Index: src/distrib/sets/lists/base/shl.mi
===================================================================
RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/base/shl.mi,v
retrieving revision 1.823
diff -u -p -r1.823 shl.mi
--- src/distrib/sets/lists/base/shl.mi	10 Oct 2017 19:31:56 -0000	1.823
+++ src/distrib/sets/lists/base/shl.mi	10 Oct 2017 20:14:55 -0000
@@ -854,6 +854,9 @@
 ./usr/lib/libzfs.so				base-zfs-shlib		compatfile,zfs
 ./usr/lib/libzfs.so.0				base-zfs-shlib		compatfile,zfs
 ./usr/lib/libzfs.so.0.0				base-zfs-shlib		compatfile,zfs
+./usr/lib/libzfs_core.so			base-zfs-shlib		compatfile,zfs
+./usr/lib/libzfs_core.so.0			base-zfs-shlib		compatfile,zfs
+./usr/lib/libzfs_core.so.0.0			base-zfs-shlib		compatfile,zfs
 ./usr/lib/libzpool.so				base-zfs-shlib		compatfile,zfs
 ./usr/lib/libzpool.so.0				base-zfs-shlib		compatfile,zfs
 ./usr/lib/libzpool.so.0.0			base-zfs-shlib		compatfile,zfs
Index: src/distrib/sets/lists/comp/mi
===================================================================
RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/comp/mi,v
retrieving revision 1.2151
diff -u -p -r1.2151 mi
--- src/distrib/sets/lists/comp/mi	10 Oct 2017 19:31:56 -0000	1.2151
+++ src/distrib/sets/lists/comp/mi	10 Oct 2017 20:14:55 -0000
@@ -3674,6 +3674,8 @@
 ./usr/lib/libz_p.a				comp-c-proflib		compatfile,profile
 ./usr/lib/libzfs.a				comp-zfs-lib		compatfile,zfs
 ./usr/lib/libzfs_p.a				comp-zfs-proflib	compatfile,zfs,profile
+./usr/lib/libzfs_core.a				comp-zfs-lib		compatfile,zfs
+./usr/lib/libzfs_core_p.a			comp-zfs-proflib	compatfile,zfs,profile
 ./usr/lib/libzpool.a				comp-zfs-lib		compatfile,zfs
 ./usr/lib/libzpool_p.a				comp-zfs-proflib	compatfile,zfs,profile
 ./usr/lib/pkgconfig				comp-c-lib
Index: src/distrib/sets/lists/comp/shl.mi
===================================================================
RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/comp/shl.mi,v
retrieving revision 1.308
diff -u -p -r1.308 shl.mi
--- src/distrib/sets/lists/comp/shl.mi	10 Oct 2017 19:31:56 -0000	1.308
+++ src/distrib/sets/lists/comp/shl.mi	10 Oct 2017 20:14:55 -0000
@@ -274,7 +274,8 @@
 ./usr/lib/libwrap_pic.a				comp-c-piclib		compatfile,picinstall
 ./usr/lib/libz_pic.a				comp-c-piclib		compatfile,picinstall
 ./usr/lib/libzfs_pic.a				comp-zfs-piclib		compatfile,picinstall,zfs
-./usr/lib/libzpool_pic.a			comp-zfs-piclib		compatfile,zfs,picinstall
+./usr/lib/libzfs_core_pic.a			comp-zfs-piclib		compatfile,picinstall,zfs
+./usr/lib/libzpool_pic.a			comp-zfs-piclib		compatfile,picinstall,zfs
 ./usr/libexec/liblto_plugin.so			comp-c-bin              gcc
 ./usr/libexec/liblto_plugin.so.0		comp-c-bin              gcc
 ./usr/libexec/liblto_plugin.so.0.0		comp-c-bin              gcc
Index: src/distrib/sets/lists/debug/mi
===================================================================
RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/debug/mi,v
retrieving revision 1.227
diff -u -p -r1.227 mi
--- src/distrib/sets/lists/debug/mi	10 Oct 2017 19:31:56 -0000	1.227
+++ src/distrib/sets/lists/debug/mi	10 Oct 2017 20:14:55 -0000
@@ -264,6 +264,7 @@
 ./usr/lib/liby_g.a				comp-c-debuglib		debuglib,compatfile
 ./usr/lib/libz_g.a				comp-c-debuglib		debuglib,compatfile
 ./usr/lib/libzfs_g.a				comp-c-debuglib		debuglib,compatfile,zfs
+./usr/lib/libzfs_core_g.a			comp-c-debuglib		debuglib,compatfile,zfs
 ./usr/lib/libzpool_g.a				comp-c-debuglib		debuglib,compatfile,zfs
 ./usr/libdata/debug/bin/cat.debug		comp-util-debug		debug
 ./usr/libdata/debug/bin/chio.debug		comp-util-debug		debug
Index: src/distrib/sets/lists/debug/shl.mi
===================================================================
RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/debug/shl.mi,v
retrieving revision 1.185
diff -u -p -r1.185 shl.mi
--- src/distrib/sets/lists/debug/shl.mi	10 Oct 2017 19:31:57 -0000	1.185
+++ src/distrib/sets/lists/debug/shl.mi	10 Oct 2017 20:14:55 -0000
@@ -289,6 +289,7 @@
 ./usr/libdata/debug/usr/lib/libwrap.so.1.0.debug		comp-net-debug	debug,compatfile
 ./usr/libdata/debug/usr/lib/libz.so.1.0.debug			comp-sys-debug	debug,compatfile
 ./usr/libdata/debug/usr/lib/libzfs.so.0.0.debug			comp-zfs-debug	debug,compatfile,zfs
+./usr/libdata/debug/usr/lib/libzfs_core.so.0.0.debug		comp-zfs-debug	debug,compatfile,zfs
 ./usr/libdata/debug/usr/lib/libzpool.so.0.0.debug		comp-zfs-debug	debug,compatfile,zfs
 ./usr/libdata/debug/usr/lib/npf/ext_log.so.0.0.debug		comp-obsolete	debug,compatfile,npf,obsolete
 ./usr/libdata/debug/usr/lib/npf/ext_normalise.so.0.0.debug	comp-obsolete	debug,compatfile,npf,obsolete
Index: src/external/bsd/libproc/dist/libproc.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/bsd/libproc/dist/libproc.h,v
retrieving revision 1.3
diff -u -p -r1.3 libproc.h
--- src/external/bsd/libproc/dist/libproc.h	9 Jun 2017 01:17:25 -0000	1.3
+++ src/external/bsd/libproc/dist/libproc.h	10 Jun 2017 00:35:09 -0000
@@ -51,6 +51,11 @@ typedef void (*proc_child_func)(void *);
 #define PS_DEAD		5
 #define PS_LOST		6
 
+/* Flags for proc_attach(). */
+#define	PATTACH_FORCE	0x01
+#define	PATTACH_RDONLY	0x02
+#define	PATTACH_NOSTOP	0x04
+
 /* Reason values for proc_detach(). */
 #define PRELEASE_HANG	1
 #define PRELEASE_KILL	2
Index: src/external/cddl/osnet/Makefile.inc
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/Makefile.inc,v
retrieving revision 1.3
diff -u -p -r1.3 Makefile.inc
--- src/external/cddl/osnet/Makefile.inc	23 Jan 2016 21:22:45 -0000	1.3
+++ src/external/cddl/osnet/Makefile.inc	10 Jun 2017 06:04:19 -0000
@@ -1,8 +1,10 @@
-# $FreeBSD: src/cddl/Makefile.inc,v 1.6.2.1 2009/08/03 08:13:06 kensmith Exp $
+# $FreeBSD: head/cddl/Makefile.inc 270358 2014-08-22 20:04:51Z delphij $
 
 WARNS?=5
+
 OSNETDIR=	${NETBSDSRCDIR}/external/cddl/osnet
 
 OPENSOLARIS_USR_DISTDIR=${OSNETDIR}/dist
 OPENSOLARIS_SYS_DISTDIR=${OSNETDIR}/dist
+
 CPPFLAGS+=-Wno-unknown-pragmas -Wno-sign-compare -D_KERNTYPES
Index: src/external/cddl/osnet/Makefile.zfs
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/Makefile.zfs,v
retrieving revision 1.4
diff -u -p -r1.4 Makefile.zfs
--- src/external/cddl/osnet/Makefile.zfs	5 Sep 2012 23:08:42 -0000	1.4
+++ src/external/cddl/osnet/Makefile.zfs	10 Jun 2017 05:39:13 -0000
@@ -6,15 +6,13 @@
 NOGCCERROR=	yes
 
 # Basic compilation stuff.
-CPPFLAGS+=	"-D__va_list=va_list"
-CPPFLAGS+=	"-Doffsetof(s, m)=((size_t)(&(((s *)0)->m)))"
 CPPFLAGS+=	-std=c99
 
 # Pick a VTOC format - ick.
 CPPFLAGS+=	-D_SUNOS_VTOC_16
 CPPFLAGS+=	-D_PROPLIB_ZFS_CONFLICT
 
-CFLAGS+=	 -O0 -fno-inline
+#CFLAGS+=	 -O0 -fno-inline
 
 #DBG=	-g
 
@@ -36,14 +34,24 @@ CPPFLAGS+=	-I${ZFSDIR}/dist/lib/libshare
 CPPFLAGS+=	-I${ZFSDIR}/dist/lib/libumem
 CPPFLAGS+=	-I${ZFSDIR}/dist/lib/libuutil/common
 CPPFLAGS+=	-I${ZFSDIR}/dist/lib/libzfs/common
+CPPFLAGS+=	-I${ZFSDIR}/dist/lib/libzfs_core/common
 CPPFLAGS+=	-I${ZFSDIR}/dist/lib/libzpool/common
 
 CPPFLAGS+=      -I${ZFSDIR}/dist/common
 
+CWARNFLAGS+=	-Wno-missing-field-initializers
+CWARNFLAGS+=	-Wno-strict-prototypes
+CWARNFLAGS+=	-Wno-cast-qual
+CWARNFLAGS+=	-Wno-discarded-qualifiers
+CWARNFLAGS+=	-Wno-switch
+CWARNFLAGS+=	-Wno-missing-prototypes
+CWARNFLAGS+=	-Wno-unused-variable
+CWARNFLAGS+=	-Wno-shadow
+
 LIBAVL_SRCDIR= ${ZFSDIR}/lib/libavl
 LIBNVPAIR_SRCDIR= ${ZFSDIR}/lib/libnvpair
 LIBUMEM_SRCDIR= ${ZFSDIR}/lib/libumem
 LIBUUTIL_SRCDIR= ${ZFSDIR}/lib/libuutil
 LIBZFS_SRCDIR= ${ZFSDIR}/lib/libzfs
+LIBZFS_CORE_SRCDIR= ${ZFSDIR}/lib/libzfs_core
 LIBZPOOL_SRCDIR= ${ZFSDIR}/lib/libzpool
-
Index: src/external/cddl/osnet/dev/cyclic/cyclic.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/cyclic.c,v
retrieving revision 1.7
diff -u -p -r1.7 cyclic.c
--- src/external/cddl/osnet/dev/cyclic/cyclic.c	1 Feb 2017 21:59:09 -0000	1.7
+++ src/external/cddl/osnet/dev/cyclic/cyclic.c	11 Jun 2017 12:07:53 -0000
@@ -23,7 +23,7 @@
  *
  * Portions Copyright 2008 John Birrell <jb@freebsd.org>
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/dev/cyclic/cyclic.c 227293 2011-11-07 06:44:47Z ed $
  *
  * This is a simplified version of the cyclic timer subsystem from
  * OpenSolaris. In the FreeBSD version, we don't use interrupt levels.
@@ -352,7 +352,6 @@
 #define mtx_unlock_spin(x) mutex_spin_exit(x)
 #define mtx_destroy(x) mutex_destroy(x)
 
-#define ASSERT(x) KASSERT(x)
 #define SYSINIT(a1, a2, a3, a4, a5)
 #define SYSUNINIT(a1, a2, a3, a4, a5)
 #define CPU_FOREACH(var) \
Index: src/external/cddl/osnet/dev/cyclic/cyclic_test.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/cyclic_test.c,v
retrieving revision 1.2
diff -u -p -r1.2 cyclic_test.c
--- src/external/cddl/osnet/dev/cyclic/cyclic_test.c	21 Feb 2010 01:46:33 -0000	1.2
+++ src/external/cddl/osnet/dev/cyclic/cyclic_test.c	11 Jun 2017 12:07:24 -0000
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/dev/cyclic/cyclic_test.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/cyclic/cyclic_test.c 179260 2008-05-23 22:21:58Z jb $
  *
  */
 
Index: src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c,v
retrieving revision 1.1
diff -u -p -r1.1 cyclic_machdep.c
--- src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c	5 Mar 2014 06:35:44 -0000	1.1
+++ src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c	11 Jun 2017 12:13:45 -0000
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/dev/cyclic/i386/cyclic_machdep.c 222813 2011-06-07 08:46:13Z attilio $
  *
  */
 
Index: src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c,v
retrieving revision 1.4
diff -u -p -r1.4 cyclic_machdep.c
--- src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c	2 Dec 2012 01:05:16 -0000	1.4
+++ src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c	11 Jun 2017 12:09:34 -0000
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/dev/cyclic/i386/cyclic_machdep.c 222813 2011-06-07 08:46:13Z attilio $
  *
  */
 
Index: src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c,v
retrieving revision 1.2
diff -u -p -r1.2 dtmalloc.c
--- src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c	21 Feb 2010 01:46:33 -0000	1.2
+++ src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c	10 Jun 2017 16:13:27 -0000
@@ -22,7 +22,7 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/sys/cddl/dev/dtmalloc/dtmalloc.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtmalloc/dtmalloc.c 252325 2013-06-28 03:14:40Z markj $
  *
  */
 
@@ -30,6 +30,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
+#include <sys/ctype.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
@@ -113,8 +114,17 @@ dtmalloc_type_cb(struct malloc_type *mtp
 {
 	char name[DTRACE_FUNCNAMELEN];
 	struct malloc_type_internal *mtip = mtp->ks_handle;
+	int i;
 
+	/*
+	 * malloc_type descriptions are allowed to contain whitespace, but
+	 * DTrace probe identifiers are not, so replace the whitespace with
+	 * underscores.
+	 */
 	strlcpy(name, mtp->ks_shortdesc, sizeof(name));
+	for (i = 0; name[i] != 0; i++)
+		if (isspace(name[i]))
+			name[i] = '_';
 
 	if (dtrace_probe_lookup(dtmalloc_id, NULL, name, "malloc") != 0)
 		return;
Index: src/external/cddl/osnet/dev/dtrace/dtrace_anon.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_anon.c,v
retrieving revision 1.2
diff -u -p -r1.2 dtrace_anon.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_anon.c	21 Feb 2010 01:46:33 -0000	1.2
+++ src/external/cddl/osnet/dev/dtrace/dtrace_anon.c	12 Apr 2017 15:45:10 -0000
@@ -20,7 +20,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_anon.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_anon.c 179237 2008-05-23 05:59:42Z jb $
  */
 
 /*
Index: src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h,v
retrieving revision 1.2
diff -u -p -r1.2 dtrace_cddl.h
--- src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h	21 Feb 2010 01:46:33 -0000	1.2
+++ src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h	16 Jun 2017 17:07:29 -0000
@@ -20,7 +20,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_cddl.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_cddl.h 292388 2015-12-17 00:00:27Z markj $
  *
  */
 
@@ -28,6 +28,11 @@
 #define	_DTRACE_CDDL_H_
 
 #include <sys/proc.h>
+#include <sys/note.h>
+
+#define SYSCTL_NODE(...)
+#define SYSCTL_DECL(...)
+#define SYSCTL_INT(...)
 
 #define LOCK_LEVEL	10
 
@@ -38,6 +43,7 @@ typedef struct kdtrace_proc {
 	int		p_dtrace_probes;	/* Are there probes for this proc? */
 	u_int64_t	p_dtrace_count;		/* Number of DTrace tracepoints */
 	void		*p_dtrace_helpers;	/* DTrace helpers, if any */
+	int		p_dtrace_model;
 
 } kdtrace_proc_t;
 
@@ -61,6 +67,9 @@ typedef struct kdtrace_thread {
 					/* Handling a return probe. */
 			u_int8_t	_td_dtrace_ast;
 					/* Saved ast flag. */
+#ifdef __amd64__
+			u_int8_t	_td_dtrace_reg;
+#endif
 		} _tds;
 		u_long	_td_dtrace_ft;	/* Bitwise or of these flags. */
 	} _tdu;
@@ -69,6 +78,7 @@ typedef struct kdtrace_thread {
 #define	td_dtrace_step	_tdu._tds._td_dtrace_step
 #define	td_dtrace_ret	_tdu._tds._td_dtrace_ret
 #define	td_dtrace_ast	_tdu._tds._td_dtrace_ast
+#define	td_dtrace_reg	_tdu._tds._td_dtrace_reg
 
 	uintptr_t	td_dtrace_pc;	/* DTrace saved pc from fasttrap. */
 	uintptr_t	td_dtrace_npc;	/* DTrace next pc from fasttrap. */
@@ -76,8 +86,12 @@ typedef struct kdtrace_thread {
 					/* DTrace per-thread scratch location. */
 	uintptr_t	td_dtrace_astpc;
 					/* DTrace return sequence location. */
+#ifdef __amd64__
+	uintptr_t	td_dtrace_regv;
+#endif
 	u_int64_t	td_hrtime;	/* Last time on cpu. */
-	int		td_errno;	/* Syscall return value. */
+	void		*td_dtrace_sscr; /* Saved scratch space location. */
+	void		*td_systrace_args; /* syscall probe arguments. */
 } kdtrace_thread_t;
 
 /*
@@ -86,21 +100,47 @@ typedef struct kdtrace_thread {
  * that the separation on FreeBSD is a licensing constraint designed to
  * keep the GENERIC kernel BSD licensed.
  */
-#define	t_dtrace_vtime	l_dtrace->td_dtrace_vtime
-#define	t_dtrace_start	l_dtrace->td_dtrace_start
-#define	t_dtrace_stop	l_dtrace->td_dtrace_stop
-#define	t_dtrace_sig	l_dtrace->td_dtrace_sig
-#define	t_predcache	l_dtrace->td_predcache
-#define p_dtrace_helpers	p_dtrace->p_dtrace_helpers
+#define td_dtrace l_dtrace
+#define	t_dtrace_vtime	td_dtrace->td_dtrace_vtime
+#define	t_dtrace_start	td_dtrace->td_dtrace_start
+#define	t_dtrace_stop	td_dtrace->td_dtrace_stop
+#define	t_dtrace_sig	td_dtrace->td_dtrace_sig
+#define	t_predcache	td_dtrace->td_predcache
+#define	t_dtrace_ft	td_dtrace->td_dtrace_ft
+#define	t_dtrace_on	td_dtrace->td_dtrace_on
+#define	t_dtrace_step	td_dtrace->td_dtrace_step
+#define	t_dtrace_ret	td_dtrace->td_dtrace_ret
+#define	t_dtrace_ast	td_dtrace->td_dtrace_ast
+#define	t_dtrace_reg	td_dtrace->td_dtrace_reg
+#define	t_dtrace_pc	td_dtrace->td_dtrace_pc
+#define	t_dtrace_npc	td_dtrace->td_dtrace_npc
+#define	t_dtrace_scrpc	td_dtrace->td_dtrace_scrpc
+#define	t_dtrace_astpc	td_dtrace->td_dtrace_astpc
+#define	t_dtrace_regv	td_dtrace->td_dtrace_regv
+#define	t_dtrace_sscr	td_dtrace->td_dtrace_sscr
+#define	t_dtrace_systrace_args	td_dtrace->td_systrace_args
+#define	p_dtrace_helpers	p_dtrace->p_dtrace_helpers
+#define	p_dtrace_count	p_dtrace->p_dtrace_count
+#define	p_dtrace_probes	p_dtrace->p_dtrace_probes
+#define	p_model		p_dtrace->p_dtrace_model
+
+#define	DATAMODEL_NATIVE	0
+#ifdef __amd64__
+#define	DATAMODEL_LP64		0
+#define	DATAMODEL_ILP32		1
+#else
+#define	DATAMODEL_LP64		1
+#define	DATAMODEL_ILP32		0
+#endif
 
 /*
- * Definitions for fields in struct proc which are named differntly in FreeBSD.
+ * Definitions for fields in struct proc which are named differently in FreeBSD.
  */
 //#define	p_cred		p_ucred
 #define	p_parent	p_pptr
 
 /*
- * Definitions for fields in struct thread which are named differntly in NetBSD.
+ * Definitions for fields in struct thread which are named differently in NetBSD.
  */
 #define	t_procp		l_proc
 #define	t_tid		l_lid
Index: src/external/cddl/osnet/dev/dtrace/dtrace_clone.c
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/dtrace_clone.c
diff -N src/external/cddl/osnet/dev/dtrace/dtrace_clone.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_clone.c	21 Feb 2010 01:46:33 -0000	1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,63 +0,0 @@
-/*	$NetBSD: dtrace_clone.c,v 1.2 2010/02/21 01:46:33 darran Exp $	*/
-
-/*-
- * Copyright (C) 2006 John Birrell <jb@freebsd.org>.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice(s), this list of conditions and the following disclaimer as
- *    the first lines of this file unmodified other than the possible 
- *    addition of one or more copyright notices.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice(s), this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- * DAMAGE.
- *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_clone.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
- *
- */
-
-static void
-dtrace_clone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev)
-{
-	int u = -1;
-	size_t len;
-
-	if (*dev != NULL)
-		return;
-
-	len = strlen(name);
-
-	if (len != 6 && len != 13)
-		return;
-
-	if (bcmp(name,"dtrace",6) != 0)
-		return;
-
-	if (len == 13 && bcmp(name,"dtrace/dtrace",13) != 0)
-		return;
-
-	/* Clone the device to the new minor number. */
-	if (clone_create(&dtrace_clones, &dtrace_cdevsw, &u, dev, 0) != 0)
-		/* Create the /dev/dtrace/dtraceNN entry. */
-		*dev = make_dev_cred(&dtrace_cdevsw, u, cred,
-		     UID_ROOT, GID_WHEEL, 0600, "dtrace/dtrace%d", u);
-	if (*dev != NULL) {
-		dev_ref(*dev);
-		(*dev)->si_flags |= SI_CHEAPCLONE;
-	}
-}
Index: src/external/cddl/osnet/dev/dtrace/dtrace_debug.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_debug.c,v
retrieving revision 1.8
diff -u -p -r1.8 dtrace_debug.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_debug.c	5 Mar 2014 06:06:42 -0000	1.8
+++ src/external/cddl/osnet/dev/dtrace/dtrace_debug.c	10 May 2017 11:09:52 -0000
@@ -27,33 +27,35 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_debug.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_debug.c 315208 2017-03-13 18:43:00Z markj $
  *
  */
 
 static char const hex2ascii_data[] = "0123456789abcdefghijklmnopqrstuvwxyz";
 #define	hex2ascii(hex)	(hex2ascii_data[hex])
+#define MAXCPU MAXCPUS
 
 #ifdef DEBUG
 
 #define DTRACE_DEBUG_BUFR_SIZE	(32 * 1024)
 
 struct dtrace_debug_data {
+	u_long lock __aligned(CACHE_LINE_SIZE);
 	char bufr[DTRACE_DEBUG_BUFR_SIZE];
 	char *first;
 	char *last;
 	char *next;
-} dtrace_debug_data[MAXCPUS];
+} dtrace_debug_data[MAXCPU];
 
 static char dtrace_debug_bufr[DTRACE_DEBUG_BUFR_SIZE];
 
-static volatile u_long	dtrace_debug_flag[MAXCPUS];
-
 static void
 dtrace_debug_lock(int cpu)
 {
-	/* FIXME: use atomic_cmpset_ulong once we have it  */
-	while (atomic_cas_ulong(&dtrace_debug_flag[cpu], 0, 1) == 0)
+	void *tid;
+
+	tid = curlwp;
+	while (atomic_cas_ptr(&dtrace_debug_data[cpu].lock, 0, tid) == 0)
 		/* Loop until the lock is obtained. */
 		;
 }
@@ -61,7 +63,9 @@ dtrace_debug_lock(int cpu)
 static void
 dtrace_debug_unlock(int cpu)
 {
-	dtrace_debug_flag[cpu] = 0;
+
+	membar_producer();
+	dtrace_debug_data[cpu].lock = 0;
 }
 
 static void
@@ -83,25 +87,26 @@ dtrace_debug_init(void *dummy)
 	}
 }
 
-//SYSINIT(dtrace_debug_init, SI_SUB_KDTRACE, SI_ORDER_ANY, dtrace_debug_init, NULL);
-//SYSINIT(dtrace_debug_smpinit, SI_SUB_SMP, SI_ORDER_ANY, dtrace_debug_init, NULL);
+#ifdef __FreeBSD__
+SYSINIT(dtrace_debug_init, SI_SUB_KDTRACE, SI_ORDER_ANY, dtrace_debug_init, NULL);
+SYSINIT(dtrace_debug_smpinit, SI_SUB_SMP, SI_ORDER_ANY, dtrace_debug_init, NULL);
+#endif
 
 static void
 dtrace_debug_output(void)
 {
 	char *p;
+	int i;
 	struct dtrace_debug_data *d;
 	uintptr_t count;
 	CPU_INFO_ITERATOR cpuind;
 	struct cpu_info *cinfo;
-	cpuid_t cpuid;
 
 	for (CPU_INFO_FOREACH(cpuind, cinfo)) {
-	    	cpuid = cpu_index(cinfo);
+	    	i = cpu_index(cinfo);
+		dtrace_debug_lock(i);
 
-		dtrace_debug_lock(cpuid);
-
-		d = &dtrace_debug_data[cpuid];
+		d = &dtrace_debug_data[i];
 
 		count = 0;
 
@@ -129,7 +134,7 @@ dtrace_debug_output(void)
 		d->first = d->bufr;
 		d->next = d->bufr;
 
-		dtrace_debug_unlock(cpuid);
+		dtrace_debug_unlock(i);
 
 		if (count > 0) {
 			char *last = dtrace_debug_bufr + count;
@@ -158,10 +163,11 @@ dtrace_debug_output(void)
  */
 
 static __inline void
-dtrace_debug__putc(char c)
+dtrace_debug__putc(int cpu, char c)
 {
-	struct dtrace_debug_data *d = &dtrace_debug_data[cpu_number()];
+	struct dtrace_debug_data *d;
 
+	d = &dtrace_debug_data[cpu];
 	*d->next++ = c;
 
 	if (d->next == d->last)
@@ -179,24 +185,30 @@ dtrace_debug__putc(char c)
 static void __used
 dtrace_debug_putc(char c)
 {
-	dtrace_debug_lock(cpu_number());
+	int cpu;
+
+	cpu = cpu_number();
+	dtrace_debug_lock(cpu);
 
-	dtrace_debug__putc(c);
+	dtrace_debug__putc(cpu, c);
 
-	dtrace_debug_unlock(cpu_number());
+	dtrace_debug_unlock(cpu);
 }
 
 static void __used
 dtrace_debug_puts(const char *s)
 {
-	dtrace_debug_lock(cpu_number());
+	int cpu;
+
+	cpu = cpu_number();
+	dtrace_debug_lock(cpu);
 
 	while (*s != '\0')
-		dtrace_debug__putc(*s++);
+		dtrace_debug__putc(cpu, *s++);
 
-	dtrace_debug__putc('\0');
+	dtrace_debug__putc(cpu, '\0');
 
-	dtrace_debug_unlock(cpu_number());
+	dtrace_debug_unlock(cpu);
 }
 
 /*
@@ -205,30 +217,30 @@ dtrace_debug_puts(const char *s)
  * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
  * order; return an optional length and a pointer to the last character
  * written in the buffer (i.e., the first character of the string).
- * The buffer pointed to by `xbuf' must have length >= MAXNBUF.
+ * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
  */
 static char *
-dtrace_debug_ksprintn(char *xbuf, uintmax_t num, int base, int *lenp, int upper)
+dtrace_debug_ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
 {
 	char *p, c;
 
-	p = xbuf;
+	p = nbuf;
 	*p = '\0';
 	do {
 		c = hex2ascii(num % base);
 		*++p = upper ? toupper(c) : c;
 	} while (num /= base);
 	if (lenp)
-		*lenp = p - xbuf;
+		*lenp = p - nbuf;
 	return (p);
 }
 
 #define MAXNBUF (sizeof(intmax_t) * NBBY + 1)
 
 static void
-dtrace_debug_vprintf(const char *fmt, va_list ap)
+dtrace_debug_vprintf(int cpu, const char *fmt, va_list ap)
 {
-	char xbuf[MAXNBUF];
+	char nbuf[MAXNBUF];
 	const char *p, *percent, *q;
 	u_char *up;
 	int ch, n;
@@ -250,10 +262,10 @@ dtrace_debug_vprintf(const char *fmt, va
 		width = 0;
 		while ((ch = (u_char)*fmt++) != '%' || stop) {
 			if (ch == '\0') {
-				dtrace_debug__putc('\0');
+				dtrace_debug__putc(cpu, '\0');
 				return;
 			}
-			dtrace_debug__putc(ch);
+			dtrace_debug__putc(cpu, ch);
 		}
 		percent = fmt - 1;
 		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
@@ -273,7 +285,7 @@ reswitch:	switch (ch = (u_char)*fmt++) {
 			ladjust = 1;
 			goto reswitch;
 		case '%':
-			dtrace_debug__putc(ch);
+			dtrace_debug__putc(cpu, ch);
 			break;
 		case '*':
 			if (!dot) {
@@ -307,8 +319,8 @@ reswitch:	switch (ch = (u_char)*fmt++) {
 		case 'b':
 			num = (u_int)va_arg(ap, int);
 			p = va_arg(ap, char *);
-			for (q = dtrace_debug_ksprintn(xbuf, num, *p++, NULL, 0); *q;)
-				dtrace_debug__putc(*q--);
+			for (q = dtrace_debug_ksprintn(nbuf, num, *p++, NULL, 0); *q;)
+				dtrace_debug__putc(cpu, *q--);
 
 			if (num == 0)
 				break;
@@ -316,19 +328,19 @@ reswitch:	switch (ch = (u_char)*fmt++) {
 			for (tmp = 0; *p;) {
 				n = *p++;
 				if (num & (1 << (n - 1))) {
-					dtrace_debug__putc(tmp ? ',' : '<');
+					dtrace_debug__putc(cpu, tmp ? ',' : '<');
 					for (; (n = *p) > ' '; ++p)
-						dtrace_debug__putc(n);
+						dtrace_debug__putc(cpu, n);
 					tmp = 1;
 				} else
 					for (; *p > ' '; ++p)
 						continue;
 			}
 			if (tmp)
-				dtrace_debug__putc('>');
+				dtrace_debug__putc(cpu, '>');
 			break;
 		case 'c':
-			dtrace_debug__putc(va_arg(ap, int));
+			dtrace_debug__putc(cpu, va_arg(ap, int));
 			break;
 		case 'D':
 			up = va_arg(ap, u_char *);
@@ -336,12 +348,12 @@ reswitch:	switch (ch = (u_char)*fmt++) {
 			if (!width)
 				width = 16;
 			while(width--) {
-				dtrace_debug__putc(hex2ascii(*up >> 4));
-				dtrace_debug__putc(hex2ascii(*up & 0x0f));
+				dtrace_debug__putc(cpu, hex2ascii(*up >> 4));
+				dtrace_debug__putc(cpu, hex2ascii(*up & 0x0f));
 				up++;
 				if (width)
 					for (q=p;*q;q++)
-						dtrace_debug__putc(*q);
+						dtrace_debug__putc(cpu, *q);
 			}
 			break;
 		case 'd':
@@ -413,12 +425,12 @@ reswitch:	switch (ch = (u_char)*fmt++) {
 
 			if (!ladjust && width > 0)
 				while (width--)
-					dtrace_debug__putc(padc);
+					dtrace_debug__putc(cpu, padc);
 			while (n--)
-				dtrace_debug__putc(*p++);
+				dtrace_debug__putc(cpu, *p++);
 			if (ladjust && width > 0)
 				while (width--)
-					dtrace_debug__putc(padc);
+					dtrace_debug__putc(cpu, padc);
 			break;
 		case 't':
 			tflag = 1;
@@ -479,7 +491,7 @@ number:
 				neg = 1;
 				num = -(intmax_t)num;
 			}
-			p = dtrace_debug_ksprintn(xbuf, num, base, &tmp, upper);
+			p = dtrace_debug_ksprintn(nbuf, num, base, &tmp, upper);
 			if (sharpflag && num != 0) {
 				if (base == 8)
 					tmp++;
@@ -492,32 +504,32 @@ number:
 			if (!ladjust && padc != '0' && width
 			    && (width -= tmp) > 0)
 				while (width--)
-					dtrace_debug__putc(padc);
+					dtrace_debug__putc(cpu, padc);
 			if (neg)
-				dtrace_debug__putc('-');
+				dtrace_debug__putc(cpu, '-');
 			if (sharpflag && num != 0) {
 				if (base == 8) {
-					dtrace_debug__putc('0');
+					dtrace_debug__putc(cpu, '0');
 				} else if (base == 16) {
-					dtrace_debug__putc('0');
-					dtrace_debug__putc('x');
+					dtrace_debug__putc(cpu, '0');
+					dtrace_debug__putc(cpu, 'x');
 				}
 			}
 			if (!ladjust && width && (width -= tmp) > 0)
 				while (width--)
-					dtrace_debug__putc(padc);
+					dtrace_debug__putc(cpu, padc);
 
 			while (*p)
-				dtrace_debug__putc(*p--);
+				dtrace_debug__putc(cpu, *p--);
 
 			if (ladjust && width && (width -= tmp) > 0)
 				while (width--)
-					dtrace_debug__putc(padc);
+					dtrace_debug__putc(cpu, padc);
 
 			break;
 		default:
 			while (percent < fmt)
-				dtrace_debug__putc(*percent++);
+				dtrace_debug__putc(cpu, *percent++);
 			/*
 			 * Since we ignore an formatting argument it is no 
 			 * longer safe to obey the remaining formatting
@@ -529,23 +541,25 @@ number:
 		}
 	}
 
-	dtrace_debug__putc('\0');
+	dtrace_debug__putc(cpu, '\0');
 }
 
 void
 dtrace_debug_printf(const char *fmt, ...)
 {
 	va_list ap;
+	int cpu;
 
-	dtrace_debug_lock(cpu_number());
+	cpu = cpu_number();
+	dtrace_debug_lock(cpu);
 
 	va_start(ap, fmt);
 
-	dtrace_debug_vprintf(fmt, ap);
+	dtrace_debug_vprintf(cpu, fmt, ap);
 
 	va_end(ap);
 
-	dtrace_debug_unlock(cpu_number());
+	dtrace_debug_unlock(cpu);
 }
 
 #else
@@ -554,4 +568,9 @@ dtrace_debug_printf(const char *fmt, ...
 #define dtrace_debug_puts(_s)
 #define dtrace_debug_printf(fmt, ...)
 
+static void
+dtrace_debug_init(void *dummy)
+{
+}
+
 #endif
Index: src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c,v
retrieving revision 1.5
diff -u -p -r1.5 dtrace_hacks.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c	23 Jun 2016 06:44:52 -0000	1.5
+++ src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c	5 May 2017 11:52:00 -0000
@@ -1,12 +1,13 @@
 /*	$NetBSD: dtrace_hacks.c,v 1.5 2016/06/23 06:44:52 pgoyette Exp $	*/
 
-/* $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_hacks.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ */
+/* $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_hacks.c 281916 2015-04-24 03:19:30Z markj $ */
 /* XXX Hacks.... */
 
 dtrace_cacheid_t dtrace_predcache_id;
 
 boolean_t
-priv_policy_only(const cred_t *a, int b, boolean_t c)
+priv_policy_only(const cred_t *cr, int b, boolean_t c)
 {
-	return 1;
+
+	return kauth_authorize_generic(cr, KAUTH_GENERIC_ISSUSER, NULL) == 0;
 }
Index: src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c,v
retrieving revision 1.6
diff -u -p -r1.6 dtrace_ioctl.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c	30 Sep 2015 20:59:13 -0000	1.6
+++ src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c	11 Jun 2017 11:47:33 -0000
@@ -20,18 +20,106 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_ioctl.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_ioctl.c 313262 2017-02-05 02:39:12Z markj $
  *
  */
 
-static int dtrace_verbose_ioctl=0;
-//SYSCTL_INT(_debug_dtrace, OID_AUTO, verbose_ioctl, CTLFLAG_RW, &dtrace_verbose_ioctl, 0, "");
+static int dtrace_verbose_ioctl;
+SYSCTL_INT(_debug_dtrace, OID_AUTO, verbose_ioctl, CTLFLAG_RW,
+    &dtrace_verbose_ioctl, 0, "log DTrace ioctls");
+
+#define pfind(pid) proc_find((pid))
 
 #define DTRACE_IOCTL_PRINTF(fmt, ...)	if (dtrace_verbose_ioctl) printf(fmt, ## __VA_ARGS__ )
 
+#ifdef __FreeBSD__
+static int
+dtrace_ioctl_helper(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
+    struct thread *td)
+#endif
+#ifdef __NetBSD__
+static int
+dtrace_ioctl_helper(dev_t dev, u_long cmd, caddr_t addr, int flags)
+#endif
+{
+	struct proc *p;
+	dof_helper_t *dhp;
+	dof_hdr_t *dof;
+	int rval;
+
+	dhp = NULL;
+	dof = NULL;
+	rval = 0;
+	switch (cmd) {
+	case DTRACEHIOC_ADDDOF:
+		dhp = (dof_helper_t *)addr;
+		addr = (caddr_t)(uintptr_t)dhp->dofhp_dof;
+		p = curproc;
+		if (p->p_pid == dhp->dofhp_pid) {
+			dof = dtrace_dof_copyin((uintptr_t)addr, &rval);
+		} else {
+#ifdef __FreeBSD__
+			p = pfind(dhp->dofhp_pid);
+			if (p == NULL)
+				return (EINVAL);
+			if (!P_SHOULDSTOP(p) ||
+			    (p->p_flag & (P_TRACED | P_WEXIT)) != P_TRACED ||
+			    p->p_pptr != curproc) {
+				PROC_UNLOCK(p);
+				return (EINVAL);
+			}
+			_PHOLD(p);
+			PROC_UNLOCK(p);
+			dof = dtrace_dof_copyin_proc(p, (uintptr_t)addr, &rval);
+#endif
+#ifdef __NetBSD__
+			dof = dtrace_dof_copyin_pid(dhp->dofhp_pid, addr, &rval);
+#endif
+		}
+
+		if (dof == NULL) {
+#ifdef __FreeBSD__
+			if (p != curproc)
+				PRELE(p);
+#endif
+			break;
+		}
+
+		mutex_enter(&dtrace_lock);
+		if ((rval = dtrace_helper_slurp(dof, dhp, p)) != -1) {
+			dhp->dofhp_gen = rval;
+			rval = 0;
+		} else {
+			rval = EINVAL;
+		}
+		mutex_exit(&dtrace_lock);
+#ifdef __FreeBSD__
+		if (p != curproc)
+			PRELE(p);
+#endif
+		break;
+	case DTRACEHIOC_REMOVE:
+		mutex_enter(&dtrace_lock);
+		rval = dtrace_helper_destroygen(NULL, *(int *)(uintptr_t)addr);
+		mutex_exit(&dtrace_lock);
+		break;
+	default:
+		rval = ENOTTY;
+		break;
+	}
+	return (rval);
+}
+
 /* ARGSUSED */
+#ifdef __FreeBSD__
+static int
+dtrace_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
+    int flags __unused, struct thread *td)
+#endif
+#ifdef __NetBSD__
 static int
 dtrace_ioctl(struct file *fp, u_long cmd, void *addr)
+#endif
 {
 	dtrace_state_t *state = (dtrace_state_t *)fp->f_data;
 	int error = 0;
@@ -224,6 +312,7 @@ dtrace_ioctl(struct file *fp, u_long cmd
 			desc.dtbd_drops = buf->dtb_drops;
 			desc.dtbd_errors = buf->dtb_errors;
 			desc.dtbd_oldest = buf->dtb_xamot_offset;
+			desc.dtbd_timestamp = dtrace_gethrtime();
 
 			mutex_exit(&dtrace_lock);
 
@@ -278,6 +367,7 @@ dtrace_ioctl(struct file *fp, u_long cmd
 		desc.dtbd_drops = buf->dtb_xamot_drops;
 		desc.dtbd_errors = buf->dtb_xamot_errors;
 		desc.dtbd_oldest = 0;
+		desc.dtbd_timestamp = buf->dtb_switched;
 
 		mutex_exit(&dtrace_lock);
 
@@ -361,7 +451,8 @@ dtrace_ioctl(struct file *fp, u_long cmd
 			return (EBUSY);
 		}
 
-		if (dtrace_dof_slurp(dof, vstate, curlwp->l_cred, &enab, 0, B_TRUE) != 0) {
+		if (dtrace_dof_slurp(dof, vstate, CRED(), &enab, 0, 0,
+		    B_TRUE) != 0) {
 			mutex_exit(&dtrace_lock);
 			mutex_exit(&cpu_lock);
 			dtrace_dof_destroy(dof);
@@ -528,19 +619,25 @@ dtrace_ioctl(struct file *fp, u_long cmd
 			return (EINVAL);
 
 		mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
 		mutex_enter(&mod_lock);
+#endif
 		mutex_enter(&dtrace_lock);
 
 		if (desc->dtargd_id > dtrace_nprobes) {
 			mutex_exit(&dtrace_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_provider_lock);
 			return (EINVAL);
 		}
 
 		if ((probe = dtrace_probes[desc->dtargd_id - 1]) == NULL) {
 			mutex_exit(&dtrace_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_provider_lock);
 			return (EINVAL);
 		}
@@ -564,7 +661,9 @@ dtrace_ioctl(struct file *fp, u_long cmd
 			    probe->dtpr_id, probe->dtpr_arg, desc);
 		}
 
+#ifdef illumos
 		mutex_exit(&mod_lock);
+#endif
 		mutex_exit(&dtrace_provider_lock);
 
 		return (0);
@@ -710,7 +809,7 @@ again:
 	case DTRACEIOC_STATUS: {
 		dtrace_status_t *stat = (dtrace_status_t *) addr;
 		dtrace_dstate_t *dstate;
-		int j;
+		int i, j;
 		uint64_t nerrs;
 		CPU_INFO_ITERATOR cpuind;
 		struct cpu_info *cinfo;
@@ -742,24 +841,25 @@ again:
 		dstate = &state->dts_vstate.dtvs_dynvars;
 
 		for (CPU_INFO_FOREACH(cpuind, cinfo)) {
-		    	int ci = cpu_index(cinfo);
-			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[ci];
+		    	i = cpu_index(cinfo);
+
+			dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
 
 			stat->dtst_dyndrops += dcpu->dtdsc_drops;
 			stat->dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
 			stat->dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
 
-			if (state->dts_buffer[ci].dtb_flags & DTRACEBUF_FULL)
+			if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
 				stat->dtst_filled++;
 
-			nerrs += state->dts_buffer[ci].dtb_errors;
+			nerrs += state->dts_buffer[i].dtb_errors;
 
 			for (j = 0; j < state->dts_nspeculations; j++) {
 				dtrace_speculation_t *spec;
 				dtrace_buffer_t *buf;
 
 				spec = &state->dts_speculations[j];
-				buf = &spec->dtsp_buffer[ci];
+				buf = &spec->dtsp_buffer[i];
 				stat->dtst_specdrops += buf->dtb_xamot_drops;
 			}
 		}
@@ -777,15 +877,16 @@ again:
 		return (0);
 	}
 	case DTRACEIOC_STOP: {
+		int rval;
 		processorid_t *cpuid = (processorid_t *) addr;
 
 		DTRACE_IOCTL_PRINTF("%s(%d): DTRACEIOC_STOP\n",__func__,__LINE__);
 
 		mutex_enter(&dtrace_lock);
-		error = dtrace_state_stop(state, cpuid);
+		rval = dtrace_state_stop(state, cpuid);
 		mutex_exit(&dtrace_lock);
 
-		return (error);
+		return (rval);
 	}
 	default:
 		error = ENOTTY;
Index: src/external/cddl/osnet/dev/dtrace/dtrace_load.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_load.c,v
retrieving revision 1.3
diff -u -p -r1.3 dtrace_load.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_load.c	31 Aug 2011 21:57:16 -0000	1.3
+++ src/external/cddl/osnet/dev/dtrace/dtrace_load.c	15 May 2017 23:58:54 -0000
@@ -20,7 +20,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_load.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_load.c 309069 2016-11-23 22:50:20Z gnn $
  *
  */
 
@@ -30,8 +30,32 @@ void dtrace_gethrtime_init(void *);
 
 int dtrace_helptrace_size=0;
 
-#ifndef mutex_init
-#define	mutex_init(a, b, c, d)	mutex_init(a, c, IPL_NONE)
+#ifdef __FreeBSD__
+#ifndef EARLY_AP_STARTUP
+static void
+dtrace_ap_start(void *dummy)
+{
+	int i;
+
+	mutex_enter(&cpu_lock);
+
+	/* Setup the rest of the CPUs. */
+	CPU_FOREACH(i) {
+		if (i == 0)
+			continue;
+
+		(void) dtrace_cpu_setup(CPU_CONFIG, i);
+	}
+
+	mutex_exit(&cpu_lock);
+}
+
+SYSINIT(dtrace_ap_start, SI_SUB_SMP, SI_ORDER_ANY, dtrace_ap_start, NULL);
+#endif
+#endif
+
+#ifdef __NetBSD__
+void *dtrace_modcb;
 #endif
 
 static void
@@ -44,6 +68,17 @@ dtrace_load(void *dummy)
 	dtrace_debug_init(NULL);
 	dtrace_gethrtime_init(NULL);
 
+#ifdef __FreeBSD__
+	/*
+	 * DTrace uses negative logic for the destructive mode switch, so it
+	 * is required to translate from the sysctl which uses positive logic.
+	 */ 
+	if (dtrace_allow_destructive)
+		dtrace_destructive_disallow = 0;
+	else
+		dtrace_destructive_disallow = 1;
+#endif
+
 	/* Hook into the trap handler. */
 	dtrace_trap_func = dtrace_trap;
 
@@ -53,11 +88,23 @@ dtrace_load(void *dummy)
 	/* Hang our hook for exceptions. */
 	dtrace_invop_init();
 
-	/*
-	 * XXX This is a short term hack to avoid having to comment
-	 * out lots and lots of lock/unlock calls.
-	 */
-	mutex_init(&mod_lock,"XXX mod_lock hack", MUTEX_DEFAULT, NULL);
+#ifdef __FreeBSD__
+	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri, 0, 0, 0);
+
+	dtrace_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
+
+	/* Register callbacks for linker file load and unload events. */
+	dtrace_kld_load_tag = EVENTHANDLER_REGISTER(kld_load,
+	    dtrace_kld_load, NULL, EVENTHANDLER_PRI_ANY);
+	dtrace_kld_unload_try_tag = EVENTHANDLER_REGISTER(kld_unload_try,
+	    dtrace_kld_unload_try, NULL, EVENTHANDLER_PRI_ANY);
+#endif
+
+#ifdef __NetBSD__
+	dtrace_arena = vmem_create("dtrace", 1, INT_MAX, 1,
+			NULL, NULL, NULL, 0, VM_SLEEP, IPL_NONE);
+
+#endif
 
 	/*
 	 * Initialise the mutexes without 'witness' because the dtrace
@@ -70,7 +117,9 @@ dtrace_load(void *dummy)
 	mutex_init(&dtrace_lock,"dtrace probe state", MUTEX_DEFAULT, NULL);
 	mutex_init(&dtrace_provider_lock,"dtrace provider state", MUTEX_DEFAULT, NULL);
 	mutex_init(&dtrace_meta_lock,"dtrace meta-provider state", MUTEX_DEFAULT, NULL);
+#ifdef DEBUG
 	mutex_init(&dtrace_errlock,"dtrace error lock", MUTEX_DEFAULT, NULL);
+#endif
 
 	mutex_enter(&dtrace_provider_lock);
 	mutex_enter(&dtrace_lock);
@@ -78,9 +127,6 @@ dtrace_load(void *dummy)
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
-	dtrace_arena = vmem_create("dtrace", 1, INT_MAX, 1,
-			NULL, NULL, NULL, 0, VM_SLEEP, IPL_NONE);
-
 	dtrace_state_cache = kmem_cache_create(__UNCONST("dtrace_state_cache"),
 	    sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
 	    NULL, NULL, NULL, NULL, NULL, 0);
@@ -130,19 +176,6 @@ dtrace_load(void *dummy)
 	    dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
 
 	mutex_exit(&cpu_lock);
-
-	/*
-	 * If DTrace helper tracing is enabled, we need to allocate the
-	 * trace buffer and initialize the values.
-	 */
-	if (dtrace_helptrace_enabled) {
-		ASSERT(dtrace_helptrace_buffer == NULL);
-		dtrace_helptrace_buffer =
-		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
-		dtrace_helptrace_next = 0;
-		dtrace_helptrace_size = dtrace_helptrace_bufsize;
-	}
-
 	mutex_exit(&dtrace_lock);
 	mutex_exit(&dtrace_provider_lock);
 
@@ -155,9 +188,17 @@ dtrace_load(void *dummy)
 
 	mutex_exit(&cpu_lock);
 
+#ifdef __NetBSD__
 	dtrace_anon_init(NULL);
-#if 0
-	dtrace_dev = make_dev(&dtrace_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "dtrace/dtrace");
+
+	dtrace_modcb = module_register_callbacks(dtrace_module_loaded,
+						 dtrace_module_unloaded);
+#endif
+#ifdef __FreeBSD__
+	dtrace_dev = make_dev(&dtrace_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+	    "dtrace/dtrace");
+	helper_dev = make_dev(&helper_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
+	    "dtrace/helper");
 #endif
 
 	return;
Index: src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c,v
retrieving revision 1.5
diff -u -p -r1.5 dtrace_modevent.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c	28 Nov 2015 22:41:36 -0000	1.5
+++ src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c	12 Apr 2017 15:44:25 -0000
@@ -20,7 +20,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_modevent.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_load.c 309069 2016-11-23 22:50:20Z gnn $
  *
  */
 
Index: src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c,v
retrieving revision 1.3
diff -u -p -r1.3 dtrace_sysctl.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c	23 Apr 2010 11:39:52 -0000	1.3
+++ src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c	12 Apr 2017 15:49:42 -0000
@@ -20,16 +20,10 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_sysctl.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_sysctl.c 309069 2016-11-23 22:50:20Z gnn $
  *
  */
 
-int	dtrace_debug = 0;
-#if 0
-TUNABLE_INT("debug.dtrace.debug", &dtrace_debug);
-SYSCTL_INT(_debug_dtrace, OID_AUTO, debug, CTLFLAG_RW, &dtrace_debug, 0, "");
-#endif
-
 #if 0	/* XXX TBD sysctl */
 /* Report registered DTrace providers. */
 static int
@@ -82,6 +76,27 @@ sysctl_dtrace_providers(SYSCTL_HANDLER_A
 	return (error);
 }
 
+SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace debug parameters");
+
 SYSCTL_PROC(_debug_dtrace, OID_AUTO, providers, CTLTYPE_STRING | CTLFLAG_RD,
-    0, 0, sysctl_dtrace_providers, "A", "");
+    0, 0, sysctl_dtrace_providers, "A", "available DTrace providers");
+
+SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace parameters");
+
+SYSCTL_INT(_kern_dtrace, OID_AUTO, err_verbose, CTLFLAG_RW,
+    &dtrace_err_verbose, 0,
+    "print DIF and DOF validation errors to the message buffer");
+
+SYSCTL_INT(_kern_dtrace, OID_AUTO, memstr_max, CTLFLAG_RW, &dtrace_memstr_max,
+    0, "largest allowed argument to memstr(), 0 indicates no limit");
+
+SYSCTL_QUAD(_kern_dtrace, OID_AUTO, dof_maxsize, CTLFLAG_RW,
+    &dtrace_dof_maxsize, 0, "largest allowed DOF table");
+
+SYSCTL_QUAD(_kern_dtrace, OID_AUTO, helper_actions_max, CTLFLAG_RW,
+    &dtrace_helper_actions_max, 0, "maximum number of allowed helper actions");
+
+SYSCTL_INT(_security_bsd, OID_AUTO, allow_destructive_dtrace, CTLFLAG_RDTUN,
+    &dtrace_allow_destructive, 1, "Allow destructive mode DTrace scripts");
+
 #endif
Index: src/external/cddl/osnet/dev/dtrace/dtrace_test.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_test.c,v
retrieving revision 1.2
diff -u -p -r1.2 dtrace_test.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_test.c	21 Feb 2010 01:46:33 -0000	1.2
+++ src/external/cddl/osnet/dev/dtrace/dtrace_test.c	12 Apr 2017 15:53:25 -0000
@@ -24,18 +24,26 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_test.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_test.c 258622 2013-11-26 08:46:27Z avg $
  *
  */
-
 #include <sys/cdefs.h>
 #include <sys/types.h>
 #include <sys/param.h>
+#include <sys/systm.h>
+
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
+#include <sys/sdt.h>
+#include <sys/sysctl.h>
 #include <sys/vnode.h>
 
+SDT_PROVIDER_DEFINE(test);
+
+SDT_PROBE_DEFINE7(test, , , sdttest, "int", "int", "int", "int", "int",
+    "int", "int");
+
 /*
  * These are variables that the DTrace test suite references in the
  * Solaris kernel. We define them here so that the tests function 
@@ -47,6 +55,33 @@ typedef struct vnode vnode_t;
 vnode_t dummy;
 vnode_t *rootvp = &dummy;
 
+/*
+ * Test SDT probes with more than 5 arguments. On amd64, such probes require
+ * special handling since only the first 5 arguments will be passed to
+ * dtrace_probe() in registers; the rest must be fetched off the stack.
+ */
+static int
+dtrace_test_sdttest(SYSCTL_HANDLER_ARGS)
+{
+	int val, error;
+
+	val = 0;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+	else if (val == 0)
+		return (0);
+
+	SDT_PROBE7(test, , , sdttest, 1, 2, 3, 4, 5, 6, 7);
+
+	return (error);
+}
+
+static SYSCTL_NODE(_debug, OID_AUTO, dtracetest, CTLFLAG_RD, 0, "");
+
+SYSCTL_PROC(_debug_dtracetest, OID_AUTO, sdttest, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, dtrace_test_sdttest, "I", "Trigger the SDT test probe");
+
 static int
 dtrace_test_modevent(module_t mod, int type, void *data)
 {
Index: src/external/cddl/osnet/dev/dtrace/dtrace_unload.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_unload.c,v
retrieving revision 1.6
diff -u -p -r1.6 dtrace_unload.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_unload.c	26 Feb 2015 09:10:52 -0000	1.6
+++ src/external/cddl/osnet/dev/dtrace/dtrace_unload.c	15 May 2017 23:59:03 -0000
@@ -20,19 +20,25 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_unload.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_unload.c 278166 2015-02-03 19:39:53Z pfg $
  *
  */
 
-extern int dtrace_probes_size;
-extern int dtrace_helptrace_size;
-
 static int
 dtrace_unload()
 {
 	dtrace_state_t *state;
 	int error = 0;
 
+#ifdef __FreeBSD__
+	destroy_dev(dtrace_dev);
+	destroy_dev(helper_dev);
+#endif
+
+#ifdef __NetBSD__
+	module_unregister_callbacks(dtrace_modcb);
+#endif
+
 	mutex_enter(&dtrace_provider_lock);
 	mutex_enter(&dtrace_lock);
 	mutex_enter(&cpu_lock);
@@ -52,6 +58,10 @@ dtrace_unload()
 	}
 
 	dtrace_provider = NULL;
+#ifdef __FreeBSD__
+	EVENTHANDLER_DEREGISTER(kld_load, dtrace_kld_load_tag);
+	EVENTHANDLER_DEREGISTER(kld_unload_try, dtrace_kld_unload_try_tag);
+#endif
 
 	if ((state = dtrace_anon_grab()) != NULL) {
 		/*
@@ -67,13 +77,8 @@ dtrace_unload()
 
 	mutex_exit(&cpu_lock);
 
-	if (dtrace_helptrace_enabled) {
-		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_size);
-		dtrace_helptrace_buffer = NULL;
-	}
-
 	if (dtrace_probes != NULL) {
-		kmem_free(dtrace_probes, dtrace_probes_size);
+		kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
 		dtrace_probes = NULL;
 		dtrace_nprobes = 0;
 	}
@@ -87,7 +92,12 @@ dtrace_unload()
 
 	kmem_cache_destroy(dtrace_state_cache);
 
+#ifdef __FreeBSD__
+	delete_unrhdr(dtrace_arena);
+#endif
+#ifdef __NetBSD__
 	vmem_destroy(dtrace_arena);
+#endif
 
 	if (dtrace_toxrange != NULL) {
 		kmem_free(dtrace_toxrange,
@@ -107,10 +117,13 @@ dtrace_unload()
 	mutex_destroy(&dtrace_meta_lock);
 	mutex_destroy(&dtrace_provider_lock);
 	mutex_destroy(&dtrace_lock);
+#ifdef DEBUG
 	mutex_destroy(&dtrace_errlock);
+#endif
 
-	/* XXX Hack */
-	mutex_destroy(&mod_lock);
+#ifdef __FreeBSD__
+	taskq_destroy(dtrace_taskq);
+#endif
 
 	/* Reset our hook for exceptions. */
 	dtrace_invop_uninit();
Index: src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c,v
retrieving revision 1.2
diff -u -p -r1.2 dtrace_vtime.c
--- src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c	21 Feb 2010 01:46:33 -0000	1.2
+++ src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c	12 Apr 2017 16:01:35 -0000
@@ -20,7 +20,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_vtime.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_vtime.c 179237 2008-05-23 05:59:42Z jb $
  */
 
 /*
Index: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c
diff -N src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c
--- src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c	18 Jul 2011 00:42:40 -0000	1.3
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,3195 +0,0 @@
-/*	$NetBSD: dis_tables.c,v 1.3 2011/07/18 00:42:40 christos Exp $	*/
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dis_tables.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved  	*/
-
-
-#if defined(sun)
-#pragma ident	"@(#)dis_tables.c	1.11	06/03/02 SMI"
-#endif
-
-#include	"dis_tables.h"
-
-/* BEGIN CSTYLED */
-
-/*
- * Disassembly begins in dis_distable, which is equivalent to the One-byte
- * Opcode Map in the Intel IA32 ISA Reference (page A-6 in my copy).  The
- * decoding loops then traverse out through the other tables as necessary to
- * decode a given instruction.
- *
- * The behavior of this file can be controlled by one of the following flags:
- *
- * 	DIS_TEXT	Include text for disassembly
- * 	DIS_MEM		Include memory-size calculations
- *
- * Either or both of these can be defined.
- *
- * This file is not, and will never be, cstyled.  If anything, the tables should
- * be taken out another tab stop or two so nothing overlaps.
- */
-
-/*
- * These functions must be provided for the consumer to do disassembly.
- */
-#ifdef DIS_TEXT
-extern char *strncpy(char *, const char *, size_t);
-extern size_t strlen(const char *);
-extern int strcmp(const char *, const char *);
-extern int strncmp(const char *, const char *, size_t);
-extern size_t strlcat(char *, const char *, size_t);
-#endif
-
-
-#define		TERM 	NULL	/* used to indicate that the 'indirect' */
-				/* field terminates - no pointer.	*/
-
-/* Used to decode instructions. */
-typedef struct	instable {
-	const struct instable	*it_indirect;	/* for decode op codes */
-	uchar_t		it_adrmode;
-#ifdef DIS_TEXT
-	char		it_name[NCPS];
-	uint_t		it_suffix:1;		/* mneu + "w", "l", or "d" */
-#endif
-#ifdef DIS_MEM
-	uint_t		it_size:16;
-#endif
-	uint_t		it_invalid64:1;		/* opcode invalid in amd64 */
-	uint_t		it_always64:1;		/* 64 bit when in 64 bit mode */
-	uint_t		it_invalid32:1;		/* invalid in IA32 */
-	uint_t		it_stackop:1;		/* push/pop stack operation */
-} instable_t;
-
-/*
- * Instruction formats.
- */
-enum {
-	UNKNOWN,
-	MRw,
-	IMlw,
-	IMw,
-	IR,
-	OA,
-	AO,
-	MS,
-	SM,
-	Mv,
-	Mw,
-	M,		/* register or memory */
-	Mb,		/* register or memory, always byte sized */
-	MO,		/* memory only (no registers) */
-	PREF,
-	SWAPGS,
-	R,
-	RA,
-	SEG,
-	MR,
-	RM,
-	IA,
-	MA,
-	SD,
-	AD,
-	SA,
-	D,
-	INM,
-	SO,
-	BD,
-	I,
-	P,
-	V,
-	DSHIFT,		/* for double shift that has an 8-bit immediate */
-	U,
-	OVERRIDE,
-	NORM,		/* instructions w/o ModR/M byte, no memory access */
-	IMPLMEM,	/* instructions w/o ModR/M byte, implicit mem access */
-	O,		/* for call	*/
-	JTAB,		/* jump table 	*/
-	IMUL,		/* for 186 iimul instr  */
-	CBW,		/* so data16 can be evaluated for cbw and variants */
-	MvI,		/* for 186 logicals */
-	ENTER,		/* for 186 enter instr  */
-	RMw,		/* for 286 arpl instr */
-	Ib,		/* for push immediate byte */
-	F,		/* for 287 instructions */
-	FF,		/* for 287 instructions */
-	FFC,		/* for 287 instructions */
-	DM,		/* 16-bit data */
-	AM,		/* 16-bit addr */
-	LSEG,		/* for 3-bit seg reg encoding */
-	MIb,		/* for 386 logicals */
-	SREG,		/* for 386 special registers */
-	PREFIX,		/* a REP instruction prefix */
-	LOCK,		/* a LOCK instruction prefix */
-	INT3,		/* The int 3 instruction, which has a fake operand */
-	INTx,		/* The normal int instruction, with explicit int num */
-	DSHIFTcl,	/* for double shift that implicitly uses %cl */
-	CWD,		/* so data16 can be evaluated for cwd and variants */
-	RET,		/* single immediate 16-bit operand */
-	MOVZ,		/* for movs and movz, with different size operands */
-	XADDB,		/* for xaddb */
-	MOVSXZ,		/* AMD64 mov sign extend 32 to 64 bit instruction */
-
-/*
- * MMX/SIMD addressing modes.
- */
-
-	MMO,		/* Prefixable MMX/SIMD-Int	mm/mem	-> mm */
-	MMOIMPL,	/* Prefixable MMX/SIMD-Int	mm	-> mm (mem) */
-	MMO3P,		/* Prefixable MMX/SIMD-Int	mm	-> r32,imm8 */
-	MMOM3,		/* Prefixable MMX/SIMD-Int	mm	-> r32 	*/
-	MMOS,		/* Prefixable MMX/SIMD-Int	mm	-> mm/mem */
-	MMOMS,		/* Prefixable MMX/SIMD-Int	mm	-> mem */
-	MMOPM,		/* MMX/SIMD-Int			mm/mem	-> mm,imm8 */
-	MMOPRM,		/* Prefixable MMX/SIMD-Int	r32/mem	-> mm,imm8 */
-	MMOSH,		/* Prefixable MMX		mm,imm8	*/
-	MM,		/* MMX/SIMD-Int			mm/mem	-> mm	*/
-	MMS,		/* MMX/SIMD-Int			mm	-> mm/mem */
-	MMSH,		/* MMX				mm,imm8 */
-	XMMO,		/* Prefixable SIMD		xmm/mem	-> xmm */
-	XMMOS,		/* Prefixable SIMD		xmm	-> xmm/mem */
-	XMMOPM,		/* Prefixable SIMD		xmm/mem	w/to xmm,imm8 */
-	XMMOMX,		/* Prefixable SIMD		mm/mem	-> xmm */
-	XMMOX3,		/* Prefixable SIMD		xmm	-> r32 */
-	XMMOXMM,	/* Prefixable SIMD		xmm/mem	-> mm	*/
-	XMMOM,		/* Prefixable SIMD		xmm	-> mem */
-	XMMOMS,		/* Prefixable SIMD		mem	-> xmm */
-	XMM,		/* SIMD 			xmm/mem	-> xmm */
-	XMMXIMPL,	/* SIMD				xmm	-> xmm (mem) */
-	XMM3P,		/* SIMD				xmm	-> r32,imm8 */
-	XMMP,		/* SIMD 			xmm/mem w/to xmm,imm8 */
-	XMMPRM,		/* SIMD 			r32/mem -> xmm,imm8 */
-	XMMS,		/* SIMD				xmm	-> xmm/mem */
-	XMMM,		/* SIMD 			mem	-> xmm */
-	XMMMS,		/* SIMD				xmm	-> mem */
-	XMM3MX,		/* SIMD 			r32/mem -> xmm */
-	XMM3MXS,	/* SIMD 			xmm	-> r32/mem */
-	XMMSH,		/* SIMD 			xmm,imm8 */
-	XMMXM3,		/* SIMD 			xmm/mem -> r32 */
-	XMMX3,		/* SIMD 			xmm	-> r32 */
-	XMMXMM,		/* SIMD 			xmm/mem	-> mm */
-	XMMMX,		/* SIMD 			mm	-> xmm */
-	XMMXM,		/* SIMD 			xmm	-> mm */
-	XMMFENCE,	/* SIMD lfence or mfence */
-	XMMSFNC		/* SIMD sfence (none or mem) */
-};
-
-#define	FILL	0x90	/* Fill byte used for alignment (nop)	*/
-
-/*
-** Register numbers for the i386
-*/
-#define	EAX_REGNO 0
-#define	ECX_REGNO 1
-#define	EDX_REGNO 2
-#define	EBX_REGNO 3
-#define	ESP_REGNO 4
-#define	EBP_REGNO 5
-#define	ESI_REGNO 6
-#define	EDI_REGNO 7
-
-/*
- * modes for immediate values
- */
-#define	MODE_NONE	0
-#define	MODE_IPREL	1	/* signed IP relative value */
-#define	MODE_SIGNED	2	/* sign extended immediate */
-#define	MODE_IMPLIED	3	/* constant value implied from opcode */
-#define	MODE_OFFSET	4	/* offset part of an address */
-
-/*
- * The letters used in these macros are:
- *   IND - indirect to another to another table
- *   "T" - means to Terminate indirections (this is the final opcode)
- *   "S" - means "operand length suffix required"
- *   "NS" - means "no suffix" which is the operand length suffix of the opcode
- *   "Z" - means instruction size arg required
- *   "u" - means the opcode is invalid in IA32 but valid in amd64
- *   "x" - means the opcode is invalid in amd64, but not IA32
- *   "y" - means the operand size is always 64 bits in 64 bit mode
- *   "p" - means push/pop stack operation
- */
-
-#if defined(DIS_TEXT) && defined(DIS_MEM)
-#define	IND(table)		{table, 0, "", 0, 0, 0, 0, 0, 0}
-#define	INDx(table)		{table, 0, "", 0, 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 1, 0}
-#define	TNSx(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 0, 0}
-#define	TNSy(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0, 1}
-#define	TNSZ(name, amode, sz)	{TERM, amode, name, 0, sz, 0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode, name, 0, sz, 0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode, name, 1, 0, 0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode, name, 1, 0, 1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode, name, 1, 0, 0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode, name, 1, 0, 0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode, name, 1, sz, 0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode, name, 1, sz, 1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode, name, 1, sz, 0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, "", 0, 0, 0, 0, 0}
-#elif defined(DIS_TEXT)
-#define	IND(table)		{table, 0, "", 0, 0, 0, 0, 0}
-#define	INDx(table)		{table, 0, "", 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0}
-#define	TNSx(name, amode)	{TERM, amode, name, 0, 1, 0, 0, 0}
-#define	TNSy(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 1}
-#define	TNSZ(name, amode, sz)	{TERM, amode, name, 0, 0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode, name, 0, 0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode, name, 1, 0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode, name, 1, 1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode, name, 1, 0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode, name, 1, 0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode, name, 1, 0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode, name, 1, 1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode, name, 1, 0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, "", 0, 0, 0, 0, 0}
-#elif defined(DIS_MEM)
-#define	IND(table)		{table, 0, 0, 0, 0, 0, 0}
-#define	INDx(table)		{table, 0, 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode,  0, 0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode,  0, 0, 0, 1, 0}
-#define	TNSy(name, amode)	{TERM, amode,  0, 0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode,  0, 0, 1, 0, 1}
-#define	TNSx(name, amode)	{TERM, amode,  0, 1, 0, 0, 0}
-#define	TNSZ(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode, sz, 0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode,  0, 0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode,  0, 1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode,  0, 0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode,  0, 0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode, sz, 1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode, sz, 0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, 0, 0, 0, 0, 0}
-#else
-#define	IND(table)		{table[0], 0, 0, 0, 0, 0}
-#define	INDx(table)		{table[0], 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode,  0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode,  0, 0, 1, 0}
-#define	TNSy(name, amode)	{TERM, amode,  0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode,  0, 1, 0, 1}
-#define	TNSx(name, amode)	{TERM, amode,  1, 0, 0, 0}
-#define	TNSZ(name, amode, sz)	{TERM, amode,  0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode,  0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode,  0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode,  1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode,  0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode,  0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode,  0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode,  1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode,  0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, 0, 0, 0, 0}
-#endif
-
-#ifdef DIS_TEXT
-/*
- * this decodes the r_m field for mode's 0, 1, 2 in 16 bit mode
- */
-const char *const dis_addr16[3][8] = {
-"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "",
-									"(%bx)",
-"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di", "(%bp)",
-									"(%bx)",
-"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "(%bp)",
-									"(%bx)",
-};
-
-
-/*
- * This decodes 32 bit addressing mode r_m field for modes 0, 1, 2
- */
-const char *const dis_addr32_mode0[16] = {
-  "(%eax)", "(%ecx)", "(%edx)",  "(%ebx)",  "", "",        "(%esi)",  "(%edi)",
-  "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "",        "(%r14d)", "(%r15d)"
-};
-
-const char *const dis_addr32_mode12[16] = {
-  "(%eax)", "(%ecx)", "(%edx)",  "(%ebx)",  "", "(%ebp)",  "(%esi)",  "(%edi)",
-  "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "(%r13d)", "(%r14d)", "(%r15d)"
-};
-
-/*
- * This decodes 64 bit addressing mode r_m field for modes 0, 1, 2
- */
-const char *const dis_addr64_mode0[16] = {
- "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "",       "(%rip)", "(%rsi)", "(%rdi)",
- "(%r8)",  "(%r9)",  "(%r10)", "(%r11)", "(%r12)", "(%rip)", "(%r14)", "(%r15)"
-};
-const char *const dis_addr64_mode12[16] = {
- "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "",       "(%rbp)", "(%rsi)", "(%rdi)",
- "(%r8)",  "(%r9)",  "(%r10)", "(%r11)", "(%r12)", "(%r13)", "(%r14)", "(%r15)"
-};
-
-/*
- * decode for scale from SIB byte
- */
-const char *const dis_scale_factor[4] = { ")", ",2)", ",4)", ",8)" };
-
-/*
- * register decoding for normal references to registers (ie. not addressing)
- */
-const char *const dis_REG8[16] = {
-	"%al",  "%cl",  "%dl",   "%bl",   "%ah",   "%ch",   "%dh",   "%bh",
-	"%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b"
-};
-
-const char *const dis_REG8_REX[16] = {
-	"%al",  "%cl",  "%dl",   "%bl",   "%spl",  "%bpl",  "%sil",  "%dil",
-	"%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b"
-};
-
-const char *const dis_REG16[16] = {
-	"%ax",  "%cx",  "%dx",   "%bx",   "%sp",   "%bp",   "%si",   "%di",
-	"%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w"
-};
-
-const char *const dis_REG32[16] = {
-	"%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
-	"%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d"
-};
-
-const char *const dis_REG64[16] = {
-	"%rax", "%rcx", "%rdx",  "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
-	"%r8",  "%r9",  "%r10",  "%r11", "%r12", "%r13", "%r14", "%r15"
-};
-
-const char *const dis_DEBUGREG[16] = {
-	"%db0", "%db1", "%db2",  "%db3",  "%db4",  "%db5",  "%db6",  "%db7",
-	"%db8", "%db9", "%db10", "%db11", "%db12", "%db13", "%db14", "%db15"
-};
-
-const char *const dis_CONTROLREG[16] = {
-    "%cr0", "%cr1", "%cr2", "%cr3", "%cr4", "%cr5?", "%cr6?", "%cr7?",
-    "%cr8", "%cr9?", "%cr10?", "%cr11?", "%cr12?", "%cr13?", "%cr14?", "%cr15?"
-};
-
-const char *const dis_TESTREG[16] = {
-	"%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7",
-	"%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7"
-};
-
-const char *const dis_MMREG[16] = {
-	"%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7",
-	"%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
-};
-
-const char *const dis_XMMREG[16] = {
-    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-    "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
-};
-
-const char *const dis_SEGREG[16] = {
-	"%es", "%cs", "%ss", "%ds", "%fs", "%gs", "<reserved>", "<reserved>",
-	"%es", "%cs", "%ss", "%ds", "%fs", "%gs", "<reserved>", "<reserved>"
-};
-
-/*
- * SIMD predicate suffixes
- */
-const char *const dis_PREDSUFFIX[8] = {
-	"eq", "lt", "le", "unord", "neq", "nlt", "nle", "ord"
-};
-
-
-
-#endif	/* DIS_TEXT */
-
-
-
-
-/*
- *	"decode table" for 64 bit mode MOVSXD instruction (opcode 0x63)
- */
-const instable_t dis_opMOVSLD = TNS("movslq",MOVSXZ);
-
-/*
- *	"decode table" for pause and clflush instructions
- */
-const instable_t dis_opPause = TNS("pause", NORM);
-
-/*
- *	Decode table for 0x0F00 opcodes
- */
-const instable_t dis_op0F00[8] = {
-
-/*  [0]  */	TNS("sldt",M),		TNS("str",M),		TNSy("lldt",M), 	TNSy("ltr",M),
-/*  [4]  */	TNSZ("verr",M,2),	TNSZ("verw",M,2),	INVALID,		INVALID,
-};
-
-
-/*
- *	Decode table for 0x0F01 opcodes
- */
-const instable_t dis_op0F01[8] = {
-
-/*  [0]  */	TNSZ("sgdt",MO,6),	TNSZ("sidt",MO,6), 	TNSZ("lgdt",MO,6),	TNSZ("lidt",MO,6),
-/*  [4]  */	TNSZ("smsw",M,2),	INVALID, 		TNSZ("lmsw",M,2),	TNS("invlpg",SWAPGS),
-};
-
-/*
- *	Decode table for 0x0F18 opcodes -- SIMD prefetch
- */
-const instable_t dis_op0F18[8] = {
-
-/*  [0]  */	TNS("prefetchnta",PREF),TNS("prefetcht0",PREF),	TNS("prefetcht1",PREF),	TNS("prefetcht2",PREF),
-/*  [4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- * 	Decode table for 0x0FAE opcodes -- SIMD state save/restore
- */
-const instable_t dis_op0FAE[8] = {
-/*  [0]  */	TNSZ("fxsave",M,512),	TNSZ("fxrstor",M,512),	TNS("ldmxcsr",M),	TNS("stmxcsr",M),
-/*  [4]  */	INVALID,		TNS("lfence",XMMFENCE), TNS("mfence",XMMFENCE),	TNS("sfence",XMMSFNC),
-};
-
-/*
- *	Decode table for 0x0FBA opcodes
- */
-
-const instable_t dis_op0FBA[8] = {
-
-/*  [0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4]  */	TS("bt",MIb),		TS("bts",MIb),		TS("btr",MIb),		TS("btc",MIb),
-};
-
-/*
- * 	Decode table for 0x0FC7 opcode
- */
-
-const instable_t dis_op0FC7[8] = {
-
-/*  [0]  */	INVALID,		TNS("cmpxchg8b",M),	INVALID,		INVALID,
-/*  [4]  */	INVALID,		INVALID,	INVALID,		 INVALID,
-};
-
-
-/*
- *	Decode table for 0x0FC8 opcode -- 486 bswap instruction
- *
- *bit pattern: 0000 1111 1100 1reg
- */
-const instable_t dis_op0FC8[4] = {
-/*  [0]  */	TNS("bswap",R),		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Decode table for 0x0F71, 0x0F72, and 0x0F73 opcodes -- MMX instructions
- */
-const instable_t dis_op0F7123[4][8] = {
-{
-/*  [70].0 */	INVALID,		INVALID,		INVALID,		INVALID,
-/*      .4 */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [71].0 */	INVALID,		INVALID,		TNS("psrlw",MMOSH),	INVALID,
-/*      .4 */	TNS("psraw",MMOSH),	INVALID,		TNS("psllw",MMOSH),	INVALID,
-}, {
-/*  [72].0 */	INVALID,		INVALID,		TNS("psrld",MMOSH),	INVALID,
-/*      .4 */	TNS("psrad",MMOSH),	INVALID,		TNS("pslld",MMOSH),	INVALID,
-}, {
-/*  [73].0 */	INVALID,		INVALID,		TNS("psrlq",MMOSH),	TNS("INVALID",MMOSH),
-/*      .4 */	INVALID,		INVALID, 		TNS("psllq",MMOSH),	TNS("INVALID",MMOSH),
-} };
-
-/*
- *	Decode table for SIMD extensions to above 0x0F71-0x0F73 opcodes.
- */
-const instable_t dis_opSIMD7123[32] = {
-/* [70].0 */	INVALID,		INVALID,		INVALID,		INVALID,
-/*     .4 */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/* [71].0 */	INVALID,		INVALID,		TNS("psrlw",XMMSH),	INVALID,
-/*     .4 */	TNS("psraw",XMMSH),	INVALID,		TNS("psllw",XMMSH),	INVALID,
-
-/* [72].0 */	INVALID,		INVALID,		TNS("psrld",XMMSH),	INVALID,
-/*     .4 */	TNS("psrad",XMMSH),	INVALID,		TNS("pslld",XMMSH),	INVALID,
-
-/* [73].0 */	INVALID,		INVALID,		TNS("psrlq",XMMSH),	TNS("psrldq",XMMSH),
-/*     .4 */	INVALID,		INVALID,		TNS("psllq",XMMSH),	TNS("pslldq",XMMSH),
-};
-
-/*
- *	SIMD instructions have been wedged into the existing IA32 instruction
- *	set through the use of prefixes.  That is, while 0xf0 0x58 may be
- *	addps, 0xf3 0xf0 0x58 (literally, repz addps) is a completely different
- *	instruction - addss.  At present, three prefixes have been coopted in
- *	this manner - address size (0x66), repnz (0xf2) and repz (0xf3).  The
- *	following tables are used to provide the prefixed instruction names.
- *	The arrays are sparse, but they're fast.
- */
-
-/*
- *	Decode table for SIMD instructions with the address size (0x66) prefix.
- */
-const instable_t dis_opSIMDdata16[256] = {
-/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [10]  */	TNSZ("movupd",XMM,16),	TNSZ("movupd",XMMS,16),	TNSZ("movlpd",XMMM,8),	TNSZ("movlpd",XMMMS,8),
-/*  [14]  */	TNSZ("unpcklpd",XMM,16),TNSZ("unpckhpd",XMM,16),TNSZ("movhpd",XMMM,8),	TNSZ("movhpd",XMMMS,8),
-/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [28]  */	TNSZ("movapd",XMM,16),	TNSZ("movapd",XMMS,16),	TNSZ("cvtpi2pd",XMMOMX,8),TNSZ("movntpd",XMMOMS,16),
-/*  [2C]  */	TNSZ("cvttpd2pi",XMMXMM,16),TNSZ("cvtpd2pi",XMMXMM,16),TNSZ("ucomisd",XMM,8),TNSZ("comisd",XMM,8),
-
-/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [50]  */	TNS("movmskpd",XMMOX3),	TNSZ("sqrtpd",XMM,16),	INVALID,		INVALID,
-/*  [54]  */	TNSZ("andpd",XMM,16),	TNSZ("andnpd",XMM,16),	TNSZ("orpd",XMM,16),	TNSZ("xorpd",XMM,16),
-/*  [58]  */	TNSZ("addpd",XMM,16),	TNSZ("mulpd",XMM,16),	TNSZ("cvtpd2ps",XMM,16),TNSZ("cvtps2dq",XMM,16),
-/*  [5C]  */	TNSZ("subpd",XMM,16),	TNSZ("minpd",XMM,16),	TNSZ("divpd",XMM,16),	TNSZ("maxpd",XMM,16),
-
-/*  [60]  */	TNSZ("punpcklbw",XMM,16),TNSZ("punpcklwd",XMM,16),TNSZ("punpckldq",XMM,16),TNSZ("packsswb",XMM,16),
-/*  [64]  */	TNSZ("pcmpgtb",XMM,16),	TNSZ("pcmpgtw",XMM,16),	TNSZ("pcmpgtd",XMM,16),	TNSZ("packuswb",XMM,16),
-/*  [68]  */	TNSZ("punpckhbw",XMM,16),TNSZ("punpckhwd",XMM,16),TNSZ("punpckhdq",XMM,16),TNSZ("packssdw",XMM,16),
-/*  [6C]  */	TNSZ("punpcklqdq",XMM,16),TNSZ("punpckhqdq",XMM,16),TNSZ("movd",XMM3MX,4),TNSZ("movdqa",XMM,16),
-
-/*  [70]  */	TNSZ("pshufd",XMMP,16),	INVALID,		INVALID,		INVALID,
-/*  [74]  */	TNSZ("pcmpeqb",XMM,16),	TNSZ("pcmpeqw",XMM,16),	TNSZ("pcmpeqd",XMM,16),	INVALID,
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		TNSZ("movd",XMM3MXS,4),	TNSZ("movdqa",XMMS,16),
-
-/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmppd",XMMP,16),	INVALID,
-/*  [C4]  */	TNSZ("pinsrw",XMMPRM,2),TNS("pextrw",XMM3P),	TNSZ("shufpd",XMMP,16),	INVALID,
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [D0]  */	INVALID,		TNSZ("psrlw",XMM,16),	TNSZ("psrld",XMM,16),	TNSZ("psrlq",XMM,16),
-/*  [D4]  */	TNSZ("paddq",XMM,16),	TNSZ("pmullw",XMM,16),	TNSZ("movq",XMMS,8),	TNS("pmovmskb",XMMX3),
-/*  [D8]  */	TNSZ("psubusb",XMM,16),	TNSZ("psubusw",XMM,16),	TNSZ("pminub",XMM,16),	TNSZ("pand",XMM,16),
-/*  [DC]  */	TNSZ("paddusb",XMM,16),	TNSZ("paddusw",XMM,16),	TNSZ("pmaxub",XMM,16),	TNSZ("pandn",XMM,16),
-
-/*  [E0]  */	TNSZ("pavgb",XMM,16),	TNSZ("psraw",XMM,16),	TNSZ("psrad",XMM,16),	TNSZ("pavgw",XMM,16),
-/*  [E4]  */	TNSZ("pmulhuw",XMM,16),	TNSZ("pmulhw",XMM,16),	TNSZ("cvttpd2dq",XMM,16),TNSZ("movntdq",XMMS,16),
-/*  [E8]  */	TNSZ("psubsb",XMM,16),	TNSZ("psubsw",XMM,16),	TNSZ("pminsw",XMM,16),	TNSZ("por",XMM,16),
-/*  [EC]  */	TNSZ("paddsb",XMM,16),	TNSZ("paddsw",XMM,16),	TNSZ("pmaxsw",XMM,16),	TNSZ("pxor",XMM,16),
-
-/*  [F0]  */	INVALID,		TNSZ("psllw",XMM,16),	TNSZ("pslld",XMM,16),	TNSZ("psllq",XMM,16),
-/*  [F4]  */	TNSZ("pmuludq",XMM,16),	TNSZ("pmaddwd",XMM,16),	TNSZ("psadbw",XMM,16),	TNSZ("maskmovdqu", XMMXIMPL,16),
-/*  [F8]  */	TNSZ("psubb",XMM,16),	TNSZ("psubw",XMM,16),	TNSZ("psubd",XMM,16),	TNSZ("psubq",XMM,16),
-/*  [FC]  */	TNSZ("paddb",XMM,16),	TNSZ("paddw",XMM,16),	TNSZ("paddd",XMM,16),	INVALID,
-};
-
-/*
- *	Decode table for SIMD instructions with the repnz (0xf2) prefix.
- */
-const instable_t dis_opSIMDrepnz[256] = {
-/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [10]  */	TNSZ("movsd",XMM,8),	TNSZ("movsd",XMMS,8),	INVALID,		INVALID,
-/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [28]  */	INVALID,		INVALID,		TNSZ("cvtsi2sd",XMM3MX,4),INVALID,
-/*  [2C]  */	TNSZ("cvttsd2si",XMMXM3,8),TNSZ("cvtsd2si",XMMXM3,8),INVALID,		INVALID,
-
-/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [50]  */	INVALID,		TNSZ("sqrtsd",XMM,8),	INVALID,		INVALID,
-/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [58]  */	TNSZ("addsd",XMM,8),	TNSZ("mulsd",XMM,8),	TNSZ("cvtsd2ss",XMM,8),	INVALID,
-/*  [5C]  */	TNSZ("subsd",XMM,8),	TNSZ("minsd",XMM,8),	TNSZ("divsd",XMM,8),	TNSZ("maxsd",XMM,8),
-
-/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [70]  */	TNSZ("pshuflw",XMMP,16),INVALID,		INVALID,		INVALID,
-/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmpsd",XMMP,8),	INVALID,
-/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [D4]  */	INVALID,		INVALID,		TNS("movdq2q",XMMXM),	INVALID,
-/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [E4]  */	INVALID,		INVALID,		TNSZ("cvtpd2dq",XMM,16),INVALID,
-/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Decode table for SIMD instructions with the repz (0xf3) prefix.
- */
-const instable_t dis_opSIMDrepz[256] = {
-/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [10]  */	TNSZ("movss",XMM,4),	TNSZ("movss",XMMS,4),	INVALID,		INVALID,
-/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [28]  */	INVALID,		INVALID,		TNSZ("cvtsi2ss",XMM3MX,4),INVALID,
-/*  [2C]  */	TNSZ("cvttss2si",XMMXM3,4),TNSZ("cvtss2si",XMMXM3,4),INVALID,		INVALID,
-
-/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [50]  */	INVALID,		TNSZ("sqrtss",XMM,4),	TNSZ("rsqrtss",XMM,4),	TNSZ("rcpss",XMM,4),
-/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [58]  */	TNSZ("addss",XMM,4),	TNSZ("mulss",XMM,4),	TNSZ("cvtss2sd",XMM,4),	TNSZ("cvttps2dq",XMM,16),
-/*  [5C]  */	TNSZ("subss",XMM,4),	TNSZ("minss",XMM,4),	TNSZ("divss",XMM,4),	TNSZ("maxss",XMM,4),
-
-/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [6C]  */	INVALID,		INVALID,		INVALID,		TNSZ("movdqu",XMM,16),
-
-/*  [70]  */	TNSZ("pshufhw",XMMP,16),INVALID,		INVALID,		INVALID,
-/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		TNSZ("movq",XMM,8),	TNSZ("movdqu",XMMS,16),
-
-/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmpss",XMMP,4),	INVALID,
-/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [D4]  */	INVALID,		INVALID,		TNS("movq2dq",XMMMX),	INVALID,
-/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [E4]  */	INVALID,		INVALID,		TNSZ("cvtdq2pd",XMM,8),	INVALID,
-/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Decode table for 0x0F opcodes
- */
-
-const instable_t dis_op0F[16][16] = {
-{
-/*  [00]  */	IND(dis_op0F00),	IND(dis_op0F01),	TNS("lar",MR),		TNS("lsl",MR),
-/*  [04]  */	INVALID,		TNS("syscall",NORM),	TNS("clts",NORM),	TNS("sysret",NORM),
-/*  [08]  */	TNS("invd",NORM),	TNS("wbinvd",NORM),	INVALID,		TNS("ud2",NORM),
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [10]  */	TNSZ("movups",XMMO,16),	TNSZ("movups",XMMOS,16),TNSZ("movlps",XMMO,8),	TNSZ("movlps",XMMOS,8),
-/*  [14]  */	TNSZ("unpcklps",XMMO,16),TNSZ("unpckhps",XMMO,16),TNSZ("movhps",XMMOM,8),TNSZ("movhps",XMMOMS,8),
-/*  [18]  */	IND(dis_op0F18),	INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [20]  */	TSy("mov",SREG),	TSy("mov",SREG),	TSy("mov",SREG),	TSy("mov",SREG),
-/*  [24]  */	TSx("mov",SREG),	INVALID,		TSx("mov",SREG),	INVALID,
-/*  [28]  */	TNSZ("movaps",XMMO,16),	TNSZ("movaps",XMMOS,16),TNSZ("cvtpi2ps",XMMOMX,8),TNSZ("movntps",XMMOS,16),
-/*  [2C]  */	TNSZ("cvttps2pi",XMMOXMM,8),TNSZ("cvtps2pi",XMMOXMM,8),TNSZ("ucomiss",XMMO,4),TNSZ("comiss",XMMO,4),
-}, {
-/*  [30]  */	TNS("wrmsr",NORM),	TNS("rdtsc",NORM),	TNS("rdmsr",NORM),	TNS("rdpmc",NORM),
-/*  [34]  */	TNSx("sysenter",NORM),	TNSx("sysexit",NORM),	INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [40]  */	TS("cmovx.o",MR),	TS("cmovx.no",MR),	TS("cmovx.b",MR),	TS("cmovx.ae",MR),
-/*  [44]  */	TS("cmovx.e",MR),	TS("cmovx.ne",MR),	TS("cmovx.be",MR),	TS("cmovx.a",MR),
-/*  [48]  */	TS("cmovx.s",MR),	TS("cmovx.ns",MR),	TS("cmovx.pe",MR),	TS("cmovx.po",MR),
-/*  [4C]  */	TS("cmovx.l",MR),	TS("cmovx.ge",MR),	TS("cmovx.le",MR),	TS("cmovx.g",MR),
-}, {
-/*  [50]  */	TNS("movmskps",XMMOX3),	TNSZ("sqrtps",XMMO,16),	TNSZ("rsqrtps",XMMO,16),TNSZ("rcpps",XMMO,16),
-/*  [54]  */	TNSZ("andps",XMMO,16),	TNSZ("andnps",XMMO,16),	TNSZ("orps",XMMO,16),	TNSZ("xorps",XMMO,16),
-/*  [58]  */	TNSZ("addps",XMMO,16),	TNSZ("mulps",XMMO,16),	TNSZ("cvtps2pd",XMMO,8),TNSZ("cvtdq2ps",XMMO,16),
-/*  [5C]  */	TNSZ("subps",XMMO,16),	TNSZ("minps",XMMO,16),	TNSZ("divps",XMMO,16),	TNSZ("maxps",XMMO,16),
-}, {
-/*  [60]  */	TNSZ("punpcklbw",MMO,4),TNSZ("punpcklwd",MMO,4),TNSZ("punpckldq",MMO,4),TNSZ("packsswb",MMO,8),
-/*  [64]  */	TNSZ("pcmpgtb",MMO,8),	TNSZ("pcmpgtw",MMO,8),	TNSZ("pcmpgtd",MMO,8),	TNSZ("packuswb",MMO,8),
-/*  [68]  */	TNSZ("punpckhbw",MMO,8),TNSZ("punpckhwd",MMO,8),TNSZ("punpckhdq",MMO,8),TNSZ("packssdw",MMO,8),
-/*  [6C]  */	TNSZ("INVALID",MMO,0),	TNSZ("INVALID",MMO,0),	TNSZ("movd",MMO,4),	TNSZ("movq",MMO,8),
-}, {
-/*  [70]  */	TNSZ("pshufw",MMOPM,8),	TNS("psrXXX",MR),	TNS("psrXXX",MR),	TNS("psrXXX",MR),
-/*  [74]  */	TNSZ("pcmpeqb",MMO,8),	TNSZ("pcmpeqw",MMO,8),	TNSZ("pcmpeqd",MMO,8),	TNS("emms",NORM),
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		TNSZ("movd",MMOS,4),	TNSZ("movq",MMOS,8),
-}, {
-/*  [80]  */	TNS("jo",D),		TNS("jno",D),		TNS("jb",D),		TNS("jae",D),
-/*  [84]  */	TNS("je",D),		TNS("jne",D),		TNS("jbe",D),		TNS("ja",D),
-/*  [88]  */	TNS("js",D),		TNS("jns",D),		TNS("jp",D),		TNS("jnp",D),
-/*  [8C]  */	TNS("jl",D),		TNS("jge",D),		TNS("jle",D),		TNS("jg",D),
-}, {
-/*  [90]  */	TNS("seto",Mb),		TNS("setno",Mb),	TNS("setb",Mb),		TNS("setae",Mb),
-/*  [94]  */	TNS("sete",Mb),		TNS("setne",Mb),	TNS("setbe",Mb),	TNS("seta",Mb),
-/*  [98]  */	TNS("sets",Mb),		TNS("setns",Mb),	TNS("setp",Mb),		TNS("setnp",Mb),
-/*  [9C]  */	TNS("setl",Mb),		TNS("setge",Mb),	TNS("setle",Mb),	TNS("setg",Mb),
-}, {
-/*  [A0]  */	TSp("push",LSEG),	TSp("pop",LSEG),	TNS("cpuid",NORM),	TS("bt",RMw),
-/*  [A4]  */	TS("shld",DSHIFT),	TS("shld",DSHIFTcl),	INVALID,		INVALID,
-/*  [A8]  */	TSp("push",LSEG),	TSp("pop",LSEG),	TNS("rsm",NORM),	TS("bts",RMw),
-/*  [AC]  */	TS("shrd",DSHIFT),	TS("shrd",DSHIFTcl),	IND(dis_op0FAE),	TS("imul",MRw),
-}, {
-/*  [B0]  */	TNS("cmpxchgb",RMw),	TS("cmpxchg",RMw),	TS("lss",MR),		TS("btr",RMw),
-/*  [B4]  */	TS("lfs",MR),		TS("lgs",MR),		TS("movzb",MOVZ),	TNS("movzwl",MOVZ),
-/*  [B8]  */	INVALID,		INVALID,		IND(dis_op0FBA),	TS("btc",RMw),
-/*  [BC]  */	TS("bsf",MRw),		TS("bsr",MRw),		TS("movsb",MOVZ),	TNS("movswl",MOVZ),
-}, {
-/*  [C0]  */	TNS("xaddb",XADDB),	TS("xadd",RMw),		TNSZ("cmpps",XMMOPM,16),TNS("movnti",RM),
-/*  [C4]  */	TNSZ("pinsrw",MMOPRM,2),TNS("pextrw",MMO3P), 	TNSZ("shufps",XMMOPM,16),IND(dis_op0FC7),
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [D0]  */	INVALID,		TNSZ("psrlw",MMO,8),	TNSZ("psrld",MMO,8),	TNSZ("psrlq",MMO,8),
-/*  [D4]  */	TNSZ("paddq",MMO,8),	TNSZ("pmullw",MMO,8),	TNSZ("INVALID",MMO,0),	TNS("pmovmskb",MMOM3),
-/*  [D8]  */	TNSZ("psubusb",MMO,8),	TNSZ("psubusw",MMO,8),	TNSZ("pminub",MMO,8),	TNSZ("pand",MMO,8),
-/*  [DC]  */	TNSZ("paddusb",MMO,8),	TNSZ("paddusw",MMO,8),	TNSZ("pmaxub",MMO,8),	TNSZ("pandn",MMO,8),
-}, {
-/*  [E0]  */	TNSZ("pavgb",MMO,8),	TNSZ("psraw",MMO,8),	TNSZ("psrad",MMO,8),	TNSZ("pavgw",MMO,8),
-/*  [E4]  */	TNSZ("pmulhuw",MMO,8),	TNSZ("pmulhw",MMO,8),	TNS("INVALID",XMMO),	TNSZ("movntq",MMOMS,8),
-/*  [E8]  */	TNSZ("psubsb",MMO,8),	TNSZ("psubsw",MMO,8),	TNSZ("pminsw",MMO,8),	TNSZ("por",MMO,8),
-/*  [EC]  */	TNSZ("paddsb",MMO,8),	TNSZ("paddsw",MMO,8),	TNSZ("pmaxsw",MMO,8),	TNSZ("pxor",MMO,8),
-}, {
-/*  [F0]  */	INVALID,		TNSZ("psllw",MMO,8),	TNSZ("pslld",MMO,8),	TNSZ("psllq",MMO,8),
-/*  [F4]  */	TNSZ("pmuludq",MMO,8),	TNSZ("pmaddwd",MMO,8),	TNSZ("psadbw",MMO,8),	TNSZ("maskmovq",MMOIMPL,8),
-/*  [F8]  */	TNSZ("psubb",MMO,8),	TNSZ("psubw",MMO,8),	TNSZ("psubd",MMO,8),	TNSZ("psubq",MMO,8),
-/*  [FC]  */	TNSZ("paddb",MMO,8),	TNSZ("paddw",MMO,8),	TNSZ("paddd",MMO,8),	INVALID,
-} };
-
-
-/*
- *	Decode table for 0x80 opcodes
- */
-
-const instable_t dis_op80[8] = {
-
-/*  [0]  */	TNS("addb",IMlw),	TNS("orb",IMw),		TNS("adcb",IMlw),	TNS("sbbb",IMlw),
-/*  [4]  */	TNS("andb",IMw),	TNS("subb",IMlw),	TNS("xorb",IMw),	TNS("cmpb",IMlw),
-};
-
-
-/*
- *	Decode table for 0x81 opcodes.
- */
-
-const instable_t dis_op81[8] = {
-
-/*  [0]  */	TS("add",IMlw),		TS("or",IMw),		TS("adc",IMlw),		TS("sbb",IMlw),
-/*  [4]  */	TS("and",IMw),		TS("sub",IMlw),		TS("xor",IMw),		TS("cmp",IMlw),
-};
-
-
-/*
- *	Decode table for 0x82 opcodes.
- */
-
-const instable_t dis_op82[8] = {
-
-/*  [0]  */	TNSx("addb",IMlw),	TNSx("orb",IMlw),	TNSx("adcb",IMlw),	TNSx("sbbb",IMlw),
-/*  [4]  */	TNSx("andb",IMlw),	TNSx("subb",IMlw),	TNSx("xorb",IMlw),	TNSx("cmpb",IMlw),
-};
-/*
- *	Decode table for 0x83 opcodes.
- */
-
-const instable_t dis_op83[8] = {
-
-/*  [0]  */	TS("add",IMlw),		TS("or",IMlw),		TS("adc",IMlw),		TS("sbb",IMlw),
-/*  [4]  */	TS("and",IMlw),		TS("sub",IMlw),		TS("xor",IMlw),		TS("cmp",IMlw),
-};
-
-/*
- *	Decode table for 0xC0 opcodes.
- */
-
-const instable_t dis_opC0[8] = {
-
-/*  [0]  */	TNS("rolb",MvI),	TNS("rorb",MvI),	TNS("rclb",MvI),	TNS("rcrb",MvI),
-/*  [4]  */	TNS("shlb",MvI),	TNS("shrb",MvI),	INVALID,		TNS("sarb",MvI),
-};
-
-/*
- *	Decode table for 0xD0 opcodes.
- */
-
-const instable_t dis_opD0[8] = {
-
-/*  [0]  */	TNS("rolb",Mv),		TNS("rorb",Mv),		TNS("rclb",Mv),		TNS("rcrb",Mv),
-/*  [4]  */	TNS("shlb",Mv),		TNS("shrb",Mv),		TNS("salb",Mv),		TNS("sarb",Mv),
-};
-
-/*
- *	Decode table for 0xC1 opcodes.
- *	186 instruction set
- */
-
-const instable_t dis_opC1[8] = {
-
-/*  [0]  */	TS("rol",MvI),		TS("ror",MvI),		TS("rcl",MvI),		TS("rcr",MvI),
-/*  [4]  */	TS("shl",MvI),		TS("shr",MvI),		TS("sal",MvI),		TS("sar",MvI),
-};
-
-/*
- *	Decode table for 0xD1 opcodes.
- */
-
-const instable_t dis_opD1[8] = {
-
-/*  [0]  */	TS("rol",Mv),		TS("ror",Mv),		TS("rcl",Mv),		TS("rcr",Mv),
-/*  [4]  */	TS("shl",Mv),		TS("shr",Mv),		TS("sal",Mv),		TS("sar",Mv),
-};
-
-
-/*
- *	Decode table for 0xD2 opcodes.
- */
-
-const instable_t dis_opD2[8] = {
-
-/*  [0]  */	TNS("rolb",Mv),		TNS("rorb",Mv),		TNS("rclb",Mv),		TNS("rcrb",Mv),
-/*  [4]  */	TNS("shlb",Mv),		TNS("shrb",Mv),		TNS("salb",Mv),		TNS("sarb",Mv),
-};
-/*
- *	Decode table for 0xD3 opcodes.
- */
-
-const instable_t dis_opD3[8] = {
-
-/*  [0]  */	TS("rol",Mv),		TS("ror",Mv),		TS("rcl",Mv),		TS("rcr",Mv),
-/*  [4]  */	TS("shl",Mv),		TS("shr",Mv),		TS("salb",Mv),		TS("sar",Mv),
-};
-
-
-/*
- *	Decode table for 0xF6 opcodes.
- */
-
-const instable_t dis_opF6[8] = {
-
-/*  [0]  */	TNS("testb",IMw),	TNS("testb",IMw),	TNS("notb",Mw),		TNS("negb",Mw),
-/*  [4]  */	TNS("mulb",MA),		TNS("imulb",MA),	TNS("divb",MA),		TNS("idivb",MA),
-};
-
-
-/*
- *	Decode table for 0xF7 opcodes.
- */
-
-const instable_t dis_opF7[8] = {
-
-/*  [0]  */	TS("test",IMw),		TS("test",IMw),		TS("not",Mw),		TS("neg",Mw),
-/*  [4]  */	TS("mul",MA),		TS("imul",MA),		TS("div",MA),		TS("idiv",MA),
-};
-
-
-/*
- *	Decode table for 0xFE opcodes.
- */
-
-const instable_t dis_opFE[8] = {
-
-/*  [0]  */	TNS("incb",Mw),		TNS("decb",Mw),		INVALID,		INVALID,
-/*  [4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-/*
- *	Decode table for 0xFF opcodes.
- */
-
-const instable_t dis_opFF[8] = {
-
-/*  [0]  */	TS("inc",Mw),		TS("dec",Mw),		TNSyp("call",INM),	TNS("lcall",INM),
-/*  [4]  */	TNSy("jmp",INM),	TNS("ljmp",INM),	TSp("push",M),		INVALID,
-};
-
-/* for 287 instructions, which are a mess to decode */
-
-const instable_t dis_opFP1n2[8][8] = {
-{
-/* bit pattern:	1101 1xxx MODxx xR/M */
-/*  [0,0] */	TNS("fadds",M),		TNS("fmuls",M),		TNS("fcoms",M),		TNS("fcomps",M),
-/*  [0,4] */	TNS("fsubs",M),		TNS("fsubrs",M),	TNS("fdivs",M),		TNS("fdivrs",M),
-}, {
-/*  [1,0]  */	TNS("flds",M),		INVALID,		TNS("fsts",M),		TNS("fstps",M),
-/*  [1,4]  */	TNSZ("fldenv",M,28),	TNSZ("fldcw",M,2),	TNSZ("fnstenv",M,28),	TNSZ("fnstcw",M,2),
-}, {
-/*  [2,0]  */	TNS("fiaddl",M),	TNS("fimull",M),	TNS("ficoml",M),	TNS("ficompl",M),
-/*  [2,4]  */	TNS("fisubl",M),	TNS("fisubrl",M),	TNS("fidivl",M),	TNS("fidivrl",M),
-}, {
-/*  [3,0]  */	TNS("fildl",M),		INVALID,		TNS("fistl",M),		TNS("fistpl",M),
-/*  [3,4]  */	INVALID,		TNSZ("fldt",M,10),	INVALID,		TNSZ("fstpt",M,10),
-}, {
-/*  [4,0]  */	TNSZ("faddl",M,8),	TNSZ("fmull",M,8),	TNSZ("fcoml",M,8),	TNSZ("fcompl",M,8),
-/*  [4,1]  */	TNSZ("fsubl",M,8),	TNSZ("fsubrl",M,8),	TNSZ("fdivl",M,8),	TNSZ("fdivrl",M,8),
-}, {
-/*  [5,0]  */	TNSZ("fldl",M,8),	INVALID,		TNSZ("fstl",M,8),	TNSZ("fstpl",M,8),
-/*  [5,4]  */	TNSZ("frstor",M,108),	INVALID,		TNSZ("fnsave",M,108),	TNSZ("fnstsw",M,2),
-}, {
-/*  [6,0]  */	TNSZ("fiadd",M,2),	TNSZ("fimul",M,2),	TNSZ("ficom",M,2),	TNSZ("ficomp",M,2),
-/*  [6,4]  */	TNSZ("fisub",M,2),	TNSZ("fisubr",M,2),	TNSZ("fidiv",M,2),	TNSZ("fidivr",M,2),
-}, {
-/*  [7,0]  */	TNSZ("fild",M,2),	INVALID,		TNSZ("fist",M,2),	TNSZ("fistp",M,2),
-/*  [7,4]  */	TNSZ("fbld",M,10),	TNSZ("fildll",M,8),	TNSZ("fbstp",M,10),	TNSZ("fistpll",M,8),
-} };
-
-const instable_t dis_opFP3[8][8] = {
-{
-/* bit  pattern:	1101 1xxx 11xx xREG */
-/*  [0,0]  */	TNS("fadd",FF),		TNS("fmul",FF),		TNS("fcom",F),		TNS("fcomp",F),
-/*  [0,4]  */	TNS("fsub",FF),		TNS("fsubr",FF),	TNS("fdiv",FF),		TNS("fdivr",FF),
-}, {
-/*  [1,0]  */	TNS("fld",F),		TNS("fxch",F),		TNS("fnop",NORM),	TNS("fstp",F),
-/*  [1,4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [2,0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [2,4]  */	INVALID,		TNS("fucompp",NORM),	INVALID,		INVALID,
-}, {
-/*  [3,0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3,4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [4,0]  */	TNS("fadd",FF),		TNS("fmul",FF),		TNS("fcom",F),		TNS("fcomp",F),
-/*  [4,4]  */	TNS("fsub",FF),		TNS("fsubr",FF),	TNS("fdiv",FF),		TNS("fdivr",FF),
-}, {
-/*  [5,0]  */	TNS("ffree",F),		TNS("fxch",F),		TNS("fst",F),		TNS("fstp",F),
-/*  [5,4]  */	TNS("fucom",F),		TNS("fucomp",F),	INVALID,		INVALID,
-}, {
-/*  [6,0]  */	TNS("faddp",FF),	TNS("fmulp",FF),	TNS("fcomp",F),		TNS("fcompp",NORM),
-/*  [6,4]  */	TNS("fsubp",FF),	TNS("fsubrp",FF),	TNS("fdivp",FF),	TNS("fdivrp",FF),
-}, {
-/*  [7,0]  */	TNS("ffree",F),		TNS("fxch",F),		TNS("fstp",F),		TNS("fstp",F),
-/*  [7,4]  */	TNS("fnstsw",M),	TNS("fucomip",FFC),	TNS("fcomip",FFC),	INVALID,
-} };
-
-const instable_t dis_opFP4[4][8] = {
-{
-/* bit pattern:	1101 1001 111x xxxx */
-/*  [0,0]  */	TNS("fchs",NORM),	TNS("fabs",NORM),	INVALID,		INVALID,
-/*  [0,4]  */	TNS("ftst",NORM),	TNS("fxam",NORM),	TNS("ftstp",NORM),	INVALID,
-}, {
-/*  [1,0]  */	TNS("fld1",NORM),	TNS("fldl2t",NORM),	TNS("fldl2e",NORM),	TNS("fldpi",NORM),
-/*  [1,4]  */	TNS("fldlg2",NORM),	TNS("fldln2",NORM),	TNS("fldz",NORM),	INVALID,
-}, {
-/*  [2,0]  */	TNS("f2xm1",NORM),	TNS("fyl2x",NORM),	TNS("fptan",NORM),	TNS("fpatan",NORM),
-/*  [2,4]  */	TNS("fxtract",NORM),	TNS("fprem1",NORM),	TNS("fdecstp",NORM),	TNS("fincstp",NORM),
-}, {
-/*  [3,0]  */	TNS("fprem",NORM),	TNS("fyl2xp1",NORM),	TNS("fsqrt",NORM),	TNS("fsincos",NORM),
-/*  [3,4]  */	TNS("frndint",NORM),	TNS("fscale",NORM),	TNS("fsin",NORM),	TNS("fcos",NORM),
-} };
-
-const instable_t dis_opFP5[8] = {
-/* bit pattern:	1101 1011 111x xxxx */
-/*  [0]  */	TNS("feni",NORM),	TNS("fdisi",NORM),	TNS("fnclex",NORM),	TNS("fninit",NORM),
-/*  [4]  */	TNS("fsetpm",NORM),	TNS("frstpm",NORM),	INVALID,		INVALID,
-};
-
-const instable_t dis_opFP6[8] = {
-/* bit pattern:	1101 1011 11yy yxxx */
-/*  [00]  */	TNS("fcmov.nb",FF),	TNS("fcmov.ne",FF),	TNS("fcmov.nbe",FF),	TNS("fcmov.nu",FF),
-/*  [04]  */	INVALID,		TNS("fucomi",F),	TNS("fcomi",F),		INVALID,
-};
-
-const instable_t dis_opFP7[8] = {
-/* bit pattern:	1101 1010 11yy yxxx */
-/*  [00]  */	TNS("fcmov.b",FF),	TNS("fcmov.e",FF),	TNS("fcmov.be",FF),	TNS("fcmov.u",FF),
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Main decode table for the op codes.  The first two nibbles
- *	will be used as an index into the table.  If there is a
- *	a need to further decode an instruction, the array to be
- *	referenced is indicated with the other two entries being
- *	empty.
- */
-
-const instable_t dis_distable[16][16] = {
-{
-/* [0,0] */	TNS("addb",RMw),	TS("add",RMw),		TNS("addb",MRw),	TS("add",MRw),
-/* [0,4] */	TNS("addb",IA),		TS("add",IA),		TSx("push",SEG),	TSx("pop",SEG),
-/* [0,8] */	TNS("orb",RMw),		TS("or",RMw),		TNS("orb",MRw),		TS("or",MRw),
-/* [0,C] */	TNS("orb",IA),		TS("or",IA),		TSx("push",SEG),	IND(&dis_op0F[0][0]),
-}, {
-/* [1,0] */	TNS("adcb",RMw),	TS("adc",RMw),		TNS("adcb",MRw),	TS("adc",MRw),
-/* [1,4] */	TNS("adcb",IA),		TS("adc",IA),		TSx("push",SEG),	TSx("pop",SEG),
-/* [1,8] */	TNS("sbbb",RMw),	TS("sbb",RMw),		TNS("sbbb",MRw),	TS("sbb",MRw),
-/* [1,C] */	TNS("sbbb",IA),		TS("sbb",IA),		TSx("push",SEG),	TSx("pop",SEG),
-}, {
-/* [2,0] */	TNS("andb",RMw),	TS("and",RMw),		TNS("andb",MRw),	TS("and",MRw),
-/* [2,4] */	TNS("andb",IA),		TS("and",IA),		TNSx("%es:",OVERRIDE),	TNSx("daa",NORM),
-/* [2,8] */	TNS("subb",RMw),	TS("sub",RMw),		TNS("subb",MRw),	TS("sub",MRw),
-/* [2,C] */	TNS("subb",IA),		TS("sub",IA),		TNSx("%cs:",OVERRIDE),	TNSx("das",NORM),
-}, {
-/* [3,0] */	TNS("xorb",RMw),	TS("xor",RMw),		TNS("xorb",MRw),	TS("xor",MRw),
-/* [3,4] */	TNS("xorb",IA),		TS("xor",IA),		TNSx("%ss:",OVERRIDE),	TNSx("aaa",NORM),
-/* [3,8] */	TNS("cmpb",RMw),	TS("cmp",RMw),		TNS("cmpb",MRw),	TS("cmp",MRw),
-/* [3,C] */	TNS("cmpb",IA),		TS("cmp",IA),		TNSx("%ds:",OVERRIDE),	TNSx("aas",NORM),
-}, {
-/* [4,0] */	TSx("inc",R),		TSx("inc",R),		TSx("inc",R),		TSx("inc",R),
-/* [4,4] */	TSx("inc",R),		TSx("inc",R),		TSx("inc",R),		TSx("inc",R),
-/* [4,8] */	TSx("dec",R),		TSx("dec",R),		TSx("dec",R),		TSx("dec",R),
-/* [4,C] */	TSx("dec",R),		TSx("dec",R),		TSx("dec",R),		TSx("dec",R),
-}, {
-/* [5,0] */	TSp("push",R),		TSp("push",R),		TSp("push",R),		TSp("push",R),
-/* [5,4] */	TSp("push",R),		TSp("push",R),		TSp("push",R),		TSp("push",R),
-/* [5,8] */	TSp("pop",R),		TSp("pop",R),		TSp("pop",R),		TSp("pop",R),
-/* [5,C] */	TSp("pop",R),		TSp("pop",R),		TSp("pop",R),		TSp("pop",R),
-}, {
-/* [6,0] */	TSZx("pusha",IMPLMEM,28),TSZx("popa",IMPLMEM,28), TSx("bound",MR),	TNS("arpl",RMw),
-/* [6,4] */	TNS("%fs:",OVERRIDE),	TNS("%gs:",OVERRIDE),	TNS("data16",DM),	TNS("addr16",AM),
-/* [6,8] */	TSp("push",I),		TS("imul",IMUL),	TSp("push",Ib),	TS("imul",IMUL),
-/* [6,C] */	TNSZ("insb",IMPLMEM,1),	TSZ("ins",IMPLMEM,4),	TNSZ("outsb",IMPLMEM,1),TSZ("outs",IMPLMEM,4),
-}, {
-/* [7,0] */	TNSy("jo",BD),		TNSy("jno",BD),		TNSy("jb",BD),		TNSy("jae",BD),
-/* [7,4] */	TNSy("je",BD),		TNSy("jne",BD),		TNSy("jbe",BD),		TNSy("ja",BD),
-/* [7,8] */	TNSy("js",BD),		TNSy("jns",BD),		TNSy("jp",BD),		TNSy("jnp",BD),
-/* [7,C] */	TNSy("jl",BD),		TNSy("jge",BD),		TNSy("jle",BD),		TNSy("jg",BD),
-}, {
-/* [8,0] */	IND(dis_op80),		IND(dis_op81),		INDx(dis_op82),		IND(dis_op83),
-/* [8,4] */	TNS("testb",RMw),	TS("test",RMw),		TNS("xchgb",RMw),	TS("xchg",RMw),
-/* [8,8] */	TNS("movb",RMw),	TS("mov",RMw),		TNS("movb",MRw),	TS("mov",MRw),
-/* [8,C] */	TNS("movw",SM),		TS("lea",MR),		TNS("movw",MS),		TSp("pop",M),
-}, {
-/* [9,0] */	TNS("nop",NORM),	TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),
-/* [9,4] */	TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),
-/* [9,8] */	TNS("cXtX",CBW),	TNS("cXtX",CWD),	TNSx("lcall",SO),	TNS("fwait",NORM),
-/* [9,C] */	TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4),	TNSx("sahf",NORM),	TNSx("lahf",NORM),
-}, {
-/* [A,0] */	TNS("movb",OA),		TS("mov",OA),		TNS("movb",AO),		TS("mov",AO),
-/* [A,4] */	TNSZ("movsb",SD,1),	TS("movs",SD),		TNSZ("cmpsb",SD,1),	TS("cmps",SD),
-/* [A,8] */	TNS("testb",IA),	TS("test",IA),		TNS("stosb",AD),	TS("stos",AD),
-/* [A,C] */	TNS("lodsb",SA),	TS("lods",SA),		TNS("scasb",AD),	TS("scas",AD),
-}, {
-/* [B,0] */	TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),
-/* [B,4] */	TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),
-/* [B,8] */	TS("mov",IR),		TS("mov",IR),		TS("mov",IR),		TS("mov",IR),
-/* [B,C] */	TS("mov",IR),		TS("mov",IR),		TS("mov",IR),		TS("mov",IR),
-}, {
-/* [C,0] */	IND(dis_opC0),		IND(dis_opC1), 		TNSyp("ret",RET),	TNSyp("ret",NORM),
-/* [C,4] */	TNSx("les",MR),		TNSx("lds",MR),		TNS("movb",IMw),	TS("mov",IMw),
-/* [C,8] */	TNSyp("enter",ENTER),	TNSyp("leave",NORM),	TNS("lret",RET),	TNS("lret",NORM),
-/* [C,C] */	TNS("int",INT3),	TNS("int",INTx),	TNSx("into",NORM),	TNS("iret",NORM),
-}, {
-/* [D,0] */	IND(dis_opD0),		IND(dis_opD1),		IND(dis_opD2),		IND(dis_opD3),
-/* [D,4] */	TNSx("aam",U),		TNSx("aad",U),		TNSx("falc",NORM),	TNSZ("xlat",IMPLMEM,1),
-
-/* 287 instructions.  Note that although the indirect field		*/
-/* indicates opFP1n2 for further decoding, this is not necessarily	*/
-/* the case since the opFP arrays are not partitioned according to key1	*/
-/* and key2.  opFP1n2 is given only to indicate that we haven't		*/
-/* finished decoding the instruction.					*/
-/* [D,8] */	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),
-/* [D,C] */	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),
-}, {
-/* [E,0] */	TNSy("loopnz",BD),	TNSy("loopz",BD),	TNSy("loop",BD),	TNSy("jcxz",BD),
-/* [E,4] */	TNS("inb",P),		TS("in",P),		TNS("outb",P),		TS("out",P),
-/* [E,8] */	TNSyp("call",D),	TNSy("jmp",D),		TNSx("ljmp",SO),		TNSy("jmp",BD),
-/* [E,C] */	TNS("inb",V),		TS("in",V),		TNS("outb",V),		TS("out",V),
-}, {
-/* [F,0] */	TNS("lock",LOCK),	TNS("icebp", NORM),	TNS("repnz",PREFIX),	TNS("repz",PREFIX),
-/* [F,4] */	TNS("hlt",NORM),	TNS("cmc",NORM),	IND(dis_opF6),		IND(dis_opF7),
-/* [F,8] */	TNS("clc",NORM),	TNS("stc",NORM),	TNS("cli",NORM),	TNS("sti",NORM),
-/* [F,C] */	TNS("cld",NORM),	TNS("std",NORM),	IND(dis_opFE),		IND(dis_opFF),
-} };
-
-/* END CSTYLED */
-
-/*
- * common functions to decode and disassemble an x86 or amd64 instruction
- */
-
-/*
- * These are the individual fields of a REX prefix. Note that a REX
- * prefix with none of these set is still needed to:
- *	- use the MOVSXD (sign extend 32 to 64 bits) instruction
- *	- access the %sil, %dil, %bpl, %spl registers
- */
-#define	REX_W 0x08	/* 64 bit operand size when set */
-#define	REX_R 0x04	/* high order bit extension of ModRM reg field */
-#define	REX_X 0x02	/* high order bit extension of SIB index field */
-#define	REX_B 0x01	/* extends ModRM r_m, SIB base, or opcode reg */
-
-static uint_t opnd_size;	/* SIZE16, SIZE32 or SIZE64 */
-static uint_t addr_size;	/* SIZE16, SIZE32 or SIZE64 */
-
-/*
- * Even in 64 bit mode, usually only 4 byte immediate operands are supported.
- */
-static int isize[] = {1, 2, 4, 4};
-static int isize64[] = {1, 2, 4, 8};
-
-/*
- * Just a bunch of useful macros.
- */
-#define	WBIT(x)	(x & 0x1)		/* to get w bit	*/
-#define	REGNO(x) (x & 0x7)		/* to get 3 bit register */
-#define	VBIT(x)	((x)>>1 & 0x1)		/* to get 'v' bit */
-#define	OPSIZE(osize, wbit) ((wbit) ? isize[osize] : 1)
-#define	OPSIZE64(osize, wbit) ((wbit) ? isize64[osize] : 1)
-
-#define	REG_ONLY 3	/* mode to indicate a register operand (not memory) */
-
-#define	BYTE_OPND	0	/* w-bit value indicating byte register */
-#define	LONG_OPND	1	/* w-bit value indicating opnd_size register */
-#define	MM_OPND		2	/* "value" used to indicate a mmx reg */
-#define	XMM_OPND	3	/* "value" used to indicate a xmm reg */
-#define	SEG_OPND	4	/* "value" used to indicate a segment reg */
-#define	CONTROL_OPND	5	/* "value" used to indicate a control reg */
-#define	DEBUG_OPND	6	/* "value" used to indicate a debug reg */
-#define	TEST_OPND	7	/* "value" used to indicate a test reg */
-#define	WORD_OPND	8	/* w-bit value indicating word size reg */
-
-/*
- * Get the next byte and separate the op code into the high and low nibbles.
- */
-static int
-dtrace_get_opcode(dis86_t *x, uint_t *high, uint_t *low)
-{
-	int byte;
-
-	/*
-	 * x86 instructions have a maximum length of 15 bytes.  Bail out if
-	 * we try to read more.
-	 */
-	if (x->d86_len >= 15)
-		return (x->d86_error = 1);
-
-	if (x->d86_error)
-		return (1);
-	byte = x->d86_get_byte(x->d86_data);
-	if (byte < 0)
-		return (x->d86_error = 1);
-	x->d86_bytes[x->d86_len++] = byte;
-	*low = byte & 0xf;		/* ----xxxx low 4 bits */
-	*high = byte >> 4 & 0xf;	/* xxxx---- bits 7 to 4 */
-	return (0);
-}
-
-/*
- * Get and decode an SIB (scaled index base) byte
- */
-static void
-dtrace_get_SIB(dis86_t *x, uint_t *ss, uint_t *index, uint_t *base)
-{
-	int byte;
-
-	if (x->d86_error)
-		return;
-
-	byte = x->d86_get_byte(x->d86_data);
-	if (byte < 0) {
-		x->d86_error = 1;
-		return;
-	}
-	x->d86_bytes[x->d86_len++] = byte;
-
-	*base = byte & 0x7;
-	*index = (byte >> 3) & 0x7;
-	*ss = (byte >> 6) & 0x3;
-}
-
-/*
- * Get the byte following the op code and separate it into the
- * mode, register, and r/m fields.
- */
-static void
-dtrace_get_modrm(dis86_t *x, uint_t *mode, uint_t *reg, uint_t *r_m)
-{
-	if (x->d86_got_modrm == 0) {
-		if (x->d86_rmindex == -1)
-			x->d86_rmindex = x->d86_len;
-		dtrace_get_SIB(x, mode, reg, r_m);
-		x->d86_got_modrm = 1;
-	}
-}
-
-/*
- * Adjust register selection based on any REX prefix bits present.
- */
-/*ARGSUSED*/
-static void
-dtrace_rex_adjust(uint_t rex_prefix, uint_t mode, uint_t *reg, uint_t *r_m)
-{
-	if (reg != NULL && r_m == NULL) {
-		if (rex_prefix & REX_B)
-			*reg += 8;
-	} else {
-		if (reg != NULL && (REX_R & rex_prefix) != 0)
-			*reg += 8;
-		if (r_m != NULL && (REX_B & rex_prefix) != 0)
-			*r_m += 8;
-	}
-}
-
-/*
- * Get an immediate operand of the given size, with sign extension.
- */
-static void
-dtrace_imm_opnd(dis86_t *x, int wbit, int size, int opindex)
-{
-	int i;
-	int byte;
-	int valsize = 0;
-
-	if (x->d86_numopnds < opindex + 1)
-		x->d86_numopnds = opindex + 1;
-
-	switch (wbit) {
-	case BYTE_OPND:
-		valsize = 1;
-		break;
-	case LONG_OPND:
-		if (x->d86_opnd_size == SIZE16)
-			valsize = 2;
-		else if (x->d86_opnd_size == SIZE32)
-			valsize = 4;
-		else
-			valsize = 8;
-		break;
-	case MM_OPND:
-	case XMM_OPND:
-	case SEG_OPND:
-	case CONTROL_OPND:
-	case DEBUG_OPND:
-	case TEST_OPND:
-		valsize = size;
-		break;
-	case WORD_OPND:
-		valsize = 2;
-		break;
-	}
-	if (valsize < size)
-		valsize = size;
-
-	if (x->d86_error)
-		return;
-	x->d86_opnd[opindex].d86_value = 0;
-	for (i = 0; i < size; ++i) {
-		byte = x->d86_get_byte(x->d86_data);
-		if (byte < 0) {
-			x->d86_error = 1;
-			return;
-		}
-		x->d86_bytes[x->d86_len++] = byte;
-		x->d86_opnd[opindex].d86_value |= (uint64_t)byte << (i * 8);
-	}
-	/* Do sign extension */
-	if (x->d86_bytes[x->d86_len - 1] & 0x80) {
-		for (; i < valsize; i++)
-			x->d86_opnd[opindex].d86_value |=
-			    (uint64_t)0xff << (i* 8);
-	}
-#ifdef DIS_TEXT
-	x->d86_opnd[opindex].d86_mode = MODE_SIGNED;
-	x->d86_opnd[opindex].d86_value_size = valsize;
-	x->d86_imm_bytes += size;
-#endif
-}
-
-/*
- * Get an ip relative operand of the given size, with sign extension.
- */
-static void
-dtrace_disp_opnd(dis86_t *x, int wbit, int size, int opindex)
-{
-	dtrace_imm_opnd(x, wbit, size, opindex);
-#ifdef DIS_TEXT
-	x->d86_opnd[opindex].d86_mode = MODE_IPREL;
-#endif
-}
-
-/*
- * Check to see if there is a segment override prefix pending.
- * If so, print it in the current 'operand' location and set
- * the override flag back to false.
- */
-/*ARGSUSED*/
-static void
-dtrace_check_override(dis86_t *x, int opindex)
-{
-#ifdef DIS_TEXT
-	if (x->d86_seg_prefix) {
-		(void) strlcat(x->d86_opnd[opindex].d86_prefix,
-		    x->d86_seg_prefix, PFIXLEN);
-	}
-#endif
-	x->d86_seg_prefix = NULL;
-}
-
-
-/*
- * Process a single instruction Register or Memory operand.
- *
- * mode = addressing mode from ModRM byte
- * r_m = r_m (or reg if mode == 3) field from ModRM byte
- * wbit = indicates which register (8bit, 16bit, ... MMX, etc.) set to use.
- * o = index of operand that we are processing (0, 1 or 2)
- *
- * the value of reg or r_m must have already been adjusted for any REX prefix.
- */
-/*ARGSUSED*/
-static void
-dtrace_get_operand(dis86_t *x, uint_t mode, uint_t r_m, int wbit, int opindex)
-{
-	int have_SIB = 0;	/* flag presence of scale-index-byte */
-	uint_t ss;		/* scale-factor from opcode */
-	uint_t index;		/* index register number */
-	uint_t base;		/* base register number */
-	int dispsize;   	/* size of displacement in bytes */
-#ifdef DIS_TEXT
-	char *opnd = x->d86_opnd[opindex].d86_opnd;
-#endif
-
-	if (x->d86_numopnds < opindex + 1)
-		x->d86_numopnds = opindex + 1;
-
-	if (x->d86_error)
-		return;
-
-	/*
-	 * first handle a simple register
-	 */
-	if (mode == REG_ONLY) {
-#ifdef DIS_TEXT
-		switch (wbit) {
-		case MM_OPND:
-			(void) strlcat(opnd, dis_MMREG[r_m], OPLEN);
-			break;
-		case XMM_OPND:
-			(void) strlcat(opnd, dis_XMMREG[r_m], OPLEN);
-			break;
-		case SEG_OPND:
-			(void) strlcat(opnd, dis_SEGREG[r_m], OPLEN);
-			break;
-		case CONTROL_OPND:
-			(void) strlcat(opnd, dis_CONTROLREG[r_m], OPLEN);
-			break;
-		case DEBUG_OPND:
-			(void) strlcat(opnd, dis_DEBUGREG[r_m], OPLEN);
-			break;
-		case TEST_OPND:
-			(void) strlcat(opnd, dis_TESTREG[r_m], OPLEN);
-			break;
-		case BYTE_OPND:
-			if (x->d86_rex_prefix == 0)
-				(void) strlcat(opnd, dis_REG8[r_m], OPLEN);
-			else
-				(void) strlcat(opnd, dis_REG8_REX[r_m], OPLEN);
-			break;
-		case WORD_OPND:
-			(void) strlcat(opnd, dis_REG16[r_m], OPLEN);
-			break;
-		case LONG_OPND:
-			if (x->d86_opnd_size == SIZE16)
-				(void) strlcat(opnd, dis_REG16[r_m], OPLEN);
-			else if (x->d86_opnd_size == SIZE32)
-				(void) strlcat(opnd, dis_REG32[r_m], OPLEN);
-			else
-				(void) strlcat(opnd, dis_REG64[r_m], OPLEN);
-			break;
-		}
-#endif /* DIS_TEXT */
-		return;
-	}
-
-	/*
-	 * if symbolic representation, skip override prefix, if any
-	 */
-	dtrace_check_override(x, opindex);
-
-	/*
-	 * Handle 16 bit memory references first, since they decode
-	 * the mode values more simply.
-	 * mode 1 is r_m + 8 bit displacement
-	 * mode 2 is r_m + 16 bit displacement
-	 * mode 0 is just r_m, unless r_m is 6 which is 16 bit disp
-	 */
-	if (x->d86_addr_size == SIZE16) {
-		if ((mode == 0 && r_m == 6) || mode == 2)
-			dtrace_imm_opnd(x, WORD_OPND, 2, opindex);
-		else if (mode == 1)
-			dtrace_imm_opnd(x, BYTE_OPND, 1, opindex);
-#ifdef DIS_TEXT
-		if (mode == 0 && r_m == 6)
-			x->d86_opnd[opindex].d86_mode = MODE_SIGNED;
-		else if (mode == 0)
-			x->d86_opnd[opindex].d86_mode = MODE_NONE;
-		else
-			x->d86_opnd[opindex].d86_mode = MODE_OFFSET;
-		(void) strlcat(opnd, dis_addr16[mode][r_m], OPLEN);
-#endif
-		return;
-	}
-
-	/*
-	 * 32 and 64 bit addressing modes are more complex since they
-	 * can involve an SIB (scaled index and base) byte to decode.
-	 */
-	if (r_m == ESP_REGNO || r_m == ESP_REGNO + 8) {
-		have_SIB = 1;
-		dtrace_get_SIB(x, &ss, &index, &base);
-		if (x->d86_error)
-			return;
-		if (base != 5 || mode != 0)
-			if (x->d86_rex_prefix & REX_B)
-				base += 8;
-		if (x->d86_rex_prefix & REX_X)
-			index += 8;
-	} else {
-		base = r_m;
-	}
-
-	/*
-	 * Compute the displacement size and get its bytes
-	 */
-	dispsize = 0;
-
-	if (mode == 1)
-		dispsize = 1;
-	else if (mode == 2)
-		dispsize = 4;
-	else if ((r_m & 7) == EBP_REGNO ||
-	    (have_SIB && (base & 7) == EBP_REGNO))
-		dispsize = 4;
-
-	if (dispsize > 0) {
-		dtrace_imm_opnd(x, dispsize == 4 ? LONG_OPND : BYTE_OPND,
-		    dispsize, opindex);
-		if (x->d86_error)
-			return;
-	}
-
-#ifdef DIS_TEXT
-	if (dispsize > 0)
-		x->d86_opnd[opindex].d86_mode = MODE_OFFSET;
-
-	if (have_SIB == 0) {
-		if (x->d86_mode == SIZE32) {
-			if (mode == 0)
-				(void) strlcat(opnd, dis_addr32_mode0[r_m],
-				    OPLEN);
-			else
-				(void) strlcat(opnd, dis_addr32_mode12[r_m],
-				    OPLEN);
-		} else {
-			if (mode == 0)
-				(void) strlcat(opnd, dis_addr64_mode0[r_m],
-				    OPLEN);
-			else
-				(void) strlcat(opnd, dis_addr64_mode12[r_m],
-				    OPLEN);
-		}
-	} else {
-		uint_t need_paren = 0;
-		char **regs;
-		if (x->d86_mode == SIZE32) /* NOTE this is not addr_size! */
-			regs = (char **)dis_REG32;
-		else
-			regs = (char **)dis_REG64;
-
-		/*
-		 * print the base (if any)
-		 */
-		if (base == EBP_REGNO && mode == 0) {
-			if (index != ESP_REGNO) {
-				(void) strlcat(opnd, "(", OPLEN);
-				need_paren = 1;
-			}
-		} else {
-			(void) strlcat(opnd, "(", OPLEN);
-			(void) strlcat(opnd, regs[base], OPLEN);
-			need_paren = 1;
-		}
-
-		/*
-		 * print the index (if any)
-		 */
-		if (index != ESP_REGNO) {
-			(void) strlcat(opnd, ",", OPLEN);
-			(void) strlcat(opnd, regs[index], OPLEN);
-			(void) strlcat(opnd, dis_scale_factor[ss], OPLEN);
-		} else
-			if (need_paren)
-				(void) strlcat(opnd, ")", OPLEN);
-	}
-#endif
-}
-
-/*
- * Operand sequence for standard instruction involving one register
- * and one register/memory operand.
- * wbit indicates a byte(0) or opnd_size(1) operation
- * vbit indicates direction (0 for "opcode r,r_m") or (1 for "opcode r_m, r")
- */
-#define	STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, vbit)  {	\
-		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
-		dtrace_get_operand(x, mode, r_m, wbit, vbit);		\
-		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1 - vbit);	\
-}
-
-/*
- * Similar to above, but allows for the two operands to be of different
- * classes (ie. wbit).
- *	wbit is for the r_m operand
- *	w2 is for the reg operand
- */
-#define	MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, w2, vbit)	{	\
-		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
-		dtrace_get_operand(x, mode, r_m, wbit, vbit);		\
-		dtrace_get_operand(x, REG_ONLY, reg, w2, 1 - vbit);	\
-}
-
-/*
- * Similar, but for 2 operands plus an immediate.
- */
-#define	THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize) { \
-		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
-		dtrace_get_operand(x, mode, r_m, wbit, 1);		\
-		dtrace_get_operand(x, REG_ONLY, reg, w2, 2);		\
-		dtrace_imm_opnd(x, wbit, immsize, 0);			\
-}
-
-/*
- * Dissassemble a single x86 or amd64 instruction.
- *
- * Mode determines the default operating mode (SIZE16, SIZE32 or SIZE64)
- * for interpreting instructions.
- *
- * returns non-zero for bad opcode
- */
-int
-dtrace_disx86(dis86_t *x, uint_t cpu_mode)
-{
-	const instable_t *dp = NULL;	/* decode table being used */
-#ifdef DIS_TEXT
-	uint_t i;
-#endif
-#ifdef DIS_MEM
-	uint_t nomem = 0;
-#define	NOMEM	(nomem = 1)
-#else
-#define	NOMEM	/* nothing */
-#endif
-	uint_t wbit = 0;	/* opcode wbit, 0 is 8 bit, !0 for opnd_size */
-	uint_t w2;		/* wbit value for second operand */
-	uint_t vbit;
-	uint_t mode = 0;	/* mode value from ModRM byte */
-	uint_t reg;		/* reg value from ModRM byte */
-	uint_t r_m;		/* r_m value from ModRM byte */
-
-	uint_t opcode1;		/* high nibble of 1st byte */
-	uint_t opcode2;		/* low nibble of 1st byte */
-	uint_t opcode3;		/* extra opcode bits usually from ModRM byte */
-	uint_t opcode4;		/* high nibble of 2nd byte */
-	uint_t opcode5 = 0;	/* low nibble of 2ne byte */	/* XXX: gcc */
-	uint_t opcode6;		/* high nibble of 3rd byte */
-	uint_t opcode7 = 0;	/* low nibble of 3rd byte */	/* XXX: gcc */
-	uint_t opcode_bytes = 1;
-
-	/*
-	 * legacy prefixes come in 5 flavors, you should have only one of each
-	 */
-	uint_t	opnd_size_prefix = 0;
-	uint_t	addr_size_prefix = 0;
-	uint_t	segment_prefix = 0;
-	uint_t	lock_prefix = 0;
-	uint_t	rep_prefix = 0;
-	uint_t	rex_prefix = 0;	/* amd64 register extension prefix */
-	size_t	off;
-
-	x->d86_len = 0;
-	x->d86_rmindex = -1;
-	x->d86_error = 0;
-#ifdef DIS_TEXT
-	x->d86_numopnds = 0;
-	x->d86_seg_prefix = NULL;
-	x->d86_mneu[0] = 0;
-	for (i = 0; i < 3; ++i) {
-		x->d86_opnd[i].d86_opnd[0] = 0;
-		x->d86_opnd[i].d86_prefix[0] = 0;
-		x->d86_opnd[i].d86_value_size = 0;
-		x->d86_opnd[i].d86_value = 0;
-		x->d86_opnd[i].d86_mode = MODE_NONE;
-	}
-#endif
-	x->d86_error = 0;
-	x->d86_memsize = 0;
-
-	if (cpu_mode == SIZE16) {
-		opnd_size = SIZE16;
-		addr_size = SIZE16;
-	} else if (cpu_mode == SIZE32) {
-		opnd_size = SIZE32;
-		addr_size = SIZE32;
-	} else {
-		opnd_size = SIZE32;
-		addr_size = SIZE64;
-	}
-
-	/*
-	 * Get one opcode byte and check for zero padding that follows
-	 * jump tables.
-	 */
-	if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
-		goto error;
-
-	if (opcode1 == 0 && opcode2 == 0 &&
-	    x->d86_check_func != NULL && x->d86_check_func(x->d86_data)) {
-#ifdef DIS_TEXT
-		(void) strncpy(x->d86_mneu, ".byte\t0", OPLEN);
-#endif
-		goto done;
-	}
-
-	/*
-	 * Gather up legacy x86 prefix bytes.
-	 */
-	for (;;) {
-		uint_t *which_prefix = NULL;
-
-		dp = &dis_distable[opcode1][opcode2];
-
-		switch (dp->it_adrmode) {
-		case PREFIX:
-			which_prefix = &rep_prefix;
-			break;
-		case LOCK:
-			which_prefix = &lock_prefix;
-			break;
-		case OVERRIDE:
-			which_prefix = &segment_prefix;
-#ifdef DIS_TEXT
-			x->d86_seg_prefix = (char *)dp->it_name;
-#endif
-			if (dp->it_invalid64 && cpu_mode == SIZE64)
-				goto error;
-			break;
-		case AM:
-			which_prefix = &addr_size_prefix;
-			break;
-		case DM:
-			which_prefix = &opnd_size_prefix;
-			break;
-		}
-		if (which_prefix == NULL)
-			break;
-		*which_prefix = (opcode1 << 4) | opcode2;
-		if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
-			goto error;
-	}
-
-	/*
-	 * Handle amd64 mode PREFIX values.
-	 * Some of the segment prefixes are no-ops. (only FS/GS actually work)
-	 * We might have a REX prefix (opcodes 0x40-0x4f)
-	 */
-	if (cpu_mode == SIZE64) {
-		if (segment_prefix != 0x64 && segment_prefix != 0x65)
-			segment_prefix = 0;
-
-		if (opcode1 == 0x4) {
-			rex_prefix = (opcode1 << 4) | opcode2;
-			if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
-				goto error;
-			dp = &dis_distable[opcode1][opcode2];
-		}
-	}
-
-	/*
-	 * Deal with selection of operand and address size now.
-	 * Note that the REX.W bit being set causes opnd_size_prefix to be
-	 * ignored.
-	 */
-	if (cpu_mode == SIZE64) {
-		if (rex_prefix & 0x08)
-			opnd_size = SIZE64;
-		else if (opnd_size_prefix)
-			opnd_size = SIZE16;
-
-		if (addr_size_prefix)
-			addr_size = SIZE32;
-	} else if (cpu_mode == SIZE32) {
-		if (opnd_size_prefix)
-			opnd_size = SIZE16;
-		if (addr_size_prefix)
-			addr_size = SIZE16;
-	} else {
-		if (opnd_size_prefix)
-			opnd_size = SIZE32;
-		if (addr_size_prefix)
-			addr_size = SIZE32;
-	}
-
-	/*
-	 * The pause instruction - a repz'd nop.  This doesn't fit
-	 * with any of the other prefix goop added for SSE, so we'll
-	 * special-case it here.
-	 */
-	if (rep_prefix == 0xf3 && opcode1 == 0x9 && opcode2 == 0x0) {
-		rep_prefix = 0;
-		dp = &dis_opPause;
-	}
-
-	/*
-	 * Some 386 instructions have 2 bytes of opcode before the mod_r/m
-	 * byte so we may need to perform a table indirection.
-	 */
-	if (dp->it_indirect == dis_op0F[0]) {
-		if (dtrace_get_opcode(x, &opcode4, &opcode5) != 0)
-			goto error;
-		opcode_bytes = 2;
-		if (opcode4 == 0x7 && opcode5 >= 0x1 && opcode5 <= 0x3) {
-			uint_t	subcode;
-
-			if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0)
-				goto error;
-			opcode_bytes = 3;
-			subcode = ((opcode6 & 0x3) << 1) |
-			    ((opcode7 & 0x8) >> 3);
-			dp = &dis_op0F7123[opcode5][subcode];
-		} else if ((opcode4 == 0xc) && (opcode5 >= 0x8)) {
-			dp = &dis_op0FC8[0];
-		} else {
-			dp = &dis_op0F[opcode4][opcode5];
-		}
-	}
-
-	/*
-	 * If still not at a TERM decode entry, then a ModRM byte
-	 * exists and its fields further decode the instruction.
-	 */
-	x->d86_got_modrm = 0;
-	if (dp->it_indirect != TERM) {
-		dtrace_get_modrm(x, &mode, &opcode3, &r_m);
-		if (x->d86_error)
-			goto error;
-		reg = opcode3;
-
-		/*
-		 * decode 287 instructions (D8-DF) from opcodeN
-		 */
-		if (opcode1 == 0xD && opcode2 >= 0x8) {
-			if (opcode2 == 0xB && mode == 0x3 && opcode3 == 4)
-				dp = &dis_opFP5[r_m];
-			else if (opcode2 == 0xA && mode == 0x3 && opcode3 < 4)
-				dp = &dis_opFP7[opcode3];
-			else if (opcode2 == 0xB && mode == 0x3)
-				dp = &dis_opFP6[opcode3];
-			else if (opcode2 == 0x9 && mode == 0x3 && opcode3 >= 4)
-				dp = &dis_opFP4[opcode3 - 4][r_m];
-			else if (mode == 0x3)
-				dp = &dis_opFP3[opcode2 - 8][opcode3];
-			else
-				dp = &dis_opFP1n2[opcode2 - 8][opcode3];
-		} else {
-			dp = dp->it_indirect + opcode3;
-		}
-	}
-
-	/*
-	 * In amd64 bit mode, ARPL opcode is changed to MOVSXD
-	 * (sign extend 32bit to 64 bit)
-	 */
-	if (cpu_mode == SIZE64 && opcode1 == 0x6 && opcode2 == 0x3)
-		dp = &dis_opMOVSLD;
-
-	/*
-	 * at this point we should have a correct (or invalid) opcode
-	 */
-	if ((cpu_mode == SIZE64 && dp->it_invalid64) ||
-	    (cpu_mode != SIZE64 && dp->it_invalid32))
-		goto error;
-	if (dp->it_indirect != TERM)
-		goto error;
-
-	/*
-	 * deal with MMX/SSE opcodes which are changed by prefixes
-	 */
-	switch (dp->it_adrmode) {
-	case MMO:
-	case MMOIMPL:
-	case MMO3P:
-	case MMOM3:
-	case MMOMS:
-	case MMOPM:
-	case MMOPRM:
-	case MMOS:
-	case XMMO:
-	case XMMOM:
-	case XMMOMS:
-	case XMMOPM:
-	case XMMOS:
-	case XMMOMX:
-	case XMMOX3:
-	case XMMOXMM:
-		/*
-		 * This is horrible.  Some SIMD instructions take the
-		 * form 0x0F 0x?? ..., which is easily decoded using the
-		 * existing tables.  Other SIMD instructions use various
-		 * prefix bytes to overload existing instructions.  For
-		 * Example, addps is F0, 58, whereas addss is F3 (repz),
-		 * F0, 58.  Presumably someone got a raise for this.
-		 *
-		 * If we see one of the instructions which can be
-		 * modified in this way (if we've got one of the SIMDO*
-		 * address modes), we'll check to see if the last prefix
-		 * was a repz.  If it was, we strip the prefix from the
-		 * mnemonic, and we indirect using the dis_opSIMDrepz
-		 * table.
-		 */
-
-		/*
-		 * Calculate our offset in dis_op0F
-		 */
-		if ((uintptr_t)dp - (uintptr_t)dis_op0F > sizeof (dis_op0F))
-			goto error;
-
-		off = ((uintptr_t)dp - (uintptr_t)dis_op0F) /
-		    sizeof (instable_t);
-
-		/*
-		 * Rewrite if this instruction used one of the magic prefixes.
-		 */
-		if (rep_prefix) {
-			if (rep_prefix == 0xf2)
-				dp = &dis_opSIMDrepnz[off];
-			else
-				dp = &dis_opSIMDrepz[off];
-			rep_prefix = 0;
-		} else if (opnd_size_prefix) {
-			dp = &dis_opSIMDdata16[off];
-			opnd_size_prefix = 0;
-			if (opnd_size == SIZE16)
-				opnd_size = SIZE32;
-		}
-		break;
-
-	case MMOSH:
-		/*
-		 * As with the "normal" SIMD instructions, the MMX
-		 * shuffle instructions are overloaded.  These
-		 * instructions, however, are special in that they use
-		 * an extra byte, and thus an extra table.  As of this
-		 * writing, they only use the opnd_size prefix.
-		 */
-
-		/*
-		 * Calculate our offset in dis_op0F7123
-		 */
-		if ((uintptr_t)dp - (uintptr_t)dis_op0F7123 >
-		    sizeof (dis_op0F7123))
-			goto error;
-
-		if (opnd_size_prefix) {
-			off = ((uintptr_t)dp - (uintptr_t)dis_op0F7123) /
-			    sizeof (instable_t);
-			dp = &dis_opSIMD7123[off];
-			opnd_size_prefix = 0;
-			if (opnd_size == SIZE16)
-				opnd_size = SIZE32;
-		}
-		break;
-	}
-
-	/*
-	 * In 64 bit mode, some opcodes automatically use opnd_size == SIZE64.
-	 */
-	if (cpu_mode == SIZE64)
-		if (dp->it_always64 || (opnd_size == SIZE32 && dp->it_stackop))
-			opnd_size = SIZE64;
-
-#ifdef DIS_TEXT
-	/*
-	 * At this point most instructions can format the opcode mnemonic
-	 * including the prefixes.
-	 */
-	if (lock_prefix)
-		(void) strlcat(x->d86_mneu, "lock ", OPLEN);
-
-	if (rep_prefix == 0xf2)
-		(void) strlcat(x->d86_mneu, "repnz ", OPLEN);
-	else if (rep_prefix == 0xf3)
-		(void) strlcat(x->d86_mneu, "repz ", OPLEN);
-
-	if (cpu_mode == SIZE64 && addr_size_prefix)
-		(void) strlcat(x->d86_mneu, "addr32 ", OPLEN);
-
-	if (dp->it_adrmode != CBW &&
-	    dp->it_adrmode != CWD &&
-	    dp->it_adrmode != XMMSFNC) {
-		if (strcmp(dp->it_name, "INVALID") == 0)
-			goto error;
-		(void) strlcat(x->d86_mneu, dp->it_name, OPLEN);
-		if (dp->it_suffix) {
-			char *types[] = {"", "w", "l", "q"};
-			if (opcode_bytes == 2 && opcode4 == 4) {
-				/* It's a cmovx.yy. Replace the suffix x */
-				for (i = 5; i < OPLEN; i++) {
-					if (x->d86_mneu[i] == '.')
-						break;
-				}
-				x->d86_mneu[i - 1] = *types[opnd_size];
-			} else {
-				(void) strlcat(x->d86_mneu, types[opnd_size],
-				    OPLEN);
-			}
-		}
-	}
-#endif
-
-	/*
-	 * Process operands based on the addressing modes.
-	 */
-	x->d86_mode = cpu_mode;
-	x->d86_rex_prefix = rex_prefix;
-	x->d86_opnd_size = opnd_size;
-	x->d86_addr_size = addr_size;
-	vbit = 0;		/* initialize for mem/reg -> reg */
-	switch (dp->it_adrmode) {
-		/*
-		 * amd64 instruction to sign extend 32 bit reg/mem operands
-		 * into 64 bit register values
-		 */
-	case MOVSXZ:
-#ifdef DIS_TEXT
-		if (rex_prefix == 0)
-			(void) strncpy(x->d86_mneu, "movzld", OPLEN);
-#endif
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		x->d86_opnd_size = SIZE64;
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		x->d86_opnd_size = opnd_size = SIZE32;
-		wbit = LONG_OPND;
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-		/*
-		 * movsbl movsbw movsbq (0x0FBE) or movswl movswq (0x0FBF)
-		 * movzbl movzbw movzbq (0x0FB6) or mobzwl movzwq (0x0FB7)
-		 * wbit lives in 2nd byte, note that operands
-		 * are different sized
-		 */
-	case MOVZ:
-		if (rex_prefix & REX_W) {
-			/* target register size = 64 bit */
-			x->d86_mneu[5] = 'q';
-		}
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		x->d86_opnd_size = opnd_size = SIZE16;
-		wbit = WBIT(opcode5);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	/*
-	 * imul instruction, with either 8-bit or longer immediate
-	 * opcode 0x6B for byte, sign-extended displacement, 0x69 for word(s)
-	 */
-	case IMUL:
-		wbit = LONG_OPND;
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND,
-		    OPSIZE(opnd_size, opcode2 == 0x9));
-		break;
-
-	/* memory or register operand to register, with 'w' bit	*/
-	case MRw:
-		wbit = WBIT(opcode2);
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
-		break;
-
-	/* register to memory or register operand, with 'w' bit	*/
-	/* arpl happens to fit here also because it is odd */
-	case RMw:
-		if (opcode_bytes == 2)
-			wbit = WBIT(opcode5);
-		else
-			wbit = WBIT(opcode2);
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* xaddb instruction */
-	case XADDB:
-		wbit = 0;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* MMX register to memory or register operand		*/
-	case MMS:
-	case MMOS:
-#ifdef DIS_TEXT
-		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
-#else
-		wbit = LONG_OPND;
-#endif
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1);
-		break;
-
-	/* MMX register to memory */
-	case MMOMS:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode == REG_ONLY)
-			goto error;
-		wbit = MM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1);
-		break;
-
-	/* Double shift. Has immediate operand specifying the shift. */
-	case DSHIFT:
-		wbit = LONG_OPND;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 2);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		break;
-
-	/*
-	 * Double shift. With no immediate operand, specifies using %cl.
-	 */
-	case DSHIFTcl:
-		wbit = LONG_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* immediate to memory or register operand */
-	case IMlw:
-		wbit = WBIT(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		/*
-		 * Have long immediate for opcode 0x81, but not 0x80 nor 0x83
-		 */
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, opcode2 == 1), 0);
-		break;
-
-	/* immediate to memory or register operand with the	*/
-	/* 'w' bit present					*/
-	case IMw:
-		wbit = WBIT(opcode2);
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0);
-		break;
-
-	/* immediate to register with register in low 3 bits	*/
-	/* of op code						*/
-	case IR:
-		/* w-bit here (with regs) is bit 3 */
-		wbit = opcode2 >>3 & 0x1;
-		reg = REGNO(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		mode = REG_ONLY;
-		r_m = reg;
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		dtrace_imm_opnd(x, wbit, OPSIZE64(opnd_size, wbit), 0);
-		break;
-
-	/* MMX immediate shift of register */
-	case MMSH:
-	case MMOSH:
-		wbit = MM_OPND;
-		goto mm_shift;	/* in next case */
-
-	/* SIMD immediate shift of register */
-	case XMMSH:
-		wbit = XMM_OPND;
-mm_shift:
-		reg = REGNO(opcode7);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		NOMEM;
-		break;
-
-	/* accumulator to memory operand */
-	case AO:
-		vbit = 1;
-		/*FALLTHROUGH*/
-
-	/* memory operand to accumulator */
-	case OA:
-		wbit = WBIT(opcode2);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1 - vbit);
-		dtrace_imm_opnd(x, wbit, OPSIZE64(addr_size, LONG_OPND), vbit);
-#ifdef DIS_TEXT
-		x->d86_opnd[vbit].d86_mode = MODE_OFFSET;
-#endif
-		break;
-
-
-	/* segment register to memory or register operand */
-	case SM:
-		vbit = 1;
-		/*FALLTHROUGH*/
-
-	/* memory or register operand to segment register */
-	case MS:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, LONG_OPND, vbit);
-		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 1 - vbit);
-		break;
-
-	/*
-	 * rotate or shift instructions, which may shift by 1 or
-	 * consult the cl register, depending on the 'v' bit
-	 */
-	case Mv:
-		vbit = VBIT(opcode2);
-		wbit = WBIT(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-#ifdef DIS_TEXT
-		if (vbit) {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "%cl", OPLEN);
-		} else {
-			x->d86_opnd[0].d86_mode = MODE_SIGNED;
-			x->d86_opnd[0].d86_value_size = 1;
-			x->d86_opnd[0].d86_value = 1;
-		}
-#endif
-		break;
-	/*
-	 * immediate rotate or shift instructions
-	 */
-	case MvI:
-		wbit = WBIT(opcode2);
-normal_imm_mem:
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		break;
-
-	/* bit test instructions */
-	case MIb:
-		wbit = LONG_OPND;
-		goto normal_imm_mem;
-
-	/* single memory or register operand with 'w' bit present */
-	case Mw:
-		wbit = WBIT(opcode2);
-just_mem:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	case SWAPGS:
-		if (cpu_mode == SIZE64 && mode == 3 && r_m == 0) {
-#ifdef DIS_TEXT
-			(void) strncpy(x->d86_mneu, "swapgs", OPLEN);
-#endif
-			NOMEM;
-			break;
-		}
-		/*FALLTHROUGH*/
-
-	/* prefetch instruction - memory operand, but no memory acess */
-	case PREF:
-		NOMEM;
-		/*FALLTHROUGH*/
-
-	/* single memory or register operand */
-	case M:
-		wbit = LONG_OPND;
-		goto just_mem;
-
-	/* single memory or register byte operand */
-	case Mb:
-		wbit = BYTE_OPND;
-		goto just_mem;
-
-	case MO:
-		/* Similar to M, but only memory (no direct registers) */
-		wbit = LONG_OPND;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode == 3)
-			goto error;
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	/* move special register to register or reverse if vbit */
-	case SREG:
-		switch (opcode5) {
-
-		case 2:
-			vbit = 1;
-			/*FALLTHROUGH*/
-		case 0:
-			wbit = CONTROL_OPND;
-			break;
-
-		case 3:
-			vbit = 1;
-			/*FALLTHROUGH*/
-		case 1:
-			wbit = DEBUG_OPND;
-			break;
-
-		case 6:
-			vbit = 1;
-			/*FALLTHROUGH*/
-		case 4:
-			wbit = TEST_OPND;
-			break;
-
-		}
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit);
-		dtrace_get_operand(x, REG_ONLY, r_m, LONG_OPND, 1 - vbit);
-		NOMEM;
-		break;
-
-	/*
-	 * single register operand with register in the low 3
-	 * bits of op code
-	 */
-	case R:
-		if (opcode_bytes == 2)
-			reg = REGNO(opcode5);
-		else
-			reg = REGNO(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0);
-		NOMEM;
-		break;
-
-	/*
-	 * register to accumulator with register in the low 3
-	 * bits of op code, xchg instructions
-	 */
-	case RA:
-		NOMEM;
-		reg = REGNO(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, LONG_OPND, 1);
-		break;
-
-	/*
-	 * single segment register operand, with register in
-	 * bits 3-4 of op code byte
-	 */
-	case SEG:
-		NOMEM;
-		reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x3;
-		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0);
-		break;
-
-	/*
-	 * single segment register operand, with register in
-	 * bits 3-5 of op code
-	 */
-	case LSEG:
-		NOMEM;
-		/* long seg reg from opcode */
-		reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x7;
-		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0);
-		break;
-
-	/* memory or register operand to register */
-	case MR:
-		wbit = LONG_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
-		break;
-
-	case RM:
-		wbit = LONG_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* MMX/SIMD-Int memory or mm reg to mm reg		*/
-	case MM:
-	case MMO:
-#ifdef DIS_TEXT
-		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
-#else
-		wbit = LONG_OPND;
-#endif
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0);
-		break;
-
-	case MMOIMPL:
-#ifdef DIS_TEXT
-		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
-#else
-		wbit = LONG_OPND;
-#endif
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		dtrace_get_operand(x, REG_ONLY, reg, MM_OPND, 1);
-		mode = 0;	/* change for memory access size... */
-		break;
-
-	/* MMX/SIMD-Int and SIMD-FP predicated mm reg to r32 */
-	case MMO3P:
-		wbit = MM_OPND;
-		goto xmm3p;
-	case XMM3P:
-		wbit = XMM_OPND;
-xmm3p:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 1);
-		NOMEM;
-		break;
-
-	/* MMX/SIMD-Int predicated r32/mem to mm reg */
-	case MMOPRM:
-		wbit = LONG_OPND;
-		w2 = MM_OPND;
-		goto xmmprm;
-	case XMMPRM:
-		wbit = LONG_OPND;
-		w2 = XMM_OPND;
-xmmprm:
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, 1);
-		break;
-
-	/* MMX/SIMD-Int predicated mm/mem to mm reg */
-	case MMOPM:
-		wbit = w2 = MM_OPND;
-		goto xmmprm;
-
-	/* MMX/SIMD-Int mm reg to r32 */
-	case MMOM3:
-		NOMEM;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-		wbit = MM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0);
-		break;
-
-	/* SIMD memory or xmm reg operand to xmm reg		*/
-	case XMM:
-	case XMMO:
-	case XMMXIMPL:
-		wbit = XMM_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
-
-		if (dp->it_adrmode == XMMXIMPL && mode != REG_ONLY)
-			goto error;
-
-#ifdef DIS_TEXT
-		/*
-		 * movlps and movhlps share opcodes.  They differ in the
-		 * addressing modes allowed for their operands.
-		 * movhps and movlhps behave similarly.
-		 */
-		if (mode == REG_ONLY) {
-			if (strcmp(dp->it_name, "movlps") == 0)
-				(void) strncpy(x->d86_mneu, "movhlps", OPLEN);
-			else if (strcmp(dp->it_name, "movhps") == 0)
-				(void) strncpy(x->d86_mneu, "movlhps", OPLEN);
-		}
-#endif
-		if (dp->it_adrmode == XMMXIMPL)
-			mode = 0;	/* change for memory access size... */
-		break;
-
-	/* SIMD xmm reg to memory or xmm reg */
-	case XMMS:
-	case XMMOS:
-	case XMMMS:
-	case XMMOMS:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-#ifdef DIS_TEXT
-		if ((strcmp(dp->it_name, "movlps") == 0 ||
-		    strcmp(dp->it_name, "movhps") == 0 ||
-		    strcmp(dp->it_name, "movntps") == 0) &&
-		    mode == REG_ONLY)
-			goto error;
-#endif
-		wbit = XMM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
-		break;
-
-	/* SIMD memory to xmm reg */
-	case XMMM:
-	case XMMOM:
-		wbit = XMM_OPND;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-#ifdef DIS_TEXT
-		if (mode == REG_ONLY) {
-			if (strcmp(dp->it_name, "movhps") == 0)
-				(void) strncpy(x->d86_mneu, "movlhps", OPLEN);
-			else
-				goto error;
-		}
-#endif
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
-		break;
-
-	/* SIMD memory or r32 to xmm reg			*/
-	case XMM3MX:
-		wbit = LONG_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
-		break;
-
-	case XMM3MXS:
-		wbit = LONG_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
-		break;
-
-	/* SIMD memory or mm reg to xmm reg			*/
-	case XMMOMX:
-	/* SIMD mm to xmm */
-	case XMMMX:
-		wbit = MM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
-		break;
-
-	/* SIMD memory or xmm reg to mm reg			*/
-	case XMMXMM:
-	case XMMOXMM:
-	case XMMXM:
-		wbit = XMM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0);
-		break;
-
-
-	/* SIMD memory or xmm reg to r32			*/
-	case XMMXM3:
-		wbit = XMM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0);
-		break;
-
-	/* SIMD xmm to r32					*/
-	case XMMX3:
-	case XMMOX3:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, mode, r_m, XMM_OPND, 0);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		NOMEM;
-		break;
-
-	/* SIMD predicated memory or xmm reg with/to xmm reg */
-	case XMMP:
-	case XMMOPM:
-		wbit = XMM_OPND;
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
-
-#ifdef DIS_TEXT
-		/*
-		 * cmpps and cmpss vary their instruction name based
-		 * on the value of imm8.  Other XMMP instructions,
-		 * such as shufps, require explicit specification of
-		 * the predicate.
-		 */
-		if (dp->it_name[0] == 'c' &&
-		    dp->it_name[1] == 'm' &&
-		    dp->it_name[2] == 'p' &&
-		    strlen(dp->it_name) == 5) {
-			uchar_t pred = x->d86_opnd[0].d86_value & 0xff;
-
-			if (pred >= (sizeof (dis_PREDSUFFIX) / sizeof (char *)))
-				goto error;
-
-			(void) strncpy(x->d86_mneu, "cmp", OPLEN);
-			(void) strlcat(x->d86_mneu, dis_PREDSUFFIX[pred],
-			    OPLEN);
-			(void) strlcat(x->d86_mneu,
-			    dp->it_name + strlen(dp->it_name) - 2,
-			    OPLEN);
-			x->d86_opnd[0] = x->d86_opnd[1];
-			x->d86_opnd[1] = x->d86_opnd[2];
-			x->d86_numopnds = 2;
-		}
-#endif
-		break;
-
-	/* immediate operand to accumulator */
-	case IA:
-		wbit = WBIT(opcode2);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1);
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0);
-		NOMEM;
-		break;
-
-	/* memory or register operand to accumulator */
-	case MA:
-		wbit = WBIT(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	/* si register to di register used to reference memory		*/
-	case SD:
-#ifdef DIS_TEXT
-		dtrace_check_override(x, 0);
-		x->d86_numopnds = 2;
-		if (addr_size == SIZE64) {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)",
-			    OPLEN);
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)",
-			    OPLEN);
-		} else if (addr_size == SIZE32) {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)",
-			    OPLEN);
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)",
-			    OPLEN);
-		} else {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)",
-			    OPLEN);
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)",
-			    OPLEN);
-		}
-#endif
-		wbit = LONG_OPND;
-		break;
-
-	/* accumulator to di register				*/
-	case AD:
-		wbit = WBIT(opcode2);
-#ifdef DIS_TEXT
-		dtrace_check_override(x, 1);
-		x->d86_numopnds = 2;
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 0);
-		if (addr_size == SIZE64)
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)",
-			    OPLEN);
-		else if (addr_size == SIZE32)
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)",
-			    OPLEN);
-		else
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)",
-			    OPLEN);
-#endif
-		break;
-
-	/* si register to accumulator				*/
-	case SA:
-		wbit = WBIT(opcode2);
-#ifdef DIS_TEXT
-		dtrace_check_override(x, 0);
-		x->d86_numopnds = 2;
-		if (addr_size == SIZE64)
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)",
-			    OPLEN);
-		else if (addr_size == SIZE32)
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)",
-			    OPLEN);
-		else
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)",
-			    OPLEN);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1);
-#endif
-		break;
-
-	/*
-	 * single operand, a 16/32 bit displacement
-	 */
-	case D:
-		wbit = LONG_OPND;
-		dtrace_disp_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0);
-		NOMEM;
-		break;
-
-	/* jmp/call indirect to memory or register operand		*/
-	case INM:
-#ifdef DIS_TEXT
-		(void) strlcat(x->d86_opnd[0].d86_prefix, "*", OPLEN);
-#endif
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, LONG_OPND, 0);
-		wbit = LONG_OPND;
-		break;
-
-	/*
-	 * for long jumps and long calls -- a new code segment
-	 * register and an offset in IP -- stored in object
-	 * code in reverse order. Note - not valid in amd64
-	 */
-	case SO:
-		dtrace_check_override(x, 1);
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 1);
-#ifdef DIS_TEXT
-		x->d86_opnd[1].d86_mode = MODE_SIGNED;
-#endif
-		/* will now get segment operand */
-		dtrace_imm_opnd(x, wbit, 2, 0);
-		break;
-
-	/*
-	 * jmp/call. single operand, 8 bit displacement.
-	 * added to current EIP in 'compofff'
-	 */
-	case BD:
-		dtrace_disp_opnd(x, BYTE_OPND, 1, 0);
-		NOMEM;
-		break;
-
-	/* single 32/16 bit immediate operand			*/
-	case I:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0);
-		break;
-
-	/* single 8 bit immediate operand			*/
-	case Ib:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		break;
-
-	case ENTER:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, 2, 0);
-		dtrace_imm_opnd(x, wbit, 1, 1);
-		switch (opnd_size) {
-		case SIZE64:
-			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 8;
-			break;
-		case SIZE32:
-			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 4;
-			break;
-		case SIZE16:
-			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 2;
-			break;
-		}
-
-		break;
-
-	/* 16-bit immediate operand */
-	case RET:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, 2, 0);
-		break;
-
-	/* single 8 bit port operand				*/
-	case P:
-		dtrace_check_override(x, 0);
-		dtrace_imm_opnd(x, BYTE_OPND, 1, 0);
-		NOMEM;
-		break;
-
-	/* single operand, dx register (variable port instruction) */
-	case V:
-		x->d86_numopnds = 1;
-		dtrace_check_override(x, 0);
-#ifdef DIS_TEXT
-		(void) strlcat(x->d86_opnd[0].d86_opnd, "(%dx)", OPLEN);
-#endif
-		NOMEM;
-		break;
-
-	/*
-	 * The int instruction, which has two forms:
-	 * int 3 (breakpoint) or
-	 * int n, where n is indicated in the subsequent
-	 * byte (format Ib).  The int 3 instruction (opcode 0xCC),
-	 * where, although the 3 looks  like an operand,
-	 * it is implied by the opcode. It must be converted
-	 * to the correct base and output.
-	 */
-	case INT3:
-#ifdef DIS_TEXT
-		x->d86_numopnds = 1;
-		x->d86_opnd[0].d86_mode = MODE_SIGNED;
-		x->d86_opnd[0].d86_value_size = 1;
-		x->d86_opnd[0].d86_value = 3;
-#endif
-		NOMEM;
-		break;
-
-	/* single 8 bit immediate operand			*/
-	case INTx:
-		dtrace_imm_opnd(x, BYTE_OPND, 1, 0);
-		NOMEM;
-		break;
-
-	/* an unused byte must be discarded */
-	case U:
-		if (x->d86_get_byte(x->d86_data) < 0)
-			goto error;
-		x->d86_len++;
-		NOMEM;
-		break;
-
-	case CBW:
-#ifdef DIS_TEXT
-		if (opnd_size == SIZE16)
-			(void) strlcat(x->d86_mneu, "cbtw", OPLEN);
-		else if (opnd_size == SIZE32)
-			(void) strlcat(x->d86_mneu, "cwtl", OPLEN);
-		else
-			(void) strlcat(x->d86_mneu, "cltq", OPLEN);
-#endif
-		wbit = LONG_OPND;
-		NOMEM;
-		break;
-
-	case CWD:
-#ifdef DIS_TEXT
-		if (opnd_size == SIZE16)
-			(void) strlcat(x->d86_mneu, "cwtd", OPLEN);
-		else if (opnd_size == SIZE32)
-			(void) strlcat(x->d86_mneu, "cltd", OPLEN);
-		else
-			(void) strlcat(x->d86_mneu, "cqtd", OPLEN);
-#endif
-		wbit = LONG_OPND;
-		NOMEM;
-		break;
-
-	case XMMSFNC:
-		/*
-		 * sfence is sfence if mode is REG_ONLY.  If mode isn't
-		 * REG_ONLY, mnemonic should be 'clflush'.
-		 */
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-
-		/* sfence doesn't take operands */
-#ifdef DIS_TEXT
-		if (mode == REG_ONLY) {
-			(void) strlcat(x->d86_mneu, "sfence", OPLEN);
-		} else {
-			(void) strlcat(x->d86_mneu, "clflush", OPLEN);
-			dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-			dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0);
-			NOMEM;
-		}
-#else
-		if (mode != REG_ONLY) {
-			dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-			dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0);
-			NOMEM;
-		}
-#endif
-		break;
-
-	/*
-	 * no disassembly, the mnemonic was all there was so go on
-	 */
-	case NORM:
-		if (dp->it_invalid32 && cpu_mode != SIZE64)
-			goto error;
-		NOMEM;
-		/*FALLTHROUGH*/
-	case IMPLMEM:
-		break;
-
-	case XMMFENCE:
-		/*
-		 * Only the following exact byte sequences are allowed:
-		 *
-		 * 	0f ae e8	lfence
-		 * 	0f ae f0	mfence
-		 */
-		if ((uint8_t)x->d86_bytes[x->d86_len - 1] != 0xe8 &&
-		    (uint8_t)x->d86_bytes[x->d86_len - 1] != 0xf0)
-			goto error;
-
-		break;
-
-
-	/* float reg */
-	case F:
-#ifdef DIS_TEXT
-		x->d86_numopnds = 1;
-		(void) strlcat(x->d86_opnd[0].d86_opnd, "%st(X)", OPLEN);
-		x->d86_opnd[0].d86_opnd[4] = r_m + '0';
-#endif
-		NOMEM;
-		break;
-
-	/* float reg to float reg, with ret bit present */
-	case FF:
-		vbit = opcode2 >> 2 & 0x1;	/* vbit = 1: st -> st(i) */
-		/*FALLTHROUGH*/
-	case FFC:				/* case for vbit always = 0 */
-#ifdef DIS_TEXT
-		x->d86_numopnds = 2;
-		(void) strlcat(x->d86_opnd[1 - vbit].d86_opnd, "%st", OPLEN);
-		(void) strlcat(x->d86_opnd[vbit].d86_opnd, "%st(X)", OPLEN);
-		x->d86_opnd[vbit].d86_opnd[4] = r_m + '0';
-#endif
-		NOMEM;
-		break;
-
-	/* an invalid op code */
-	case AM:
-	case DM:
-	case OVERRIDE:
-	case PREFIX:
-	case UNKNOWN:
-		NOMEM;
-	default:
-		goto error;
-	} /* end switch */
-	if (x->d86_error)
-		goto error;
-
-done:
-#ifdef DIS_MEM
-	/*
-	 * compute the size of any memory accessed by the instruction
-	 */
-	if (x->d86_memsize != 0) {
-		return (0);
-	} else if (dp->it_stackop) {
-		switch (opnd_size) {
-		case SIZE16:
-			x->d86_memsize = 2;
-			break;
-		case SIZE32:
-			x->d86_memsize = 4;
-			break;
-		case SIZE64:
-			x->d86_memsize = 8;
-			break;
-		}
-	} else if (nomem || mode == REG_ONLY) {
-		x->d86_memsize = 0;
-
-	} else if (dp->it_size != 0) {
-		/*
-		 * In 64 bit mode descriptor table entries
-		 * go up to 10 bytes and popf/pushf are always 8 bytes
-		 */
-		if (x->d86_mode == SIZE64 && dp->it_size == 6)
-			x->d86_memsize = 10;
-		else if (x->d86_mode == SIZE64 && opcode1 == 0x9 &&
-		    (opcode2 == 0xc || opcode2 == 0xd))
-			x->d86_memsize = 8;
-		else
-			x->d86_memsize = dp->it_size;
-
-	} else if (wbit == 0) {
-		x->d86_memsize = 1;
-
-	} else if (wbit == LONG_OPND) {
-		if (opnd_size == SIZE64)
-			x->d86_memsize = 8;
-		else if (opnd_size == SIZE32)
-			x->d86_memsize = 4;
-		else
-			x->d86_memsize = 2;
-
-	} else if (wbit == SEG_OPND) {
-		x->d86_memsize = 4;
-
-	} else {
-		x->d86_memsize = 8;
-	}
-#endif
-	return (0);
-
-error:
-#ifdef DIS_TEXT
-	(void) strlcat(x->d86_mneu, "undef", OPLEN);
-#endif
-	return (1);
-}
-
-#ifdef DIS_TEXT
-
-/*
- * Some instructions should have immediate operands printed
- * as unsigned integers. We compare against this table.
- */
-static char *unsigned_ops[] = {
-	"or", "and", "xor", "test", "in", "out", "lcall", "ljmp",
-	"rcr", "rcl", "ror", "rol", "shl", "shr", "sal", "psr", "psl",
-	0
-};
-
-static int
-isunsigned_op(char *opcode)
-{
-	char *where;
-	int i;
-	int is_unsigned = 0;
-
-	/*
-	 * Work back to start of last mnemonic, since we may have
-	 * prefixes on some opcodes.
-	 */
-	where = opcode + strlen(opcode) - 1;
-	while (where > opcode && *where != ' ')
-		--where;
-	if (*where == ' ')
-		++where;
-
-	for (i = 0; unsigned_ops[i]; ++i) {
-		if (strncmp(where, unsigned_ops[i],
-		    strlen(unsigned_ops[i])))
-			continue;
-		is_unsigned = 1;
-		break;
-	}
-	return (is_unsigned);
-}
-
-/* ARGSUSED */
-void
-dtrace_disx86_str(dis86_t *dis, uint_t mode, uintptr_t pc, char *buf,
-    size_t buflen)
-{
-	int i;
-
-	dis->d86_sprintf_func(buf, buflen, "%-6s ", dis->d86_mneu);
-
-	/*
-	 * For PC-relative jumps, the pc is really the next pc after executing
-	 * this instruction, so increment it appropriately.
-	 */
-	pc += dis->d86_len;
-
-	for (i = 0; i < dis->d86_numopnds; i++) {
-		d86opnd_t *op = &dis->d86_opnd[i];
-		int64_t sv;
-		uint64_t mask;
-
-		if (i != 0)
-			(void) strlcat(buf, ",", buflen);
-
-		(void) strlcat(buf, op->d86_prefix, buflen);
-
-		sv = op->d86_value;
-
-		switch (op->d86_mode) {
-
-		case MODE_NONE:
-
-			(void) strlcat(buf, op->d86_opnd, buflen);
-			break;
-
-		case MODE_SIGNED:
-		case MODE_IMPLIED:
-		case MODE_OFFSET:
-
-			if (dis->d86_seg_prefix)
-				(void) strlcat(buf, dis->d86_seg_prefix,
-				    buflen);
-
-			switch (op->d86_value_size) {
-			case 1:
-				sv = (int8_t)sv;
-				mask = 0xff;
-				break;
-			case 2:
-				sv = (int16_t)sv;
-				mask = 0xffff;
-				break;
-			case 4:
-				sv = (int32_t)sv;
-				mask = 0xffffffff;
-				break;
-			case 8:
-				mask = 0xffffffffffffffffULL;
-				break;
-			}
-
-			if (op->d86_mode == MODE_SIGNED ||
-			    op->d86_mode == MODE_IMPLIED)
-				(void) strlcat(buf, "$", buflen);
-
-			if (sv < 0 && sv > -0xffff &&
-			    !isunsigned_op(dis->d86_mneu)) {
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "-0%llo" : "-0x%llx", -sv & mask);
-			} else {
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "0%llo" : "0x%llx", sv & mask);
-			}
-			(void) strlcat(buf, op->d86_opnd, buflen);
-			break;
-
-		case MODE_IPREL:
-
-			switch (op->d86_value_size) {
-			case 1:
-				sv = (int8_t)sv;
-				break;
-			case 2:
-				sv = (int16_t)sv;
-				break;
-			case 4:
-				sv = (int32_t)sv;
-				break;
-			}
-
-			if (sv < 0)
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "-0%llo" : "-0x%llx", -sv - dis->d86_len);
-			else
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "+0%llo" : "+0x%llx", sv + dis->d86_len);
-
-			(void) strlcat(buf, "\t<", buflen);
-
-			if (dis->d86_sym_lookup == NULL ||
-			    dis->d86_sym_lookup(dis->d86_data, pc + sv,
-			    buf + strlen(buf), buflen - strlen(buf)) != 0)
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "0%llo" : "0x%llx", pc + sv);
-
-			(void) strlcat(buf, ">", buflen);
-
-			break;
-		}
-	}
-}
-
-#endif /* DIS_TEXT */
Index: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h
diff -N src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h
--- src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h	21 Feb 2010 01:46:33 -0000	1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,114 +0,0 @@
-/*	$NetBSD: dis_tables.h,v 1.2 2010/02/21 01:46:33 darran Exp $	*/
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dis_tables.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved  	*/
-
-
-#ifndef _DIS_TABLES_H
-#define	_DIS_TABLES_H
-
-#if defined(sun)
-#pragma ident	"@(#)dis_tables.h	1.7	06/03/02 SMI"
-#endif
-
-/*
- * Constants and prototypes for the IA32 disassembler backend.  See dis_tables.c
- * for usage information and documentation.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/types.h>
-#include <sys/param.h>
-
-/*
- * values for cpu mode
- */
-#define	SIZE16	1
-#define	SIZE32	2
-#define	SIZE64	3
-
-#define	OPLEN	256
-#define	PFIXLEN	  8
-#define	NCPS	12	/* number of chars per symbol	*/
-
-/*
- * data structures that must be provided to dtrace_dis86()
- */
-typedef struct d86opnd {
-	char		d86_opnd[OPLEN];	/* symbolic rep of operand */
-	char		d86_prefix[PFIXLEN];	/* any prefix string or "" */
-	uint_t		d86_mode;		/* mode for immediate */
-	uint_t		d86_value_size;		/* size in bytes of d86_value */
-	uint64_t	d86_value;		/* immediate value of opnd */
-} d86opnd_t;
-
-typedef struct dis86 {
-	uint_t		d86_mode;
-	uint_t		d86_error;
-	uint_t		d86_len;		/* instruction length */
-	int		d86_rmindex;		/* index of modrm byte or -1 */
-	uint_t		d86_memsize;		/* size of memory referenced */
-	char		d86_bytes[16];		/* bytes of instruction */
-	char		d86_mneu[OPLEN];
-	uint_t		d86_numopnds;
-	uint_t		d86_rex_prefix;		/* value of REX prefix if !0 */
-	char		*d86_seg_prefix;	/* segment prefix, if any */
-	uint_t		d86_opnd_size;
-	uint_t		d86_addr_size;
-	uint_t		d86_got_modrm;
-	struct d86opnd	d86_opnd[3];		/* up to 3 operands */
-	int		(*d86_check_func)(void *);
-	int		(*d86_get_byte)(void *);
-#ifdef DIS_TEXT
-	int		(*d86_sym_lookup)(void *, uint64_t, char *, size_t);
-	int		(*d86_sprintf_func)(char *, size_t, const char *, ...);
-	int		d86_flags;
-	uint_t		d86_imm_bytes;
-#endif
-	void		*d86_data;
-} dis86_t;
-
-extern int dtrace_disx86(dis86_t *x, uint_t cpu_mode);
-
-#define	DIS_OP_OCTAL	0x1	/* Print all numbers in octal */
-
-#ifdef DIS_TEXT
-extern void dtrace_disx86_str(dis86_t *x, uint_t cpu_mode, uintptr_t pc,
-    char *buf, size_t len);
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _DIS_TABLES_H */
Index: src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S,v
retrieving revision 1.7
diff -u -p -r1.7 dtrace_asm.S
--- src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S	27 Feb 2017 06:46:59 -0000	1.7
+++ src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S	8 May 2017 22:42:29 -0000
@@ -22,7 +22,7 @@
  *
  * Portions Copyright 2008 John Birrell <jb@freebsd.org>
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dtrace_asm.S,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_asm.S 298171 2016-04-17 23:08:47Z markj $
  *
  */
 /*
Index: src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c,v
retrieving revision 1.6
diff -u -p -r1.6 dtrace_isa.c
--- src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c	27 Feb 2017 06:46:59 -0000	1.6
+++ src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c	20 Apr 2017 11:07:10 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dtrace_isa.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_isa.c 298171 2016-04-17 23:08:47Z markj $
  */
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
@@ -38,6 +38,8 @@
 
 #include <machine/vmparam.h>
 
+#include "regset.h"
+
 uint8_t dtrace_fuword8_nocheck(void *);
 uint16_t dtrace_fuword16_nocheck(void *);
 uint32_t dtrace_fuword32_nocheck(void *);
@@ -54,6 +56,8 @@ struct amd64_frame {
 
 typedef unsigned long vm_offset_t;
 
+int	dtrace_ustackdepth_max = 2048;
+
 void
 dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes,
     uint32_t *intrpc)
@@ -107,14 +111,25 @@ static int
 dtrace_getustack_common(uint64_t *pcstack, int pcstack_limit, uintptr_t pc,
     uintptr_t sp)
 {
+	uintptr_t oldsp;
 	volatile uint16_t *flags =
 	    (volatile uint16_t *)&cpu_core[cpu_number()].cpuc_dtrace_flags;
 	int ret = 0;
 
 	ASSERT(pcstack == NULL || pcstack_limit > 0);
+	ASSERT(dtrace_ustackdepth_max > 0);
 
 	while (pc != 0) {
-		ret++;
+		/*
+		 * We limit the number of times we can go around this
+		 * loop to account for a circular stack.
+		 */
+		if (ret++ >= dtrace_ustackdepth_max) {
+			*flags |= CPU_DTRACE_BADSTACK;
+			cpu_core[cpu_number()].cpuc_dtrace_illval = sp;
+			break;
+		}
+
 		if (pcstack != NULL) {
 			*pcstack++ = (uint64_t)pc;
 			pcstack_limit--;
@@ -125,10 +140,18 @@ dtrace_getustack_common(uint64_t *pcstac
 		if (sp == 0)
 			break;
 
+		oldsp = sp;
+
 		pc = dtrace_fuword64((void *)(sp +
 			offsetof(struct amd64_frame, f_retaddr)));
 		sp = dtrace_fuword64((void *)sp);
 
+		if (sp == oldsp) {
+			*flags |= CPU_DTRACE_BADSTACK;
+			cpu_core[cpu_number()].cpuc_dtrace_illval = sp;
+			break;
+		}
+
 		/*
 		 * This is totally bogus:  if we faulted, we're going to clear
 		 * the fault and break.  This is to deal with the apparently
@@ -467,11 +490,10 @@ dtrace_getstackdepth(int aframes)
 		return depth - aframes;
 }
 
-#ifdef notyet
 ulong_t
-dtrace_getreg(struct regs *rp, uint_t reg)
+dtrace_getreg(struct trapframe *rp, uint_t reg)
 {
-#if defined(__amd64)
+	/* CHUQ skipped */
 	int regmap[] = {
 		REG_GS,		/* GS */
 		REG_FS,		/* FS */
@@ -507,72 +529,62 @@ dtrace_getreg(struct regs *rp, uint_t re
 
 	switch (reg) {
 	case REG_RDI:
-		return (rp->r_rdi);
+		return (rp->tf_rdi);
 	case REG_RSI:
-		return (rp->r_rsi);
+		return (rp->tf_rsi);
 	case REG_RDX:
-		return (rp->r_rdx);
+		return (rp->tf_rdx);
 	case REG_RCX:
-		return (rp->r_rcx);
+		return (rp->tf_rcx);
 	case REG_R8:
-		return (rp->r_r8);
+		return (rp->tf_r8);
 	case REG_R9:
-		return (rp->r_r9);
+		return (rp->tf_r9);
 	case REG_RAX:
-		return (rp->r_rax);
+		return (rp->tf_rax);
 	case REG_RBX:
-		return (rp->r_rbx);
+		return (rp->tf_rbx);
 	case REG_RBP:
-		return (rp->r_rbp);
+		return (rp->tf_rbp);
 	case REG_R10:
-		return (rp->r_r10);
+		return (rp->tf_r10);
 	case REG_R11:
-		return (rp->r_r11);
+		return (rp->tf_r11);
 	case REG_R12:
-		return (rp->r_r12);
+		return (rp->tf_r12);
 	case REG_R13:
-		return (rp->r_r13);
+		return (rp->tf_r13);
 	case REG_R14:
-		return (rp->r_r14);
+		return (rp->tf_r14);
 	case REG_R15:
-		return (rp->r_r15);
+		return (rp->tf_r15);
 	case REG_DS:
-		return (rp->r_ds);
+		return (rp->tf_ds);
 	case REG_ES:
-		return (rp->r_es);
+		return (rp->tf_es);
 	case REG_FS:
-		return (rp->r_fs);
+		return (rp->tf_fs);
 	case REG_GS:
-		return (rp->r_gs);
+		return (rp->tf_gs);
 	case REG_TRAPNO:
-		return (rp->r_trapno);
+		return (rp->tf_trapno);
 	case REG_ERR:
-		return (rp->r_err);
+		return (rp->tf_err);
 	case REG_RIP:
-		return (rp->r_rip);
+		return (rp->tf_rip);
 	case REG_CS:
-		return (rp->r_cs);
+		return (rp->tf_cs);
 	case REG_SS:
-		return (rp->r_ss);
+		return (rp->tf_ss);
 	case REG_RFL:
-		return (rp->r_rfl);
+		return (rp->tf_rflags);
 	case REG_RSP:
-		return (rp->r_rsp);
+		return (rp->tf_rsp);
 	default:
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
 		return (0);
 	}
-
-#else
-	if (reg > SS) {
-		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
-		return (0);
-	}
-
-	return ((&rp->r_gs)[reg]);
-#endif
 }
-#endif
 
 static int
 dtrace_copycheck(uintptr_t uaddr, uintptr_t kaddr, size_t size)
Index: src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c,v
retrieving revision 1.8
diff -u -p -r1.8 dtrace_subr.c
--- src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c	27 Feb 2017 06:46:59 -0000	1.8
+++ src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c	20 Apr 2017 11:58:41 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dtrace_subr.c,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c 313850 2017-02-17 03:27:20Z markj $
  *
  */
 /*
@@ -29,6 +29,11 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/proc.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
@@ -38,18 +43,13 @@
 #include <sys/xcall.h>
 #include <sys/cpu.h>
 #include <sys/cpuvar.h>
-//#include <sys/smp.h>
 #include <sys/dtrace_impl.h>
 #include <sys/dtrace_bsd.h>
 #include <machine/frame.h>
 #include <machine/cpu_counter.h>
-#include <uvm/uvm_pglist.h>
-#include <uvm/uvm_prot.h>
-#include <uvm/uvm_pmap.h>
+#include <machine/cpufunc.h>
 
 extern uintptr_t 	kernelbase;
-extern uintptr_t 	dtrace_in_probe_addr;
-extern int		dtrace_in_probe;
 
 int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t);
 
@@ -61,6 +61,7 @@ typedef struct dtrace_invop_hdlr {
 dtrace_invop_hdlr_t *dtrace_invop_hdlr;
 
 void dtrace_gethrtime_init(void *);
+void dtrace_getnanotime(struct timespec *);
 
 int
 dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax)
@@ -160,122 +161,6 @@ dtrace_sync(void)
 }
 
 #ifdef notyet
-int (*dtrace_fasttrap_probe_ptr)(struct regs *);
-int (*dtrace_pid_probe_ptr)(struct regs *);
-int (*dtrace_return_probe_ptr)(struct regs *);
-
-void
-dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid)
-{
-	krwlock_t *rwp;
-	proc_t *p = curproc;
-	extern void trap(struct regs *, caddr_t, processorid_t);
-
-	if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) {
-		if (curthread->t_cred != p->p_cred) {
-			cred_t *oldcred = curthread->t_cred;
-			/*
-			 * DTrace accesses t_cred in probe context.  t_cred
-			 * must always be either NULL, or point to a valid,
-			 * allocated cred structure.
-			 */
-			curthread->t_cred = crgetcred();
-			crfree(oldcred);
-		}
-	}
-
-	if (rp->r_trapno == T_DTRACE_RET) {
-		uint8_t step = curthread->t_dtrace_step;
-		uint8_t ret = curthread->t_dtrace_ret;
-		uintptr_t npc = curthread->t_dtrace_npc;
-
-		if (curthread->t_dtrace_ast) {
-			aston(curthread);
-			curthread->t_sig_check = 1;
-		}
-
-		/*
-		 * Clear all user tracing flags.
-		 */
-		curthread->t_dtrace_ft = 0;
-
-		/*
-		 * If we weren't expecting to take a return probe trap, kill
-		 * the process as though it had just executed an unassigned
-		 * trap instruction.
-		 */
-		if (step == 0) {
-			tsignal(curthread, SIGILL);
-			return;
-		}
-
-		/*
-		 * If we hit this trap unrelated to a return probe, we're
-		 * just here to reset the AST flag since we deferred a signal
-		 * until after we logically single-stepped the instruction we
-		 * copied out.
-		 */
-		if (ret == 0) {
-			rp->r_pc = npc;
-			return;
-		}
-
-		/*
-		 * We need to wait until after we've called the
-		 * dtrace_return_probe_ptr function pointer to set %pc.
-		 */
-		rwp = &CPU->cpu_ft_lock;
-		rw_enter(rwp, RW_READER);
-		if (dtrace_return_probe_ptr != NULL)
-			(void) (*dtrace_return_probe_ptr)(rp);
-		rw_exit(rwp);
-		rp->r_pc = npc;
-
-	} else if (rp->r_trapno == T_DTRACE_PROBE) {
-		rwp = &CPU->cpu_ft_lock;
-		rw_enter(rwp, RW_READER);
-		if (dtrace_fasttrap_probe_ptr != NULL)
-			(void) (*dtrace_fasttrap_probe_ptr)(rp);
-		rw_exit(rwp);
-
-	} else if (rp->r_trapno == T_BPTFLT) {
-		uint8_t instr;
-		rwp = &CPU->cpu_ft_lock;
-
-		/*
-		 * The DTrace fasttrap provider uses the breakpoint trap
-		 * (int 3). We let DTrace take the first crack at handling
-		 * this trap; if it's not a probe that DTrace knowns about,
-		 * we call into the trap() routine to handle it like a
-		 * breakpoint placed by a conventional debugger.
-		 */
-		rw_enter(rwp, RW_READER);
-		if (dtrace_pid_probe_ptr != NULL &&
-		    (*dtrace_pid_probe_ptr)(rp) == 0) {
-			rw_exit(rwp);
-			return;
-		}
-		rw_exit(rwp);
-
-		/*
-		 * If the instruction that caused the breakpoint trap doesn't
-		 * look like an int 3 anymore, it may be that this tracepoint
-		 * was removed just after the user thread executed it. In
-		 * that case, return to user land to retry the instuction.
-		 */
-		if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 &&
-		    instr != FASTTRAP_INSTR) {
-			rp->r_pc--;
-			return;
-		}
-
-		trap(rp, addr, cpuid);
-
-	} else {
-		trap(rp, addr, cpuid);
-	}
-}
-
 void
 dtrace_safe_synchronous_signal(void)
 {
@@ -321,14 +206,15 @@ dtrace_safe_defer_signal(void)
 	}
 
 	/*
-	 * If we've executed the original instruction, but haven't performed
-	 * the jmp back to t->t_dtrace_npc or the clean up of any registers
-	 * used to emulate %rip-relative instructions in 64-bit mode, do that
-	 * here and take the signal right away. We detect this condition by
-	 * seeing if the program counter is the range [scrpc + isz, astpc).
+	 * If we have executed the original instruction, but we have performed
+	 * neither the jmp back to t->t_dtrace_npc nor the clean up of any
+	 * registers used to emulate %rip-relative instructions in 64-bit mode,
+	 * we'll save ourselves some effort by doing that here and taking the
+	 * signal right away.  We detect this condition by seeing if the program
+	 * counter is the range [scrpc + isz, astpc).
 	 */
-	if (t->t_dtrace_astpc - rp->r_pc <
-	    t->t_dtrace_astpc - t->t_dtrace_scrpc - isz) {
+	if (rp->r_pc >= t->t_dtrace_scrpc + isz &&
+	    rp->r_pc < t->t_dtrace_astpc) {
 #ifdef __amd64
 		/*
 		 * If there is a scratch register and we're on the
@@ -451,7 +337,6 @@ dtrace_gethrtime_init(void *arg)
 	 * another 32-bit integer without overflowing 64-bit.
 	 * Thus minimum supported TSC frequency is 62.5MHz.
 	 */
-	//KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low"));
 	KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)));
 
 	/*
@@ -472,27 +357,6 @@ dtrace_gethrtime_init(void *arg)
 		/* use skew relative to cpu 0 */
 		tsc_skew[cpu_index(cinfo)] = cinfo->ci_data.cpu_cc_skew;
 	}
-
-	/* Already handled in x86/tsc.c for ci_data.cpu_cc_skew */
-#if 0
-	for (i = 0; i <= mp_maxid; i++) {
-		if (i == curcpu)
-			continue;
-
-		if (pcpu_find(i) == NULL)
-			continue;
-
-		map = 0;
-		map |= (1 << curcpu);
-		map |= (1 << i);
-
-		smp_rendezvous_cpus(map, dtrace_gethrtime_init_sync,
-		    dtrace_gethrtime_init_cpu,
-		    smp_no_rendevous_barrier, (void *)(uintptr_t) i);
-
-		tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc;
-	}
-#endif
 }
 
 /*
@@ -525,14 +389,20 @@ dtrace_gethrtime()
 uint64_t
 dtrace_gethrestime(void)
 {
-	printf("%s(%d): XXX\n",__func__,__LINE__);
-	return (0);
+	struct timespec current_time;
+
+	dtrace_getnanotime(&current_time);
+
+	return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec);
 }
 
 /* Function to handle DTrace traps during probes. See amd64/amd64/trap.c */
 int
 dtrace_trap(struct trapframe *frame, u_int type)
 {
+	bool nofault;
+	cpuid_t cpuid = cpu_number();	/* current cpu id */
+
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
@@ -543,19 +413,19 @@ dtrace_trap(struct trapframe *frame, u_i
 	 * Check if DTrace has enabled 'no-fault' mode:
 	 *
 	 */
-	if ((cpu_core[cpu_number()].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) {
+	nofault = (cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0;
+	if (nofault) {
+		KASSERTMSG((read_rflags() & PSL_I) == 0, "interrupts enabled");
+
 		/*
 		 * There are only a couple of trap types that are expected.
 		 * All the rest will be handled in the usual way.
 		 */
 		switch (type) {
-		/* Privilieged instruction fault. */
-		case T_PRIVINFLT:
-			break;
 		/* General protection fault. */
 		case T_PROTFLT:
 			/* Flag an illegal operation. */
-			cpu_core[cpu_number()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
+			cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
 
 			/*
 			 * Offset the instruction pointer to the instruction
@@ -566,8 +436,8 @@ dtrace_trap(struct trapframe *frame, u_i
 		/* Page fault. */
 		case T_PAGEFLT:
 			/* Flag a bad address. */
-			cpu_core[cpu_number()].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR;
-			cpu_core[cpu_number()].cpuc_dtrace_illval = rcr2();
+			cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR;
+			cpu_core[cpuid].cpuc_dtrace_illval = rcr2();
 
 			/*
 			 * Offset the instruction pointer to the instruction
Index: src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c
diff -N src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c
--- src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c	21 Feb 2010 01:46:33 -0000	1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,134 +0,0 @@
-/*	$NetBSD: instr_size.c,v 1.2 2010/02/21 01:46:33 darran Exp $	*/
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/instr_size.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved	*/
-
-
-#if defined(sun)
-#pragma ident	"@(#)instr_size.c	1.14	05/07/08 SMI"
-#endif
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/proc.h>
-#if defined(sun)
-#include <sys/cmn_err.h>
-#include <sys/archsystm.h>
-#include <sys/copyops.h>
-#include <vm/seg_enum.h>
-#include <sys/privregs.h>
-#else
-typedef	u_int			model_t;
-#define	DATAMODEL_NATIVE	0
-int dtrace_instr_size(uchar_t *);
-#endif
-
-#include <dis_tables.h>
-
-/*
- * This subsystem (with the minor exception of the instr_size() function) is
- * is called from DTrace probe context.  This imposes several requirements on
- * the implementation:
- *
- * 1. External subsystems and functions may not be referenced.  The one current
- *    exception is for cmn_err, but only to signal the detection of table
- *    errors.  Assuming the tables are correct, no combination of input is to
- *    trigger a cmn_err call.
- *
- * 2. These functions can't be allowed to be traced.  To prevent this,
- *    all functions in the probe path (everything except instr_size()) must
- *    have names that begin with "dtrace_".
- */
-
-typedef enum dis_isize {
-	DIS_ISIZE_INSTR,
-	DIS_ISIZE_OPERAND
-} dis_isize_t;
-
-
-/*
- * get a byte from instruction stream
- */
-static int
-dtrace_dis_get_byte(void *p)
-{
-	int ret;
-	uchar_t **instr = p;
-
-	ret = **instr;
-	*instr += 1;
-
-	return (ret);
-}
-
-/*
- * Returns either the size of a given instruction, in bytes, or the size of that
- * instruction's memory access (if any), depending on the value of `which'.
- * If a programming error in the tables is detected, the system will panic to
- * ease diagnosis.  Invalid instructions will not be flagged.  They will appear
- * to have an instruction size between 1 and the actual size, and will be
- * reported as having no memory impact.
- */
-/* ARGSUSED2 */
-static int
-dtrace_dis_isize(uchar_t *instr, dis_isize_t which, model_t model, int *rmindex)
-{
-	int sz;
-	dis86_t	x;
-	uint_t mode = SIZE64;
-
-#if defined(sun)
-	mode = (model == DATAMODEL_LP64) ? SIZE64 : SIZE32;
-#endif
-
-	x.d86_data = (void **)&instr;
-	x.d86_get_byte = dtrace_dis_get_byte;
-	x.d86_check_func = NULL;
-
-	if (dtrace_disx86(&x, mode) != 0)
-		return (-1);
-
-	if (which == DIS_ISIZE_INSTR)
-		sz = x.d86_len;		/* length of the instruction */
-	else
-		sz = x.d86_memsize;	/* length of memory operand */
-
-	if (rmindex != NULL)
-		*rmindex = x.d86_rmindex;
-	return (sz);
-}
-
-int
-dtrace_instr_size(uchar_t *instr)
-{
-	return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, DATAMODEL_NATIVE,
-	    NULL));
-}
Index: src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S,v
retrieving revision 1.6
diff -u -p -r1.6 dtrace_asm.S
--- src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S	23 Jun 2016 04:35:35 -0000	1.6
+++ src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S	25 Apr 2017 02:37:35 -0000
@@ -19,7 +19,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/dev/dtrace/arm/dtrace_asm.S 308427 2016-11-07 20:02:18Z gonzo $
  */
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
@@ -28,7 +28,6 @@
 
 #define _ASM
 #define _LOCORE
-#define LOCORE
 
 #include <sys/cpuvar_defs.h>
 #include <sys/dtrace.h>
@@ -36,6 +35,16 @@
 #include <machine/asm.h>
 #include <arm/armreg.h>
 
+#define	PSR_I	I32_bit
+#define	PSR_F	F32_bit
+
+#ifdef	__ARM_BIG_ENDIAN
+#define	__BIG_ENDIAN 1
+#endif
+
+#define EENTRY(x)	ENTRY_NP(x)
+#define EEND(x)		/* nothing */
+
 /*
 void dtrace_membar_producer(void)
 */
@@ -56,7 +65,7 @@ dtrace_icookie_t dtrace_interrupt_disabl
 ENTRY(dtrace_interrupt_disable)
 	mrs	r0, cpsr
 	mov	r1, r0
-	orr	r1, r1, #(I32_bit|F32_bit)
+	orr	r1, r1, #(PSR_I | PSR_F)
 	msr	cpsr_c, r1
 	RET
 END(dtrace_interrupt_disable)
@@ -65,44 +74,15 @@ END(dtrace_interrupt_disable)
 void dtrace_interrupt_enable(dtrace_icookie_t cookie)
 */
 ENTRY(dtrace_interrupt_enable)
-	and	r0, r0, #(I32_bit|F32_bit)
+	and	r0, r0, #(PSR_I | PSR_F)
 	mrs	r1, cpsr
-	bic	r1, r1, #(I32_bit|F32_bit)
+	bic	r1, r1, #(PSR_I | PSR_F)
 	orr	r1, r1, r0
 	msr	cpsr_c, r1
 	RET
 END(dtrace_interrupt_enable)
 
 /*
-uint32_t dtrace_cas32(uint32_t *target, uint32_t cmp, uint32_t new)
-XXX: just disable interrupts for now, add proper implementation for
-ARMv6/ARMv7 later
-*/
-ENTRY_NP(dtrace_casptr)
-ENTRY(dtrace_cas32)
-	stmfd	sp!, {r4, r5}
-
-	mrs	r3, cpsr
-	mov	r4, r3
-	orr	r4, r4, #(I32_bit|F32_bit)
-	msr	cpsr_c, r4
-
-	ldr	r5, [r0]
-	cmp	r5, r1
-	movne	r0, r5
-	bne	2f
-
-	str	r2, [r0]
-	mov	r0, r5
-
-2:
-	msr	cpsr_c, r3
-	ldmfd	sp!, {r4, r5}
-	RET
-END(dtrace_cas32)
-END(dtrace_casptr)
-
-/*
 uint8_t
 dtrace_fuword8_nocheck(void *addr)
 */
@@ -135,20 +115,21 @@ END(dtrace_fuword32_nocheck)
 /*
 uint64_t
 dtrace_fuword64_nocheck(void *addr)
-XXX: add byteorder check
 */
 ENTRY(dtrace_fuword64_nocheck)
 	ldm	r0, {r2, r3}
 
 	mov	r0, r2
 	mov	r1, r3
-#if 0
-/* little endian */
-	mov	r0, r2
-	mov	r1, r3
+#if defined(__BIG_ENDIAN__)
 /* big endian */
 	mov	r0, r3
 	mov	r1, r2
+#else
+/* little endian */
+	mov	r0, r2
+	mov	r1, r3
+
 #endif
 	RET
 END(dtrace_fuword64_nocheck)
@@ -159,21 +140,20 @@ dtrace_copy(uintptr_t uaddr, uintptr_t k
 */
 ENTRY(dtrace_copy)
 	stmfd   sp!, {r4-r5}			/* stack is 8 byte aligned */
-	teq     r2, #0x00000000
-	mov     r5, #0x00000000
-	beq     2f
-
-1:	ldrb    r4, [r0], #0x0001
-	add     r5, r5, #0x00000001
-	strb    r4, [r1], #0x0001
-	teqne   r5, r2
-	bne     1b
+	teq	r2, #0x00000000
+	mov	r5, #0x00000000
+	beq	2f
+
+1:	ldrb	r4, [r0], #0x0001
+	add	r5, r5, #0x00000001
+	strb	r4, [r1], #0x0001
+	teqne	r5, r2
+	bne	1b
 
-2:	ldmfd   sp!, {r4-r5}			/* stack is 8 byte aligned */
+2:	ldmfd	sp!, {r4-r5}			/* stack is 8 byte aligned */
 	RET
 END(dtrace_copy)
 
-
 /*
 void
 dtrace_copystr(uintptr_t uaddr, uintptr_t kaddr, size_t size,
@@ -181,48 +161,22 @@ dtrace_copystr(uintptr_t uaddr, uintptr_
 XXX: Check for flags?
 */
 ENTRY(dtrace_copystr)
-	stmfd   sp!, {r4-r5}			/* stack is 8 byte aligned */
-	teq     r2, #0x00000000
-	mov     r5, #0x00000000
-	beq     2f
-
-1:	ldrb    r4, [r0], #0x0001
-	add     r5, r5, #0x00000001
-	teq     r4, #0x00000000
-	strb    r4, [r1], #0x0001
-	teqne   r5, r2
-	bne     1b
+	stmfd	sp!, {r4-r5}			/* stack is 8 byte aligned */
+	teq	r2, #0x00000000
+	mov	r5, #0x00000000
+	beq	2f
+
+1:	ldrb	r4, [r0], #0x0001
+	add	r5, r5, #0x00000001
+	teq	r4, #0x00000000
+	strb	r4, [r1], #0x0001
+	teqne	r5, r2
+	bne	1b
 
-2:	ldmfd   sp!, {r4-r5}			/* stack is 8 byte aligned */
+2:	ldmfd	sp!, {r4-r5}			/* stack is 8 byte aligned */
 	RET
 END(dtrace_copystr)
 
-
-/*
-void dtrace_invop_init(void)
-*/
-ENTRY(dtrace_invop_init)
-	ldr	r1, .Ldtrace_invop
-	ldr	r2, .Ldtrace_invop_jump_addr
-	str	r1, [r2]
-	RET
-	.align	0
-.Ldtrace_invop:
-	.word	dtrace_invop
-.Ldtrace_invop_jump_addr:
-	.word	dtrace_invop_jump_addr
-END(dtrace_invop_init)
-
-/*
-void dtrace_invop_uninit(void)
-*/
-ENTRY(dtrace_invop_uninit)
-	mov	r0, #0
-	ldr	r1, .Ldtrace_invop_jump_addr
-	str	r0, [r1]
-	RET
-END(dtrace_invop_uninit)
-
 /*
 uintptr_t
 dtrace_caller(int aframes)
@@ -231,3 +185,53 @@ ENTRY(dtrace_caller)
 	mov	r0, #-1
 	RET
 END(dtrace_caller)
+
+/*
+uint32_t
+dtrace_cas32(uint32_t *target, uint32_t cmp, uint32_t new)
+
+void *
+dtrace_casptr(volatile void *target, volatile void *cmp, volatile void *new)
+*/
+EENTRY(dtrace_casptr)
+ENTRY(dtrace_cas32)
+#if __ARM_ARCH >= 6
+
+1:	ldrex	r3, [r0]	/* Load target */
+	cmp	r3, r1		/* Check if *target == cmp */
+	bne	2f		/* No, return */
+	strex	ip, r2, [r0]	/* Store new to target */
+	cmp	ip, #0		/* Did the store succeed? */
+	bne	1b		/* No, try again */
+2:	mov	r0, r3		/* Return the value loaded from target */
+	RET
+
+#else
+
+	/*
+	 * We don't support MP on CPUs older than v6, so just disable interrupts
+	 * and use non-atomic instructions.
+	 */
+
+	stmfd	sp!, {r4, r5}
+
+	mrs	r3, cpsr
+	mov	r4, r3
+	orr	r4, r4, #(PSR_I | PSR_F)
+	msr	cpsr_c, r4
+
+	ldr	r5, [r0]
+	cmp	r5, r1
+	movne	r0, r5
+	bne	2f
+
+	str	r2, [r0]
+	mov	r0, r5
+
+2:
+	msr	cpsr_c, r3
+	ldmfd	sp!, {r4, r5}
+	RET
+#endif
+END(dtrace_cas32)
+EEND(dtrace_casptr)
Index: src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c,v
retrieving revision 1.5
diff -u -p -r1.5 dtrace_isa.c
--- src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c	2 Oct 2015 22:15:18 -0000	1.5
+++ src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c	5 Jul 2017 20:28:13 -0000
@@ -19,7 +19,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/dev/dtrace/arm/dtrace_isa.c 295882 2016-02-22 09:08:04Z skra $
  */
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
@@ -100,8 +100,6 @@ dtrace_getpcstack(pc_t *pcstack, int pcs
 		 * that generated the stack frame.  We hope for the best.
 		 */
 		scp = frame[FR_SCP];
-		printf("--> %08x\n", (uint32_t)scp);
-
 		if (aframes > 0) {
 			aframes--;
 			if ((aframes == 0) && (caller != 0)) {
@@ -109,7 +107,6 @@ dtrace_getpcstack(pc_t *pcstack, int pcs
 			}
 		}
 		else {
-			printf("++ --> %08x\n", (uint32_t)scp);
 			pcstack[depth++] = scp;
 		}
 
@@ -154,13 +151,13 @@ dtrace_getpcstack(pc_t *pcstack, int pcs
 void
 dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit)
 {
-	printf("unimplemented\n");
+	printf("IMPLEMENT ME: %s\n", __func__);
 }
 
 int
 dtrace_getustackdepth(void)
 {
-	printf("unimplemented\n");
+	printf("IMPLEMENT ME: %s\n", __func__);
 	return (0);
 }
 
@@ -174,8 +171,9 @@ dtrace_getufpstack(uint64_t *pcstack, ui
 uint64_t
 dtrace_getarg(int arg, int aframes)
 {
-	printf("unimplemented\n");
+/*	struct arm_frame *fp = (struct arm_frame *)dtrace_getfp();*/
 
+	printf("IMPLEMENT ME: %s\n", __func__);
 	return (0);
 }
 
@@ -227,6 +225,7 @@ dtrace_getstackdepth(int aframes)
 ulong_t
 dtrace_getreg(struct trapframe *rp, uint_t reg)
 {
+	printf("IMPLEMENT ME: %s\n", __func__);
 
 	return (0);
 }
Index: src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c,v
retrieving revision 1.3
diff -u -p -r1.3 dtrace_subr.c
--- src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c	27 Feb 2017 06:47:00 -0000	1.3
+++ src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c	10 May 2017 10:10:04 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/dev/dtrace/arm/dtrace_subr.c 308457 2016-11-08 23:59:41Z bdrewery $
  *
  */
 /*
@@ -47,12 +47,26 @@
 #include <uvm/uvm_prot.h>
 #include <uvm/uvm_pmap.h>
 
+#define FAULT_ALIGN	FAULT_ALIGN_0
 extern uintptr_t 	kernelbase;
 extern uintptr_t 	dtrace_in_probe_addr;
 extern int		dtrace_in_probe;
+
+void dtrace_gethrtime_init(void *arg);
+
+#define	DELAYBRANCH(x)	((int)(x) < 0)
+
+#define	BIT_PC		15
+#define	BIT_LR		14
+#define	BIT_SP		13
+
 extern dtrace_id_t	dtrace_probeid_error;
+extern int (*dtrace_invop_jump_addr)(struct trapframe *);
+extern void dtrace_getnanotime(struct timespec *tsp);
 
 int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t);
+void dtrace_invop_init(void);
+void dtrace_invop_uninit(void);
 
 typedef struct dtrace_invop_hdlr {
 	int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t);
@@ -61,8 +75,6 @@ typedef struct dtrace_invop_hdlr {
 
 dtrace_invop_hdlr_t *dtrace_invop_hdlr;
 
-void dtrace_gethrtime_init(void *arg);
-
 int
 dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax)
 {
@@ -76,6 +88,7 @@ dtrace_invop(uintptr_t addr, struct trap
 	return (0);
 }
 
+
 void
 dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t))
 {
@@ -114,6 +127,7 @@ dtrace_invop_remove(int (*func)(uintptr_
 	kmem_free(hdlr, sizeof (dtrace_invop_hdlr_t));
 }
 
+/*ARGSUSED*/
 void
 dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
 {
@@ -156,7 +170,7 @@ dtrace_sync_func(void)
 void
 dtrace_sync(void)
 {
-        dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);
+	dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL);
 }
 
 /*
@@ -167,36 +181,35 @@ dtrace_sync(void)
  * Returns nanoseconds since boot.
  */
 uint64_t
-dtrace_gethrtime()
+dtrace_gethrtime(void)
 {
-	struct      timespec curtime;
+	struct	timespec curtime;
 
 	nanouptime(&curtime);
 
 	return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec);
-
 }
 
 uint64_t
 dtrace_gethrestime(void)
 {
-	struct      timespec curtime;
+	struct timespec current_time;
 
-	getnanotime(&curtime);
+	dtrace_getnanotime(&current_time);
 
-	return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec);
+	return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec);
 }
 
 /* Function to handle DTrace traps during probes. Not used on ARM yet */
 int
 dtrace_trap(struct trapframe *frame, u_int type)
 {
-	cpuid_t cpuid = cpu_number();	/* current cpu id */
+	cpuid_t curcpu_id = cpu_number();	/* current cpu id */
 
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
-	 * a flag in it's per-cpu flags to indicate that it doesn't
+	 * a flag in its per-cpu flags to indicate that it doesn't
 	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 *
@@ -204,24 +217,23 @@ dtrace_trap(struct trapframe *frame, u_i
 	 *
 	 */
 
-	if ((cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) {
+	if ((cpu_core[curcpu_id].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) {
 		/*
 		 * There are only a couple of trap types that are expected.
 		 * All the rest will be handled in the usual way.
 		 */
 		switch (type) {
 		/* Page fault. */
-		case 0:
+		case FAULT_ALIGN:
 			/* Flag a bad address. */
-			cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR;
-			cpu_core[cpuid].cpuc_dtrace_illval = 0;
+			cpu_core[curcpu_id].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR;
+			cpu_core[curcpu_id].cpuc_dtrace_illval = 0;
 
 			/*
 			 * Offset the instruction pointer to the instruction
 			 * following the one causing the fault.
 			 */
-			panic("%s", __func__);
-			// frame->pc += sizeof(int);
+			frame->tf_pc += sizeof(int);
 			return (1);
 		default:
 			/* Handle all other traps in the usual way. */
@@ -248,3 +260,349 @@ dtrace_gethrtime_init(void *arg)
 {
 	/* FIXME */
 }
+
+static uint32_t
+dtrace_expand_imm(uint32_t imm12)
+{
+	uint32_t unrot = imm12 & 0xff;
+	int amount = 2 * (imm12 >> 8);
+
+	if (amount)
+		return (unrot >> amount) | (unrot << (32 - amount));
+	else
+		return unrot;
+}
+
+static uint32_t
+dtrace_add_with_carry(uint32_t x, uint32_t y, int carry_in,
+	int *carry_out, int *overflow)
+{
+	uint32_t result;
+	uint64_t unsigned_sum = x + y + (uint32_t)carry_in;
+	int64_t signed_sum = (int32_t)x + (int32_t)y + (int32_t)carry_in;
+	KASSERT(carry_in == 1);
+
+	result = (uint32_t)(unsigned_sum & 0xffffffff);
+	*carry_out = ((uint64_t)result == unsigned_sum) ? 1 : 0;
+	*overflow = ((int64_t)result == signed_sum) ? 0 : 1;
+	
+	return result;
+}
+
+static void
+dtrace_invop_emulate(int invop, struct trapframe *frame)
+{
+	uint32_t op = invop;
+#if 1
+	/* nbsd encoding */
+	uint32_t code = op >> 28;
+	uint32_t data = op;
+#else
+	/* fbsd encoding */
+	uint32_t code = op & DTRACE_INVOP_MASK;
+	uint32_t data = DTRACE_INVOP_DATA(invop);
+#endif
+
+	switch (code) {
+	case DTRACE_INVOP_MOV_IP_SP:
+		/* mov ip, sp */
+		frame->tf_ip = frame->tf_svc_sp;
+		frame->tf_pc += 4;
+		break;
+	case DTRACE_INVOP_BX_LR:
+		/* bx lr */
+		frame->tf_pc = frame->tf_svc_lr;
+		break;
+	case DTRACE_INVOP_MOV_PC_LR:
+		/* mov pc, lr */
+		frame->tf_pc = frame->tf_svc_lr;
+		break;
+	case DTRACE_INVOP_LDM:
+		/* ldm sp, {..., pc} */
+		/* FALLTHRU */
+	case DTRACE_INVOP_POPM: {
+		/* ldmib sp, {..., pc} */
+		uint32_t register_list = (op & 0xffff);
+		uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp;
+		uint32_t *regs = &frame->tf_r0;
+		int i;
+
+		/* POPM */
+		if (code == DTRACE_INVOP_POPM)
+			sp++;
+
+		for (i = 0; i <= 12; i++) {
+			if (register_list & (1 << i))
+				regs[i] = *sp++;
+		}
+		if (register_list & (1 << 13))
+			frame->tf_svc_sp = *sp++;
+		if (register_list & (1 << 14))
+			frame->tf_svc_lr = *sp++;
+		frame->tf_pc = *sp;
+		break;
+	}
+	case DTRACE_INVOP_LDR_IMM: {
+		/* ldr r?, [{pc,r?}, #?] */
+		uint32_t rt = (op >> 12) & 0xf;
+		uint32_t rn = (op >> 16) & 0xf;
+		uint32_t imm = op & 0xfff;
+		uint32_t *regs = &frame->tf_r0;
+		KDASSERT(rt <= 12);
+		KDASSERT(rn == 15 || rn <= 12);
+		if (rn == 15)
+			regs[rt] = *((uint32_t *)(intptr_t)(frame->tf_pc + 8 + imm));
+		else
+			regs[rt] = *((uint32_t *)(intptr_t)(regs[rn] + imm));
+		frame->tf_pc += 4;
+		break;
+	}
+	case DTRACE_INVOP_MOVW: {
+		/* movw r?, #? */
+		uint32_t rd = (op >> 12) & 0xf;
+		uint32_t imm = (op & 0xfff) | ((op & 0xf0000) >> 4);
+		uint32_t *regs = &frame->tf_r0;
+		KDASSERT(rd <= 12);
+		regs[rd] = imm;
+		frame->tf_pc += 4;
+		break;
+	}
+	case DTRACE_INVOP_MOV_IMM: {
+		/* mov r?, #? */
+		uint32_t rd = (op >> 12) & 0xf;
+		uint32_t imm = dtrace_expand_imm(op & 0xfff);
+		uint32_t *regs = &frame->tf_r0;
+		KDASSERT(rd <= 12);
+		regs[rd] = imm;
+		frame->tf_pc += 4;
+		break;
+	}
+	case DTRACE_INVOP_CMP_IMM: {
+		/* cmp r?, #? */
+		uint32_t rn = (op >> 16) & 0xf;
+		uint32_t *regs = &frame->tf_r0;
+		uint32_t imm = dtrace_expand_imm(op & 0xfff);
+		uint32_t spsr = frame->tf_spsr;
+		uint32_t result;
+		int carry;
+		int overflow;
+		/*
+		 * (result, carry, overflow) = AddWithCarry(R[n], NOT(imm32), â€™1â€™);
+		 * APSR.N = result<31>;
+		 * APSR.Z = IsZeroBit(result);
+		 * APSR.C = carry;
+		 * APSR.V = overflow; 
+		 */
+		KDASSERT(rn <= 12);
+		result = dtrace_add_with_carry(regs[rn], ~imm, 1, &carry, &overflow);
+		if (result & 0x80000000)
+			spsr |= PSR_N_bit;
+		else
+			spsr &= ~PSR_N_bit;
+		if (result == 0)
+			spsr |= PSR_Z_bit;
+		else
+			spsr &= ~PSR_Z_bit;
+		if (carry)
+			spsr |= PSR_C_bit;
+		else
+			spsr &= ~PSR_C_bit;
+		if (overflow)
+			spsr |= PSR_V_bit;
+		else
+			spsr &= ~PSR_V_bit;
+
+#if 0
+		aprint_normal("pc=%x Rn=%x imm=%x %c%c%c%c\n", frame->tf_pc, regs[rn], imm,
+		    (spsr & PSR_N_bit) ? 'N' : 'n',
+		    (spsr & PSR_Z_bit) ? 'Z' : 'z',
+		    (spsr & PSR_C_bit) ? 'C' : 'c',
+		    (spsr & PSR_V_bit) ? 'V' : 'v');
+#endif
+		frame->tf_spsr = spsr;
+		frame->tf_pc += 4;
+		break;
+	}
+	case DTRACE_INVOP_B: {
+		/* b ??? */
+		uint32_t imm = (op & 0x00ffffff) << 2;
+		int32_t diff;
+		/* SignExtend(imm26, 32) */
+		if (imm & 0x02000000)
+			imm |= 0xfc000000;
+		diff = (int32_t)imm;
+		frame->tf_pc += 8 + diff;
+		break;
+	}
+	case DTRACE_INVOP_PUSHM: {
+		/* push {...} */
+		uint32_t register_list = (op & 0xffff);
+		uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp;
+		uint32_t *regs = &frame->tf_r0;
+		int i;
+		int count = 0;
+
+#if 0
+		if ((op & 0x0fff0fff) == 0x052d0004) {
+			/* A2: str r4, [sp, #-4]! */
+			*(sp - 1) = regs[4];
+			frame->tf_pc += 4;
+			break;
+		}
+#endif
+
+		for (i = 0; i < 16; i++) {
+			if (register_list & (1 << i))
+				count++;
+		}
+		sp -= count;
+
+		for (i = 0; i <= 12; i++) {
+			if (register_list & (1 << i))
+				*sp++ = regs[i];
+		}
+		if (register_list & (1 << 13))
+			*sp++ = frame->tf_svc_sp;
+		if (register_list & (1 << 14))
+			*sp++ = frame->tf_svc_lr;
+		if (register_list & (1 << 15))
+			*sp = frame->tf_pc + 8;
+
+		/* make sure the caches and memory are in sync */
+		cpu_dcache_wbinv_range(frame->tf_svc_sp, count * 4);
+
+		/* In case the current page tables have been modified ... */
+		cpu_tlb_flushID();
+		cpu_cpwait();
+
+		frame->tf_svc_sp -= count * 4;
+		frame->tf_pc += 4;
+
+		break;
+	}
+	default:
+		KDASSERTMSG(0, "invop 0x%08x code %u tf %p", invop, code, frame);
+	}
+}
+
+static int
+dtrace_invop_start(struct trapframe *frame)
+{
+#if 0
+	register_t *r0, *sp;
+	int data, invop, reg, update_sp;
+#endif
+	int invop;
+
+	invop = dtrace_invop(frame->tf_pc, frame, frame->tf_r0);
+
+	dtrace_invop_emulate(invop, frame);
+
+#if 0
+	switch (invop & DTRACE_INVOP_MASK) {
+	case DTRACE_INVOP_PUSHM:
+		sp = (register_t *)frame->tf_svc_sp;
+		r0 = &frame->tf_r0;
+		data = DTRACE_INVOP_DATA(invop);
+
+		/*
+		 * Store the pc, lr, and sp. These have their own
+		 * entries in the struct.
+		 */
+		if (data & (1 << BIT_PC)) {
+			sp--;
+			*sp = frame->tf_pc;
+		}
+		if (data & (1 << BIT_LR)) {
+			sp--;
+			*sp = frame->tf_svc_lr;
+		}
+		if (data & (1 << BIT_SP)) {
+			sp--;
+			*sp = frame->tf_svc_sp;
+		}
+
+		/* Store the general registers */
+		for (reg = 12; reg >= 0; reg--) {
+			if (data & (1 << reg)) {
+				sp--;
+				*sp = r0[reg];
+			}
+		}
+
+		/* Update the stack pointer and program counter to continue */
+		frame->tf_svc_sp = (register_t)sp;
+		frame->tf_pc += 4;
+		break;
+	case DTRACE_INVOP_POPM:
+		sp = (register_t *)frame->tf_svc_sp;
+		r0 = &frame->tf_r0;
+		data = DTRACE_INVOP_DATA(invop);
+
+		/* Read the general registers */
+		for (reg = 0; reg <= 12; reg++) {
+			if (data & (1 << reg)) {
+				r0[reg] = *sp;
+				sp++;
+			}
+		}
+
+		/*
+		 * Set the stack pointer. If we don't update it here we will
+		 * need to update it at the end as the instruction would do
+		 */
+		update_sp = 1;
+		if (data & (1 << BIT_SP)) {
+			frame->tf_svc_sp = *sp;
+			*sp++;
+			update_sp = 0;
+		}
+
+		/* Update the link register, we need to use the correct copy */
+		if (data & (1 << BIT_LR)) {
+			frame->tf_svc_lr = *sp;
+			*sp++;
+		}
+		/*
+		 * And the program counter. If it's not in the list skip over
+		 * it when we return so to not hit this again.
+		 */
+		if (data & (1 << BIT_PC)) {
+			frame->tf_pc = *sp;
+			*sp++;
+		} else
+			frame->tf_pc += 4;
+
+		/* Update the stack pointer if we haven't already done so */
+		if (update_sp)
+			frame->tf_svc_sp = (register_t)sp;
+		break;
+	case DTRACE_INVOP_B:
+		data = DTRACE_INVOP_DATA(invop) & 0x00ffffff;
+		/* Sign extend the data */
+		if ((data & (1 << 23)) != 0)
+			data |= 0xff000000;
+		/* The data is the number of 4-byte words to change the pc */
+		data *= 4;
+		data += 8;
+		frame->tf_pc += data;
+		break;
+
+	default:
+		return (-1);
+		break;
+	}
+#endif
+
+	return (0);
+}
+
+void dtrace_invop_init(void)
+{
+	dtrace_invop_jump_addr = dtrace_invop_start;
+}
+
+void dtrace_invop_uninit(void)
+{
+	dtrace_invop_jump_addr = 0;
+}
Index: src/external/cddl/osnet/dev/dtrace/arm/regset.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/regset.h,v
retrieving revision 1.1
diff -u -p -r1.1 regset.h
--- src/external/cddl/osnet/dev/dtrace/arm/regset.h	21 Jun 2013 19:16:00 -0000	1.1
+++ src/external/cddl/osnet/dev/dtrace/arm/regset.h	12 Apr 2017 09:46:41 -0000
@@ -19,7 +19,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD$ 
+ * $FreeBSD: head/sys/cddl/dev/dtrace/arm/regset.h 278529 2015-02-10 19:41:30Z gnn $ 
  */
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
@@ -42,18 +42,13 @@
 extern "C" {
 #endif
 
-/*
- * XXX: define registers properly 
- */
-
 #if 0
-#define REG_PC  PC
-#define REG_FP  EBP
-#define REG_SP  SP
-#define REG_PS  EFL
-#define REG_R0  EAX
-#define REG_R1  EDX
-#endif
+#define REG_LINK  R14
+#define REG_SP  R12
+#define REG_PS  R0
+#define REG_R0  R0
+#define REG_R1  R1
+#endif 
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c
diff -N src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c
--- src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c	20 Jul 2011 19:51:57 -0000	1.3
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,3195 +0,0 @@
-/*	$NetBSD: dis_tables.c,v 1.3 2011/07/20 19:51:57 tron Exp $	*/
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dis_tables.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved  	*/
-
-
-#if defined(sun)
-#pragma ident	"@(#)dis_tables.c	1.11	06/03/02 SMI"
-#endif
-
-#include	"dis_tables.h"
-
-/* BEGIN CSTYLED */
-
-/*
- * Disassembly begins in dis_distable, which is equivalent to the One-byte
- * Opcode Map in the Intel IA32 ISA Reference (page A-6 in my copy).  The
- * decoding loops then traverse out through the other tables as necessary to
- * decode a given instruction.
- *
- * The behavior of this file can be controlled by one of the following flags:
- *
- * 	DIS_TEXT	Include text for disassembly
- * 	DIS_MEM		Include memory-size calculations
- *
- * Either or both of these can be defined.
- *
- * This file is not, and will never be, cstyled.  If anything, the tables should
- * be taken out another tab stop or two so nothing overlaps.
- */
-
-/*
- * These functions must be provided for the consumer to do disassembly.
- */
-#ifdef DIS_TEXT
-extern char *strncpy(char *, const char *, size_t);
-extern size_t strlen(const char *);
-extern int strcmp(const char *, const char *);
-extern int strncmp(const char *, const char *, size_t);
-extern size_t strlcat(char *, const char *, size_t);
-#endif
-
-
-#define		TERM 	NULL	/* used to indicate that the 'indirect' */
-				/* field terminates - no pointer.	*/
-
-/* Used to decode instructions. */
-typedef struct	instable {
-	const struct instable	*it_indirect;	/* for decode op codes */
-	uchar_t		it_adrmode;
-#ifdef DIS_TEXT
-	char		it_name[NCPS];
-	uint_t		it_suffix:1;		/* mneu + "w", "l", or "d" */
-#endif
-#ifdef DIS_MEM
-	uint_t		it_size:16;
-#endif
-	uint_t		it_invalid64:1;		/* opcode invalid in amd64 */
-	uint_t		it_always64:1;		/* 64 bit when in 64 bit mode */
-	uint_t		it_invalid32:1;		/* invalid in IA32 */
-	uint_t		it_stackop:1;		/* push/pop stack operation */
-} instable_t;
-
-/*
- * Instruction formats.
- */
-enum {
-	UNKNOWN,
-	MRw,
-	IMlw,
-	IMw,
-	IR,
-	OA,
-	AO,
-	MS,
-	SM,
-	Mv,
-	Mw,
-	M,		/* register or memory */
-	Mb,		/* register or memory, always byte sized */
-	MO,		/* memory only (no registers) */
-	PREF,
-	SWAPGS,
-	R,
-	RA,
-	SEG,
-	MR,
-	RM,
-	IA,
-	MA,
-	SD,
-	AD,
-	SA,
-	D,
-	INM,
-	SO,
-	BD,
-	I,
-	P,
-	V,
-	DSHIFT,		/* for double shift that has an 8-bit immediate */
-	U,
-	OVERRIDE,
-	NORM,		/* instructions w/o ModR/M byte, no memory access */
-	IMPLMEM,	/* instructions w/o ModR/M byte, implicit mem access */
-	O,		/* for call	*/
-	JTAB,		/* jump table 	*/
-	IMUL,		/* for 186 iimul instr  */
-	CBW,		/* so data16 can be evaluated for cbw and variants */
-	MvI,		/* for 186 logicals */
-	ENTER,		/* for 186 enter instr  */
-	RMw,		/* for 286 arpl instr */
-	Ib,		/* for push immediate byte */
-	F,		/* for 287 instructions */
-	FF,		/* for 287 instructions */
-	FFC,		/* for 287 instructions */
-	DM,		/* 16-bit data */
-	AM,		/* 16-bit addr */
-	LSEG,		/* for 3-bit seg reg encoding */
-	MIb,		/* for 386 logicals */
-	SREG,		/* for 386 special registers */
-	PREFIX,		/* a REP instruction prefix */
-	LOCK,		/* a LOCK instruction prefix */
-	INT3,		/* The int 3 instruction, which has a fake operand */
-	INTx,		/* The normal int instruction, with explicit int num */
-	DSHIFTcl,	/* for double shift that implicitly uses %cl */
-	CWD,		/* so data16 can be evaluated for cwd and variants */
-	RET,		/* single immediate 16-bit operand */
-	MOVZ,		/* for movs and movz, with different size operands */
-	XADDB,		/* for xaddb */
-	MOVSXZ,		/* AMD64 mov sign extend 32 to 64 bit instruction */
-
-/*
- * MMX/SIMD addressing modes.
- */
-
-	MMO,		/* Prefixable MMX/SIMD-Int	mm/mem	-> mm */
-	MMOIMPL,	/* Prefixable MMX/SIMD-Int	mm	-> mm (mem) */
-	MMO3P,		/* Prefixable MMX/SIMD-Int	mm	-> r32,imm8 */
-	MMOM3,		/* Prefixable MMX/SIMD-Int	mm	-> r32 	*/
-	MMOS,		/* Prefixable MMX/SIMD-Int	mm	-> mm/mem */
-	MMOMS,		/* Prefixable MMX/SIMD-Int	mm	-> mem */
-	MMOPM,		/* MMX/SIMD-Int			mm/mem	-> mm,imm8 */
-	MMOPRM,		/* Prefixable MMX/SIMD-Int	r32/mem	-> mm,imm8 */
-	MMOSH,		/* Prefixable MMX		mm,imm8	*/
-	MM,		/* MMX/SIMD-Int			mm/mem	-> mm	*/
-	MMS,		/* MMX/SIMD-Int			mm	-> mm/mem */
-	MMSH,		/* MMX				mm,imm8 */
-	XMMO,		/* Prefixable SIMD		xmm/mem	-> xmm */
-	XMMOS,		/* Prefixable SIMD		xmm	-> xmm/mem */
-	XMMOPM,		/* Prefixable SIMD		xmm/mem	w/to xmm,imm8 */
-	XMMOMX,		/* Prefixable SIMD		mm/mem	-> xmm */
-	XMMOX3,		/* Prefixable SIMD		xmm	-> r32 */
-	XMMOXMM,	/* Prefixable SIMD		xmm/mem	-> mm	*/
-	XMMOM,		/* Prefixable SIMD		xmm	-> mem */
-	XMMOMS,		/* Prefixable SIMD		mem	-> xmm */
-	XMM,		/* SIMD 			xmm/mem	-> xmm */
-	XMMXIMPL,	/* SIMD				xmm	-> xmm (mem) */
-	XMM3P,		/* SIMD				xmm	-> r32,imm8 */
-	XMMP,		/* SIMD 			xmm/mem w/to xmm,imm8 */
-	XMMPRM,		/* SIMD 			r32/mem -> xmm,imm8 */
-	XMMS,		/* SIMD				xmm	-> xmm/mem */
-	XMMM,		/* SIMD 			mem	-> xmm */
-	XMMMS,		/* SIMD				xmm	-> mem */
-	XMM3MX,		/* SIMD 			r32/mem -> xmm */
-	XMM3MXS,	/* SIMD 			xmm	-> r32/mem */
-	XMMSH,		/* SIMD 			xmm,imm8 */
-	XMMXM3,		/* SIMD 			xmm/mem -> r32 */
-	XMMX3,		/* SIMD 			xmm	-> r32 */
-	XMMXMM,		/* SIMD 			xmm/mem	-> mm */
-	XMMMX,		/* SIMD 			mm	-> xmm */
-	XMMXM,		/* SIMD 			xmm	-> mm */
-	XMMFENCE,	/* SIMD lfence or mfence */
-	XMMSFNC		/* SIMD sfence (none or mem) */
-};
-
-#define	FILL	0x90	/* Fill byte used for alignment (nop)	*/
-
-/*
-** Register numbers for the i386
-*/
-#define	EAX_REGNO 0
-#define	ECX_REGNO 1
-#define	EDX_REGNO 2
-#define	EBX_REGNO 3
-#define	ESP_REGNO 4
-#define	EBP_REGNO 5
-#define	ESI_REGNO 6
-#define	EDI_REGNO 7
-
-/*
- * modes for immediate values
- */
-#define	MODE_NONE	0
-#define	MODE_IPREL	1	/* signed IP relative value */
-#define	MODE_SIGNED	2	/* sign extended immediate */
-#define	MODE_IMPLIED	3	/* constant value implied from opcode */
-#define	MODE_OFFSET	4	/* offset part of an address */
-
-/*
- * The letters used in these macros are:
- *   IND - indirect to another to another table
- *   "T" - means to Terminate indirections (this is the final opcode)
- *   "S" - means "operand length suffix required"
- *   "NS" - means "no suffix" which is the operand length suffix of the opcode
- *   "Z" - means instruction size arg required
- *   "u" - means the opcode is invalid in IA32 but valid in amd64
- *   "x" - means the opcode is invalid in amd64, but not IA32
- *   "y" - means the operand size is always 64 bits in 64 bit mode
- *   "p" - means push/pop stack operation
- */
-
-#if defined(DIS_TEXT) && defined(DIS_MEM)
-#define	IND(table)		{table, 0, "", 0, 0, 0, 0, 0, 0}
-#define	INDx(table)		{table, 0, "", 0, 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 1, 0}
-#define	TNSx(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 0, 0}
-#define	TNSy(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0, 1}
-#define	TNSZ(name, amode, sz)	{TERM, amode, name, 0, sz, 0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode, name, 0, sz, 0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode, name, 1, 0, 0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode, name, 1, 0, 1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode, name, 1, 0, 0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode, name, 1, 0, 0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode, name, 1, sz, 0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode, name, 1, sz, 1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode, name, 1, sz, 0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, "", 0, 0, 0, 0, 0}
-#elif defined(DIS_TEXT)
-#define	IND(table)		{table, 0, "", 0, 0, 0, 0, 0}
-#define	INDx(table)		{table, 0, "", 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0}
-#define	TNSx(name, amode)	{TERM, amode, name, 0, 1, 0, 0, 0}
-#define	TNSy(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 1}
-#define	TNSZ(name, amode, sz)	{TERM, amode, name, 0, 0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode, name, 0, 0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode, name, 1, 0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode, name, 1, 1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode, name, 1, 0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode, name, 1, 0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode, name, 1, 0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode, name, 1, 1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode, name, 1, 0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, "", 0, 0, 0, 0, 0}
-#elif defined(DIS_MEM)
-#define	IND(table)		{table, 0, 0, 0, 0, 0, 0}
-#define	INDx(table)		{table, 0, 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode,  0, 0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode,  0, 0, 0, 1, 0}
-#define	TNSy(name, amode)	{TERM, amode,  0, 0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode,  0, 0, 1, 0, 1}
-#define	TNSx(name, amode)	{TERM, amode,  0, 1, 0, 0, 0}
-#define	TNSZ(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode, sz, 0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode,  0, 0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode,  0, 1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode,  0, 0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode,  0, 0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode, sz, 1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode, sz, 0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, 0, 0, 0, 0, 0}
-#else
-#define	IND(table)		{table[0], 0, 0, 0, 0, 0}
-#define	INDx(table)		{table[0], 0, 1, 0, 0, 0}
-#define	TNS(name, amode)	{TERM, amode,  0, 0, 0, 0}
-#define	TNSu(name, amode)	{TERM, amode,  0, 0, 1, 0}
-#define	TNSy(name, amode)	{TERM, amode,  0, 1, 0, 0}
-#define	TNSyp(name, amode)	{TERM, amode,  0, 1, 0, 1}
-#define	TNSx(name, amode)	{TERM, amode,  1, 0, 0, 0}
-#define	TNSZ(name, amode, sz)	{TERM, amode,  0, 0, 0, 0}
-#define	TNSZy(name, amode, sz)	{TERM, amode,  0, 1, 0, 0}
-#define	TS(name, amode)		{TERM, amode,  0, 0, 0, 0}
-#define	TSx(name, amode)	{TERM, amode,  1, 0, 0, 0}
-#define	TSy(name, amode)	{TERM, amode,  0, 1, 0, 0}
-#define	TSp(name, amode)	{TERM, amode,  0, 0, 0, 1}
-#define	TSZ(name, amode, sz)	{TERM, amode,  0, 0, 0, 0}
-#define	TSZx(name, amode, sz)	{TERM, amode,  1, 0, 0, 0}
-#define	TSZy(name, amode, sz)	{TERM, amode,  0, 1, 0, 0}
-#define	INVALID			{TERM, UNKNOWN, 0, 0, 0, 0}
-#endif
-
-#ifdef DIS_TEXT
-/*
- * this decodes the r_m field for mode's 0, 1, 2 in 16 bit mode
- */
-const char *const dis_addr16[3][8] = {
-"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "",
-									"(%bx)",
-"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di", "(%bp)",
-									"(%bx)",
-"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "(%bp)",
-									"(%bx)",
-};
-
-
-/*
- * This decodes 32 bit addressing mode r_m field for modes 0, 1, 2
- */
-const char *const dis_addr32_mode0[16] = {
-  "(%eax)", "(%ecx)", "(%edx)",  "(%ebx)",  "", "",        "(%esi)",  "(%edi)",
-  "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "",        "(%r14d)", "(%r15d)"
-};
-
-const char *const dis_addr32_mode12[16] = {
-  "(%eax)", "(%ecx)", "(%edx)",  "(%ebx)",  "", "(%ebp)",  "(%esi)",  "(%edi)",
-  "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "(%r13d)", "(%r14d)", "(%r15d)"
-};
-
-/*
- * This decodes 64 bit addressing mode r_m field for modes 0, 1, 2
- */
-const char *const dis_addr64_mode0[16] = {
- "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "",       "(%rip)", "(%rsi)", "(%rdi)",
- "(%r8)",  "(%r9)",  "(%r10)", "(%r11)", "(%r12)", "(%rip)", "(%r14)", "(%r15)"
-};
-const char *const dis_addr64_mode12[16] = {
- "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "",       "(%rbp)", "(%rsi)", "(%rdi)",
- "(%r8)",  "(%r9)",  "(%r10)", "(%r11)", "(%r12)", "(%r13)", "(%r14)", "(%r15)"
-};
-
-/*
- * decode for scale from SIB byte
- */
-const char *const dis_scale_factor[4] = { ")", ",2)", ",4)", ",8)" };
-
-/*
- * register decoding for normal references to registers (ie. not addressing)
- */
-const char *const dis_REG8[16] = {
-	"%al",  "%cl",  "%dl",   "%bl",   "%ah",   "%ch",   "%dh",   "%bh",
-	"%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b"
-};
-
-const char *const dis_REG8_REX[16] = {
-	"%al",  "%cl",  "%dl",   "%bl",   "%spl",  "%bpl",  "%sil",  "%dil",
-	"%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b"
-};
-
-const char *const dis_REG16[16] = {
-	"%ax",  "%cx",  "%dx",   "%bx",   "%sp",   "%bp",   "%si",   "%di",
-	"%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w"
-};
-
-const char *const dis_REG32[16] = {
-	"%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
-	"%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d"
-};
-
-const char *const dis_REG64[16] = {
-	"%rax", "%rcx", "%rdx",  "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
-	"%r8",  "%r9",  "%r10",  "%r11", "%r12", "%r13", "%r14", "%r15"
-};
-
-const char *const dis_DEBUGREG[16] = {
-	"%db0", "%db1", "%db2",  "%db3",  "%db4",  "%db5",  "%db6",  "%db7",
-	"%db8", "%db9", "%db10", "%db11", "%db12", "%db13", "%db14", "%db15"
-};
-
-const char *const dis_CONTROLREG[16] = {
-    "%cr0", "%cr1", "%cr2", "%cr3", "%cr4", "%cr5?", "%cr6?", "%cr7?",
-    "%cr8", "%cr9?", "%cr10?", "%cr11?", "%cr12?", "%cr13?", "%cr14?", "%cr15?"
-};
-
-const char *const dis_TESTREG[16] = {
-	"%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7",
-	"%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7"
-};
-
-const char *const dis_MMREG[16] = {
-	"%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7",
-	"%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
-};
-
-const char *const dis_XMMREG[16] = {
-    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
-    "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
-};
-
-const char *const dis_SEGREG[16] = {
-	"%es", "%cs", "%ss", "%ds", "%fs", "%gs", "<reserved>", "<reserved>",
-	"%es", "%cs", "%ss", "%ds", "%fs", "%gs", "<reserved>", "<reserved>"
-};
-
-/*
- * SIMD predicate suffixes
- */
-const char *const dis_PREDSUFFIX[8] = {
-	"eq", "lt", "le", "unord", "neq", "nlt", "nle", "ord"
-};
-
-
-
-#endif	/* DIS_TEXT */
-
-
-
-
-/*
- *	"decode table" for 64 bit mode MOVSXD instruction (opcode 0x63)
- */
-const instable_t dis_opMOVSLD = TNS("movslq",MOVSXZ);
-
-/*
- *	"decode table" for pause and clflush instructions
- */
-const instable_t dis_opPause = TNS("pause", NORM);
-
-/*
- *	Decode table for 0x0F00 opcodes
- */
-const instable_t dis_op0F00[8] = {
-
-/*  [0]  */	TNS("sldt",M),		TNS("str",M),		TNSy("lldt",M), 	TNSy("ltr",M),
-/*  [4]  */	TNSZ("verr",M,2),	TNSZ("verw",M,2),	INVALID,		INVALID,
-};
-
-
-/*
- *	Decode table for 0x0F01 opcodes
- */
-const instable_t dis_op0F01[8] = {
-
-/*  [0]  */	TNSZ("sgdt",MO,6),	TNSZ("sidt",MO,6), 	TNSZ("lgdt",MO,6),	TNSZ("lidt",MO,6),
-/*  [4]  */	TNSZ("smsw",M,2),	INVALID, 		TNSZ("lmsw",M,2),	TNS("invlpg",SWAPGS),
-};
-
-/*
- *	Decode table for 0x0F18 opcodes -- SIMD prefetch
- */
-const instable_t dis_op0F18[8] = {
-
-/*  [0]  */	TNS("prefetchnta",PREF),TNS("prefetcht0",PREF),	TNS("prefetcht1",PREF),	TNS("prefetcht2",PREF),
-/*  [4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- * 	Decode table for 0x0FAE opcodes -- SIMD state save/restore
- */
-const instable_t dis_op0FAE[8] = {
-/*  [0]  */	TNSZ("fxsave",M,512),	TNSZ("fxrstor",M,512),	TNS("ldmxcsr",M),	TNS("stmxcsr",M),
-/*  [4]  */	INVALID,		TNS("lfence",XMMFENCE), TNS("mfence",XMMFENCE),	TNS("sfence",XMMSFNC),
-};
-
-/*
- *	Decode table for 0x0FBA opcodes
- */
-
-const instable_t dis_op0FBA[8] = {
-
-/*  [0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4]  */	TS("bt",MIb),		TS("bts",MIb),		TS("btr",MIb),		TS("btc",MIb),
-};
-
-/*
- * 	Decode table for 0x0FC7 opcode
- */
-
-const instable_t dis_op0FC7[8] = {
-
-/*  [0]  */	INVALID,		TNS("cmpxchg8b",M),	INVALID,		INVALID,
-/*  [4]  */	INVALID,		INVALID,	INVALID,		 INVALID,
-};
-
-
-/*
- *	Decode table for 0x0FC8 opcode -- 486 bswap instruction
- *
- *bit pattern: 0000 1111 1100 1reg
- */
-const instable_t dis_op0FC8[4] = {
-/*  [0]  */	TNS("bswap",R),		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Decode table for 0x0F71, 0x0F72, and 0x0F73 opcodes -- MMX instructions
- */
-const instable_t dis_op0F7123[4][8] = {
-{
-/*  [70].0 */	INVALID,		INVALID,		INVALID,		INVALID,
-/*      .4 */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [71].0 */	INVALID,		INVALID,		TNS("psrlw",MMOSH),	INVALID,
-/*      .4 */	TNS("psraw",MMOSH),	INVALID,		TNS("psllw",MMOSH),	INVALID,
-}, {
-/*  [72].0 */	INVALID,		INVALID,		TNS("psrld",MMOSH),	INVALID,
-/*      .4 */	TNS("psrad",MMOSH),	INVALID,		TNS("pslld",MMOSH),	INVALID,
-}, {
-/*  [73].0 */	INVALID,		INVALID,		TNS("psrlq",MMOSH),	TNS("INVALID",MMOSH),
-/*      .4 */	INVALID,		INVALID, 		TNS("psllq",MMOSH),	TNS("INVALID",MMOSH),
-} };
-
-/*
- *	Decode table for SIMD extensions to above 0x0F71-0x0F73 opcodes.
- */
-const instable_t dis_opSIMD7123[32] = {
-/* [70].0 */	INVALID,		INVALID,		INVALID,		INVALID,
-/*     .4 */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/* [71].0 */	INVALID,		INVALID,		TNS("psrlw",XMMSH),	INVALID,
-/*     .4 */	TNS("psraw",XMMSH),	INVALID,		TNS("psllw",XMMSH),	INVALID,
-
-/* [72].0 */	INVALID,		INVALID,		TNS("psrld",XMMSH),	INVALID,
-/*     .4 */	TNS("psrad",XMMSH),	INVALID,		TNS("pslld",XMMSH),	INVALID,
-
-/* [73].0 */	INVALID,		INVALID,		TNS("psrlq",XMMSH),	TNS("psrldq",XMMSH),
-/*     .4 */	INVALID,		INVALID,		TNS("psllq",XMMSH),	TNS("pslldq",XMMSH),
-};
-
-/*
- *	SIMD instructions have been wedged into the existing IA32 instruction
- *	set through the use of prefixes.  That is, while 0xf0 0x58 may be
- *	addps, 0xf3 0xf0 0x58 (literally, repz addps) is a completely different
- *	instruction - addss.  At present, three prefixes have been coopted in
- *	this manner - address size (0x66), repnz (0xf2) and repz (0xf3).  The
- *	following tables are used to provide the prefixed instruction names.
- *	The arrays are sparse, but they're fast.
- */
-
-/*
- *	Decode table for SIMD instructions with the address size (0x66) prefix.
- */
-const instable_t dis_opSIMDdata16[256] = {
-/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [10]  */	TNSZ("movupd",XMM,16),	TNSZ("movupd",XMMS,16),	TNSZ("movlpd",XMMM,8),	TNSZ("movlpd",XMMMS,8),
-/*  [14]  */	TNSZ("unpcklpd",XMM,16),TNSZ("unpckhpd",XMM,16),TNSZ("movhpd",XMMM,8),	TNSZ("movhpd",XMMMS,8),
-/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [28]  */	TNSZ("movapd",XMM,16),	TNSZ("movapd",XMMS,16),	TNSZ("cvtpi2pd",XMMOMX,8),TNSZ("movntpd",XMMOMS,16),
-/*  [2C]  */	TNSZ("cvttpd2pi",XMMXMM,16),TNSZ("cvtpd2pi",XMMXMM,16),TNSZ("ucomisd",XMM,8),TNSZ("comisd",XMM,8),
-
-/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [50]  */	TNS("movmskpd",XMMOX3),	TNSZ("sqrtpd",XMM,16),	INVALID,		INVALID,
-/*  [54]  */	TNSZ("andpd",XMM,16),	TNSZ("andnpd",XMM,16),	TNSZ("orpd",XMM,16),	TNSZ("xorpd",XMM,16),
-/*  [58]  */	TNSZ("addpd",XMM,16),	TNSZ("mulpd",XMM,16),	TNSZ("cvtpd2ps",XMM,16),TNSZ("cvtps2dq",XMM,16),
-/*  [5C]  */	TNSZ("subpd",XMM,16),	TNSZ("minpd",XMM,16),	TNSZ("divpd",XMM,16),	TNSZ("maxpd",XMM,16),
-
-/*  [60]  */	TNSZ("punpcklbw",XMM,16),TNSZ("punpcklwd",XMM,16),TNSZ("punpckldq",XMM,16),TNSZ("packsswb",XMM,16),
-/*  [64]  */	TNSZ("pcmpgtb",XMM,16),	TNSZ("pcmpgtw",XMM,16),	TNSZ("pcmpgtd",XMM,16),	TNSZ("packuswb",XMM,16),
-/*  [68]  */	TNSZ("punpckhbw",XMM,16),TNSZ("punpckhwd",XMM,16),TNSZ("punpckhdq",XMM,16),TNSZ("packssdw",XMM,16),
-/*  [6C]  */	TNSZ("punpcklqdq",XMM,16),TNSZ("punpckhqdq",XMM,16),TNSZ("movd",XMM3MX,4),TNSZ("movdqa",XMM,16),
-
-/*  [70]  */	TNSZ("pshufd",XMMP,16),	INVALID,		INVALID,		INVALID,
-/*  [74]  */	TNSZ("pcmpeqb",XMM,16),	TNSZ("pcmpeqw",XMM,16),	TNSZ("pcmpeqd",XMM,16),	INVALID,
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		TNSZ("movd",XMM3MXS,4),	TNSZ("movdqa",XMMS,16),
-
-/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmppd",XMMP,16),	INVALID,
-/*  [C4]  */	TNSZ("pinsrw",XMMPRM,2),TNS("pextrw",XMM3P),	TNSZ("shufpd",XMMP,16),	INVALID,
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [D0]  */	INVALID,		TNSZ("psrlw",XMM,16),	TNSZ("psrld",XMM,16),	TNSZ("psrlq",XMM,16),
-/*  [D4]  */	TNSZ("paddq",XMM,16),	TNSZ("pmullw",XMM,16),	TNSZ("movq",XMMS,8),	TNS("pmovmskb",XMMX3),
-/*  [D8]  */	TNSZ("psubusb",XMM,16),	TNSZ("psubusw",XMM,16),	TNSZ("pminub",XMM,16),	TNSZ("pand",XMM,16),
-/*  [DC]  */	TNSZ("paddusb",XMM,16),	TNSZ("paddusw",XMM,16),	TNSZ("pmaxub",XMM,16),	TNSZ("pandn",XMM,16),
-
-/*  [E0]  */	TNSZ("pavgb",XMM,16),	TNSZ("psraw",XMM,16),	TNSZ("psrad",XMM,16),	TNSZ("pavgw",XMM,16),
-/*  [E4]  */	TNSZ("pmulhuw",XMM,16),	TNSZ("pmulhw",XMM,16),	TNSZ("cvttpd2dq",XMM,16),TNSZ("movntdq",XMMS,16),
-/*  [E8]  */	TNSZ("psubsb",XMM,16),	TNSZ("psubsw",XMM,16),	TNSZ("pminsw",XMM,16),	TNSZ("por",XMM,16),
-/*  [EC]  */	TNSZ("paddsb",XMM,16),	TNSZ("paddsw",XMM,16),	TNSZ("pmaxsw",XMM,16),	TNSZ("pxor",XMM,16),
-
-/*  [F0]  */	INVALID,		TNSZ("psllw",XMM,16),	TNSZ("pslld",XMM,16),	TNSZ("psllq",XMM,16),
-/*  [F4]  */	TNSZ("pmuludq",XMM,16),	TNSZ("pmaddwd",XMM,16),	TNSZ("psadbw",XMM,16),	TNSZ("maskmovdqu", XMMXIMPL,16),
-/*  [F8]  */	TNSZ("psubb",XMM,16),	TNSZ("psubw",XMM,16),	TNSZ("psubd",XMM,16),	TNSZ("psubq",XMM,16),
-/*  [FC]  */	TNSZ("paddb",XMM,16),	TNSZ("paddw",XMM,16),	TNSZ("paddd",XMM,16),	INVALID,
-};
-
-/*
- *	Decode table for SIMD instructions with the repnz (0xf2) prefix.
- */
-const instable_t dis_opSIMDrepnz[256] = {
-/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [10]  */	TNSZ("movsd",XMM,8),	TNSZ("movsd",XMMS,8),	INVALID,		INVALID,
-/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [28]  */	INVALID,		INVALID,		TNSZ("cvtsi2sd",XMM3MX,4),INVALID,
-/*  [2C]  */	TNSZ("cvttsd2si",XMMXM3,8),TNSZ("cvtsd2si",XMMXM3,8),INVALID,		INVALID,
-
-/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [50]  */	INVALID,		TNSZ("sqrtsd",XMM,8),	INVALID,		INVALID,
-/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [58]  */	TNSZ("addsd",XMM,8),	TNSZ("mulsd",XMM,8),	TNSZ("cvtsd2ss",XMM,8),	INVALID,
-/*  [5C]  */	TNSZ("subsd",XMM,8),	TNSZ("minsd",XMM,8),	TNSZ("divsd",XMM,8),	TNSZ("maxsd",XMM,8),
-
-/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [70]  */	TNSZ("pshuflw",XMMP,16),INVALID,		INVALID,		INVALID,
-/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmpsd",XMMP,8),	INVALID,
-/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [D4]  */	INVALID,		INVALID,		TNS("movdq2q",XMMXM),	INVALID,
-/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [E4]  */	INVALID,		INVALID,		TNSZ("cvtpd2dq",XMM,16),INVALID,
-/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Decode table for SIMD instructions with the repz (0xf3) prefix.
- */
-const instable_t dis_opSIMDrepz[256] = {
-/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [10]  */	TNSZ("movss",XMM,4),	TNSZ("movss",XMMS,4),	INVALID,		INVALID,
-/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [28]  */	INVALID,		INVALID,		TNSZ("cvtsi2ss",XMM3MX,4),INVALID,
-/*  [2C]  */	TNSZ("cvttss2si",XMMXM3,4),TNSZ("cvtss2si",XMMXM3,4),INVALID,		INVALID,
-
-/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [50]  */	INVALID,		TNSZ("sqrtss",XMM,4),	TNSZ("rsqrtss",XMM,4),	TNSZ("rcpss",XMM,4),
-/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [58]  */	TNSZ("addss",XMM,4),	TNSZ("mulss",XMM,4),	TNSZ("cvtss2sd",XMM,4),	TNSZ("cvttps2dq",XMM,16),
-/*  [5C]  */	TNSZ("subss",XMM,4),	TNSZ("minss",XMM,4),	TNSZ("divss",XMM,4),	TNSZ("maxss",XMM,4),
-
-/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [6C]  */	INVALID,		INVALID,		INVALID,		TNSZ("movdqu",XMM,16),
-
-/*  [70]  */	TNSZ("pshufhw",XMMP,16),INVALID,		INVALID,		INVALID,
-/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		TNSZ("movq",XMM,8),	TNSZ("movdqu",XMMS,16),
-
-/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmpss",XMMP,4),	INVALID,
-/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [D4]  */	INVALID,		INVALID,		TNS("movq2dq",XMMMX),	INVALID,
-/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [E4]  */	INVALID,		INVALID,		TNSZ("cvtdq2pd",XMM,8),	INVALID,
-/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-
-/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Decode table for 0x0F opcodes
- */
-
-const instable_t dis_op0F[16][16] = {
-{
-/*  [00]  */	IND(dis_op0F00),	IND(dis_op0F01),	TNS("lar",MR),		TNS("lsl",MR),
-/*  [04]  */	INVALID,		TNS("syscall",NORM),	TNS("clts",NORM),	TNS("sysret",NORM),
-/*  [08]  */	TNS("invd",NORM),	TNS("wbinvd",NORM),	INVALID,		TNS("ud2",NORM),
-/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [10]  */	TNSZ("movups",XMMO,16),	TNSZ("movups",XMMOS,16),TNSZ("movlps",XMMO,8),	TNSZ("movlps",XMMOS,8),
-/*  [14]  */	TNSZ("unpcklps",XMMO,16),TNSZ("unpckhps",XMMO,16),TNSZ("movhps",XMMOM,8),TNSZ("movhps",XMMOMS,8),
-/*  [18]  */	IND(dis_op0F18),	INVALID,		INVALID,		INVALID,
-/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [20]  */	TSy("mov",SREG),	TSy("mov",SREG),	TSy("mov",SREG),	TSy("mov",SREG),
-/*  [24]  */	TSx("mov",SREG),	INVALID,		TSx("mov",SREG),	INVALID,
-/*  [28]  */	TNSZ("movaps",XMMO,16),	TNSZ("movaps",XMMOS,16),TNSZ("cvtpi2ps",XMMOMX,8),TNSZ("movntps",XMMOS,16),
-/*  [2C]  */	TNSZ("cvttps2pi",XMMOXMM,8),TNSZ("cvtps2pi",XMMOXMM,8),TNSZ("ucomiss",XMMO,4),TNSZ("comiss",XMMO,4),
-}, {
-/*  [30]  */	TNS("wrmsr",NORM),	TNS("rdtsc",NORM),	TNS("rdmsr",NORM),	TNS("rdpmc",NORM),
-/*  [34]  */	TNSx("sysenter",NORM),	TNSx("sysexit",NORM),	INVALID,		INVALID,
-/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [40]  */	TS("cmovx.o",MR),	TS("cmovx.no",MR),	TS("cmovx.b",MR),	TS("cmovx.ae",MR),
-/*  [44]  */	TS("cmovx.e",MR),	TS("cmovx.ne",MR),	TS("cmovx.be",MR),	TS("cmovx.a",MR),
-/*  [48]  */	TS("cmovx.s",MR),	TS("cmovx.ns",MR),	TS("cmovx.pe",MR),	TS("cmovx.po",MR),
-/*  [4C]  */	TS("cmovx.l",MR),	TS("cmovx.ge",MR),	TS("cmovx.le",MR),	TS("cmovx.g",MR),
-}, {
-/*  [50]  */	TNS("movmskps",XMMOX3),	TNSZ("sqrtps",XMMO,16),	TNSZ("rsqrtps",XMMO,16),TNSZ("rcpps",XMMO,16),
-/*  [54]  */	TNSZ("andps",XMMO,16),	TNSZ("andnps",XMMO,16),	TNSZ("orps",XMMO,16),	TNSZ("xorps",XMMO,16),
-/*  [58]  */	TNSZ("addps",XMMO,16),	TNSZ("mulps",XMMO,16),	TNSZ("cvtps2pd",XMMO,8),TNSZ("cvtdq2ps",XMMO,16),
-/*  [5C]  */	TNSZ("subps",XMMO,16),	TNSZ("minps",XMMO,16),	TNSZ("divps",XMMO,16),	TNSZ("maxps",XMMO,16),
-}, {
-/*  [60]  */	TNSZ("punpcklbw",MMO,4),TNSZ("punpcklwd",MMO,4),TNSZ("punpckldq",MMO,4),TNSZ("packsswb",MMO,8),
-/*  [64]  */	TNSZ("pcmpgtb",MMO,8),	TNSZ("pcmpgtw",MMO,8),	TNSZ("pcmpgtd",MMO,8),	TNSZ("packuswb",MMO,8),
-/*  [68]  */	TNSZ("punpckhbw",MMO,8),TNSZ("punpckhwd",MMO,8),TNSZ("punpckhdq",MMO,8),TNSZ("packssdw",MMO,8),
-/*  [6C]  */	TNSZ("INVALID",MMO,0),	TNSZ("INVALID",MMO,0),	TNSZ("movd",MMO,4),	TNSZ("movq",MMO,8),
-}, {
-/*  [70]  */	TNSZ("pshufw",MMOPM,8),	TNS("psrXXX",MR),	TNS("psrXXX",MR),	TNS("psrXXX",MR),
-/*  [74]  */	TNSZ("pcmpeqb",MMO,8),	TNSZ("pcmpeqw",MMO,8),	TNSZ("pcmpeqd",MMO,8),	TNS("emms",NORM),
-/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [7C]  */	INVALID,		INVALID,		TNSZ("movd",MMOS,4),	TNSZ("movq",MMOS,8),
-}, {
-/*  [80]  */	TNS("jo",D),		TNS("jno",D),		TNS("jb",D),		TNS("jae",D),
-/*  [84]  */	TNS("je",D),		TNS("jne",D),		TNS("jbe",D),		TNS("ja",D),
-/*  [88]  */	TNS("js",D),		TNS("jns",D),		TNS("jp",D),		TNS("jnp",D),
-/*  [8C]  */	TNS("jl",D),		TNS("jge",D),		TNS("jle",D),		TNS("jg",D),
-}, {
-/*  [90]  */	TNS("seto",Mb),		TNS("setno",Mb),	TNS("setb",Mb),		TNS("setae",Mb),
-/*  [94]  */	TNS("sete",Mb),		TNS("setne",Mb),	TNS("setbe",Mb),	TNS("seta",Mb),
-/*  [98]  */	TNS("sets",Mb),		TNS("setns",Mb),	TNS("setp",Mb),		TNS("setnp",Mb),
-/*  [9C]  */	TNS("setl",Mb),		TNS("setge",Mb),	TNS("setle",Mb),	TNS("setg",Mb),
-}, {
-/*  [A0]  */	TSp("push",LSEG),	TSp("pop",LSEG),	TNS("cpuid",NORM),	TS("bt",RMw),
-/*  [A4]  */	TS("shld",DSHIFT),	TS("shld",DSHIFTcl),	INVALID,		INVALID,
-/*  [A8]  */	TSp("push",LSEG),	TSp("pop",LSEG),	TNS("rsm",NORM),	TS("bts",RMw),
-/*  [AC]  */	TS("shrd",DSHIFT),	TS("shrd",DSHIFTcl),	IND(dis_op0FAE),	TS("imul",MRw),
-}, {
-/*  [B0]  */	TNS("cmpxchgb",RMw),	TS("cmpxchg",RMw),	TS("lss",MR),		TS("btr",RMw),
-/*  [B4]  */	TS("lfs",MR),		TS("lgs",MR),		TS("movzb",MOVZ),	TNS("movzwl",MOVZ),
-/*  [B8]  */	INVALID,		INVALID,		IND(dis_op0FBA),	TS("btc",RMw),
-/*  [BC]  */	TS("bsf",MRw),		TS("bsr",MRw),		TS("movsb",MOVZ),	TNS("movswl",MOVZ),
-}, {
-/*  [C0]  */	TNS("xaddb",XADDB),	TS("xadd",RMw),		TNSZ("cmpps",XMMOPM,16),TNS("movnti",RM),
-/*  [C4]  */	TNSZ("pinsrw",MMOPRM,2),TNS("pextrw",MMO3P), 	TNSZ("shufps",XMMOPM,16),IND(dis_op0FC7),
-/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [D0]  */	INVALID,		TNSZ("psrlw",MMO,8),	TNSZ("psrld",MMO,8),	TNSZ("psrlq",MMO,8),
-/*  [D4]  */	TNSZ("paddq",MMO,8),	TNSZ("pmullw",MMO,8),	TNSZ("INVALID",MMO,0),	TNS("pmovmskb",MMOM3),
-/*  [D8]  */	TNSZ("psubusb",MMO,8),	TNSZ("psubusw",MMO,8),	TNSZ("pminub",MMO,8),	TNSZ("pand",MMO,8),
-/*  [DC]  */	TNSZ("paddusb",MMO,8),	TNSZ("paddusw",MMO,8),	TNSZ("pmaxub",MMO,8),	TNSZ("pandn",MMO,8),
-}, {
-/*  [E0]  */	TNSZ("pavgb",MMO,8),	TNSZ("psraw",MMO,8),	TNSZ("psrad",MMO,8),	TNSZ("pavgw",MMO,8),
-/*  [E4]  */	TNSZ("pmulhuw",MMO,8),	TNSZ("pmulhw",MMO,8),	TNS("INVALID",XMMO),	TNSZ("movntq",MMOMS,8),
-/*  [E8]  */	TNSZ("psubsb",MMO,8),	TNSZ("psubsw",MMO,8),	TNSZ("pminsw",MMO,8),	TNSZ("por",MMO,8),
-/*  [EC]  */	TNSZ("paddsb",MMO,8),	TNSZ("paddsw",MMO,8),	TNSZ("pmaxsw",MMO,8),	TNSZ("pxor",MMO,8),
-}, {
-/*  [F0]  */	INVALID,		TNSZ("psllw",MMO,8),	TNSZ("pslld",MMO,8),	TNSZ("psllq",MMO,8),
-/*  [F4]  */	TNSZ("pmuludq",MMO,8),	TNSZ("pmaddwd",MMO,8),	TNSZ("psadbw",MMO,8),	TNSZ("maskmovq",MMOIMPL,8),
-/*  [F8]  */	TNSZ("psubb",MMO,8),	TNSZ("psubw",MMO,8),	TNSZ("psubd",MMO,8),	TNSZ("psubq",MMO,8),
-/*  [FC]  */	TNSZ("paddb",MMO,8),	TNSZ("paddw",MMO,8),	TNSZ("paddd",MMO,8),	INVALID,
-} };
-
-
-/*
- *	Decode table for 0x80 opcodes
- */
-
-const instable_t dis_op80[8] = {
-
-/*  [0]  */	TNS("addb",IMlw),	TNS("orb",IMw),		TNS("adcb",IMlw),	TNS("sbbb",IMlw),
-/*  [4]  */	TNS("andb",IMw),	TNS("subb",IMlw),	TNS("xorb",IMw),	TNS("cmpb",IMlw),
-};
-
-
-/*
- *	Decode table for 0x81 opcodes.
- */
-
-const instable_t dis_op81[8] = {
-
-/*  [0]  */	TS("add",IMlw),		TS("or",IMw),		TS("adc",IMlw),		TS("sbb",IMlw),
-/*  [4]  */	TS("and",IMw),		TS("sub",IMlw),		TS("xor",IMw),		TS("cmp",IMlw),
-};
-
-
-/*
- *	Decode table for 0x82 opcodes.
- */
-
-const instable_t dis_op82[8] = {
-
-/*  [0]  */	TNSx("addb",IMlw),	TNSx("orb",IMlw),	TNSx("adcb",IMlw),	TNSx("sbbb",IMlw),
-/*  [4]  */	TNSx("andb",IMlw),	TNSx("subb",IMlw),	TNSx("xorb",IMlw),	TNSx("cmpb",IMlw),
-};
-/*
- *	Decode table for 0x83 opcodes.
- */
-
-const instable_t dis_op83[8] = {
-
-/*  [0]  */	TS("add",IMlw),		TS("or",IMlw),		TS("adc",IMlw),		TS("sbb",IMlw),
-/*  [4]  */	TS("and",IMlw),		TS("sub",IMlw),		TS("xor",IMlw),		TS("cmp",IMlw),
-};
-
-/*
- *	Decode table for 0xC0 opcodes.
- */
-
-const instable_t dis_opC0[8] = {
-
-/*  [0]  */	TNS("rolb",MvI),	TNS("rorb",MvI),	TNS("rclb",MvI),	TNS("rcrb",MvI),
-/*  [4]  */	TNS("shlb",MvI),	TNS("shrb",MvI),	INVALID,		TNS("sarb",MvI),
-};
-
-/*
- *	Decode table for 0xD0 opcodes.
- */
-
-const instable_t dis_opD0[8] = {
-
-/*  [0]  */	TNS("rolb",Mv),		TNS("rorb",Mv),		TNS("rclb",Mv),		TNS("rcrb",Mv),
-/*  [4]  */	TNS("shlb",Mv),		TNS("shrb",Mv),		TNS("salb",Mv),		TNS("sarb",Mv),
-};
-
-/*
- *	Decode table for 0xC1 opcodes.
- *	186 instruction set
- */
-
-const instable_t dis_opC1[8] = {
-
-/*  [0]  */	TS("rol",MvI),		TS("ror",MvI),		TS("rcl",MvI),		TS("rcr",MvI),
-/*  [4]  */	TS("shl",MvI),		TS("shr",MvI),		TS("sal",MvI),		TS("sar",MvI),
-};
-
-/*
- *	Decode table for 0xD1 opcodes.
- */
-
-const instable_t dis_opD1[8] = {
-
-/*  [0]  */	TS("rol",Mv),		TS("ror",Mv),		TS("rcl",Mv),		TS("rcr",Mv),
-/*  [4]  */	TS("shl",Mv),		TS("shr",Mv),		TS("sal",Mv),		TS("sar",Mv),
-};
-
-
-/*
- *	Decode table for 0xD2 opcodes.
- */
-
-const instable_t dis_opD2[8] = {
-
-/*  [0]  */	TNS("rolb",Mv),		TNS("rorb",Mv),		TNS("rclb",Mv),		TNS("rcrb",Mv),
-/*  [4]  */	TNS("shlb",Mv),		TNS("shrb",Mv),		TNS("salb",Mv),		TNS("sarb",Mv),
-};
-/*
- *	Decode table for 0xD3 opcodes.
- */
-
-const instable_t dis_opD3[8] = {
-
-/*  [0]  */	TS("rol",Mv),		TS("ror",Mv),		TS("rcl",Mv),		TS("rcr",Mv),
-/*  [4]  */	TS("shl",Mv),		TS("shr",Mv),		TS("salb",Mv),		TS("sar",Mv),
-};
-
-
-/*
- *	Decode table for 0xF6 opcodes.
- */
-
-const instable_t dis_opF6[8] = {
-
-/*  [0]  */	TNS("testb",IMw),	TNS("testb",IMw),	TNS("notb",Mw),		TNS("negb",Mw),
-/*  [4]  */	TNS("mulb",MA),		TNS("imulb",MA),	TNS("divb",MA),		TNS("idivb",MA),
-};
-
-
-/*
- *	Decode table for 0xF7 opcodes.
- */
-
-const instable_t dis_opF7[8] = {
-
-/*  [0]  */	TS("test",IMw),		TS("test",IMw),		TS("not",Mw),		TS("neg",Mw),
-/*  [4]  */	TS("mul",MA),		TS("imul",MA),		TS("div",MA),		TS("idiv",MA),
-};
-
-
-/*
- *	Decode table for 0xFE opcodes.
- */
-
-const instable_t dis_opFE[8] = {
-
-/*  [0]  */	TNS("incb",Mw),		TNS("decb",Mw),		INVALID,		INVALID,
-/*  [4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-/*
- *	Decode table for 0xFF opcodes.
- */
-
-const instable_t dis_opFF[8] = {
-
-/*  [0]  */	TS("inc",Mw),		TS("dec",Mw),		TNSyp("call",INM),	TNS("lcall",INM),
-/*  [4]  */	TNSy("jmp",INM),	TNS("ljmp",INM),	TSp("push",M),		INVALID,
-};
-
-/* for 287 instructions, which are a mess to decode */
-
-const instable_t dis_opFP1n2[8][8] = {
-{
-/* bit pattern:	1101 1xxx MODxx xR/M */
-/*  [0,0] */	TNS("fadds",M),		TNS("fmuls",M),		TNS("fcoms",M),		TNS("fcomps",M),
-/*  [0,4] */	TNS("fsubs",M),		TNS("fsubrs",M),	TNS("fdivs",M),		TNS("fdivrs",M),
-}, {
-/*  [1,0]  */	TNS("flds",M),		INVALID,		TNS("fsts",M),		TNS("fstps",M),
-/*  [1,4]  */	TNSZ("fldenv",M,28),	TNSZ("fldcw",M,2),	TNSZ("fnstenv",M,28),	TNSZ("fnstcw",M,2),
-}, {
-/*  [2,0]  */	TNS("fiaddl",M),	TNS("fimull",M),	TNS("ficoml",M),	TNS("ficompl",M),
-/*  [2,4]  */	TNS("fisubl",M),	TNS("fisubrl",M),	TNS("fidivl",M),	TNS("fidivrl",M),
-}, {
-/*  [3,0]  */	TNS("fildl",M),		INVALID,		TNS("fistl",M),		TNS("fistpl",M),
-/*  [3,4]  */	INVALID,		TNSZ("fldt",M,10),	INVALID,		TNSZ("fstpt",M,10),
-}, {
-/*  [4,0]  */	TNSZ("faddl",M,8),	TNSZ("fmull",M,8),	TNSZ("fcoml",M,8),	TNSZ("fcompl",M,8),
-/*  [4,1]  */	TNSZ("fsubl",M,8),	TNSZ("fsubrl",M,8),	TNSZ("fdivl",M,8),	TNSZ("fdivrl",M,8),
-}, {
-/*  [5,0]  */	TNSZ("fldl",M,8),	INVALID,		TNSZ("fstl",M,8),	TNSZ("fstpl",M,8),
-/*  [5,4]  */	TNSZ("frstor",M,108),	INVALID,		TNSZ("fnsave",M,108),	TNSZ("fnstsw",M,2),
-}, {
-/*  [6,0]  */	TNSZ("fiadd",M,2),	TNSZ("fimul",M,2),	TNSZ("ficom",M,2),	TNSZ("ficomp",M,2),
-/*  [6,4]  */	TNSZ("fisub",M,2),	TNSZ("fisubr",M,2),	TNSZ("fidiv",M,2),	TNSZ("fidivr",M,2),
-}, {
-/*  [7,0]  */	TNSZ("fild",M,2),	INVALID,		TNSZ("fist",M,2),	TNSZ("fistp",M,2),
-/*  [7,4]  */	TNSZ("fbld",M,10),	TNSZ("fildll",M,8),	TNSZ("fbstp",M,10),	TNSZ("fistpll",M,8),
-} };
-
-const instable_t dis_opFP3[8][8] = {
-{
-/* bit  pattern:	1101 1xxx 11xx xREG */
-/*  [0,0]  */	TNS("fadd",FF),		TNS("fmul",FF),		TNS("fcom",F),		TNS("fcomp",F),
-/*  [0,4]  */	TNS("fsub",FF),		TNS("fsubr",FF),	TNS("fdiv",FF),		TNS("fdivr",FF),
-}, {
-/*  [1,0]  */	TNS("fld",F),		TNS("fxch",F),		TNS("fnop",NORM),	TNS("fstp",F),
-/*  [1,4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [2,0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [2,4]  */	INVALID,		TNS("fucompp",NORM),	INVALID,		INVALID,
-}, {
-/*  [3,0]  */	INVALID,		INVALID,		INVALID,		INVALID,
-/*  [3,4]  */	INVALID,		INVALID,		INVALID,		INVALID,
-}, {
-/*  [4,0]  */	TNS("fadd",FF),		TNS("fmul",FF),		TNS("fcom",F),		TNS("fcomp",F),
-/*  [4,4]  */	TNS("fsub",FF),		TNS("fsubr",FF),	TNS("fdiv",FF),		TNS("fdivr",FF),
-}, {
-/*  [5,0]  */	TNS("ffree",F),		TNS("fxch",F),		TNS("fst",F),		TNS("fstp",F),
-/*  [5,4]  */	TNS("fucom",F),		TNS("fucomp",F),	INVALID,		INVALID,
-}, {
-/*  [6,0]  */	TNS("faddp",FF),	TNS("fmulp",FF),	TNS("fcomp",F),		TNS("fcompp",NORM),
-/*  [6,4]  */	TNS("fsubp",FF),	TNS("fsubrp",FF),	TNS("fdivp",FF),	TNS("fdivrp",FF),
-}, {
-/*  [7,0]  */	TNS("ffree",F),		TNS("fxch",F),		TNS("fstp",F),		TNS("fstp",F),
-/*  [7,4]  */	TNS("fnstsw",M),	TNS("fucomip",FFC),	TNS("fcomip",FFC),	INVALID,
-} };
-
-const instable_t dis_opFP4[4][8] = {
-{
-/* bit pattern:	1101 1001 111x xxxx */
-/*  [0,0]  */	TNS("fchs",NORM),	TNS("fabs",NORM),	INVALID,		INVALID,
-/*  [0,4]  */	TNS("ftst",NORM),	TNS("fxam",NORM),	TNS("ftstp",NORM),	INVALID,
-}, {
-/*  [1,0]  */	TNS("fld1",NORM),	TNS("fldl2t",NORM),	TNS("fldl2e",NORM),	TNS("fldpi",NORM),
-/*  [1,4]  */	TNS("fldlg2",NORM),	TNS("fldln2",NORM),	TNS("fldz",NORM),	INVALID,
-}, {
-/*  [2,0]  */	TNS("f2xm1",NORM),	TNS("fyl2x",NORM),	TNS("fptan",NORM),	TNS("fpatan",NORM),
-/*  [2,4]  */	TNS("fxtract",NORM),	TNS("fprem1",NORM),	TNS("fdecstp",NORM),	TNS("fincstp",NORM),
-}, {
-/*  [3,0]  */	TNS("fprem",NORM),	TNS("fyl2xp1",NORM),	TNS("fsqrt",NORM),	TNS("fsincos",NORM),
-/*  [3,4]  */	TNS("frndint",NORM),	TNS("fscale",NORM),	TNS("fsin",NORM),	TNS("fcos",NORM),
-} };
-
-const instable_t dis_opFP5[8] = {
-/* bit pattern:	1101 1011 111x xxxx */
-/*  [0]  */	TNS("feni",NORM),	TNS("fdisi",NORM),	TNS("fnclex",NORM),	TNS("fninit",NORM),
-/*  [4]  */	TNS("fsetpm",NORM),	TNS("frstpm",NORM),	INVALID,		INVALID,
-};
-
-const instable_t dis_opFP6[8] = {
-/* bit pattern:	1101 1011 11yy yxxx */
-/*  [00]  */	TNS("fcmov.nb",FF),	TNS("fcmov.ne",FF),	TNS("fcmov.nbe",FF),	TNS("fcmov.nu",FF),
-/*  [04]  */	INVALID,		TNS("fucomi",F),	TNS("fcomi",F),		INVALID,
-};
-
-const instable_t dis_opFP7[8] = {
-/* bit pattern:	1101 1010 11yy yxxx */
-/*  [00]  */	TNS("fcmov.b",FF),	TNS("fcmov.e",FF),	TNS("fcmov.be",FF),	TNS("fcmov.u",FF),
-/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
-};
-
-/*
- *	Main decode table for the op codes.  The first two nibbles
- *	will be used as an index into the table.  If there is a
- *	a need to further decode an instruction, the array to be
- *	referenced is indicated with the other two entries being
- *	empty.
- */
-
-const instable_t dis_distable[16][16] = {
-{
-/* [0,0] */	TNS("addb",RMw),	TS("add",RMw),		TNS("addb",MRw),	TS("add",MRw),
-/* [0,4] */	TNS("addb",IA),		TS("add",IA),		TSx("push",SEG),	TSx("pop",SEG),
-/* [0,8] */	TNS("orb",RMw),		TS("or",RMw),		TNS("orb",MRw),		TS("or",MRw),
-/* [0,C] */	TNS("orb",IA),		TS("or",IA),		TSx("push",SEG),	IND(&dis_op0F[0][0]),
-}, {
-/* [1,0] */	TNS("adcb",RMw),	TS("adc",RMw),		TNS("adcb",MRw),	TS("adc",MRw),
-/* [1,4] */	TNS("adcb",IA),		TS("adc",IA),		TSx("push",SEG),	TSx("pop",SEG),
-/* [1,8] */	TNS("sbbb",RMw),	TS("sbb",RMw),		TNS("sbbb",MRw),	TS("sbb",MRw),
-/* [1,C] */	TNS("sbbb",IA),		TS("sbb",IA),		TSx("push",SEG),	TSx("pop",SEG),
-}, {
-/* [2,0] */	TNS("andb",RMw),	TS("and",RMw),		TNS("andb",MRw),	TS("and",MRw),
-/* [2,4] */	TNS("andb",IA),		TS("and",IA),		TNSx("%es:",OVERRIDE),	TNSx("daa",NORM),
-/* [2,8] */	TNS("subb",RMw),	TS("sub",RMw),		TNS("subb",MRw),	TS("sub",MRw),
-/* [2,C] */	TNS("subb",IA),		TS("sub",IA),		TNSx("%cs:",OVERRIDE),	TNSx("das",NORM),
-}, {
-/* [3,0] */	TNS("xorb",RMw),	TS("xor",RMw),		TNS("xorb",MRw),	TS("xor",MRw),
-/* [3,4] */	TNS("xorb",IA),		TS("xor",IA),		TNSx("%ss:",OVERRIDE),	TNSx("aaa",NORM),
-/* [3,8] */	TNS("cmpb",RMw),	TS("cmp",RMw),		TNS("cmpb",MRw),	TS("cmp",MRw),
-/* [3,C] */	TNS("cmpb",IA),		TS("cmp",IA),		TNSx("%ds:",OVERRIDE),	TNSx("aas",NORM),
-}, {
-/* [4,0] */	TSx("inc",R),		TSx("inc",R),		TSx("inc",R),		TSx("inc",R),
-/* [4,4] */	TSx("inc",R),		TSx("inc",R),		TSx("inc",R),		TSx("inc",R),
-/* [4,8] */	TSx("dec",R),		TSx("dec",R),		TSx("dec",R),		TSx("dec",R),
-/* [4,C] */	TSx("dec",R),		TSx("dec",R),		TSx("dec",R),		TSx("dec",R),
-}, {
-/* [5,0] */	TSp("push",R),		TSp("push",R),		TSp("push",R),		TSp("push",R),
-/* [5,4] */	TSp("push",R),		TSp("push",R),		TSp("push",R),		TSp("push",R),
-/* [5,8] */	TSp("pop",R),		TSp("pop",R),		TSp("pop",R),		TSp("pop",R),
-/* [5,C] */	TSp("pop",R),		TSp("pop",R),		TSp("pop",R),		TSp("pop",R),
-}, {
-/* [6,0] */	TSZx("pusha",IMPLMEM,28),TSZx("popa",IMPLMEM,28), TSx("bound",MR),	TNS("arpl",RMw),
-/* [6,4] */	TNS("%fs:",OVERRIDE),	TNS("%gs:",OVERRIDE),	TNS("data16",DM),	TNS("addr16",AM),
-/* [6,8] */	TSp("push",I),		TS("imul",IMUL),	TSp("push",Ib),	TS("imul",IMUL),
-/* [6,C] */	TNSZ("insb",IMPLMEM,1),	TSZ("ins",IMPLMEM,4),	TNSZ("outsb",IMPLMEM,1),TSZ("outs",IMPLMEM,4),
-}, {
-/* [7,0] */	TNSy("jo",BD),		TNSy("jno",BD),		TNSy("jb",BD),		TNSy("jae",BD),
-/* [7,4] */	TNSy("je",BD),		TNSy("jne",BD),		TNSy("jbe",BD),		TNSy("ja",BD),
-/* [7,8] */	TNSy("js",BD),		TNSy("jns",BD),		TNSy("jp",BD),		TNSy("jnp",BD),
-/* [7,C] */	TNSy("jl",BD),		TNSy("jge",BD),		TNSy("jle",BD),		TNSy("jg",BD),
-}, {
-/* [8,0] */	IND(dis_op80),		IND(dis_op81),		INDx(dis_op82),		IND(dis_op83),
-/* [8,4] */	TNS("testb",RMw),	TS("test",RMw),		TNS("xchgb",RMw),	TS("xchg",RMw),
-/* [8,8] */	TNS("movb",RMw),	TS("mov",RMw),		TNS("movb",MRw),	TS("mov",MRw),
-/* [8,C] */	TNS("movw",SM),		TS("lea",MR),		TNS("movw",MS),		TSp("pop",M),
-}, {
-/* [9,0] */	TNS("nop",NORM),	TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),
-/* [9,4] */	TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),
-/* [9,8] */	TNS("cXtX",CBW),	TNS("cXtX",CWD),	TNSx("lcall",SO),	TNS("fwait",NORM),
-/* [9,C] */	TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4),	TNSx("sahf",NORM),	TNSx("lahf",NORM),
-}, {
-/* [A,0] */	TNS("movb",OA),		TS("mov",OA),		TNS("movb",AO),		TS("mov",AO),
-/* [A,4] */	TNSZ("movsb",SD,1),	TS("movs",SD),		TNSZ("cmpsb",SD,1),	TS("cmps",SD),
-/* [A,8] */	TNS("testb",IA),	TS("test",IA),		TNS("stosb",AD),	TS("stos",AD),
-/* [A,C] */	TNS("lodsb",SA),	TS("lods",SA),		TNS("scasb",AD),	TS("scas",AD),
-}, {
-/* [B,0] */	TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),
-/* [B,4] */	TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),
-/* [B,8] */	TS("mov",IR),		TS("mov",IR),		TS("mov",IR),		TS("mov",IR),
-/* [B,C] */	TS("mov",IR),		TS("mov",IR),		TS("mov",IR),		TS("mov",IR),
-}, {
-/* [C,0] */	IND(dis_opC0),		IND(dis_opC1), 		TNSyp("ret",RET),	TNSyp("ret",NORM),
-/* [C,4] */	TNSx("les",MR),		TNSx("lds",MR),		TNS("movb",IMw),	TS("mov",IMw),
-/* [C,8] */	TNSyp("enter",ENTER),	TNSyp("leave",NORM),	TNS("lret",RET),	TNS("lret",NORM),
-/* [C,C] */	TNS("int",INT3),	TNS("int",INTx),	TNSx("into",NORM),	TNS("iret",NORM),
-}, {
-/* [D,0] */	IND(dis_opD0),		IND(dis_opD1),		IND(dis_opD2),		IND(dis_opD3),
-/* [D,4] */	TNSx("aam",U),		TNSx("aad",U),		TNSx("falc",NORM),	TNSZ("xlat",IMPLMEM,1),
-
-/* 287 instructions.  Note that although the indirect field		*/
-/* indicates opFP1n2 for further decoding, this is not necessarily	*/
-/* the case since the opFP arrays are not partitioned according to key1	*/
-/* and key2.  opFP1n2 is given only to indicate that we haven't		*/
-/* finished decoding the instruction.					*/
-/* [D,8] */	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),
-/* [D,C] */	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),	IND(&dis_opFP1n2[0][0]),
-}, {
-/* [E,0] */	TNSy("loopnz",BD),	TNSy("loopz",BD),	TNSy("loop",BD),	TNSy("jcxz",BD),
-/* [E,4] */	TNS("inb",P),		TS("in",P),		TNS("outb",P),		TS("out",P),
-/* [E,8] */	TNSyp("call",D),	TNSy("jmp",D),		TNSx("ljmp",SO),		TNSy("jmp",BD),
-/* [E,C] */	TNS("inb",V),		TS("in",V),		TNS("outb",V),		TS("out",V),
-}, {
-/* [F,0] */	TNS("lock",LOCK),	TNS("icebp", NORM),	TNS("repnz",PREFIX),	TNS("repz",PREFIX),
-/* [F,4] */	TNS("hlt",NORM),	TNS("cmc",NORM),	IND(dis_opF6),		IND(dis_opF7),
-/* [F,8] */	TNS("clc",NORM),	TNS("stc",NORM),	TNS("cli",NORM),	TNS("sti",NORM),
-/* [F,C] */	TNS("cld",NORM),	TNS("std",NORM),	IND(dis_opFE),		IND(dis_opFF),
-} };
-
-/* END CSTYLED */
-
-/*
- * common functions to decode and disassemble an x86 or amd64 instruction
- */
-
-/*
- * These are the individual fields of a REX prefix. Note that a REX
- * prefix with none of these set is still needed to:
- *	- use the MOVSXD (sign extend 32 to 64 bits) instruction
- *	- access the %sil, %dil, %bpl, %spl registers
- */
-#define	REX_W 0x08	/* 64 bit operand size when set */
-#define	REX_R 0x04	/* high order bit extension of ModRM reg field */
-#define	REX_X 0x02	/* high order bit extension of SIB index field */
-#define	REX_B 0x01	/* extends ModRM r_m, SIB base, or opcode reg */
-
-static uint_t opnd_size;	/* SIZE16, SIZE32 or SIZE64 */
-static uint_t addr_size;	/* SIZE16, SIZE32 or SIZE64 */
-
-/*
- * Even in 64 bit mode, usually only 4 byte immediate operands are supported.
- */
-static int isize[] = {1, 2, 4, 4};
-static int isize64[] = {1, 2, 4, 8};
-
-/*
- * Just a bunch of useful macros.
- */
-#define	WBIT(x)	(x & 0x1)		/* to get w bit	*/
-#define	REGNO(x) (x & 0x7)		/* to get 3 bit register */
-#define	VBIT(x)	((x)>>1 & 0x1)		/* to get 'v' bit */
-#define	OPSIZE(osize, wbit) ((wbit) ? isize[osize] : 1)
-#define	OPSIZE64(osize, wbit) ((wbit) ? isize64[osize] : 1)
-
-#define	REG_ONLY 3	/* mode to indicate a register operand (not memory) */
-
-#define	BYTE_OPND	0	/* w-bit value indicating byte register */
-#define	LONG_OPND	1	/* w-bit value indicating opnd_size register */
-#define	MM_OPND		2	/* "value" used to indicate a mmx reg */
-#define	XMM_OPND	3	/* "value" used to indicate a xmm reg */
-#define	SEG_OPND	4	/* "value" used to indicate a segment reg */
-#define	CONTROL_OPND	5	/* "value" used to indicate a control reg */
-#define	DEBUG_OPND	6	/* "value" used to indicate a debug reg */
-#define	TEST_OPND	7	/* "value" used to indicate a test reg */
-#define	WORD_OPND	8	/* w-bit value indicating word size reg */
-
-/*
- * Get the next byte and separate the op code into the high and low nibbles.
- */
-static int
-dtrace_get_opcode(dis86_t *x, uint_t *high, uint_t *low)
-{
-	int byte;
-
-	/*
-	 * x86 instructions have a maximum length of 15 bytes.  Bail out if
-	 * we try to read more.
-	 */
-	if (x->d86_len >= 15)
-		return (x->d86_error = 1);
-
-	if (x->d86_error)
-		return (1);
-	byte = x->d86_get_byte(x->d86_data);
-	if (byte < 0)
-		return (x->d86_error = 1);
-	x->d86_bytes[x->d86_len++] = byte;
-	*low = byte & 0xf;		/* ----xxxx low 4 bits */
-	*high = byte >> 4 & 0xf;	/* xxxx---- bits 7 to 4 */
-	return (0);
-}
-
-/*
- * Get and decode an SIB (scaled index base) byte
- */
-static void
-dtrace_get_SIB(dis86_t *x, uint_t *ss, uint_t *index, uint_t *base)
-{
-	int byte;
-
-	if (x->d86_error)
-		return;
-
-	byte = x->d86_get_byte(x->d86_data);
-	if (byte < 0) {
-		x->d86_error = 1;
-		return;
-	}
-	x->d86_bytes[x->d86_len++] = byte;
-
-	*base = byte & 0x7;
-	*index = (byte >> 3) & 0x7;
-	*ss = (byte >> 6) & 0x3;
-}
-
-/*
- * Get the byte following the op code and separate it into the
- * mode, register, and r/m fields.
- */
-static void
-dtrace_get_modrm(dis86_t *x, uint_t *mode, uint_t *reg, uint_t *r_m)
-{
-	if (x->d86_got_modrm == 0) {
-		if (x->d86_rmindex == -1)
-			x->d86_rmindex = x->d86_len;
-		dtrace_get_SIB(x, mode, reg, r_m);
-		x->d86_got_modrm = 1;
-	}
-}
-
-/*
- * Adjust register selection based on any REX prefix bits present.
- */
-/*ARGSUSED*/
-static void
-dtrace_rex_adjust(uint_t rex_prefix, uint_t mode, uint_t *reg, uint_t *r_m)
-{
-	if (reg != NULL && r_m == NULL) {
-		if (rex_prefix & REX_B)
-			*reg += 8;
-	} else {
-		if (reg != NULL && (REX_R & rex_prefix) != 0)
-			*reg += 8;
-		if (r_m != NULL && (REX_B & rex_prefix) != 0)
-			*r_m += 8;
-	}
-}
-
-/*
- * Get an immediate operand of the given size, with sign extension.
- */
-static void
-dtrace_imm_opnd(dis86_t *x, int wbit, int size, int opindex)
-{
-	int i;
-	int byte;
-	int valsize = 0;
-
-	if (x->d86_numopnds < opindex + 1)
-		x->d86_numopnds = opindex + 1;
-
-	switch (wbit) {
-	case BYTE_OPND:
-		valsize = 1;
-		break;
-	case LONG_OPND:
-		if (x->d86_opnd_size == SIZE16)
-			valsize = 2;
-		else if (x->d86_opnd_size == SIZE32)
-			valsize = 4;
-		else
-			valsize = 8;
-		break;
-	case MM_OPND:
-	case XMM_OPND:
-	case SEG_OPND:
-	case CONTROL_OPND:
-	case DEBUG_OPND:
-	case TEST_OPND:
-		valsize = size;
-		break;
-	case WORD_OPND:
-		valsize = 2;
-		break;
-	}
-	if (valsize < size)
-		valsize = size;
-
-	if (x->d86_error)
-		return;
-	x->d86_opnd[opindex].d86_value = 0;
-	for (i = 0; i < size; ++i) {
-		byte = x->d86_get_byte(x->d86_data);
-		if (byte < 0) {
-			x->d86_error = 1;
-			return;
-		}
-		x->d86_bytes[x->d86_len++] = byte;
-		x->d86_opnd[opindex].d86_value |= (uint64_t)byte << (i * 8);
-	}
-	/* Do sign extension */
-	if (x->d86_bytes[x->d86_len - 1] & 0x80) {
-		for (; i < valsize; i++)
-			x->d86_opnd[opindex].d86_value |=
-			    (uint64_t)0xff << (i* 8);
-	}
-#ifdef DIS_TEXT
-	x->d86_opnd[opindex].d86_mode = MODE_SIGNED;
-	x->d86_opnd[opindex].d86_value_size = valsize;
-	x->d86_imm_bytes += size;
-#endif
-}
-
-/*
- * Get an ip relative operand of the given size, with sign extension.
- */
-static void
-dtrace_disp_opnd(dis86_t *x, int wbit, int size, int opindex)
-{
-	dtrace_imm_opnd(x, wbit, size, opindex);
-#ifdef DIS_TEXT
-	x->d86_opnd[opindex].d86_mode = MODE_IPREL;
-#endif
-}
-
-/*
- * Check to see if there is a segment override prefix pending.
- * If so, print it in the current 'operand' location and set
- * the override flag back to false.
- */
-/*ARGSUSED*/
-static void
-dtrace_check_override(dis86_t *x, int opindex)
-{
-#ifdef DIS_TEXT
-	if (x->d86_seg_prefix) {
-		(void) strlcat(x->d86_opnd[opindex].d86_prefix,
-		    x->d86_seg_prefix, PFIXLEN);
-	}
-#endif
-	x->d86_seg_prefix = NULL;
-}
-
-
-/*
- * Process a single instruction Register or Memory operand.
- *
- * mode = addressing mode from ModRM byte
- * r_m = r_m (or reg if mode == 3) field from ModRM byte
- * wbit = indicates which register (8bit, 16bit, ... MMX, etc.) set to use.
- * o = index of operand that we are processing (0, 1 or 2)
- *
- * the value of reg or r_m must have already been adjusted for any REX prefix.
- */
-/*ARGSUSED*/
-static void
-dtrace_get_operand(dis86_t *x, uint_t mode, uint_t r_m, int wbit, int opindex)
-{
-	int have_SIB = 0;	/* flag presence of scale-index-byte */
-	uint_t ss;		/* scale-factor from opcode */
-	uint_t index;		/* index register number */
-	uint_t base;		/* base register number */
-	int dispsize;   	/* size of displacement in bytes */
-#ifdef DIS_TEXT
-	char *opnd = x->d86_opnd[opindex].d86_opnd;
-#endif
-
-	if (x->d86_numopnds < opindex + 1)
-		x->d86_numopnds = opindex + 1;
-
-	if (x->d86_error)
-		return;
-
-	/*
-	 * first handle a simple register
-	 */
-	if (mode == REG_ONLY) {
-#ifdef DIS_TEXT
-		switch (wbit) {
-		case MM_OPND:
-			(void) strlcat(opnd, dis_MMREG[r_m], OPLEN);
-			break;
-		case XMM_OPND:
-			(void) strlcat(opnd, dis_XMMREG[r_m], OPLEN);
-			break;
-		case SEG_OPND:
-			(void) strlcat(opnd, dis_SEGREG[r_m], OPLEN);
-			break;
-		case CONTROL_OPND:
-			(void) strlcat(opnd, dis_CONTROLREG[r_m], OPLEN);
-			break;
-		case DEBUG_OPND:
-			(void) strlcat(opnd, dis_DEBUGREG[r_m], OPLEN);
-			break;
-		case TEST_OPND:
-			(void) strlcat(opnd, dis_TESTREG[r_m], OPLEN);
-			break;
-		case BYTE_OPND:
-			if (x->d86_rex_prefix == 0)
-				(void) strlcat(opnd, dis_REG8[r_m], OPLEN);
-			else
-				(void) strlcat(opnd, dis_REG8_REX[r_m], OPLEN);
-			break;
-		case WORD_OPND:
-			(void) strlcat(opnd, dis_REG16[r_m], OPLEN);
-			break;
-		case LONG_OPND:
-			if (x->d86_opnd_size == SIZE16)
-				(void) strlcat(opnd, dis_REG16[r_m], OPLEN);
-			else if (x->d86_opnd_size == SIZE32)
-				(void) strlcat(opnd, dis_REG32[r_m], OPLEN);
-			else
-				(void) strlcat(opnd, dis_REG64[r_m], OPLEN);
-			break;
-		}
-#endif /* DIS_TEXT */
-		return;
-	}
-
-	/*
-	 * if symbolic representation, skip override prefix, if any
-	 */
-	dtrace_check_override(x, opindex);
-
-	/*
-	 * Handle 16 bit memory references first, since they decode
-	 * the mode values more simply.
-	 * mode 1 is r_m + 8 bit displacement
-	 * mode 2 is r_m + 16 bit displacement
-	 * mode 0 is just r_m, unless r_m is 6 which is 16 bit disp
-	 */
-	if (x->d86_addr_size == SIZE16) {
-		if ((mode == 0 && r_m == 6) || mode == 2)
-			dtrace_imm_opnd(x, WORD_OPND, 2, opindex);
-		else if (mode == 1)
-			dtrace_imm_opnd(x, BYTE_OPND, 1, opindex);
-#ifdef DIS_TEXT
-		if (mode == 0 && r_m == 6)
-			x->d86_opnd[opindex].d86_mode = MODE_SIGNED;
-		else if (mode == 0)
-			x->d86_opnd[opindex].d86_mode = MODE_NONE;
-		else
-			x->d86_opnd[opindex].d86_mode = MODE_OFFSET;
-		(void) strlcat(opnd, dis_addr16[mode][r_m], OPLEN);
-#endif
-		return;
-	}
-
-	/*
-	 * 32 and 64 bit addressing modes are more complex since they
-	 * can involve an SIB (scaled index and base) byte to decode.
-	 */
-	if (r_m == ESP_REGNO || r_m == ESP_REGNO + 8) {
-		have_SIB = 1;
-		dtrace_get_SIB(x, &ss, &index, &base);
-		if (x->d86_error)
-			return;
-		if (base != 5 || mode != 0)
-			if (x->d86_rex_prefix & REX_B)
-				base += 8;
-		if (x->d86_rex_prefix & REX_X)
-			index += 8;
-	} else {
-		base = r_m;
-	}
-
-	/*
-	 * Compute the displacement size and get its bytes
-	 */
-	dispsize = 0;
-
-	if (mode == 1)
-		dispsize = 1;
-	else if (mode == 2)
-		dispsize = 4;
-	else if ((r_m & 7) == EBP_REGNO ||
-	    (have_SIB && (base & 7) == EBP_REGNO))
-		dispsize = 4;
-
-	if (dispsize > 0) {
-		dtrace_imm_opnd(x, dispsize == 4 ? LONG_OPND : BYTE_OPND,
-		    dispsize, opindex);
-		if (x->d86_error)
-			return;
-	}
-
-#ifdef DIS_TEXT
-	if (dispsize > 0)
-		x->d86_opnd[opindex].d86_mode = MODE_OFFSET;
-
-	if (have_SIB == 0) {
-		if (x->d86_mode == SIZE32) {
-			if (mode == 0)
-				(void) strlcat(opnd, dis_addr32_mode0[r_m],
-				    OPLEN);
-			else
-				(void) strlcat(opnd, dis_addr32_mode12[r_m],
-				    OPLEN);
-		} else {
-			if (mode == 0)
-				(void) strlcat(opnd, dis_addr64_mode0[r_m],
-				    OPLEN);
-			else
-				(void) strlcat(opnd, dis_addr64_mode12[r_m],
-				    OPLEN);
-		}
-	} else {
-		uint_t need_paren = 0;
-		char **regs;
-		if (x->d86_mode == SIZE32) /* NOTE this is not addr_size! */
-			regs = (char **)dis_REG32;
-		else
-			regs = (char **)dis_REG64;
-
-		/*
-		 * print the base (if any)
-		 */
-		if (base == EBP_REGNO && mode == 0) {
-			if (index != ESP_REGNO) {
-				(void) strlcat(opnd, "(", OPLEN);
-				need_paren = 1;
-			}
-		} else {
-			(void) strlcat(opnd, "(", OPLEN);
-			(void) strlcat(opnd, regs[base], OPLEN);
-			need_paren = 1;
-		}
-
-		/*
-		 * print the index (if any)
-		 */
-		if (index != ESP_REGNO) {
-			(void) strlcat(opnd, ",", OPLEN);
-			(void) strlcat(opnd, regs[index], OPLEN);
-			(void) strlcat(opnd, dis_scale_factor[ss], OPLEN);
-		} else
-			if (need_paren)
-				(void) strlcat(opnd, ")", OPLEN);
-	}
-#endif
-}
-
-/*
- * Operand sequence for standard instruction involving one register
- * and one register/memory operand.
- * wbit indicates a byte(0) or opnd_size(1) operation
- * vbit indicates direction (0 for "opcode r,r_m") or (1 for "opcode r_m, r")
- */
-#define	STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, vbit)  {	\
-		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
-		dtrace_get_operand(x, mode, r_m, wbit, vbit);		\
-		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1 - vbit);	\
-}
-
-/*
- * Similar to above, but allows for the two operands to be of different
- * classes (ie. wbit).
- *	wbit is for the r_m operand
- *	w2 is for the reg operand
- */
-#define	MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, w2, vbit)	{	\
-		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
-		dtrace_get_operand(x, mode, r_m, wbit, vbit);		\
-		dtrace_get_operand(x, REG_ONLY, reg, w2, 1 - vbit);	\
-}
-
-/*
- * Similar, but for 2 operands plus an immediate.
- */
-#define	THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize) { \
-		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
-		dtrace_get_operand(x, mode, r_m, wbit, 1);		\
-		dtrace_get_operand(x, REG_ONLY, reg, w2, 2);		\
-		dtrace_imm_opnd(x, wbit, immsize, 0);			\
-}
-
-/*
- * Dissassemble a single x86 or amd64 instruction.
- *
- * Mode determines the default operating mode (SIZE16, SIZE32 or SIZE64)
- * for interpreting instructions.
- *
- * returns non-zero for bad opcode
- */
-int
-dtrace_disx86(dis86_t *x, uint_t cpu_mode)
-{
-	const instable_t *dp = NULL;	/* decode table being used */
-#ifdef DIS_TEXT
-	uint_t i;
-#endif
-#ifdef DIS_MEM
-	uint_t nomem = 0;
-#define	NOMEM	(nomem = 1)
-#else
-#define	NOMEM	/* nothing */
-#endif
-	uint_t wbit = 0;	/* opcode wbit, 0 is 8 bit, !0 for opnd_size */
-	uint_t w2;		/* wbit value for second operand */
-	uint_t vbit;
-	uint_t mode = 0;	/* mode value from ModRM byte */
-	uint_t reg;		/* reg value from ModRM byte */
-	uint_t r_m;		/* r_m value from ModRM byte */
-
-	uint_t opcode1;		/* high nibble of 1st byte */
-	uint_t opcode2;		/* low nibble of 1st byte */
-	uint_t opcode3;		/* extra opcode bits usually from ModRM byte */
-	uint_t opcode4;		/* high nibble of 2nd byte */
-	uint_t opcode5 = 0xff;	/* low nibble of 2nd byte */
-	uint_t opcode6;		/* high nibble of 3rd byte */
-	uint_t opcode7 = 0xff;	/* low nibble of 3rd byte */
-	uint_t opcode_bytes = 1;
-
-	/*
-	 * legacy prefixes come in 5 flavors, you should have only one of each
-	 */
-	uint_t	opnd_size_prefix = 0;
-	uint_t	addr_size_prefix = 0;
-	uint_t	segment_prefix = 0;
-	uint_t	lock_prefix = 0;
-	uint_t	rep_prefix = 0;
-	uint_t	rex_prefix = 0;	/* amd64 register extension prefix */
-	size_t	off;
-
-	x->d86_len = 0;
-	x->d86_rmindex = -1;
-	x->d86_error = 0;
-#ifdef DIS_TEXT
-	x->d86_numopnds = 0;
-	x->d86_seg_prefix = NULL;
-	x->d86_mneu[0] = 0;
-	for (i = 0; i < 3; ++i) {
-		x->d86_opnd[i].d86_opnd[0] = 0;
-		x->d86_opnd[i].d86_prefix[0] = 0;
-		x->d86_opnd[i].d86_value_size = 0;
-		x->d86_opnd[i].d86_value = 0;
-		x->d86_opnd[i].d86_mode = MODE_NONE;
-	}
-#endif
-	x->d86_error = 0;
-	x->d86_memsize = 0;
-
-	if (cpu_mode == SIZE16) {
-		opnd_size = SIZE16;
-		addr_size = SIZE16;
-	} else if (cpu_mode == SIZE32) {
-		opnd_size = SIZE32;
-		addr_size = SIZE32;
-	} else {
-		opnd_size = SIZE32;
-		addr_size = SIZE64;
-	}
-
-	/*
-	 * Get one opcode byte and check for zero padding that follows
-	 * jump tables.
-	 */
-	if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
-		goto error;
-
-	if (opcode1 == 0 && opcode2 == 0 &&
-	    x->d86_check_func != NULL && x->d86_check_func(x->d86_data)) {
-#ifdef DIS_TEXT
-		(void) strncpy(x->d86_mneu, ".byte\t0", OPLEN);
-#endif
-		goto done;
-	}
-
-	/*
-	 * Gather up legacy x86 prefix bytes.
-	 */
-	for (;;) {
-		uint_t *which_prefix = NULL;
-
-		dp = &dis_distable[opcode1][opcode2];
-
-		switch (dp->it_adrmode) {
-		case PREFIX:
-			which_prefix = &rep_prefix;
-			break;
-		case LOCK:
-			which_prefix = &lock_prefix;
-			break;
-		case OVERRIDE:
-			which_prefix = &segment_prefix;
-#ifdef DIS_TEXT
-			x->d86_seg_prefix = (char *)dp->it_name;
-#endif
-			if (dp->it_invalid64 && cpu_mode == SIZE64)
-				goto error;
-			break;
-		case AM:
-			which_prefix = &addr_size_prefix;
-			break;
-		case DM:
-			which_prefix = &opnd_size_prefix;
-			break;
-		}
-		if (which_prefix == NULL)
-			break;
-		*which_prefix = (opcode1 << 4) | opcode2;
-		if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
-			goto error;
-	}
-
-	/*
-	 * Handle amd64 mode PREFIX values.
-	 * Some of the segment prefixes are no-ops. (only FS/GS actually work)
-	 * We might have a REX prefix (opcodes 0x40-0x4f)
-	 */
-	if (cpu_mode == SIZE64) {
-		if (segment_prefix != 0x64 && segment_prefix != 0x65)
-			segment_prefix = 0;
-
-		if (opcode1 == 0x4) {
-			rex_prefix = (opcode1 << 4) | opcode2;
-			if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
-				goto error;
-			dp = &dis_distable[opcode1][opcode2];
-		}
-	}
-
-	/*
-	 * Deal with selection of operand and address size now.
-	 * Note that the REX.W bit being set causes opnd_size_prefix to be
-	 * ignored.
-	 */
-	if (cpu_mode == SIZE64) {
-		if (rex_prefix & 0x08)
-			opnd_size = SIZE64;
-		else if (opnd_size_prefix)
-			opnd_size = SIZE16;
-
-		if (addr_size_prefix)
-			addr_size = SIZE32;
-	} else if (cpu_mode == SIZE32) {
-		if (opnd_size_prefix)
-			opnd_size = SIZE16;
-		if (addr_size_prefix)
-			addr_size = SIZE16;
-	} else {
-		if (opnd_size_prefix)
-			opnd_size = SIZE32;
-		if (addr_size_prefix)
-			addr_size = SIZE32;
-	}
-
-	/*
-	 * The pause instruction - a repz'd nop.  This doesn't fit
-	 * with any of the other prefix goop added for SSE, so we'll
-	 * special-case it here.
-	 */
-	if (rep_prefix == 0xf3 && opcode1 == 0x9 && opcode2 == 0x0) {
-		rep_prefix = 0;
-		dp = &dis_opPause;
-	}
-
-	/*
-	 * Some 386 instructions have 2 bytes of opcode before the mod_r/m
-	 * byte so we may need to perform a table indirection.
-	 */
-	if (dp->it_indirect == dis_op0F[0]) {
-		if (dtrace_get_opcode(x, &opcode4, &opcode5) != 0)
-			goto error;
-		opcode_bytes = 2;
-		if (opcode4 == 0x7 && opcode5 >= 0x1 && opcode5 <= 0x3) {
-			uint_t	subcode;
-
-			if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0)
-				goto error;
-			opcode_bytes = 3;
-			subcode = ((opcode6 & 0x3) << 1) |
-			    ((opcode7 & 0x8) >> 3);
-			dp = &dis_op0F7123[opcode5][subcode];
-		} else if ((opcode4 == 0xc) && (opcode5 >= 0x8)) {
-			dp = &dis_op0FC8[0];
-		} else {
-			dp = &dis_op0F[opcode4][opcode5];
-		}
-	}
-
-	/*
-	 * If still not at a TERM decode entry, then a ModRM byte
-	 * exists and its fields further decode the instruction.
-	 */
-	x->d86_got_modrm = 0;
-	if (dp->it_indirect != TERM) {
-		dtrace_get_modrm(x, &mode, &opcode3, &r_m);
-		if (x->d86_error)
-			goto error;
-		reg = opcode3;
-
-		/*
-		 * decode 287 instructions (D8-DF) from opcodeN
-		 */
-		if (opcode1 == 0xD && opcode2 >= 0x8) {
-			if (opcode2 == 0xB && mode == 0x3 && opcode3 == 4)
-				dp = &dis_opFP5[r_m];
-			else if (opcode2 == 0xA && mode == 0x3 && opcode3 < 4)
-				dp = &dis_opFP7[opcode3];
-			else if (opcode2 == 0xB && mode == 0x3)
-				dp = &dis_opFP6[opcode3];
-			else if (opcode2 == 0x9 && mode == 0x3 && opcode3 >= 4)
-				dp = &dis_opFP4[opcode3 - 4][r_m];
-			else if (mode == 0x3)
-				dp = &dis_opFP3[opcode2 - 8][opcode3];
-			else
-				dp = &dis_opFP1n2[opcode2 - 8][opcode3];
-		} else {
-			dp = dp->it_indirect + opcode3;
-		}
-	}
-
-	/*
-	 * In amd64 bit mode, ARPL opcode is changed to MOVSXD
-	 * (sign extend 32bit to 64 bit)
-	 */
-	if (cpu_mode == SIZE64 && opcode1 == 0x6 && opcode2 == 0x3)
-		dp = &dis_opMOVSLD;
-
-	/*
-	 * at this point we should have a correct (or invalid) opcode
-	 */
-	if ((cpu_mode == SIZE64 && dp->it_invalid64) ||
-	    (cpu_mode != SIZE64 && dp->it_invalid32))
-		goto error;
-	if (dp->it_indirect != TERM)
-		goto error;
-
-	/*
-	 * deal with MMX/SSE opcodes which are changed by prefixes
-	 */
-	switch (dp->it_adrmode) {
-	case MMO:
-	case MMOIMPL:
-	case MMO3P:
-	case MMOM3:
-	case MMOMS:
-	case MMOPM:
-	case MMOPRM:
-	case MMOS:
-	case XMMO:
-	case XMMOM:
-	case XMMOMS:
-	case XMMOPM:
-	case XMMOS:
-	case XMMOMX:
-	case XMMOX3:
-	case XMMOXMM:
-		/*
-		 * This is horrible.  Some SIMD instructions take the
-		 * form 0x0F 0x?? ..., which is easily decoded using the
-		 * existing tables.  Other SIMD instructions use various
-		 * prefix bytes to overload existing instructions.  For
-		 * Example, addps is F0, 58, whereas addss is F3 (repz),
-		 * F0, 58.  Presumably someone got a raise for this.
-		 *
-		 * If we see one of the instructions which can be
-		 * modified in this way (if we've got one of the SIMDO*
-		 * address modes), we'll check to see if the last prefix
-		 * was a repz.  If it was, we strip the prefix from the
-		 * mnemonic, and we indirect using the dis_opSIMDrepz
-		 * table.
-		 */
-
-		/*
-		 * Calculate our offset in dis_op0F
-		 */
-		if ((uintptr_t)dp - (uintptr_t)dis_op0F > sizeof (dis_op0F))
-			goto error;
-
-		off = ((uintptr_t)dp - (uintptr_t)dis_op0F) /
-		    sizeof (instable_t);
-
-		/*
-		 * Rewrite if this instruction used one of the magic prefixes.
-		 */
-		if (rep_prefix) {
-			if (rep_prefix == 0xf2)
-				dp = &dis_opSIMDrepnz[off];
-			else
-				dp = &dis_opSIMDrepz[off];
-			rep_prefix = 0;
-		} else if (opnd_size_prefix) {
-			dp = &dis_opSIMDdata16[off];
-			opnd_size_prefix = 0;
-			if (opnd_size == SIZE16)
-				opnd_size = SIZE32;
-		}
-		break;
-
-	case MMOSH:
-		/*
-		 * As with the "normal" SIMD instructions, the MMX
-		 * shuffle instructions are overloaded.  These
-		 * instructions, however, are special in that they use
-		 * an extra byte, and thus an extra table.  As of this
-		 * writing, they only use the opnd_size prefix.
-		 */
-
-		/*
-		 * Calculate our offset in dis_op0F7123
-		 */
-		if ((uintptr_t)dp - (uintptr_t)dis_op0F7123 >
-		    sizeof (dis_op0F7123))
-			goto error;
-
-		if (opnd_size_prefix) {
-			off = ((uintptr_t)dp - (uintptr_t)dis_op0F7123) /
-			    sizeof (instable_t);
-			dp = &dis_opSIMD7123[off];
-			opnd_size_prefix = 0;
-			if (opnd_size == SIZE16)
-				opnd_size = SIZE32;
-		}
-		break;
-	}
-
-	/*
-	 * In 64 bit mode, some opcodes automatically use opnd_size == SIZE64.
-	 */
-	if (cpu_mode == SIZE64)
-		if (dp->it_always64 || (opnd_size == SIZE32 && dp->it_stackop))
-			opnd_size = SIZE64;
-
-#ifdef DIS_TEXT
-	/*
-	 * At this point most instructions can format the opcode mnemonic
-	 * including the prefixes.
-	 */
-	if (lock_prefix)
-		(void) strlcat(x->d86_mneu, "lock ", OPLEN);
-
-	if (rep_prefix == 0xf2)
-		(void) strlcat(x->d86_mneu, "repnz ", OPLEN);
-	else if (rep_prefix == 0xf3)
-		(void) strlcat(x->d86_mneu, "repz ", OPLEN);
-
-	if (cpu_mode == SIZE64 && addr_size_prefix)
-		(void) strlcat(x->d86_mneu, "addr32 ", OPLEN);
-
-	if (dp->it_adrmode != CBW &&
-	    dp->it_adrmode != CWD &&
-	    dp->it_adrmode != XMMSFNC) {
-		if (strcmp(dp->it_name, "INVALID") == 0)
-			goto error;
-		(void) strlcat(x->d86_mneu, dp->it_name, OPLEN);
-		if (dp->it_suffix) {
-			char *types[] = {"", "w", "l", "q"};
-			if (opcode_bytes == 2 && opcode4 == 4) {
-				/* It's a cmovx.yy. Replace the suffix x */
-				for (i = 5; i < OPLEN; i++) {
-					if (x->d86_mneu[i] == '.')
-						break;
-				}
-				x->d86_mneu[i - 1] = *types[opnd_size];
-			} else {
-				(void) strlcat(x->d86_mneu, types[opnd_size],
-				    OPLEN);
-			}
-		}
-	}
-#endif
-
-	/*
-	 * Process operands based on the addressing modes.
-	 */
-	x->d86_mode = cpu_mode;
-	x->d86_rex_prefix = rex_prefix;
-	x->d86_opnd_size = opnd_size;
-	x->d86_addr_size = addr_size;
-	vbit = 0;		/* initialize for mem/reg -> reg */
-	switch (dp->it_adrmode) {
-		/*
-		 * amd64 instruction to sign extend 32 bit reg/mem operands
-		 * into 64 bit register values
-		 */
-	case MOVSXZ:
-#ifdef DIS_TEXT
-		if (rex_prefix == 0)
-			(void) strncpy(x->d86_mneu, "movzld", OPLEN);
-#endif
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		x->d86_opnd_size = SIZE64;
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		x->d86_opnd_size = opnd_size = SIZE32;
-		wbit = LONG_OPND;
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-		/*
-		 * movsbl movsbw movsbq (0x0FBE) or movswl movswq (0x0FBF)
-		 * movzbl movzbw movzbq (0x0FB6) or mobzwl movzwq (0x0FB7)
-		 * wbit lives in 2nd byte, note that operands
-		 * are different sized
-		 */
-	case MOVZ:
-		if (rex_prefix & REX_W) {
-			/* target register size = 64 bit */
-			x->d86_mneu[5] = 'q';
-		}
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		x->d86_opnd_size = opnd_size = SIZE16;
-		wbit = WBIT(opcode5);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	/*
-	 * imul instruction, with either 8-bit or longer immediate
-	 * opcode 0x6B for byte, sign-extended displacement, 0x69 for word(s)
-	 */
-	case IMUL:
-		wbit = LONG_OPND;
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND,
-		    OPSIZE(opnd_size, opcode2 == 0x9));
-		break;
-
-	/* memory or register operand to register, with 'w' bit	*/
-	case MRw:
-		wbit = WBIT(opcode2);
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
-		break;
-
-	/* register to memory or register operand, with 'w' bit	*/
-	/* arpl happens to fit here also because it is odd */
-	case RMw:
-		if (opcode_bytes == 2)
-			wbit = WBIT(opcode5);
-		else
-			wbit = WBIT(opcode2);
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* xaddb instruction */
-	case XADDB:
-		wbit = 0;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* MMX register to memory or register operand		*/
-	case MMS:
-	case MMOS:
-#ifdef DIS_TEXT
-		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
-#else
-		wbit = LONG_OPND;
-#endif
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1);
-		break;
-
-	/* MMX register to memory */
-	case MMOMS:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode == REG_ONLY)
-			goto error;
-		wbit = MM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1);
-		break;
-
-	/* Double shift. Has immediate operand specifying the shift. */
-	case DSHIFT:
-		wbit = LONG_OPND;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 2);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		break;
-
-	/*
-	 * Double shift. With no immediate operand, specifies using %cl.
-	 */
-	case DSHIFTcl:
-		wbit = LONG_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* immediate to memory or register operand */
-	case IMlw:
-		wbit = WBIT(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		/*
-		 * Have long immediate for opcode 0x81, but not 0x80 nor 0x83
-		 */
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, opcode2 == 1), 0);
-		break;
-
-	/* immediate to memory or register operand with the	*/
-	/* 'w' bit present					*/
-	case IMw:
-		wbit = WBIT(opcode2);
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0);
-		break;
-
-	/* immediate to register with register in low 3 bits	*/
-	/* of op code						*/
-	case IR:
-		/* w-bit here (with regs) is bit 3 */
-		wbit = opcode2 >>3 & 0x1;
-		reg = REGNO(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		mode = REG_ONLY;
-		r_m = reg;
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		dtrace_imm_opnd(x, wbit, OPSIZE64(opnd_size, wbit), 0);
-		break;
-
-	/* MMX immediate shift of register */
-	case MMSH:
-	case MMOSH:
-		wbit = MM_OPND;
-		goto mm_shift;	/* in next case */
-
-	/* SIMD immediate shift of register */
-	case XMMSH:
-		wbit = XMM_OPND;
-mm_shift:
-		reg = REGNO(opcode7);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		NOMEM;
-		break;
-
-	/* accumulator to memory operand */
-	case AO:
-		vbit = 1;
-		/*FALLTHROUGH*/
-
-	/* memory operand to accumulator */
-	case OA:
-		wbit = WBIT(opcode2);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1 - vbit);
-		dtrace_imm_opnd(x, wbit, OPSIZE64(addr_size, LONG_OPND), vbit);
-#ifdef DIS_TEXT
-		x->d86_opnd[vbit].d86_mode = MODE_OFFSET;
-#endif
-		break;
-
-
-	/* segment register to memory or register operand */
-	case SM:
-		vbit = 1;
-		/*FALLTHROUGH*/
-
-	/* memory or register operand to segment register */
-	case MS:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, LONG_OPND, vbit);
-		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 1 - vbit);
-		break;
-
-	/*
-	 * rotate or shift instructions, which may shift by 1 or
-	 * consult the cl register, depending on the 'v' bit
-	 */
-	case Mv:
-		vbit = VBIT(opcode2);
-		wbit = WBIT(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-#ifdef DIS_TEXT
-		if (vbit) {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "%cl", OPLEN);
-		} else {
-			x->d86_opnd[0].d86_mode = MODE_SIGNED;
-			x->d86_opnd[0].d86_value_size = 1;
-			x->d86_opnd[0].d86_value = 1;
-		}
-#endif
-		break;
-	/*
-	 * immediate rotate or shift instructions
-	 */
-	case MvI:
-		wbit = WBIT(opcode2);
-normal_imm_mem:
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 1);
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		break;
-
-	/* bit test instructions */
-	case MIb:
-		wbit = LONG_OPND;
-		goto normal_imm_mem;
-
-	/* single memory or register operand with 'w' bit present */
-	case Mw:
-		wbit = WBIT(opcode2);
-just_mem:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	case SWAPGS:
-		if (cpu_mode == SIZE64 && mode == 3 && r_m == 0) {
-#ifdef DIS_TEXT
-			(void) strncpy(x->d86_mneu, "swapgs", OPLEN);
-#endif
-			NOMEM;
-			break;
-		}
-		/*FALLTHROUGH*/
-
-	/* prefetch instruction - memory operand, but no memory acess */
-	case PREF:
-		NOMEM;
-		/*FALLTHROUGH*/
-
-	/* single memory or register operand */
-	case M:
-		wbit = LONG_OPND;
-		goto just_mem;
-
-	/* single memory or register byte operand */
-	case Mb:
-		wbit = BYTE_OPND;
-		goto just_mem;
-
-	case MO:
-		/* Similar to M, but only memory (no direct registers) */
-		wbit = LONG_OPND;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode == 3)
-			goto error;
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	/* move special register to register or reverse if vbit */
-	case SREG:
-		switch (opcode5) {
-
-		case 2:
-			vbit = 1;
-			/*FALLTHROUGH*/
-		case 0:
-			wbit = CONTROL_OPND;
-			break;
-
-		case 3:
-			vbit = 1;
-			/*FALLTHROUGH*/
-		case 1:
-			wbit = DEBUG_OPND;
-			break;
-
-		case 6:
-			vbit = 1;
-			/*FALLTHROUGH*/
-		case 4:
-			wbit = TEST_OPND;
-			break;
-
-		}
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit);
-		dtrace_get_operand(x, REG_ONLY, r_m, LONG_OPND, 1 - vbit);
-		NOMEM;
-		break;
-
-	/*
-	 * single register operand with register in the low 3
-	 * bits of op code
-	 */
-	case R:
-		if (opcode_bytes == 2)
-			reg = REGNO(opcode5);
-		else
-			reg = REGNO(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0);
-		NOMEM;
-		break;
-
-	/*
-	 * register to accumulator with register in the low 3
-	 * bits of op code, xchg instructions
-	 */
-	case RA:
-		NOMEM;
-		reg = REGNO(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, LONG_OPND, 1);
-		break;
-
-	/*
-	 * single segment register operand, with register in
-	 * bits 3-4 of op code byte
-	 */
-	case SEG:
-		NOMEM;
-		reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x3;
-		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0);
-		break;
-
-	/*
-	 * single segment register operand, with register in
-	 * bits 3-5 of op code
-	 */
-	case LSEG:
-		NOMEM;
-		/* long seg reg from opcode */
-		reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x7;
-		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0);
-		break;
-
-	/* memory or register operand to register */
-	case MR:
-		wbit = LONG_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
-		break;
-
-	case RM:
-		wbit = LONG_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
-		break;
-
-	/* MMX/SIMD-Int memory or mm reg to mm reg		*/
-	case MM:
-	case MMO:
-#ifdef DIS_TEXT
-		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
-#else
-		wbit = LONG_OPND;
-#endif
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0);
-		break;
-
-	case MMOIMPL:
-#ifdef DIS_TEXT
-		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
-#else
-		wbit = LONG_OPND;
-#endif
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		dtrace_get_operand(x, REG_ONLY, reg, MM_OPND, 1);
-		mode = 0;	/* change for memory access size... */
-		break;
-
-	/* MMX/SIMD-Int and SIMD-FP predicated mm reg to r32 */
-	case MMO3P:
-		wbit = MM_OPND;
-		goto xmm3p;
-	case XMM3P:
-		wbit = XMM_OPND;
-xmm3p:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 1);
-		NOMEM;
-		break;
-
-	/* MMX/SIMD-Int predicated r32/mem to mm reg */
-	case MMOPRM:
-		wbit = LONG_OPND;
-		w2 = MM_OPND;
-		goto xmmprm;
-	case XMMPRM:
-		wbit = LONG_OPND;
-		w2 = XMM_OPND;
-xmmprm:
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, 1);
-		break;
-
-	/* MMX/SIMD-Int predicated mm/mem to mm reg */
-	case MMOPM:
-		wbit = w2 = MM_OPND;
-		goto xmmprm;
-
-	/* MMX/SIMD-Int mm reg to r32 */
-	case MMOM3:
-		NOMEM;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-		wbit = MM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0);
-		break;
-
-	/* SIMD memory or xmm reg operand to xmm reg		*/
-	case XMM:
-	case XMMO:
-	case XMMXIMPL:
-		wbit = XMM_OPND;
-		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
-
-		if (dp->it_adrmode == XMMXIMPL && mode != REG_ONLY)
-			goto error;
-
-#ifdef DIS_TEXT
-		/*
-		 * movlps and movhlps share opcodes.  They differ in the
-		 * addressing modes allowed for their operands.
-		 * movhps and movlhps behave similarly.
-		 */
-		if (mode == REG_ONLY) {
-			if (strcmp(dp->it_name, "movlps") == 0)
-				(void) strncpy(x->d86_mneu, "movhlps", OPLEN);
-			else if (strcmp(dp->it_name, "movhps") == 0)
-				(void) strncpy(x->d86_mneu, "movlhps", OPLEN);
-		}
-#endif
-		if (dp->it_adrmode == XMMXIMPL)
-			mode = 0;	/* change for memory access size... */
-		break;
-
-	/* SIMD xmm reg to memory or xmm reg */
-	case XMMS:
-	case XMMOS:
-	case XMMMS:
-	case XMMOMS:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-#ifdef DIS_TEXT
-		if ((strcmp(dp->it_name, "movlps") == 0 ||
-		    strcmp(dp->it_name, "movhps") == 0 ||
-		    strcmp(dp->it_name, "movntps") == 0) &&
-		    mode == REG_ONLY)
-			goto error;
-#endif
-		wbit = XMM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
-		break;
-
-	/* SIMD memory to xmm reg */
-	case XMMM:
-	case XMMOM:
-		wbit = XMM_OPND;
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-#ifdef DIS_TEXT
-		if (mode == REG_ONLY) {
-			if (strcmp(dp->it_name, "movhps") == 0)
-				(void) strncpy(x->d86_mneu, "movlhps", OPLEN);
-			else
-				goto error;
-		}
-#endif
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
-		break;
-
-	/* SIMD memory or r32 to xmm reg			*/
-	case XMM3MX:
-		wbit = LONG_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
-		break;
-
-	case XMM3MXS:
-		wbit = LONG_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
-		break;
-
-	/* SIMD memory or mm reg to xmm reg			*/
-	case XMMOMX:
-	/* SIMD mm to xmm */
-	case XMMMX:
-		wbit = MM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
-		break;
-
-	/* SIMD memory or xmm reg to mm reg			*/
-	case XMMXMM:
-	case XMMOXMM:
-	case XMMXM:
-		wbit = XMM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0);
-		break;
-
-
-	/* SIMD memory or xmm reg to r32			*/
-	case XMMXM3:
-		wbit = XMM_OPND;
-		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0);
-		break;
-
-	/* SIMD xmm to r32					*/
-	case XMMX3:
-	case XMMOX3:
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-		if (mode != REG_ONLY)
-			goto error;
-		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-		dtrace_get_operand(x, mode, r_m, XMM_OPND, 0);
-		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
-		NOMEM;
-		break;
-
-	/* SIMD predicated memory or xmm reg with/to xmm reg */
-	case XMMP:
-	case XMMOPM:
-		wbit = XMM_OPND;
-		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
-
-#ifdef DIS_TEXT
-		/*
-		 * cmpps and cmpss vary their instruction name based
-		 * on the value of imm8.  Other XMMP instructions,
-		 * such as shufps, require explicit specification of
-		 * the predicate.
-		 */
-		if (dp->it_name[0] == 'c' &&
-		    dp->it_name[1] == 'm' &&
-		    dp->it_name[2] == 'p' &&
-		    strlen(dp->it_name) == 5) {
-			uchar_t pred = x->d86_opnd[0].d86_value & 0xff;
-
-			if (pred >= (sizeof (dis_PREDSUFFIX) / sizeof (char *)))
-				goto error;
-
-			(void) strncpy(x->d86_mneu, "cmp", OPLEN);
-			(void) strlcat(x->d86_mneu, dis_PREDSUFFIX[pred],
-			    OPLEN);
-			(void) strlcat(x->d86_mneu,
-			    dp->it_name + strlen(dp->it_name) - 2,
-			    OPLEN);
-			x->d86_opnd[0] = x->d86_opnd[1];
-			x->d86_opnd[1] = x->d86_opnd[2];
-			x->d86_numopnds = 2;
-		}
-#endif
-		break;
-
-	/* immediate operand to accumulator */
-	case IA:
-		wbit = WBIT(opcode2);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1);
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0);
-		NOMEM;
-		break;
-
-	/* memory or register operand to accumulator */
-	case MA:
-		wbit = WBIT(opcode2);
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, wbit, 0);
-		break;
-
-	/* si register to di register used to reference memory		*/
-	case SD:
-#ifdef DIS_TEXT
-		dtrace_check_override(x, 0);
-		x->d86_numopnds = 2;
-		if (addr_size == SIZE64) {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)",
-			    OPLEN);
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)",
-			    OPLEN);
-		} else if (addr_size == SIZE32) {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)",
-			    OPLEN);
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)",
-			    OPLEN);
-		} else {
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)",
-			    OPLEN);
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)",
-			    OPLEN);
-		}
-#endif
-		wbit = LONG_OPND;
-		break;
-
-	/* accumulator to di register				*/
-	case AD:
-		wbit = WBIT(opcode2);
-#ifdef DIS_TEXT
-		dtrace_check_override(x, 1);
-		x->d86_numopnds = 2;
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 0);
-		if (addr_size == SIZE64)
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)",
-			    OPLEN);
-		else if (addr_size == SIZE32)
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)",
-			    OPLEN);
-		else
-			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)",
-			    OPLEN);
-#endif
-		break;
-
-	/* si register to accumulator				*/
-	case SA:
-		wbit = WBIT(opcode2);
-#ifdef DIS_TEXT
-		dtrace_check_override(x, 0);
-		x->d86_numopnds = 2;
-		if (addr_size == SIZE64)
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)",
-			    OPLEN);
-		else if (addr_size == SIZE32)
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)",
-			    OPLEN);
-		else
-			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)",
-			    OPLEN);
-		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1);
-#endif
-		break;
-
-	/*
-	 * single operand, a 16/32 bit displacement
-	 */
-	case D:
-		wbit = LONG_OPND;
-		dtrace_disp_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0);
-		NOMEM;
-		break;
-
-	/* jmp/call indirect to memory or register operand		*/
-	case INM:
-#ifdef DIS_TEXT
-		(void) strlcat(x->d86_opnd[0].d86_prefix, "*", OPLEN);
-#endif
-		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
-		dtrace_get_operand(x, mode, r_m, LONG_OPND, 0);
-		wbit = LONG_OPND;
-		break;
-
-	/*
-	 * for long jumps and long calls -- a new code segment
-	 * register and an offset in IP -- stored in object
-	 * code in reverse order. Note - not valid in amd64
-	 */
-	case SO:
-		dtrace_check_override(x, 1);
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 1);
-#ifdef DIS_TEXT
-		x->d86_opnd[1].d86_mode = MODE_SIGNED;
-#endif
-		/* will now get segment operand */
-		dtrace_imm_opnd(x, wbit, 2, 0);
-		break;
-
-	/*
-	 * jmp/call. single operand, 8 bit displacement.
-	 * added to current EIP in 'compofff'
-	 */
-	case BD:
-		dtrace_disp_opnd(x, BYTE_OPND, 1, 0);
-		NOMEM;
-		break;
-
-	/* single 32/16 bit immediate operand			*/
-	case I:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0);
-		break;
-
-	/* single 8 bit immediate operand			*/
-	case Ib:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, 1, 0);
-		break;
-
-	case ENTER:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, 2, 0);
-		dtrace_imm_opnd(x, wbit, 1, 1);
-		switch (opnd_size) {
-		case SIZE64:
-			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 8;
-			break;
-		case SIZE32:
-			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 4;
-			break;
-		case SIZE16:
-			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 2;
-			break;
-		}
-
-		break;
-
-	/* 16-bit immediate operand */
-	case RET:
-		wbit = LONG_OPND;
-		dtrace_imm_opnd(x, wbit, 2, 0);
-		break;
-
-	/* single 8 bit port operand				*/
-	case P:
-		dtrace_check_override(x, 0);
-		dtrace_imm_opnd(x, BYTE_OPND, 1, 0);
-		NOMEM;
-		break;
-
-	/* single operand, dx register (variable port instruction) */
-	case V:
-		x->d86_numopnds = 1;
-		dtrace_check_override(x, 0);
-#ifdef DIS_TEXT
-		(void) strlcat(x->d86_opnd[0].d86_opnd, "(%dx)", OPLEN);
-#endif
-		NOMEM;
-		break;
-
-	/*
-	 * The int instruction, which has two forms:
-	 * int 3 (breakpoint) or
-	 * int n, where n is indicated in the subsequent
-	 * byte (format Ib).  The int 3 instruction (opcode 0xCC),
-	 * where, although the 3 looks  like an operand,
-	 * it is implied by the opcode. It must be converted
-	 * to the correct base and output.
-	 */
-	case INT3:
-#ifdef DIS_TEXT
-		x->d86_numopnds = 1;
-		x->d86_opnd[0].d86_mode = MODE_SIGNED;
-		x->d86_opnd[0].d86_value_size = 1;
-		x->d86_opnd[0].d86_value = 3;
-#endif
-		NOMEM;
-		break;
-
-	/* single 8 bit immediate operand			*/
-	case INTx:
-		dtrace_imm_opnd(x, BYTE_OPND, 1, 0);
-		NOMEM;
-		break;
-
-	/* an unused byte must be discarded */
-	case U:
-		if (x->d86_get_byte(x->d86_data) < 0)
-			goto error;
-		x->d86_len++;
-		NOMEM;
-		break;
-
-	case CBW:
-#ifdef DIS_TEXT
-		if (opnd_size == SIZE16)
-			(void) strlcat(x->d86_mneu, "cbtw", OPLEN);
-		else if (opnd_size == SIZE32)
-			(void) strlcat(x->d86_mneu, "cwtl", OPLEN);
-		else
-			(void) strlcat(x->d86_mneu, "cltq", OPLEN);
-#endif
-		wbit = LONG_OPND;
-		NOMEM;
-		break;
-
-	case CWD:
-#ifdef DIS_TEXT
-		if (opnd_size == SIZE16)
-			(void) strlcat(x->d86_mneu, "cwtd", OPLEN);
-		else if (opnd_size == SIZE32)
-			(void) strlcat(x->d86_mneu, "cltd", OPLEN);
-		else
-			(void) strlcat(x->d86_mneu, "cqtd", OPLEN);
-#endif
-		wbit = LONG_OPND;
-		NOMEM;
-		break;
-
-	case XMMSFNC:
-		/*
-		 * sfence is sfence if mode is REG_ONLY.  If mode isn't
-		 * REG_ONLY, mnemonic should be 'clflush'.
-		 */
-		dtrace_get_modrm(x, &mode, &reg, &r_m);
-
-		/* sfence doesn't take operands */
-#ifdef DIS_TEXT
-		if (mode == REG_ONLY) {
-			(void) strlcat(x->d86_mneu, "sfence", OPLEN);
-		} else {
-			(void) strlcat(x->d86_mneu, "clflush", OPLEN);
-			dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-			dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0);
-			NOMEM;
-		}
-#else
-		if (mode != REG_ONLY) {
-			dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
-			dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0);
-			NOMEM;
-		}
-#endif
-		break;
-
-	/*
-	 * no disassembly, the mnemonic was all there was so go on
-	 */
-	case NORM:
-		if (dp->it_invalid32 && cpu_mode != SIZE64)
-			goto error;
-		NOMEM;
-		/*FALLTHROUGH*/
-	case IMPLMEM:
-		break;
-
-	case XMMFENCE:
-		/*
-		 * Only the following exact byte sequences are allowed:
-		 *
-		 * 	0f ae e8	lfence
-		 * 	0f ae f0	mfence
-		 */
-		if ((uint8_t)x->d86_bytes[x->d86_len - 1] != 0xe8 &&
-		    (uint8_t)x->d86_bytes[x->d86_len - 1] != 0xf0)
-			goto error;
-
-		break;
-
-
-	/* float reg */
-	case F:
-#ifdef DIS_TEXT
-		x->d86_numopnds = 1;
-		(void) strlcat(x->d86_opnd[0].d86_opnd, "%st(X)", OPLEN);
-		x->d86_opnd[0].d86_opnd[4] = r_m + '0';
-#endif
-		NOMEM;
-		break;
-
-	/* float reg to float reg, with ret bit present */
-	case FF:
-		vbit = opcode2 >> 2 & 0x1;	/* vbit = 1: st -> st(i) */
-		/*FALLTHROUGH*/
-	case FFC:				/* case for vbit always = 0 */
-#ifdef DIS_TEXT
-		x->d86_numopnds = 2;
-		(void) strlcat(x->d86_opnd[1 - vbit].d86_opnd, "%st", OPLEN);
-		(void) strlcat(x->d86_opnd[vbit].d86_opnd, "%st(X)", OPLEN);
-		x->d86_opnd[vbit].d86_opnd[4] = r_m + '0';
-#endif
-		NOMEM;
-		break;
-
-	/* an invalid op code */
-	case AM:
-	case DM:
-	case OVERRIDE:
-	case PREFIX:
-	case UNKNOWN:
-		NOMEM;
-	default:
-		goto error;
-	} /* end switch */
-	if (x->d86_error)
-		goto error;
-
-done:
-#ifdef DIS_MEM
-	/*
-	 * compute the size of any memory accessed by the instruction
-	 */
-	if (x->d86_memsize != 0) {
-		return (0);
-	} else if (dp->it_stackop) {
-		switch (opnd_size) {
-		case SIZE16:
-			x->d86_memsize = 2;
-			break;
-		case SIZE32:
-			x->d86_memsize = 4;
-			break;
-		case SIZE64:
-			x->d86_memsize = 8;
-			break;
-		}
-	} else if (nomem || mode == REG_ONLY) {
-		x->d86_memsize = 0;
-
-	} else if (dp->it_size != 0) {
-		/*
-		 * In 64 bit mode descriptor table entries
-		 * go up to 10 bytes and popf/pushf are always 8 bytes
-		 */
-		if (x->d86_mode == SIZE64 && dp->it_size == 6)
-			x->d86_memsize = 10;
-		else if (x->d86_mode == SIZE64 && opcode1 == 0x9 &&
-		    (opcode2 == 0xc || opcode2 == 0xd))
-			x->d86_memsize = 8;
-		else
-			x->d86_memsize = dp->it_size;
-
-	} else if (wbit == 0) {
-		x->d86_memsize = 1;
-
-	} else if (wbit == LONG_OPND) {
-		if (opnd_size == SIZE64)
-			x->d86_memsize = 8;
-		else if (opnd_size == SIZE32)
-			x->d86_memsize = 4;
-		else
-			x->d86_memsize = 2;
-
-	} else if (wbit == SEG_OPND) {
-		x->d86_memsize = 4;
-
-	} else {
-		x->d86_memsize = 8;
-	}
-#endif
-	return (0);
-
-error:
-#ifdef DIS_TEXT
-	(void) strlcat(x->d86_mneu, "undef", OPLEN);
-#endif
-	return (1);
-}
-
-#ifdef DIS_TEXT
-
-/*
- * Some instructions should have immediate operands printed
- * as unsigned integers. We compare against this table.
- */
-static char *unsigned_ops[] = {
-	"or", "and", "xor", "test", "in", "out", "lcall", "ljmp",
-	"rcr", "rcl", "ror", "rol", "shl", "shr", "sal", "psr", "psl",
-	0
-};
-
-static int
-isunsigned_op(char *opcode)
-{
-	char *where;
-	int i;
-	int is_unsigned = 0;
-
-	/*
-	 * Work back to start of last mnemonic, since we may have
-	 * prefixes on some opcodes.
-	 */
-	where = opcode + strlen(opcode) - 1;
-	while (where > opcode && *where != ' ')
-		--where;
-	if (*where == ' ')
-		++where;
-
-	for (i = 0; unsigned_ops[i]; ++i) {
-		if (strncmp(where, unsigned_ops[i],
-		    strlen(unsigned_ops[i])))
-			continue;
-		is_unsigned = 1;
-		break;
-	}
-	return (is_unsigned);
-}
-
-/* ARGSUSED */
-void
-dtrace_disx86_str(dis86_t *dis, uint_t mode, uintptr_t pc, char *buf,
-    size_t buflen)
-{
-	int i;
-
-	dis->d86_sprintf_func(buf, buflen, "%-6s ", dis->d86_mneu);
-
-	/*
-	 * For PC-relative jumps, the pc is really the next pc after executing
-	 * this instruction, so increment it appropriately.
-	 */
-	pc += dis->d86_len;
-
-	for (i = 0; i < dis->d86_numopnds; i++) {
-		d86opnd_t *op = &dis->d86_opnd[i];
-		int64_t sv;
-		uint64_t mask;
-
-		if (i != 0)
-			(void) strlcat(buf, ",", buflen);
-
-		(void) strlcat(buf, op->d86_prefix, buflen);
-
-		sv = op->d86_value;
-
-		switch (op->d86_mode) {
-
-		case MODE_NONE:
-
-			(void) strlcat(buf, op->d86_opnd, buflen);
-			break;
-
-		case MODE_SIGNED:
-		case MODE_IMPLIED:
-		case MODE_OFFSET:
-
-			if (dis->d86_seg_prefix)
-				(void) strlcat(buf, dis->d86_seg_prefix,
-				    buflen);
-
-			switch (op->d86_value_size) {
-			case 1:
-				sv = (int8_t)sv;
-				mask = 0xff;
-				break;
-			case 2:
-				sv = (int16_t)sv;
-				mask = 0xffff;
-				break;
-			case 4:
-				sv = (int32_t)sv;
-				mask = 0xffffffff;
-				break;
-			case 8:
-				mask = 0xffffffffffffffffULL;
-				break;
-			}
-
-			if (op->d86_mode == MODE_SIGNED ||
-			    op->d86_mode == MODE_IMPLIED)
-				(void) strlcat(buf, "$", buflen);
-
-			if (sv < 0 && sv > -0xffff &&
-			    !isunsigned_op(dis->d86_mneu)) {
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "-0%llo" : "-0x%llx", -sv & mask);
-			} else {
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "0%llo" : "0x%llx", sv & mask);
-			}
-			(void) strlcat(buf, op->d86_opnd, buflen);
-			break;
-
-		case MODE_IPREL:
-
-			switch (op->d86_value_size) {
-			case 1:
-				sv = (int8_t)sv;
-				break;
-			case 2:
-				sv = (int16_t)sv;
-				break;
-			case 4:
-				sv = (int32_t)sv;
-				break;
-			}
-
-			if (sv < 0)
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "-0%llo" : "-0x%llx", -sv - dis->d86_len);
-			else
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "+0%llo" : "+0x%llx", sv + dis->d86_len);
-
-			(void) strlcat(buf, "\t<", buflen);
-
-			if (dis->d86_sym_lookup == NULL ||
-			    dis->d86_sym_lookup(dis->d86_data, pc + sv,
-			    buf + strlen(buf), buflen - strlen(buf)) != 0)
-				dis->d86_sprintf_func(buf + strlen(buf),
-				    buflen - strlen(buf),
-				    (dis->d86_flags & DIS_OP_OCTAL) ?
-				    "0%llo" : "0x%llx", pc + sv);
-
-			(void) strlcat(buf, ">", buflen);
-
-			break;
-		}
-	}
-}
-
-#endif /* DIS_TEXT */
Index: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h
diff -N src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h
--- src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h	21 Feb 2010 01:46:33 -0000	1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,114 +0,0 @@
-/*	$NetBSD: dis_tables.h,v 1.2 2010/02/21 01:46:33 darran Exp $	*/
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dis_tables.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved  	*/
-
-
-#ifndef _DIS_TABLES_H
-#define	_DIS_TABLES_H
-
-#if defined(sun)
-#pragma ident	"@(#)dis_tables.h	1.7	06/03/02 SMI"
-#endif
-
-/*
- * Constants and prototypes for the IA32 disassembler backend.  See dis_tables.c
- * for usage information and documentation.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <sys/types.h>
-#include <sys/param.h>
-
-/*
- * values for cpu mode
- */
-#define	SIZE16	1
-#define	SIZE32	2
-#define	SIZE64	3
-
-#define	OPLEN	256
-#define	PFIXLEN	  8
-#define	NCPS	12	/* number of chars per symbol	*/
-
-/*
- * data structures that must be provided to dtrace_dis86()
- */
-typedef struct d86opnd {
-	char		d86_opnd[OPLEN];	/* symbolic rep of operand */
-	char		d86_prefix[PFIXLEN];	/* any prefix string or "" */
-	uint_t		d86_mode;		/* mode for immediate */
-	uint_t		d86_value_size;		/* size in bytes of d86_value */
-	uint64_t	d86_value;		/* immediate value of opnd */
-} d86opnd_t;
-
-typedef struct dis86 {
-	uint_t		d86_mode;
-	uint_t		d86_error;
-	uint_t		d86_len;		/* instruction length */
-	int		d86_rmindex;		/* index of modrm byte or -1 */
-	uint_t		d86_memsize;		/* size of memory referenced */
-	char		d86_bytes[16];		/* bytes of instruction */
-	char		d86_mneu[OPLEN];
-	uint_t		d86_numopnds;
-	uint_t		d86_rex_prefix;		/* value of REX prefix if !0 */
-	char		*d86_seg_prefix;	/* segment prefix, if any */
-	uint_t		d86_opnd_size;
-	uint_t		d86_addr_size;
-	uint_t		d86_got_modrm;
-	struct d86opnd	d86_opnd[3];		/* up to 3 operands */
-	int		(*d86_check_func)(void *);
-	int		(*d86_get_byte)(void *);
-#ifdef DIS_TEXT
-	int		(*d86_sym_lookup)(void *, uint64_t, char *, size_t);
-	int		(*d86_sprintf_func)(char *, size_t, const char *, ...);
-	int		d86_flags;
-	uint_t		d86_imm_bytes;
-#endif
-	void		*d86_data;
-} dis86_t;
-
-extern int dtrace_disx86(dis86_t *x, uint_t cpu_mode);
-
-#define	DIS_OP_OCTAL	0x1	/* Print all numbers in octal */
-
-#ifdef DIS_TEXT
-extern void dtrace_disx86_str(dis86_t *x, uint_t cpu_mode, uintptr_t pc,
-    char *buf, size_t len);
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _DIS_TABLES_H */
Index: src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S,v
retrieving revision 1.6
diff -u -p -r1.6 dtrace_asm.S
--- src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S	27 Feb 2017 06:47:00 -0000	1.6
+++ src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S	12 Apr 2017 16:19:26 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dtrace_asm.S,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/i386/dtrace_asm.S 298171 2016-04-17 23:08:47Z markj $
  */
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
@@ -146,11 +146,11 @@ invop_leave:
 	movl	8(%esp), %eax		/* load calling EIP */
 	incl	%eax			/* increment over LOCK prefix */
 	movl	%eax, -8(%ebx)		/* store calling EIP */
-	movl	%ebx, -4(%esp)		/* temporarily store new %esp */
+	subl	$8, %ebx		/* adjust for three pushes, one pop */
+	movl	%ebx, 8(%esp)		/* temporarily store new %esp */
 	popl	%ebx			/* pop off temp */
 	popl	%eax			/* pop off temp */
-	movl	-12(%esp), %esp		/* set stack pointer */
-	subl	$8, %esp		/* adjust for three pushes, one pop */
+	movl	(%esp), %esp		/* set stack pointer */
 	iret				/* return from interrupt */
 invop_nop:
 	/*
@@ -193,19 +193,7 @@ uint32_t dtrace_cas32(uint32_t *target, 
 */
 
 	ENTRY(dtrace_cas32)
-	movl	4(%esp), %edx
-	movl	8(%esp), %eax
-	movl	12(%esp), %ecx
-	lock
-	cmpxchgl %ecx, (%edx)
-	ret
-	END(dtrace_cas32)
-
-/*
-uint32_t dtrace_casptr(uint32_t *target, uint32_t cmp, uint32_t new)
-*/
-
-	ENTRY(dtrace_casptr)
+	ALTENTRY(dtrace_casptr)
 	movl	4(%esp), %edx
 	movl	8(%esp), %eax
 	movl	12(%esp), %ecx
@@ -213,7 +201,7 @@ uint32_t dtrace_casptr(uint32_t *target,
 	cmpxchgl %ecx, (%edx)
 	ret
 	END(dtrace_casptr)
-
+	END(dtrace_cas32)
 
 /*
 uintptr_t dtrace_caller(int aframes)
Index: src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c,v
retrieving revision 1.5
diff -u -p -r1.5 dtrace_isa.c
--- src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c	27 Feb 2017 06:47:00 -0000	1.5
+++ src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c	8 May 2017 08:27:51 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dtrace_isa.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/i386/dtrace_isa.c 298171 2016-04-17 23:08:47Z markj $
  */
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
@@ -36,6 +36,8 @@
 #include <machine/vmparam.h>
 #include <machine/pmap.h>
 
+#include "regset.h"
+
 uintptr_t kernelbase = (uintptr_t)KERNBASE;
 
 #define INKERNEL(va) \
@@ -54,6 +56,8 @@ uint16_t dtrace_fuword16_nocheck(void *)
 uint32_t dtrace_fuword32_nocheck(void *);
 uint64_t dtrace_fuword64_nocheck(void *);
 
+int	dtrace_ustackdepth_max = 2048;
+
 void
 dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes,
     uint32_t *intrpc)
@@ -112,11 +116,13 @@ dtrace_getustack_common(uint64_t *pcstac
 	uintptr_t oldcontext = lwp->lwp_oldcontext; /* XXX signal stack. */
 	size_t s1, s2;
 #endif
+	uintptr_t oldsp;
 	volatile uint16_t *flags =
 	    (volatile uint16_t *)&cpu_core[cpu_number()].cpuc_dtrace_flags;
 	int ret = 0;
 
 	ASSERT(pcstack == NULL || pcstack_limit > 0);
+	ASSERT(dtrace_ustackdepth_max > 0);
 
 #ifdef notyet /* XXX signal stack. */
 	if (p->p_model == DATAMODEL_NATIVE) {
@@ -129,7 +135,16 @@ dtrace_getustack_common(uint64_t *pcstac
 #endif
 
 	while (pc != 0) {
-		ret++;
+		/*
+		 * We limit the number of times we can go around this
+		 * loop to account for a circular stack.
+		 */
+		if (ret++ >= dtrace_ustackdepth_max) {
+			*flags |= CPU_DTRACE_BADSTACK;
+			cpu_core[cpu_number()].cpuc_dtrace_illval = sp;
+			break;
+		}
+
 		if (pcstack != NULL) {
 			*pcstack++ = (uint64_t)pc;
 			pcstack_limit--;
@@ -140,6 +155,8 @@ dtrace_getustack_common(uint64_t *pcstac
 		if (sp == 0)
 			break;
 
+		oldsp = sp;
+
 #ifdef notyet /* XXX signal stack. */ 
 		if (oldcontext == sp + s1 || oldcontext == sp + s2) {
 			if (p->p_model == DATAMODEL_NATIVE) {
@@ -178,6 +195,12 @@ dtrace_getustack_common(uint64_t *pcstac
 		sp = dtrace_fuword32((void *)sp);
 #endif /* ! notyet */
 
+		if (sp == oldsp) {
+			*flags |= CPU_DTRACE_BADSTACK;
+			cpu_core[cpu_number()].cpuc_dtrace_illval = sp;
+			break;
+		}
+
 		/*
 		 * This is totally bogus:  if we faulted, we're going to clear
 		 * the fault and break.  This is to deal with the apparently
@@ -242,7 +265,7 @@ dtrace_getupcstack(uint64_t *pcstack, in
 		pc = dtrace_fuword32((void *) sp);
 	}
 
-	n = dtrace_getustack_common(pcstack, pcstack_limit, pc, fp);
+	n = dtrace_getustack_common(pcstack, pcstack_limit, pc, sp);
 	ASSERT(n >= 0);
 	ASSERT(n <= pcstack_limit);
 
@@ -429,6 +452,7 @@ dtrace_getarg(int arg, int aframes)
 			stack = (uintptr_t *)&frame->tf_esp + 1;
 			goto load;
 		}
+
 	}
 
 	/*
@@ -480,112 +504,102 @@ dtrace_getstackdepth(int aframes)
 		return depth - aframes;
 }
 
-#ifdef notyet
 ulong_t
-dtrace_getreg(struct regs *rp, uint_t reg)
+dtrace_getreg(struct trapframe *rp, uint_t reg)
 {
-#if defined(__amd64)
-	int regmap[] = {
-		REG_GS,		/* GS */
-		REG_FS,		/* FS */
-		REG_ES,		/* ES */
-		REG_DS,		/* DS */
-		REG_RDI,	/* EDI */
-		REG_RSI,	/* ESI */
-		REG_RBP,	/* EBP */
-		REG_RSP,	/* ESP */
-		REG_RBX,	/* EBX */
-		REG_RDX,	/* EDX */
-		REG_RCX,	/* ECX */
-		REG_RAX,	/* EAX */
-		REG_TRAPNO,	/* TRAPNO */
-		REG_ERR,	/* ERR */
-		REG_RIP,	/* EIP */
-		REG_CS,		/* CS */
-		REG_RFL,	/* EFL */
-		REG_RSP,	/* UESP */
-		REG_SS		/* SS */
+	struct pcb *pcb;
+	int regmap[] = {  /* Order is dependent on reg.d */
+		REG_GS,		/* 0  GS */
+		REG_FS,		/* 1  FS */
+		REG_ES,		/* 2  ES */
+		REG_DS,		/* 3  DS */
+		REG_RDI,	/* 4  EDI */
+		REG_RSI,	/* 5  ESI */
+		REG_RBP,	/* 6  EBP, REG_FP */
+		REG_RSP,	/* 7  ESP */
+		REG_RBX,	/* 8  EBX */
+		REG_RDX,	/* 9  EDX, REG_R1 */
+		REG_RCX,	/* 10 ECX */
+		REG_RAX,	/* 11 EAX, REG_R0 */
+		REG_TRAPNO,	/* 12 TRAPNO */
+		REG_ERR,	/* 13 ERR */
+		REG_RIP,	/* 14 EIP, REG_PC */
+		REG_CS,		/* 15 CS */
+		REG_RFL,	/* 16 EFL, REG_PS */
+		REG_RSP,	/* 17 UESP, REG_SP */
+		REG_SS		/* 18 SS */
 	};
 
-	if (reg <= SS) {
-		if (reg >= sizeof (regmap) / sizeof (int)) {
-			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
-			return (0);
-		}
+	if (reg > SS) {
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+		return (0);
+	}
 
-		reg = regmap[reg];
-	} else {
-		reg -= SS + 1;
+	if (reg >= sizeof (regmap) / sizeof (int)) {
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+		return (0);
 	}
 
-	switch (reg) {
+	reg = regmap[reg];
+
+	switch(reg) {
+	case REG_GS:
+#ifdef __FreeBSD__
+		if ((pcb = curthread->td_pcb) == NULL) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+			return (0);
+		}
+		return (pcb->pcb_gs);
+#endif
+#ifdef __NetBSD__
+		return (rp->tf_gs);
+#endif
+	case REG_FS:
+		return (rp->tf_fs);
+	case REG_ES:
+		return (rp->tf_es);
+	case REG_DS:
+		return (rp->tf_ds);
 	case REG_RDI:
-		return (rp->r_rdi);
+		return (rp->tf_edi);
 	case REG_RSI:
-		return (rp->r_rsi);
-	case REG_RDX:
-		return (rp->r_rdx);
+		return (rp->tf_esi);
+	case REG_RBP:
+		return (rp->tf_ebp);
+	case REG_RSP:
+#ifdef __FreeBSD__
+		return (rp->tf_isp);
+#endif
+#ifdef __NetBSD__
+		return (rp->tf_esp);
+#endif
+	case REG_RBX:
+		return (rp->tf_ebx);
 	case REG_RCX:
-		return (rp->r_rcx);
-	case REG_R8:
-		return (rp->r_r8);
-	case REG_R9:
-		return (rp->r_r9);
+		return (rp->tf_ecx);
 	case REG_RAX:
-		return (rp->r_rax);
-	case REG_RBX:
-		return (rp->r_rbx);
-	case REG_RBP:
-		return (rp->r_rbp);
-	case REG_R10:
-		return (rp->r_r10);
-	case REG_R11:
-		return (rp->r_r11);
-	case REG_R12:
-		return (rp->r_r12);
-	case REG_R13:
-		return (rp->r_r13);
-	case REG_R14:
-		return (rp->r_r14);
-	case REG_R15:
-		return (rp->r_r15);
-	case REG_DS:
-		return (rp->r_ds);
-	case REG_ES:
-		return (rp->r_es);
-	case REG_FS:
-		return (rp->r_fs);
-	case REG_GS:
-		return (rp->r_gs);
+		return (rp->tf_eax);
 	case REG_TRAPNO:
-		return (rp->r_trapno);
+		return (rp->tf_trapno);
 	case REG_ERR:
-		return (rp->r_err);
+		return (rp->tf_err);
 	case REG_RIP:
-		return (rp->r_rip);
+		return (rp->tf_eip);
 	case REG_CS:
-		return (rp->r_cs);
-	case REG_SS:
-		return (rp->r_ss);
+		return (rp->tf_cs);
 	case REG_RFL:
-		return (rp->r_rfl);
+		return (rp->tf_eflags);
+#if 0
 	case REG_RSP:
-		return (rp->r_rsp);
+		return (rp->tf_esp);
+#endif
+	case REG_SS:
+		return (rp->tf_ss);
 	default:
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
 		return (0);
 	}
-
-#else
-	if (reg > SS) {
-		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
-		return (0);
-	}
-
-	return ((&rp->r_gs)[reg]);
-#endif
 }
-#endif
 
 static int
 dtrace_copycheck(uintptr_t uaddr, uintptr_t kaddr, size_t size)
Index: src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c,v
retrieving revision 1.8
diff -u -p -r1.8 dtrace_subr.c
--- src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c	27 Feb 2017 06:47:00 -0000	1.8
+++ src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c	19 Apr 2017 17:15:40 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dtrace_subr.c,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/dtrace/i386/dtrace_subr.c 313850 2017-02-17 03:27:20Z markj $
  *
  */
 /*
@@ -29,6 +29,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/types.h>
@@ -38,7 +42,6 @@
 #include <sys/xcall.h>
 #include <sys/cpu.h>
 #include <sys/cpuvar.h>
-//#include <sys/smp.h>
 #include <sys/dtrace_impl.h>
 #include <sys/dtrace_bsd.h>
 #include <machine/cpu.h>
@@ -51,8 +54,8 @@
 #include <x86/include/cpu_counter.h>
 
 extern uintptr_t 	kernelbase;
-extern uintptr_t 	dtrace_in_probe_addr;
-extern int		dtrace_in_probe;
+
+extern void dtrace_getnanotime(struct timespec *tsp);
 
 int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t);
 
@@ -162,122 +165,6 @@ dtrace_sync(void)
 }
 
 #ifdef notyet
-int (*dtrace_fasttrap_probe_ptr)(struct regs *);
-int (*dtrace_pid_probe_ptr)(struct regs *);
-int (*dtrace_return_probe_ptr)(struct regs *);
-
-void
-dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid)
-{
-	krwlock_t *rwp;
-	proc_t *p = curproc;
-	extern void trap(struct regs *, caddr_t, processorid_t);
-
-	if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) {
-		if (curthread->t_cred != p->p_cred) {
-			cred_t *oldcred = curthread->t_cred;
-			/*
-			 * DTrace accesses t_cred in probe context.  t_cred
-			 * must always be either NULL, or point to a valid,
-			 * allocated cred structure.
-			 */
-			curthread->t_cred = crgetcred();
-			crfree(oldcred);
-		}
-	}
-
-	if (rp->r_trapno == T_DTRACE_RET) {
-		uint8_t step = curthread->t_dtrace_step;
-		uint8_t ret = curthread->t_dtrace_ret;
-		uintptr_t npc = curthread->t_dtrace_npc;
-
-		if (curthread->t_dtrace_ast) {
-			aston(curthread);
-			curthread->t_sig_check = 1;
-		}
-
-		/*
-		 * Clear all user tracing flags.
-		 */
-		curthread->t_dtrace_ft = 0;
-
-		/*
-		 * If we weren't expecting to take a return probe trap, kill
-		 * the process as though it had just executed an unassigned
-		 * trap instruction.
-		 */
-		if (step == 0) {
-			tsignal(curthread, SIGILL);
-			return;
-		}
-
-		/*
-		 * If we hit this trap unrelated to a return probe, we're
-		 * just here to reset the AST flag since we deferred a signal
-		 * until after we logically single-stepped the instruction we
-		 * copied out.
-		 */
-		if (ret == 0) {
-			rp->r_pc = npc;
-			return;
-		}
-
-		/*
-		 * We need to wait until after we've called the
-		 * dtrace_return_probe_ptr function pointer to set %pc.
-		 */
-		rwp = &CPU->cpu_ft_lock;
-		rw_enter(rwp, RW_READER);
-		if (dtrace_return_probe_ptr != NULL)
-			(void) (*dtrace_return_probe_ptr)(rp);
-		rw_exit(rwp);
-		rp->r_pc = npc;
-
-	} else if (rp->r_trapno == T_DTRACE_PROBE) {
-		rwp = &CPU->cpu_ft_lock;
-		rw_enter(rwp, RW_READER);
-		if (dtrace_fasttrap_probe_ptr != NULL)
-			(void) (*dtrace_fasttrap_probe_ptr)(rp);
-		rw_exit(rwp);
-
-	} else if (rp->r_trapno == T_BPTFLT) {
-		uint8_t instr;
-		rwp = &CPU->cpu_ft_lock;
-
-		/*
-		 * The DTrace fasttrap provider uses the breakpoint trap
-		 * (int 3). We let DTrace take the first crack at handling
-		 * this trap; if it's not a probe that DTrace knowns about,
-		 * we call into the trap() routine to handle it like a
-		 * breakpoint placed by a conventional debugger.
-		 */
-		rw_enter(rwp, RW_READER);
-		if (dtrace_pid_probe_ptr != NULL &&
-		    (*dtrace_pid_probe_ptr)(rp) == 0) {
-			rw_exit(rwp);
-			return;
-		}
-		rw_exit(rwp);
-
-		/*
-		 * If the instruction that caused the breakpoint trap doesn't
-		 * look like an int 3 anymore, it may be that this tracepoint
-		 * was removed just after the user thread executed it. In
-		 * that case, return to user land to retry the instuction.
-		 */
-		if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 &&
-		    instr != FASTTRAP_INSTR) {
-			rp->r_pc--;
-			return;
-		}
-
-		trap(rp, addr, cpuid);
-
-	} else {
-		trap(rp, addr, cpuid);
-	}
-}
-
 void
 dtrace_safe_synchronous_signal(void)
 {
@@ -323,14 +210,15 @@ dtrace_safe_defer_signal(void)
 	}
 
 	/*
-	 * If we've executed the original instruction, but haven't performed
-	 * the jmp back to t->t_dtrace_npc or the clean up of any registers
-	 * used to emulate %rip-relative instructions in 64-bit mode, do that
-	 * here and take the signal right away. We detect this condition by
-	 * seeing if the program counter is the range [scrpc + isz, astpc).
+	 * If we have executed the original instruction, but we have performed
+	 * neither the jmp back to t->t_dtrace_npc nor the clean up of any
+	 * registers used to emulate %rip-relative instructions in 64-bit mode,
+	 * we'll save ourselves some effort by doing that here and taking the
+	 * signal right away.  We detect this condition by seeing if the program
+	 * counter is the range [scrpc + isz, astpc).
 	 */
-	if (t->t_dtrace_astpc - rp->r_pc <
-	    t->t_dtrace_astpc - t->t_dtrace_scrpc - isz) {
+	if (rp->r_pc >= t->t_dtrace_scrpc + isz &&
+	    rp->r_pc < t->t_dtrace_astpc) {
 #ifdef __amd64
 		/*
 		 * If there is a scratch register and we're on the
@@ -376,10 +264,8 @@ dtrace_safe_defer_signal(void)
 }
 #endif
 
-#if 0
 static int64_t	tgt_cpu_tsc;
 static int64_t	hst_cpu_tsc;
-#endif
 static int64_t	tsc_skew[MAXCPUS];
 static uint64_t	nsec_scale;
 
@@ -395,29 +281,6 @@ dtrace_rdtsc(void)
 	return (rv);
 }
 
-#if 0
-static void
-dtrace_gethrtime_init_sync(void *arg)
-{
-#ifdef CHECK_SYNC
-	/*
-	 * Delay this function from returning on one
-	 * of the CPUs to check that the synchronisation
-	 * works.
-	 */
-	uintptr_t cpu = (uintptr_t) arg;
-
-	if (cpu == curcpu) {
-		int i;
-		for (i = 0; i < 1000000000; i++)
-			tgt_cpu_tsc = dtrace_rdtsc();
-		tgt_cpu_tsc = 0;
-	}
-#endif
-}
-#endif
-
-#if 0
 static void
 dtrace_gethrtime_init_cpu(void *arg)
 {
@@ -428,7 +291,6 @@ dtrace_gethrtime_init_cpu(void *arg)
 	else
 		hst_cpu_tsc = dtrace_rdtsc();
 }
-#endif
 
 void
 dtrace_gethrtime_init(void *arg)
@@ -452,8 +314,8 @@ dtrace_gethrtime_init(void *arg)
 	 * another 32-bit integer without overflowing 64-bit.
 	 * Thus minimum supported TSC frequency is 62.5MHz.
 	 */
-	//KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low"));
-	KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)));
+	KASSERTMSG(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)),
+	    "TSC frequency is too low");
 
 	/*
 	 * We scale up NANOSEC/tsc_f ratio to preserve as much precision
@@ -476,26 +338,37 @@ dtrace_gethrtime_init(void *arg)
 
 	/* Already handled in x86/tsc.c for ci_data.cpu_cc_skew */
 #if 0
-	for (i = 0; i <= mp_maxid; i++) {
+	/* The current CPU is the reference one. */
+	sched_pin();
+	tsc_skew[curcpu] = 0;
+	CPU_FOREACH(i) {
 		if (i == curcpu)
 			continue;
 
-		if (pcpu_find(i) == NULL)
-			continue;
-
-		map = 0;
-		map |= (1 << curcpu);
-		map |= (1 << i);
+		pc = pcpu_find(i);
+		CPU_SETOF(PCPU_GET(cpuid), &map);
+		CPU_SET(pc->pc_cpuid, &map);
 
-		smp_rendezvous_cpus(map, dtrace_gethrtime_init_sync,
+		smp_rendezvous_cpus(map, NULL,
 		    dtrace_gethrtime_init_cpu,
 		    smp_no_rendevous_barrier, (void *)(uintptr_t) i);
 
 		tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc;
 	}
+	sched_unpin();
 #endif
 }
 
+#ifdef __FreeBSD__
+#ifdef EARLY_AP_STARTUP
+SYSINIT(dtrace_gethrtime_init, SI_SUB_DTRACE, SI_ORDER_ANY,
+    dtrace_gethrtime_init, NULL);
+#else
+SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init,
+    NULL);
+#endif
+#endif
+
 /*
  * DTrace needs a high resolution time function which can
  * be called from a probe context and guaranteed not to have
@@ -526,27 +399,33 @@ dtrace_gethrtime()
 uint64_t
 dtrace_gethrestime(void)
 {
-	printf("%s(%d): XXX\n",__func__,__LINE__);
-	return (0);
+	struct timespec current_time;
+
+	dtrace_getnanotime(&current_time);
+
+	return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec);
 }
 
 /* Function to handle DTrace traps during probes. See i386/i386/trap.c */
 int
 dtrace_trap(struct trapframe *frame, u_int type)
 {
+	bool nofault;
 	cpuid_t cpuid = cpu_number();	/* current cpu id */
 
 	/*
 	 * A trap can occur while DTrace executes a probe. Before
 	 * executing the probe, DTrace blocks re-scheduling and sets
-	 * a flag in it's per-cpu flags to indicate that it doesn't
-	 * want to fault. On returning from the the probe, the no-fault
+	 * a flag in its per-cpu flags to indicate that it doesn't
+	 * want to fault. On returning from the probe, the no-fault
 	 * flag is cleared and finally re-scheduling is enabled.
 	 *
 	 * Check if DTrace has enabled 'no-fault' mode:
-	 *
 	 */
-	if ((cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) {
+	nofault = (cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0;
+	if (nofault) {
+		KASSERTMSG((read_eflags() & PSL_I) == 0, "interrupts enabled");
+
 		/*
 		 * There are only a couple of trap types that are expected.
 		 * All the rest will be handled in the usual way.
Index: src/external/cddl/osnet/dev/dtrace/i386/instr_size.c
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/i386/instr_size.c
diff -N src/external/cddl/osnet/dev/dtrace/i386/instr_size.c
--- src/external/cddl/osnet/dev/dtrace/i386/instr_size.c	21 Feb 2010 01:46:33 -0000	1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,134 +0,0 @@
-/*	$NetBSD: instr_size.c,v 1.2 2010/02/21 01:46:33 darran Exp $	*/
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- *
- * $FreeBSD: src/sys/cddl/dev/dtrace/i386/instr_size.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
- */
-/*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved	*/
-
-
-#if defined(sun)
-#pragma ident	"@(#)instr_size.c	1.14	05/07/08 SMI"
-#endif
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/proc.h>
-#if defined(sun)
-#include <sys/cmn_err.h>
-#include <sys/archsystm.h>
-#include <sys/copyops.h>
-#include <vm/seg_enum.h>
-#include <sys/privregs.h>
-#else
-typedef	u_int			model_t;
-#define	DATAMODEL_NATIVE	0
-int dtrace_instr_size(uchar_t *);
-#endif
-
-#include <dis_tables.h>
-
-/*
- * This subsystem (with the minor exception of the instr_size() function) is
- * is called from DTrace probe context.  This imposes several requirements on
- * the implementation:
- *
- * 1. External subsystems and functions may not be referenced.  The one current
- *    exception is for cmn_err, but only to signal the detection of table
- *    errors.  Assuming the tables are correct, no combination of input is to
- *    trigger a cmn_err call.
- *
- * 2. These functions can't be allowed to be traced.  To prevent this,
- *    all functions in the probe path (everything except instr_size()) must
- *    have names that begin with "dtrace_".
- */
-
-typedef enum dis_isize {
-	DIS_ISIZE_INSTR,
-	DIS_ISIZE_OPERAND
-} dis_isize_t;
-
-
-/*
- * get a byte from instruction stream
- */
-static int
-dtrace_dis_get_byte(void *p)
-{
-	int ret;
-	uchar_t **instr = p;
-
-	ret = **instr;
-	*instr += 1;
-
-	return (ret);
-}
-
-/*
- * Returns either the size of a given instruction, in bytes, or the size of that
- * instruction's memory access (if any), depending on the value of `which'.
- * If a programming error in the tables is detected, the system will panic to
- * ease diagnosis.  Invalid instructions will not be flagged.  They will appear
- * to have an instruction size between 1 and the actual size, and will be
- * reported as having no memory impact.
- */
-/* ARGSUSED2 */
-static int
-dtrace_dis_isize(uchar_t *instr, dis_isize_t which, model_t model, int *rmindex)
-{
-	int sz;
-	dis86_t	x;
-	uint_t mode = SIZE32;
-
-#if defined(sun)
-	mode = (model == DATAMODEL_LP64) ? SIZE64 : SIZE32;
-#endif
-
-	x.d86_data = (void **)&instr;
-	x.d86_get_byte = dtrace_dis_get_byte;
-	x.d86_check_func = NULL;
-
-	if (dtrace_disx86(&x, mode) != 0)
-		return (-1);
-
-	if (which == DIS_ISIZE_INSTR)
-		sz = x.d86_len;		/* length of the instruction */
-	else
-		sz = x.d86_memsize;	/* length of memory operand */
-
-	if (rmindex != NULL)
-		*rmindex = x.d86_rmindex;
-	return (sz);
-}
-
-int
-dtrace_instr_size(uchar_t *instr)
-{
-	return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, DATAMODEL_NATIVE,
-	    NULL));
-}
Index: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c
diff -N src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c	2 Mar 2017 10:54:26 -0000
@@ -0,0 +1,5597 @@
+/*
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+/*
+ * Copyright (c) 2010, Intel Corporation.
+ * All rights reserved.
+ */
+
+/*	Copyright (c) 1988 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * $FreeBSD: head/sys/cddl/dev/dtrace/x86/dis_tables.c 313133 2017-02-03 03:22:47Z markj $
+ */
+
+#include	"dis_tables.h"
+
+/* BEGIN CSTYLED */
+
+/*
+ * Disassembly begins in dis_distable, which is equivalent to the One-byte
+ * Opcode Map in the Intel IA32 ISA Reference (page A-6 in my copy).  The
+ * decoding loops then traverse out through the other tables as necessary to
+ * decode a given instruction.
+ *
+ * The behavior of this file can be controlled by one of the following flags:
+ *
+ * 	DIS_TEXT	Include text for disassembly
+ * 	DIS_MEM		Include memory-size calculations
+ *
+ * Either or both of these can be defined.
+ *
+ * This file is not, and will never be, cstyled.  If anything, the tables should
+ * be taken out another tab stop or two so nothing overlaps.
+ */
+
+/*
+ * These functions must be provided for the consumer to do disassembly.
+ */
+#ifdef DIS_TEXT
+extern char *strncpy(char *, const char *, size_t);
+extern size_t strlen(const char *);
+extern int strcmp(const char *, const char *);
+extern int strncmp(const char *, const char *, size_t);
+extern size_t strlcat(char *, const char *, size_t);
+#endif
+
+
+#define		TERM 	0	/* used to indicate that the 'indirect' */
+				/* field terminates - no pointer.	*/
+
+/* Used to decode instructions. */
+typedef struct	instable {
+	struct instable	*it_indirect;	/* for decode op codes */
+	uchar_t		it_adrmode;
+#ifdef DIS_TEXT
+	char		it_name[NCPS];
+	uint_t		it_suffix:1;		/* mnem + "w", "l", or "d" */
+#endif
+#ifdef DIS_MEM
+	uint_t		it_size:16;
+#endif
+	uint_t		it_invalid64:1;		/* opcode invalid in amd64 */
+	uint_t		it_always64:1;		/* 64 bit when in 64 bit mode */
+	uint_t		it_invalid32:1;		/* invalid in IA32 */
+	uint_t		it_stackop:1;		/* push/pop stack operation */
+	uint_t		it_vexwoxmm:1;		/* VEX instructions that don't use XMM/YMM */
+	uint_t		it_avxsuf:1;		/* AVX suffix required */
+} instable_t;
+
+/*
+ * Instruction formats.
+ */
+enum {
+	UNKNOWN,
+	MRw,
+	IMlw,
+	IMw,
+	IR,
+	OA,
+	AO,
+	MS,
+	SM,
+	Mv,
+	Mw,
+	M,		/* register or memory */
+	MG9,		/* register or memory in group 9 (prefix optional) */
+	Mb,		/* register or memory, always byte sized */
+	MO,		/* memory only (no registers) */
+	PREF,
+	SWAPGS_RDTSCP,
+	MONITOR_MWAIT,
+	R,
+	RA,
+	SEG,
+	MR,
+	RM,
+	RM_66r,		/* RM, but with a required 0x66 prefix */ 
+	IA,
+	MA,
+	SD,
+	AD,
+	SA,
+	D,
+	INM,
+	SO,
+	BD,
+	I,
+	P,
+	V,
+	DSHIFT,		/* for double shift that has an 8-bit immediate */
+	U,
+	OVERRIDE,
+	NORM,		/* instructions w/o ModR/M byte, no memory access */
+	IMPLMEM,	/* instructions w/o ModR/M byte, implicit mem access */
+	O,		/* for call	*/
+	JTAB,		/* jump table 	*/
+	IMUL,		/* for 186 iimul instr  */
+	CBW,		/* so data16 can be evaluated for cbw and variants */
+	MvI,		/* for 186 logicals */
+	ENTER,		/* for 186 enter instr  */
+	RMw,		/* for 286 arpl instr */
+	Ib,		/* for push immediate byte */
+	F,		/* for 287 instructions */
+	FF,		/* for 287 instructions */
+	FFC,		/* for 287 instructions */
+	DM,		/* 16-bit data */
+	AM,		/* 16-bit addr */
+	LSEG,		/* for 3-bit seg reg encoding */
+	MIb,		/* for 386 logicals */
+	SREG,		/* for 386 special registers */
+	PREFIX,		/* a REP instruction prefix */
+	LOCK,		/* a LOCK instruction prefix */
+	INT3,		/* The int 3 instruction, which has a fake operand */
+	INTx,		/* The normal int instruction, with explicit int num */
+	DSHIFTcl,	/* for double shift that implicitly uses %cl */
+	CWD,		/* so data16 can be evaluated for cwd and variants */
+	RET,		/* single immediate 16-bit operand */
+	MOVZ,		/* for movs and movz, with different size operands */
+	CRC32,		/* for crc32, with different size operands */
+	XADDB,		/* for xaddb */
+	MOVSXZ,		/* AMD64 mov sign extend 32 to 64 bit instruction */
+	MOVBE,		/* movbe instruction */
+
+/*
+ * MMX/SIMD addressing modes.
+ */
+
+	MMO,		/* Prefixable MMX/SIMD-Int	mm/mem	-> mm */
+	MMOIMPL,	/* Prefixable MMX/SIMD-Int	mm	-> mm (mem) */
+	MMO3P,		/* Prefixable MMX/SIMD-Int	mm	-> r32,imm8 */
+	MMOM3,		/* Prefixable MMX/SIMD-Int	mm	-> r32 	*/
+	MMOS,		/* Prefixable MMX/SIMD-Int	mm	-> mm/mem */
+	MMOMS,		/* Prefixable MMX/SIMD-Int	mm	-> mem */
+	MMOPM,		/* MMX/SIMD-Int			mm/mem	-> mm,imm8 */
+	MMOPM_66o,	/* MMX/SIMD-Int 0x66 optional	mm/mem	-> mm,imm8 */
+	MMOPRM,		/* Prefixable MMX/SIMD-Int	r32/mem	-> mm,imm8 */
+	MMOSH,		/* Prefixable MMX		mm,imm8	*/
+	MM,		/* MMX/SIMD-Int			mm/mem	-> mm	*/
+	MMS,		/* MMX/SIMD-Int			mm	-> mm/mem */
+	MMSH,		/* MMX				mm,imm8 */
+	XMMO,		/* Prefixable SIMD		xmm/mem	-> xmm */
+	XMMOS,		/* Prefixable SIMD		xmm	-> xmm/mem */
+	XMMOPM,		/* Prefixable SIMD		xmm/mem	w/to xmm,imm8 */
+	XMMOMX,		/* Prefixable SIMD		mm/mem	-> xmm */
+	XMMOX3,		/* Prefixable SIMD		xmm	-> r32 */
+	XMMOXMM,	/* Prefixable SIMD		xmm/mem	-> mm	*/
+	XMMOM,		/* Prefixable SIMD		xmm	-> mem */
+	XMMOMS,		/* Prefixable SIMD		mem	-> xmm */
+	XMM,		/* SIMD 			xmm/mem	-> xmm */
+	XMM_66r,	/* SIMD 0x66 prefix required	xmm/mem	-> xmm */
+	XMM_66o,	/* SIMD 0x66 prefix optional 	xmm/mem	-> xmm */
+	XMMXIMPL,	/* SIMD				xmm	-> xmm (mem) */
+	XMM3P,		/* SIMD				xmm	-> r32,imm8 */
+	XMM3PM_66r,	/* SIMD 0x66 prefix required	xmm	-> r32/mem,imm8 */
+	XMMP,		/* SIMD 			xmm/mem w/to xmm,imm8 */
+	XMMP_66o,	/* SIMD 0x66 prefix optional	xmm/mem w/to xmm,imm8 */
+	XMMP_66r,	/* SIMD 0x66 prefix required	xmm/mem w/to xmm,imm8 */
+	XMMPRM,		/* SIMD 			r32/mem -> xmm,imm8 */
+	XMMPRM_66r,	/* SIMD 0x66 prefix required	r32/mem -> xmm,imm8 */
+	XMMS,		/* SIMD				xmm	-> xmm/mem */
+	XMMM,		/* SIMD 			mem	-> xmm */
+	XMMM_66r,	/* SIMD	0x66 prefix required	mem	-> xmm */
+	XMMMS,		/* SIMD				xmm	-> mem */
+	XMM3MX,		/* SIMD 			r32/mem -> xmm */
+	XMM3MXS,	/* SIMD 			xmm	-> r32/mem */
+	XMMSH,		/* SIMD 			xmm,imm8 */
+	XMMXM3,		/* SIMD 			xmm/mem -> r32 */
+	XMMX3,		/* SIMD 			xmm	-> r32 */
+	XMMXMM,		/* SIMD 			xmm/mem	-> mm */
+	XMMMX,		/* SIMD 			mm	-> xmm */
+	XMMXM,		/* SIMD 			xmm	-> mm */
+        XMMX2I,		/* SIMD				xmm -> xmm, imm, imm */
+        XMM2I,		/* SIMD				xmm, imm, imm */
+	XMMFENCE,	/* SIMD lfence or mfence */
+	XMMSFNC,	/* SIMD sfence (none or mem) */
+	XGETBV_XSETBV,
+	VEX_NONE,	/* VEX  no operand */
+	VEX_MO,		/* VEX	mod_rm		               -> implicit reg */
+	VEX_RMrX,	/* VEX  VEX.vvvv, mod_rm               -> mod_reg */
+	VEX_VRMrX,	/* VEX  mod_rm, VEX.vvvv               -> mod_rm */
+	VEX_RRX,	/* VEX  VEX.vvvv, mod_reg              -> mod_rm */
+	VEX_RMRX,	/* VEX  VEX.vvvv, mod_rm, imm8[7:4]    -> mod_reg */
+	VEX_MX,         /* VEX  mod_rm                         -> mod_reg */
+	VEX_MXI,        /* VEX  mod_rm, imm8                   -> mod_reg */
+	VEX_XXI,        /* VEX  mod_rm, imm8                   -> VEX.vvvv */
+	VEX_MR,         /* VEX  mod_rm                         -> mod_reg */
+	VEX_RRI,        /* VEX  mod_reg, mod_rm                -> implicit(eflags/r32) */
+	VEX_RX,         /* VEX  mod_reg                        -> mod_rm */
+	VEX_RR,         /* VEX  mod_rm                         -> mod_reg */
+	VEX_RRi,        /* VEX  mod_rm, imm8                   -> mod_reg */
+	VEX_RM,         /* VEX  mod_reg                        -> mod_rm */
+	VEX_RIM,	/* VEX  mod_reg, imm8                  -> mod_rm */
+	VEX_RRM,        /* VEX  VEX.vvvv, mod_reg              -> mod_rm */
+	VEX_RMX,        /* VEX  VEX.vvvv, mod_rm               -> mod_reg */
+	VEX_SbVM,	/* VEX  SIB, VEX.vvvv                  -> mod_rm */
+	VMx,		/* vmcall/vmlaunch/vmresume/vmxoff */
+	VMxo,		/* VMx instruction with optional prefix */
+	SVM,		/* AMD SVM instructions */
+	BLS,		/* BLSR, BLSMSK, BLSI */
+	FMA,		/* FMA instructions, all VEX_RMrX */
+	ADX		/* ADX instructions, support REX.w, mod_rm->mod_reg */
+};
+
+/*
+ * VEX prefixes
+ */
+#define VEX_2bytes	0xC5	/* the first byte of two-byte form */
+#define VEX_3bytes	0xC4	/* the first byte of three-byte form */
+
+#define	FILL	0x90	/* Fill byte used for alignment (nop)	*/
+
+/*
+** Register numbers for the i386
+*/
+#define	EAX_REGNO 0
+#define	ECX_REGNO 1
+#define	EDX_REGNO 2
+#define	EBX_REGNO 3
+#define	ESP_REGNO 4
+#define	EBP_REGNO 5
+#define	ESI_REGNO 6
+#define	EDI_REGNO 7
+
+/*
+ * modes for immediate values
+ */
+#define	MODE_NONE	0
+#define	MODE_IPREL	1	/* signed IP relative value */
+#define	MODE_SIGNED	2	/* sign extended immediate */
+#define	MODE_IMPLIED	3	/* constant value implied from opcode */
+#define	MODE_OFFSET	4	/* offset part of an address */
+#define	MODE_RIPREL	5	/* like IPREL, but from %rip (amd64) */
+
+/*
+ * The letters used in these macros are:
+ *   IND - indirect to another to another table
+ *   "T" - means to Terminate indirections (this is the final opcode)
+ *   "S" - means "operand length suffix required"
+ *   "Sa" - means AVX2 suffix (d/q) required
+ *   "NS" - means "no suffix" which is the operand length suffix of the opcode
+ *   "Z" - means instruction size arg required
+ *   "u" - means the opcode is invalid in IA32 but valid in amd64
+ *   "x" - means the opcode is invalid in amd64, but not IA32
+ *   "y" - means the operand size is always 64 bits in 64 bit mode
+ *   "p" - means push/pop stack operation
+ *   "vr" - means VEX instruction that operates on normal registers, not fpu
+ */
+
+#if defined(DIS_TEXT) && defined(DIS_MEM)
+#define	IND(table)		{(instable_t *)table, 0, "", 0, 0, 0, 0, 0, 0}
+#define	INDx(table)		{(instable_t *)table, 0, "", 0, 0, 1, 0, 0, 0}
+#define	TNS(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 0, 0}
+#define	TNSu(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 1, 0}
+#define	TNSx(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 0, 0}
+#define	TNSy(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0, 0}
+#define	TNSyp(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0, 1}
+#define	TNSZ(name, amode, sz)	{TERM, amode, name, 0, sz, 0, 0, 0, 0}
+#define	TNSZy(name, amode, sz)	{TERM, amode, name, 0, sz, 0, 1, 0, 0}
+#define	TNSZvr(name, amode, sz)	{TERM, amode, name, 0, sz, 0, 0, 0, 0, 1}
+#define	TS(name, amode)		{TERM, amode, name, 1, 0, 0, 0, 0, 0}
+#define	TSx(name, amode)	{TERM, amode, name, 1, 0, 1, 0, 0, 0}
+#define	TSy(name, amode)	{TERM, amode, name, 1, 0, 0, 1, 0, 0}
+#define	TSp(name, amode)	{TERM, amode, name, 1, 0, 0, 0, 0, 1}
+#define	TSZ(name, amode, sz)	{TERM, amode, name, 1, sz, 0, 0, 0, 0}
+#define	TSaZ(name, amode, sz)	{TERM, amode, name, 1, sz, 0, 0, 0, 0, 0, 1}
+#define	TSZx(name, amode, sz)	{TERM, amode, name, 1, sz, 1, 0, 0, 0}
+#define	TSZy(name, amode, sz)	{TERM, amode, name, 1, sz, 0, 1, 0, 0}
+#define	INVALID			{TERM, UNKNOWN, "", 0, 0, 0, 0, 0}
+#elif defined(DIS_TEXT)
+#define	IND(table)		{(instable_t *)table, 0, "", 0, 0, 0, 0, 0}
+#define	INDx(table)		{(instable_t *)table, 0, "", 0, 1, 0, 0, 0}
+#define	TNS(name, amode)	{TERM, amode, name, 0, 0, 0, 0, 0}
+#define	TNSu(name, amode)	{TERM, amode, name, 0, 0, 0, 1, 0}
+#define	TNSx(name, amode)	{TERM, amode, name, 0, 1, 0, 0, 0}
+#define	TNSy(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 0}
+#define	TNSyp(name, amode)	{TERM, amode, name, 0, 0, 1, 0, 1}
+#define	TNSZ(name, amode, sz)	{TERM, amode, name, 0, 0, 0, 0, 0}
+#define	TNSZy(name, amode, sz)	{TERM, amode, name, 0, 0, 1, 0, 0}
+#define	TNSZvr(name, amode, sz)	{TERM, amode, name, 0, 0, 0, 0, 0, 1}
+#define	TS(name, amode)		{TERM, amode, name, 1, 0, 0, 0, 0}
+#define	TSx(name, amode)	{TERM, amode, name, 1, 1, 0, 0, 0}
+#define	TSy(name, amode)	{TERM, amode, name, 1, 0, 1, 0, 0}
+#define	TSp(name, amode)	{TERM, amode, name, 1, 0, 0, 0, 1}
+#define	TSZ(name, amode, sz)	{TERM, amode, name, 1, 0, 0, 0, 0}
+#define	TSaZ(name, amode, sz)	{TERM, amode, name, 1, 0, 0, 0, 0, 0, 1}
+#define	TSZx(name, amode, sz)	{TERM, amode, name, 1, 1, 0, 0, 0}
+#define	TSZy(name, amode, sz)	{TERM, amode, name, 1, 0, 1, 0, 0}
+#define	INVALID			{TERM, UNKNOWN, "", 0, 0, 0, 0, 0}
+#elif defined(DIS_MEM)
+#define	IND(table)		{(instable_t *)table, 0, 0, 0, 0, 0, 0}
+#define	INDx(table)		{(instable_t *)table, 0, 0, 1, 0, 0, 0}
+#define	TNS(name, amode)	{TERM, amode,  0, 0, 0, 0, 0}
+#define	TNSu(name, amode)	{TERM, amode,  0, 0, 0, 1, 0}
+#define	TNSy(name, amode)	{TERM, amode,  0, 0, 1, 0, 0}
+#define	TNSyp(name, amode)	{TERM, amode,  0, 0, 1, 0, 1}
+#define	TNSx(name, amode)	{TERM, amode,  0, 1, 0, 0, 0}
+#define	TNSZ(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0}
+#define	TNSZy(name, amode, sz)	{TERM, amode, sz, 0, 1, 0, 0}
+#define	TNSZvr(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0, 1}
+#define	TS(name, amode)		{TERM, amode,  0, 0, 0, 0, 0}
+#define	TSx(name, amode)	{TERM, amode,  0, 1, 0, 0, 0}
+#define	TSy(name, amode)	{TERM, amode,  0, 0, 1, 0, 0}
+#define	TSp(name, amode)	{TERM, amode,  0, 0, 0, 0, 1}
+#define	TSZ(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0}
+#define	TSaZ(name, amode, sz)	{TERM, amode, sz, 0, 0, 0, 0, 0, 1}
+#define	TSZx(name, amode, sz)	{TERM, amode, sz, 1, 0, 0, 0}
+#define	TSZy(name, amode, sz)	{TERM, amode, sz, 0, 1, 0, 0}
+#define	INVALID			{TERM, UNKNOWN, 0, 0, 0, 0, 0}
+#else
+#define	IND(table)		{(instable_t *)table, 0, 0, 0, 0, 0}
+#define	INDx(table)		{(instable_t *)table, 0, 1, 0, 0, 0}
+#define	TNS(name, amode)	{TERM, amode,  0, 0, 0, 0}
+#define	TNSu(name, amode)	{TERM, amode,  0, 0, 1, 0}
+#define	TNSy(name, amode)	{TERM, amode,  0, 1, 0, 0}
+#define	TNSyp(name, amode)	{TERM, amode,  0, 1, 0, 1}
+#define	TNSx(name, amode)	{TERM, amode,  1, 0, 0, 0}
+#define	TNSZ(name, amode, sz)	{TERM, amode,  0, 0, 0, 0}
+#define	TNSZy(name, amode, sz)	{TERM, amode,  0, 1, 0, 0}
+#define	TNSZvr(name, amode, sz)	{TERM, amode,  0, 0, 0, 0, 1}
+#define	TS(name, amode)		{TERM, amode,  0, 0, 0, 0}
+#define	TSx(name, amode)	{TERM, amode,  1, 0, 0, 0}
+#define	TSy(name, amode)	{TERM, amode,  0, 1, 0, 0}
+#define	TSp(name, amode)	{TERM, amode,  0, 0, 0, 1}
+#define	TSZ(name, amode, sz)	{TERM, amode,  0, 0, 0, 0}
+#define	TSaZ(name, amode, sz)	{TERM, amode,  0, 0, 0, 0, 0, 1}
+#define	TSZx(name, amode, sz)	{TERM, amode,  1, 0, 0, 0}
+#define	TSZy(name, amode, sz)	{TERM, amode,  0, 1, 0, 0}
+#define	INVALID			{TERM, UNKNOWN, 0, 0, 0, 0}
+#endif
+
+#ifdef DIS_TEXT
+/*
+ * this decodes the r_m field for mode's 0, 1, 2 in 16 bit mode
+ */
+const char *const dis_addr16[3][8] = {
+"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "",
+									"(%bx)",
+"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di", "(%bp)",
+									"(%bx)",
+"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "(%bp)",
+									"(%bx)",
+};
+
+
+/*
+ * This decodes 32 bit addressing mode r_m field for modes 0, 1, 2
+ */
+const char *const dis_addr32_mode0[16] = {
+  "(%eax)", "(%ecx)", "(%edx)",  "(%ebx)",  "", "",        "(%esi)",  "(%edi)",
+  "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "",        "(%r14d)", "(%r15d)"
+};
+
+const char *const dis_addr32_mode12[16] = {
+  "(%eax)", "(%ecx)", "(%edx)",  "(%ebx)",  "", "(%ebp)",  "(%esi)",  "(%edi)",
+  "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "(%r13d)", "(%r14d)", "(%r15d)"
+};
+
+/*
+ * This decodes 64 bit addressing mode r_m field for modes 0, 1, 2
+ */
+const char *const dis_addr64_mode0[16] = {
+ "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "",       "(%rip)", "(%rsi)", "(%rdi)",
+ "(%r8)",  "(%r9)",  "(%r10)", "(%r11)", "(%r12)", "(%rip)", "(%r14)", "(%r15)"
+};
+const char *const dis_addr64_mode12[16] = {
+ "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "",       "(%rbp)", "(%rsi)", "(%rdi)",
+ "(%r8)",  "(%r9)",  "(%r10)", "(%r11)", "(%r12)", "(%r13)", "(%r14)", "(%r15)"
+};
+
+/*
+ * decode for scale from SIB byte
+ */
+const char *const dis_scale_factor[4] = { ")", ",2)", ",4)", ",8)" };
+
+/*
+ * decode for scale from VSIB byte, note that we always include the scale factor
+ * to match gas.
+ */
+const char *const dis_vscale_factor[4] = { ",1)", ",2)", ",4)", ",8)" };
+
+/*
+ * register decoding for normal references to registers (ie. not addressing)
+ */
+const char *const dis_REG8[16] = {
+	"%al",  "%cl",  "%dl",   "%bl",   "%ah",   "%ch",   "%dh",   "%bh",
+	"%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b"
+};
+
+const char *const dis_REG8_REX[16] = {
+	"%al",  "%cl",  "%dl",   "%bl",   "%spl",  "%bpl",  "%sil",  "%dil",
+	"%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b"
+};
+
+const char *const dis_REG16[16] = {
+	"%ax",  "%cx",  "%dx",   "%bx",   "%sp",   "%bp",   "%si",   "%di",
+	"%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w"
+};
+
+const char *const dis_REG32[16] = {
+	"%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
+	"%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d"
+};
+
+const char *const dis_REG64[16] = {
+	"%rax", "%rcx", "%rdx",  "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
+	"%r8",  "%r9",  "%r10",  "%r11", "%r12", "%r13", "%r14", "%r15"
+};
+
+const char *const dis_DEBUGREG[16] = {
+	"%db0", "%db1", "%db2",  "%db3",  "%db4",  "%db5",  "%db6",  "%db7",
+	"%db8", "%db9", "%db10", "%db11", "%db12", "%db13", "%db14", "%db15"
+};
+
+const char *const dis_CONTROLREG[16] = {
+    "%cr0", "%cr1", "%cr2", "%cr3", "%cr4", "%cr5?", "%cr6?", "%cr7?",
+    "%cr8", "%cr9?", "%cr10?", "%cr11?", "%cr12?", "%cr13?", "%cr14?", "%cr15?"
+};
+
+const char *const dis_TESTREG[16] = {
+	"%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7",
+	"%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7"
+};
+
+const char *const dis_MMREG[16] = {
+	"%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7",
+	"%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
+};
+
+const char *const dis_XMMREG[16] = {
+    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+    "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
+};
+
+const char *const dis_YMMREG[16] = {
+    "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7",
+    "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15"
+};
+
+const char *const dis_SEGREG[16] = {
+	"%es", "%cs", "%ss", "%ds", "%fs", "%gs", "<reserved>", "<reserved>",
+	"%es", "%cs", "%ss", "%ds", "%fs", "%gs", "<reserved>", "<reserved>"
+};
+
+/*
+ * SIMD predicate suffixes
+ */
+const char *const dis_PREDSUFFIX[8] = {
+	"eq", "lt", "le", "unord", "neq", "nlt", "nle", "ord"
+};
+
+const char *const dis_AVXvgrp7[3][8] = {
+	/*0	1	2		3		4		5	6		7*/
+/*71*/	{"",	"",	"vpsrlw",	"",		"vpsraw",	"",	"vpsllw",	""},
+/*72*/	{"",	"",	"vpsrld",	"",		"vpsrad",	"",	"vpslld",	""},
+/*73*/	{"",	"",	"vpsrlq",	"vpsrldq",	"",		"",	"vpsllq",	"vpslldq"}
+};
+
+#endif	/* DIS_TEXT */
+
+/*
+ *	"decode table" for 64 bit mode MOVSXD instruction (opcode 0x63)
+ */
+const instable_t dis_opMOVSLD = TNS("movslq",MOVSXZ);
+
+/*
+ *	"decode table" for pause and clflush instructions
+ */
+const instable_t dis_opPause = TNS("pause", NORM);
+
+/*
+ *	Decode table for 0x0F00 opcodes
+ */
+const instable_t dis_op0F00[8] = {
+
+/*  [0]  */	TNS("sldt",M),		TNS("str",M),		TNSy("lldt",M), 	TNSy("ltr",M),
+/*  [4]  */	TNSZ("verr",M,2),	TNSZ("verw",M,2),	INVALID,		INVALID,
+};
+
+
+/*
+ *	Decode table for 0x0F01 opcodes
+ */
+const instable_t dis_op0F01[8] = {
+
+/*  [0]  */	TNSZ("sgdt",VMx,6),	TNSZ("sidt",MONITOR_MWAIT,6),	TNSZ("lgdt",XGETBV_XSETBV,6),	TNSZ("lidt",SVM,6),
+/*  [4]  */	TNSZ("smsw",M,2),	INVALID, 		TNSZ("lmsw",M,2),	TNS("invlpg",SWAPGS_RDTSCP),
+};
+
+/*
+ *	Decode table for 0x0F18 opcodes -- SIMD prefetch
+ */
+const instable_t dis_op0F18[8] = {
+
+/*  [0]  */	TNS("prefetchnta",PREF),TNS("prefetcht0",PREF),	TNS("prefetcht1",PREF),	TNS("prefetcht2",PREF),
+/*  [4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+/*
+ * 	Decode table for 0x0FAE opcodes -- SIMD state save/restore
+ */
+const instable_t dis_op0FAE[8] = {
+/*  [0]  */	TNSZ("fxsave",M,512),	TNSZ("fxrstor",M,512),	TNS("ldmxcsr",M),	TNS("stmxcsr",M),
+/*  [4]  */	TNSZ("xsave",M,512),	TNS("lfence",XMMFENCE), TNS("mfence",XMMFENCE),	TNS("sfence",XMMSFNC),
+};
+
+/*
+ *	Decode table for 0x0FBA opcodes
+ */
+
+const instable_t dis_op0FBA[8] = {
+
+/*  [0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4]  */	TS("bt",MIb),		TS("bts",MIb),		TS("btr",MIb),		TS("btc",MIb),
+};
+
+/*
+ * 	Decode table for 0x0FC7 opcode (group 9)
+ */
+
+const instable_t dis_op0FC7[8] = {
+
+/*  [0]  */	INVALID,		TNS("cmpxchg8b",M),	INVALID,		INVALID,
+/*  [4]  */	INVALID,		INVALID,		TNS("vmptrld",MG9),	TNS("vmptrst",MG9),
+};
+
+/*
+ * 	Decode table for 0x0FC7 opcode (group 9) mode 3
+ */
+
+const instable_t dis_op0FC7m3[8] = {
+
+/*  [0]  */	INVALID,		INVALID,	INVALID,		INVALID,
+/*  [4]  */	INVALID,		INVALID,	TNS("rdrand",MG9),	TNS("rdseed", MG9),
+};
+
+/*
+ * 	Decode table for 0x0FC7 opcode with 0x66 prefix
+ */
+
+const instable_t dis_op660FC7[8] = {
+
+/*  [0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4]  */	INVALID,		INVALID,		TNS("vmclear",M),	INVALID,
+};
+
+/*
+ * 	Decode table for 0x0FC7 opcode with 0xF3 prefix
+ */
+
+const instable_t dis_opF30FC7[8] = {
+
+/*  [0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4]  */	INVALID,		INVALID,		TNS("vmxon",M),		INVALID,
+};
+
+/*
+ *	Decode table for 0x0FC8 opcode -- 486 bswap instruction
+ *
+ *bit pattern: 0000 1111 1100 1reg
+ */
+const instable_t dis_op0FC8[4] = {
+/*  [0]  */	TNS("bswap",R),		INVALID,		INVALID,		INVALID,
+};
+
+/*
+ *	Decode table for 0x0F71, 0x0F72, and 0x0F73 opcodes -- MMX instructions
+ */
+const instable_t dis_op0F7123[4][8] = {
+{
+/*  [70].0 */	INVALID,		INVALID,		INVALID,		INVALID,
+/*      .4 */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [71].0 */	INVALID,		INVALID,		TNS("psrlw",MMOSH),	INVALID,
+/*      .4 */	TNS("psraw",MMOSH),	INVALID,		TNS("psllw",MMOSH),	INVALID,
+}, {
+/*  [72].0 */	INVALID,		INVALID,		TNS("psrld",MMOSH),	INVALID,
+/*      .4 */	TNS("psrad",MMOSH),	INVALID,		TNS("pslld",MMOSH),	INVALID,
+}, {
+/*  [73].0 */	INVALID,		INVALID,		TNS("psrlq",MMOSH),	TNS("INVALID",MMOSH),
+/*      .4 */	INVALID,		INVALID, 		TNS("psllq",MMOSH),	TNS("INVALID",MMOSH),
+} };
+
+/*
+ *	Decode table for SIMD extensions to above 0x0F71-0x0F73 opcodes.
+ */
+const instable_t dis_opSIMD7123[32] = {
+/* [70].0 */	INVALID,		INVALID,		INVALID,		INVALID,
+/*     .4 */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/* [71].0 */	INVALID,		INVALID,		TNS("psrlw",XMMSH),	INVALID,
+/*     .4 */	TNS("psraw",XMMSH),	INVALID,		TNS("psllw",XMMSH),	INVALID,
+
+/* [72].0 */	INVALID,		INVALID,		TNS("psrld",XMMSH),	INVALID,
+/*     .4 */	TNS("psrad",XMMSH),	INVALID,		TNS("pslld",XMMSH),	INVALID,
+
+/* [73].0 */	INVALID,		INVALID,		TNS("psrlq",XMMSH),	TNS("psrldq",XMMSH),
+/*     .4 */	INVALID,		INVALID,		TNS("psllq",XMMSH),	TNS("pslldq",XMMSH),
+};
+
+/*
+ *	SIMD instructions have been wedged into the existing IA32 instruction
+ *	set through the use of prefixes.  That is, while 0xf0 0x58 may be
+ *	addps, 0xf3 0xf0 0x58 (literally, repz addps) is a completely different
+ *	instruction - addss.  At present, three prefixes have been coopted in
+ *	this manner - address size (0x66), repnz (0xf2) and repz (0xf3).  The
+ *	following tables are used to provide the prefixed instruction names.
+ *	The arrays are sparse, but they're fast.
+ */
+
+/*
+ *	Decode table for SIMD instructions with the address size (0x66) prefix.
+ */
+const instable_t dis_opSIMDdata16[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	TNSZ("movupd",XMM,16),	TNSZ("movupd",XMMS,16),	TNSZ("movlpd",XMMM,8),	TNSZ("movlpd",XMMMS,8),
+/*  [14]  */	TNSZ("unpcklpd",XMM,16),TNSZ("unpckhpd",XMM,16),TNSZ("movhpd",XMMM,8),	TNSZ("movhpd",XMMMS,8),
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	TNSZ("movapd",XMM,16),	TNSZ("movapd",XMMS,16),	TNSZ("cvtpi2pd",XMMOMX,8),TNSZ("movntpd",XMMOMS,16),
+/*  [2C]  */	TNSZ("cvttpd2pi",XMMXMM,16),TNSZ("cvtpd2pi",XMMXMM,16),TNSZ("ucomisd",XMM,8),TNSZ("comisd",XMM,8),
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	TNS("movmskpd",XMMOX3),	TNSZ("sqrtpd",XMM,16),	INVALID,		INVALID,
+/*  [54]  */	TNSZ("andpd",XMM,16),	TNSZ("andnpd",XMM,16),	TNSZ("orpd",XMM,16),	TNSZ("xorpd",XMM,16),
+/*  [58]  */	TNSZ("addpd",XMM,16),	TNSZ("mulpd",XMM,16),	TNSZ("cvtpd2ps",XMM,16),TNSZ("cvtps2dq",XMM,16),
+/*  [5C]  */	TNSZ("subpd",XMM,16),	TNSZ("minpd",XMM,16),	TNSZ("divpd",XMM,16),	TNSZ("maxpd",XMM,16),
+
+/*  [60]  */	TNSZ("punpcklbw",XMM,16),TNSZ("punpcklwd",XMM,16),TNSZ("punpckldq",XMM,16),TNSZ("packsswb",XMM,16),
+/*  [64]  */	TNSZ("pcmpgtb",XMM,16),	TNSZ("pcmpgtw",XMM,16),	TNSZ("pcmpgtd",XMM,16),	TNSZ("packuswb",XMM,16),
+/*  [68]  */	TNSZ("punpckhbw",XMM,16),TNSZ("punpckhwd",XMM,16),TNSZ("punpckhdq",XMM,16),TNSZ("packssdw",XMM,16),
+/*  [6C]  */	TNSZ("punpcklqdq",XMM,16),TNSZ("punpckhqdq",XMM,16),TNSZ("movd",XMM3MX,4),TNSZ("movdqa",XMM,16),
+
+/*  [70]  */	TNSZ("pshufd",XMMP,16),	INVALID,		INVALID,		INVALID,
+/*  [74]  */	TNSZ("pcmpeqb",XMM,16),	TNSZ("pcmpeqw",XMM,16),	TNSZ("pcmpeqd",XMM,16),	INVALID,
+/*  [78]  */	TNSZ("extrq",XMM2I,16),	TNSZ("extrq",XMM,16), INVALID,		INVALID,
+/*  [7C]  */	TNSZ("haddpd",XMM,16),	TNSZ("hsubpd",XMM,16),	TNSZ("movd",XMM3MXS,4),	TNSZ("movdqa",XMMS,16),
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmppd",XMMP,16),	INVALID,
+/*  [C4]  */	TNSZ("pinsrw",XMMPRM,2),TNS("pextrw",XMM3P),	TNSZ("shufpd",XMMP,16),	INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	TNSZ("addsubpd",XMM,16),TNSZ("psrlw",XMM,16),	TNSZ("psrld",XMM,16),	TNSZ("psrlq",XMM,16),
+/*  [D4]  */	TNSZ("paddq",XMM,16),	TNSZ("pmullw",XMM,16),	TNSZ("movq",XMMS,8),	TNS("pmovmskb",XMMX3),
+/*  [D8]  */	TNSZ("psubusb",XMM,16),	TNSZ("psubusw",XMM,16),	TNSZ("pminub",XMM,16),	TNSZ("pand",XMM,16),
+/*  [DC]  */	TNSZ("paddusb",XMM,16),	TNSZ("paddusw",XMM,16),	TNSZ("pmaxub",XMM,16),	TNSZ("pandn",XMM,16),
+
+/*  [E0]  */	TNSZ("pavgb",XMM,16),	TNSZ("psraw",XMM,16),	TNSZ("psrad",XMM,16),	TNSZ("pavgw",XMM,16),
+/*  [E4]  */	TNSZ("pmulhuw",XMM,16),	TNSZ("pmulhw",XMM,16),	TNSZ("cvttpd2dq",XMM,16),TNSZ("movntdq",XMMS,16),
+/*  [E8]  */	TNSZ("psubsb",XMM,16),	TNSZ("psubsw",XMM,16),	TNSZ("pminsw",XMM,16),	TNSZ("por",XMM,16),
+/*  [EC]  */	TNSZ("paddsb",XMM,16),	TNSZ("paddsw",XMM,16),	TNSZ("pmaxsw",XMM,16),	TNSZ("pxor",XMM,16),
+
+/*  [F0]  */	INVALID,		TNSZ("psllw",XMM,16),	TNSZ("pslld",XMM,16),	TNSZ("psllq",XMM,16),
+/*  [F4]  */	TNSZ("pmuludq",XMM,16),	TNSZ("pmaddwd",XMM,16),	TNSZ("psadbw",XMM,16),	TNSZ("maskmovdqu", XMMXIMPL,16),
+/*  [F8]  */	TNSZ("psubb",XMM,16),	TNSZ("psubw",XMM,16),	TNSZ("psubd",XMM,16),	TNSZ("psubq",XMM,16),
+/*  [FC]  */	TNSZ("paddb",XMM,16),	TNSZ("paddw",XMM,16),	TNSZ("paddd",XMM,16),	INVALID,
+};
+
+const instable_t dis_opAVX660F[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	TNSZ("vmovupd",VEX_MX,16),	TNSZ("vmovupd",VEX_RX,16),	TNSZ("vmovlpd",VEX_RMrX,8),	TNSZ("vmovlpd",VEX_RM,8),
+/*  [14]  */	TNSZ("vunpcklpd",VEX_RMrX,16),TNSZ("vunpckhpd",VEX_RMrX,16),TNSZ("vmovhpd",VEX_RMrX,8),	TNSZ("vmovhpd",VEX_RM,8),
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	TNSZ("vmovapd",VEX_MX,16),	TNSZ("vmovapd",VEX_RX,16),	INVALID,		TNSZ("vmovntpd",VEX_RM,16),
+/*  [2C]  */	INVALID,		INVALID,		TNSZ("vucomisd",VEX_MX,8),TNSZ("vcomisd",VEX_MX,8),
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	TNS("vmovmskpd",VEX_MR),	TNSZ("vsqrtpd",VEX_MX,16),	INVALID,		INVALID,
+/*  [54]  */	TNSZ("vandpd",VEX_RMrX,16),	TNSZ("vandnpd",VEX_RMrX,16),	TNSZ("vorpd",VEX_RMrX,16),	TNSZ("vxorpd",VEX_RMrX,16),
+/*  [58]  */	TNSZ("vaddpd",VEX_RMrX,16),	TNSZ("vmulpd",VEX_RMrX,16),	TNSZ("vcvtpd2ps",VEX_MX,16),TNSZ("vcvtps2dq",VEX_MX,16),
+/*  [5C]  */	TNSZ("vsubpd",VEX_RMrX,16),	TNSZ("vminpd",VEX_RMrX,16),	TNSZ("vdivpd",VEX_RMrX,16),	TNSZ("vmaxpd",VEX_RMrX,16),
+
+/*  [60]  */	TNSZ("vpunpcklbw",VEX_RMrX,16),TNSZ("vpunpcklwd",VEX_RMrX,16),TNSZ("vpunpckldq",VEX_RMrX,16),TNSZ("vpacksswb",VEX_RMrX,16),
+/*  [64]  */	TNSZ("vpcmpgtb",VEX_RMrX,16),	TNSZ("vpcmpgtw",VEX_RMrX,16),	TNSZ("vpcmpgtd",VEX_RMrX,16),	TNSZ("vpackuswb",VEX_RMrX,16),
+/*  [68]  */	TNSZ("vpunpckhbw",VEX_RMrX,16),TNSZ("vpunpckhwd",VEX_RMrX,16),TNSZ("vpunpckhdq",VEX_RMrX,16),TNSZ("vpackssdw",VEX_RMrX,16),
+/*  [6C]  */	TNSZ("vpunpcklqdq",VEX_RMrX,16),TNSZ("vpunpckhqdq",VEX_RMrX,16),TNSZ("vmovd",VEX_MX,4),TNSZ("vmovdqa",VEX_MX,16),
+
+/*  [70]  */	TNSZ("vpshufd",VEX_MXI,16),	TNSZ("vgrp71",VEX_XXI,16),	TNSZ("vgrp72",VEX_XXI,16),		TNSZ("vgrp73",VEX_XXI,16),
+/*  [74]  */	TNSZ("vpcmpeqb",VEX_RMrX,16),	TNSZ("vpcmpeqw",VEX_RMrX,16),	TNSZ("vpcmpeqd",VEX_RMrX,16),	INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	TNSZ("vhaddpd",VEX_RMrX,16),	TNSZ("vhsubpd",VEX_RMrX,16),	TNSZ("vmovd",VEX_RR,4),	TNSZ("vmovdqa",VEX_RX,16),
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		TNSZ("vcmppd",VEX_RMRX,16),	INVALID,
+/*  [C4]  */	TNSZ("vpinsrw",VEX_RMRX,2),TNS("vpextrw",VEX_MR),	TNSZ("vshufpd",VEX_RMRX,16),	INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	TNSZ("vaddsubpd",VEX_RMrX,16),TNSZ("vpsrlw",VEX_RMrX,16),	TNSZ("vpsrld",VEX_RMrX,16),	TNSZ("vpsrlq",VEX_RMrX,16),
+/*  [D4]  */	TNSZ("vpaddq",VEX_RMrX,16),	TNSZ("vpmullw",VEX_RMrX,16),	TNSZ("vmovq",VEX_RX,8),	TNS("vpmovmskb",VEX_MR),
+/*  [D8]  */	TNSZ("vpsubusb",VEX_RMrX,16),	TNSZ("vpsubusw",VEX_RMrX,16),	TNSZ("vpminub",VEX_RMrX,16),	TNSZ("vpand",VEX_RMrX,16),
+/*  [DC]  */	TNSZ("vpaddusb",VEX_RMrX,16),	TNSZ("vpaddusw",VEX_RMrX,16),	TNSZ("vpmaxub",VEX_RMrX,16),	TNSZ("vpandn",VEX_RMrX,16),
+
+/*  [E0]  */	TNSZ("vpavgb",VEX_RMrX,16),	TNSZ("vpsraw",VEX_RMrX,16),	TNSZ("vpsrad",VEX_RMrX,16),	TNSZ("vpavgw",VEX_RMrX,16),
+/*  [E4]  */	TNSZ("vpmulhuw",VEX_RMrX,16),	TNSZ("vpmulhw",VEX_RMrX,16),	TNSZ("vcvttpd2dq",VEX_MX,16),TNSZ("vmovntdq",VEX_RM,16),
+/*  [E8]  */	TNSZ("vpsubsb",VEX_RMrX,16),	TNSZ("vpsubsw",VEX_RMrX,16),	TNSZ("vpminsw",VEX_RMrX,16),	TNSZ("vpor",VEX_RMrX,16),
+/*  [EC]  */	TNSZ("vpaddsb",VEX_RMrX,16),	TNSZ("vpaddsw",VEX_RMrX,16),	TNSZ("vpmaxsw",VEX_RMrX,16),	TNSZ("vpxor",VEX_RMrX,16),
+
+/*  [F0]  */	INVALID,		TNSZ("vpsllw",VEX_RMrX,16),	TNSZ("vpslld",VEX_RMrX,16),	TNSZ("vpsllq",VEX_RMrX,16),
+/*  [F4]  */	TNSZ("vpmuludq",VEX_RMrX,16),	TNSZ("vpmaddwd",VEX_RMrX,16),	TNSZ("vpsadbw",VEX_RMrX,16),	TNS("vmaskmovdqu",VEX_MX),
+/*  [F8]  */	TNSZ("vpsubb",VEX_RMrX,16),	TNSZ("vpsubw",VEX_RMrX,16),	TNSZ("vpsubd",VEX_RMrX,16),	TNSZ("vpsubq",VEX_RMrX,16),
+/*  [FC]  */	TNSZ("vpaddb",VEX_RMrX,16),	TNSZ("vpaddw",VEX_RMrX,16),	TNSZ("vpaddd",VEX_RMrX,16),	INVALID,
+};
+
+/*
+ *	Decode table for SIMD instructions with the repnz (0xf2) prefix.
+ */
+const instable_t dis_opSIMDrepnz[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	TNSZ("movsd",XMM,8),	TNSZ("movsd",XMMS,8),	TNSZ("movddup",XMM,8),	INVALID,
+/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		TNSZ("cvtsi2sd",XMM3MX,4),TNSZ("movntsd",XMMMS,8),
+/*  [2C]  */	TNSZ("cvttsd2si",XMMXM3,8),TNSZ("cvtsd2si",XMMXM3,8),INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		TNSZ("sqrtsd",XMM,8),	INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	TNSZ("addsd",XMM,8),	TNSZ("mulsd",XMM,8),	TNSZ("cvtsd2ss",XMM,8),	INVALID,
+/*  [5C]  */	TNSZ("subsd",XMM,8),	TNSZ("minsd",XMM,8),	TNSZ("divsd",XMM,8),	TNSZ("maxsd",XMM,8),
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	TNSZ("pshuflw",XMMP,16),INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	TNSZ("insertq",XMMX2I,16),TNSZ("insertq",XMM,8),INVALID,		INVALID,
+/*  [7C]  */	TNSZ("haddps",XMM,16),	TNSZ("hsubps",XMM,16),	INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmpsd",XMMP,8),	INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	TNSZ("addsubps",XMM,16),INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		TNS("movdq2q",XMMXM),	INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		TNSZ("cvtpd2dq",XMM,16),INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	TNS("lddqu",XMMM),	INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_opAVXF20F[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	TNSZ("vmovsd",VEX_RMrX,8),	TNSZ("vmovsd",VEX_RRX,8),	TNSZ("vmovddup",VEX_MX,8),	INVALID,
+/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		TNSZ("vcvtsi2sd",VEX_RMrX,4),INVALID,
+/*  [2C]  */	TNSZ("vcvttsd2si",VEX_MR,8),TNSZ("vcvtsd2si",VEX_MR,8),INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		TNSZ("vsqrtsd",VEX_RMrX,8),	INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	TNSZ("vaddsd",VEX_RMrX,8),	TNSZ("vmulsd",VEX_RMrX,8),	TNSZ("vcvtsd2ss",VEX_RMrX,8),	INVALID,
+/*  [5C]  */	TNSZ("vsubsd",VEX_RMrX,8),	TNSZ("vminsd",VEX_RMrX,8),	TNSZ("vdivsd",VEX_RMrX,8),	TNSZ("vmaxsd",VEX_RMrX,8),
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	TNSZ("vpshuflw",VEX_MXI,16),INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	TNSZ("vhaddps",VEX_RMrX,8),	TNSZ("vhsubps",VEX_RMrX,8),	INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		TNSZ("vcmpsd",VEX_RMRX,8),	INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	TNSZ("vaddsubps",VEX_RMrX,8),	INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		TNSZ("vcvtpd2dq",VEX_MX,16),INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	TNSZ("vlddqu",VEX_MX,16),	INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_opAVXF20F3A[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [2C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [5C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	TNSZvr("rorx",VEX_MXI,6),INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_opAVXF20F38[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [2C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [5C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		TNSZvr("pdep",VEX_RMrX,5),TNSZvr("mulx",VEX_RMrX,5),TNSZvr("shrx",VEX_VRMrX,5),
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_opAVXF30F38[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [14]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [2C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [5C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		TNSZvr("pext",VEX_RMrX,5),INVALID,		TNSZvr("sarx",VEX_VRMrX,5),
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+/*
+ *	Decode table for SIMD instructions with the repz (0xf3) prefix.
+ */
+const instable_t dis_opSIMDrepz[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	TNSZ("movss",XMM,4),	TNSZ("movss",XMMS,4),	TNSZ("movsldup",XMM,16),INVALID,
+/*  [14]  */	INVALID,		INVALID,		TNSZ("movshdup",XMM,16),INVALID,
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		TNSZ("cvtsi2ss",XMM3MX,4),TNSZ("movntss",XMMMS,4),
+/*  [2C]  */	TNSZ("cvttss2si",XMMXM3,4),TNSZ("cvtss2si",XMMXM3,4),INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		TNSZ("sqrtss",XMM,4),	TNSZ("rsqrtss",XMM,4),	TNSZ("rcpss",XMM,4),
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	TNSZ("addss",XMM,4),	TNSZ("mulss",XMM,4),	TNSZ("cvtss2sd",XMM,4),	TNSZ("cvttps2dq",XMM,16),
+/*  [5C]  */	TNSZ("subss",XMM,4),	TNSZ("minss",XMM,4),	TNSZ("divss",XMM,4),	TNSZ("maxss",XMM,4),
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		TNSZ("movdqu",XMM,16),
+
+/*  [70]  */	TNSZ("pshufhw",XMMP,16),INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		TNSZ("movq",XMM,8),	TNSZ("movdqu",XMMS,16),
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	TS("popcnt",MRw),	INVALID,		INVALID,		INVALID,
+/*  [BC]  */	TNSZ("tzcnt",MRw,5),	TS("lzcnt",MRw),	INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		TNSZ("cmpss",XMMP,4),	INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		TNS("movq2dq",XMMMX),	INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		TNSZ("cvtdq2pd",XMM,8),	INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_opAVXF30F[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	TNSZ("vmovss",VEX_RMrX,4),	TNSZ("vmovss",VEX_RRX,4),	TNSZ("vmovsldup",VEX_MX,4),	INVALID,
+/*  [14]  */	INVALID,		INVALID,		TNSZ("vmovshdup",VEX_MX,4),	INVALID,
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		TNSZ("vcvtsi2ss",VEX_RMrX,4),INVALID,
+/*  [2C]  */	TNSZ("vcvttss2si",VEX_MR,4),TNSZ("vcvtss2si",VEX_MR,4),INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		TNSZ("vsqrtss",VEX_RMrX,4),	TNSZ("vrsqrtss",VEX_RMrX,4),	TNSZ("vrcpss",VEX_RMrX,4),
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	TNSZ("vaddss",VEX_RMrX,4),	TNSZ("vmulss",VEX_RMrX,4),	TNSZ("vcvtss2sd",VEX_RMrX,4),	TNSZ("vcvttps2dq",VEX_MX,16),
+/*  [5C]  */	TNSZ("vsubss",VEX_RMrX,4),	TNSZ("vminss",VEX_RMrX,4),	TNSZ("vdivss",VEX_RMrX,4),	TNSZ("vmaxss",VEX_RMrX,4),
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		TNSZ("vmovdqu",VEX_MX,16),
+
+/*  [70]  */	TNSZ("vpshufhw",VEX_MXI,16),INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		TNSZ("vmovq",VEX_MX,8),	TNSZ("vmovdqu",VEX_RX,16),
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		TNSZ("vcmpss",VEX_RMRX,4),	INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		TNSZ("vcvtdq2pd",VEX_MX,8),	INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+/*
+ * The following two tables are used to encode crc32 and movbe
+ * since they share the same opcodes.
+ */
+const instable_t dis_op0F38F0[2] = {
+/*  [00]  */	TNS("crc32b",CRC32),
+		TS("movbe",MOVBE),
+};
+
+const instable_t dis_op0F38F1[2] = {
+/*  [00]  */	TS("crc32",CRC32),
+		TS("movbe",MOVBE),
+};
+
+/*
+ * The following table is used to distinguish between adox and adcx which share
+ * the same opcodes.
+ */
+const instable_t dis_op0F38F6[2] = {
+/*  [00]  */	TNS("adcx",ADX),
+		TNS("adox",ADX),
+};
+
+const instable_t dis_op0F38[256] = {
+/*  [00]  */	TNSZ("pshufb",XMM_66o,16),TNSZ("phaddw",XMM_66o,16),TNSZ("phaddd",XMM_66o,16),TNSZ("phaddsw",XMM_66o,16),
+/*  [04]  */	TNSZ("pmaddubsw",XMM_66o,16),TNSZ("phsubw",XMM_66o,16),	TNSZ("phsubd",XMM_66o,16),TNSZ("phsubsw",XMM_66o,16),
+/*  [08]  */	TNSZ("psignb",XMM_66o,16),TNSZ("psignw",XMM_66o,16),TNSZ("psignd",XMM_66o,16),TNSZ("pmulhrsw",XMM_66o,16),
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [10]  */	TNSZ("pblendvb",XMM_66r,16),INVALID,		INVALID,		INVALID,
+/*  [14]  */	TNSZ("blendvps",XMM_66r,16),TNSZ("blendvpd",XMM_66r,16),INVALID,	TNSZ("ptest",XMM_66r,16),
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	TNSZ("pabsb",XMM_66o,16),TNSZ("pabsw",XMM_66o,16),TNSZ("pabsd",XMM_66o,16),INVALID,
+
+/*  [20]  */	TNSZ("pmovsxbw",XMM_66r,16),TNSZ("pmovsxbd",XMM_66r,16),TNSZ("pmovsxbq",XMM_66r,16),TNSZ("pmovsxwd",XMM_66r,16),
+/*  [24]  */	TNSZ("pmovsxwq",XMM_66r,16),TNSZ("pmovsxdq",XMM_66r,16),INVALID,	INVALID,
+/*  [28]  */	TNSZ("pmuldq",XMM_66r,16),TNSZ("pcmpeqq",XMM_66r,16),TNSZ("movntdqa",XMMM_66r,16),TNSZ("packusdw",XMM_66r,16),
+/*  [2C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [30]  */	TNSZ("pmovzxbw",XMM_66r,16),TNSZ("pmovzxbd",XMM_66r,16),TNSZ("pmovzxbq",XMM_66r,16),TNSZ("pmovzxwd",XMM_66r,16),
+/*  [34]  */	TNSZ("pmovzxwq",XMM_66r,16),TNSZ("pmovzxdq",XMM_66r,16),INVALID,	TNSZ("pcmpgtq",XMM_66r,16),
+/*  [38]  */	TNSZ("pminsb",XMM_66r,16),TNSZ("pminsd",XMM_66r,16),TNSZ("pminuw",XMM_66r,16),TNSZ("pminud",XMM_66r,16),
+/*  [3C]  */	TNSZ("pmaxsb",XMM_66r,16),TNSZ("pmaxsd",XMM_66r,16),TNSZ("pmaxuw",XMM_66r,16),TNSZ("pmaxud",XMM_66r,16),
+
+/*  [40]  */	TNSZ("pmulld",XMM_66r,16),TNSZ("phminposuw",XMM_66r,16),INVALID,	INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [5C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [80]  */	TNSy("invept", RM_66r),	TNSy("invvpid", RM_66r),TNSy("invpcid", RM_66r),INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	TNSZ("sha1nexte",XMM,16),TNSZ("sha1msg1",XMM,16),TNSZ("sha1msg2",XMM,16),TNSZ("sha256rnds2",XMM,16),
+/*  [CC]  */	TNSZ("sha256msg1",XMM,16),TNSZ("sha256msg2",XMM,16),INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		TNSZ("aesimc",XMM_66r,16),
+/*  [DC]  */	TNSZ("aesenc",XMM_66r,16),TNSZ("aesenclast",XMM_66r,16),TNSZ("aesdec",XMM_66r,16),TNSZ("aesdeclast",XMM_66r,16),
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F0]  */	IND(dis_op0F38F0),	IND(dis_op0F38F1),	INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		IND(dis_op0F38F6),	INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_opAVX660F38[256] = {
+/*  [00]  */	TNSZ("vpshufb",VEX_RMrX,16),TNSZ("vphaddw",VEX_RMrX,16),TNSZ("vphaddd",VEX_RMrX,16),TNSZ("vphaddsw",VEX_RMrX,16),
+/*  [04]  */	TNSZ("vpmaddubsw",VEX_RMrX,16),TNSZ("vphsubw",VEX_RMrX,16),	TNSZ("vphsubd",VEX_RMrX,16),TNSZ("vphsubsw",VEX_RMrX,16),
+/*  [08]  */	TNSZ("vpsignb",VEX_RMrX,16),TNSZ("vpsignw",VEX_RMrX,16),TNSZ("vpsignd",VEX_RMrX,16),TNSZ("vpmulhrsw",VEX_RMrX,16),
+/*  [0C]  */	TNSZ("vpermilps",VEX_RMrX,8),TNSZ("vpermilpd",VEX_RMrX,16),TNSZ("vtestps",VEX_RRI,8),	TNSZ("vtestpd",VEX_RRI,16),
+
+/*  [10]  */	INVALID,		INVALID,		INVALID,		TNSZ("vcvtph2ps",VEX_MX,16),
+/*  [14]  */	INVALID,		INVALID,		TNSZ("vpermps",VEX_RMrX,16),TNSZ("vptest",VEX_RRI,16),
+/*  [18]  */	TNSZ("vbroadcastss",VEX_MX,4),TNSZ("vbroadcastsd",VEX_MX,8),TNSZ("vbroadcastf128",VEX_MX,16),INVALID,
+/*  [1C]  */	TNSZ("vpabsb",VEX_MX,16),TNSZ("vpabsw",VEX_MX,16),TNSZ("vpabsd",VEX_MX,16),INVALID,
+
+/*  [20]  */	TNSZ("vpmovsxbw",VEX_MX,16),TNSZ("vpmovsxbd",VEX_MX,16),TNSZ("vpmovsxbq",VEX_MX,16),TNSZ("vpmovsxwd",VEX_MX,16),
+/*  [24]  */	TNSZ("vpmovsxwq",VEX_MX,16),TNSZ("vpmovsxdq",VEX_MX,16),INVALID,	INVALID,
+/*  [28]  */	TNSZ("vpmuldq",VEX_RMrX,16),TNSZ("vpcmpeqq",VEX_RMrX,16),TNSZ("vmovntdqa",VEX_MX,16),TNSZ("vpackusdw",VEX_RMrX,16),
+/*  [2C]  */	TNSZ("vmaskmovps",VEX_RMrX,8),TNSZ("vmaskmovpd",VEX_RMrX,16),TNSZ("vmaskmovps",VEX_RRM,8),TNSZ("vmaskmovpd",VEX_RRM,16),
+
+/*  [30]  */	TNSZ("vpmovzxbw",VEX_MX,16),TNSZ("vpmovzxbd",VEX_MX,16),TNSZ("vpmovzxbq",VEX_MX,16),TNSZ("vpmovzxwd",VEX_MX,16),
+/*  [34]  */	TNSZ("vpmovzxwq",VEX_MX,16),TNSZ("vpmovzxdq",VEX_MX,16),TNSZ("vpermd",VEX_RMrX,16),TNSZ("vpcmpgtq",VEX_RMrX,16),
+/*  [38]  */	TNSZ("vpminsb",VEX_RMrX,16),TNSZ("vpminsd",VEX_RMrX,16),TNSZ("vpminuw",VEX_RMrX,16),TNSZ("vpminud",VEX_RMrX,16),
+/*  [3C]  */	TNSZ("vpmaxsb",VEX_RMrX,16),TNSZ("vpmaxsd",VEX_RMrX,16),TNSZ("vpmaxuw",VEX_RMrX,16),TNSZ("vpmaxud",VEX_RMrX,16),
+
+/*  [40]  */	TNSZ("vpmulld",VEX_RMrX,16),TNSZ("vphminposuw",VEX_MX,16),INVALID,	INVALID,
+/*  [44]  */	INVALID,		TSaZ("vpsrlv",VEX_RMrX,16),TNSZ("vpsravd",VEX_RMrX,16),TSaZ("vpsllv",VEX_RMrX,16),
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	TNSZ("vpbroadcastd",VEX_MX,16),TNSZ("vpbroadcastq",VEX_MX,16),TNSZ("vbroadcasti128",VEX_MX,16),INVALID,
+/*  [5C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	TNSZ("vpbroadcastb",VEX_MX,16),TNSZ("vpbroadcastw",VEX_MX,16),INVALID,	INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [8C]  */	TSaZ("vpmaskmov",VEX_RMrX,16),INVALID,		TSaZ("vpmaskmov",VEX_RRM,16),INVALID,
+
+/*  [90]  */	TNSZ("vpgatherd",VEX_SbVM,16),TNSZ("vpgatherq",VEX_SbVM,16),TNSZ("vgatherdp",VEX_SbVM,16),TNSZ("vgatherqp",VEX_SbVM,16),
+/*  [94]  */	INVALID,		INVALID,		TNSZ("vfmaddsub132p",FMA,16),TNSZ("vfmsubadd132p",FMA,16),
+/*  [98]  */	TNSZ("vfmadd132p",FMA,16),TNSZ("vfmadd132s",FMA,16),TNSZ("vfmsub132p",FMA,16),TNSZ("vfmsub132s",FMA,16),
+/*  [9C]  */	TNSZ("vfnmadd132p",FMA,16),TNSZ("vfnmadd132s",FMA,16),TNSZ("vfnmsub132p",FMA,16),TNSZ("vfnmsub132s",FMA,16),
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		TNSZ("vfmaddsub213p",FMA,16),TNSZ("vfmsubadd213p",FMA,16),
+/*  [A8]  */	TNSZ("vfmadd213p",FMA,16),TNSZ("vfmadd213s",FMA,16),TNSZ("vfmsub213p",FMA,16),TNSZ("vfmsub213s",FMA,16),
+/*  [AC]  */	TNSZ("vfnmadd213p",FMA,16),TNSZ("vfnmadd213s",FMA,16),TNSZ("vfnmsub213p",FMA,16),TNSZ("vfnmsub213s",FMA,16),
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		TNSZ("vfmaddsub231p",FMA,16),TNSZ("vfmsubadd231p",FMA,16),
+/*  [B8]  */	TNSZ("vfmadd231p",FMA,16),TNSZ("vfmadd231s",FMA,16),TNSZ("vfmsub231p",FMA,16),TNSZ("vfmsub231s",FMA,16),
+/*  [BC]  */	TNSZ("vfnmadd231p",FMA,16),TNSZ("vfnmadd231s",FMA,16),TNSZ("vfnmsub231p",FMA,16),TNSZ("vfnmsub231s",FMA,16),
+
+/*  [C0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		TNSZ("vaesimc",VEX_MX,16),
+/*  [DC]  */	TNSZ("vaesenc",VEX_RMrX,16),TNSZ("vaesenclast",VEX_RMrX,16),TNSZ("vaesdec",VEX_RMrX,16),TNSZ("vaesdeclast",VEX_RMrX,16),
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F0]  */	IND(dis_op0F38F0),	IND(dis_op0F38F1),	INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		TNSZvr("shlx",VEX_VRMrX,5),
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_op0F3A[256] = {
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	TNSZ("roundps",XMMP_66r,16),TNSZ("roundpd",XMMP_66r,16),TNSZ("roundss",XMMP_66r,16),TNSZ("roundsd",XMMP_66r,16),
+/*  [0C]  */	TNSZ("blendps",XMMP_66r,16),TNSZ("blendpd",XMMP_66r,16),TNSZ("pblendw",XMMP_66r,16),TNSZ("palignr",XMMP_66o,16),
+
+/*  [10]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [14]  */	TNSZ("pextrb",XMM3PM_66r,8),TNSZ("pextrw",XMM3PM_66r,16),TSZ("pextr",XMM3PM_66r,16),TNSZ("extractps",XMM3PM_66r,16),
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [20]  */	TNSZ("pinsrb",XMMPRM_66r,8),TNSZ("insertps",XMMP_66r,16),TSZ("pinsr",XMMPRM_66r,16),INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [2C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	TNSZ("dpps",XMMP_66r,16),TNSZ("dppd",XMMP_66r,16),TNSZ("mpsadbw",XMMP_66r,16),INVALID,
+/*  [44]  */	TNSZ("pclmulqdq",XMMP_66r,16),INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [5C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [60]  */	TNSZ("pcmpestrm",XMMP_66r,16),TNSZ("pcmpestri",XMMP_66r,16),TNSZ("pcmpistrm",XMMP_66r,16),TNSZ("pcmpistri",XMMP_66r,16),
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	TNSZ("sha1rnds4",XMMP,16),INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		TNSZ("aeskeygenassist",XMMP_66r,16),
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+const instable_t dis_opAVX660F3A[256] = {
+/*  [00]  */	TNSZ("vpermq",VEX_MXI,16),TNSZ("vpermpd",VEX_MXI,16),TNSZ("vpblendd",VEX_RMRX,16),INVALID,
+/*  [04]  */	TNSZ("vpermilps",VEX_MXI,8),TNSZ("vpermilpd",VEX_MXI,16),TNSZ("vperm2f128",VEX_RMRX,16),INVALID,
+/*  [08]  */	TNSZ("vroundps",VEX_MXI,16),TNSZ("vroundpd",VEX_MXI,16),TNSZ("vroundss",VEX_RMRX,16),TNSZ("vroundsd",VEX_RMRX,16),
+/*  [0C]  */	TNSZ("vblendps",VEX_RMRX,16),TNSZ("vblendpd",VEX_RMRX,16),TNSZ("vpblendw",VEX_RMRX,16),TNSZ("vpalignr",VEX_RMRX,16),
+
+/*  [10]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [14]  */	TNSZ("vpextrb",VEX_RRi,8),TNSZ("vpextrw",VEX_RRi,16),TNSZ("vpextrd",VEX_RRi,16),TNSZ("vextractps",VEX_RM,16),
+/*  [18]  */	TNSZ("vinsertf128",VEX_RMRX,16),TNSZ("vextractf128",VEX_RX,16),INVALID,		INVALID,
+/*  [1C]  */	INVALID,		TNSZ("vcvtps2ph",VEX_RX,16),		INVALID,		INVALID,
+
+/*  [20]  */	TNSZ("vpinsrb",VEX_RMRX,8),TNSZ("vinsertps",VEX_RMRX,16),TNSZ("vpinsrd",VEX_RMRX,16),INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [2C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	TNSZ("vinserti128",VEX_RMRX,16),TNSZ("vextracti128",VEX_RIM,16),INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [40]  */	TNSZ("vdpps",VEX_RMRX,16),TNSZ("vdppd",VEX_RMRX,16),TNSZ("vmpsadbw",VEX_RMRX,16),INVALID,
+/*  [44]  */	TNSZ("vpclmulqdq",VEX_RMRX,16),INVALID,		TNSZ("vperm2i128",VEX_RMRX,16),INVALID,
+/*  [48]  */	INVALID,		INVALID,		TNSZ("vblendvps",VEX_RMRX,8),	TNSZ("vblendvpd",VEX_RMRX,16),
+/*  [4C]  */	TNSZ("vpblendvb",VEX_RMRX,16),INVALID,		INVALID,		INVALID,
+
+/*  [50]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [54]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [58]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [5C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [60]  */	TNSZ("vpcmpestrm",VEX_MXI,16),TNSZ("vpcmpestri",VEX_MXI,16),TNSZ("vpcmpistrm",VEX_MXI,16),TNSZ("vpcmpistri",VEX_MXI,16),
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [C0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		TNSZ("vaeskeygenassist",VEX_MXI,16),
+
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+
+/*  [F0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+/*
+ * 	Decode table for 0x0F0D which uses the first byte of the mod_rm to
+ * 	indicate a sub-code.
+ */
+const instable_t dis_op0F0D[8] = {
+/*  [00]  */	INVALID,		TNS("prefetchw",PREF),	TNS("prefetchwt1",PREF),INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+/*
+ *	Decode table for 0x0F opcodes
+ */
+
+const instable_t dis_op0F[16][16] = {
+{
+/*  [00]  */	IND(dis_op0F00),	IND(dis_op0F01),	TNS("lar",MR),		TNS("lsl",MR),
+/*  [04]  */	INVALID,		TNS("syscall",NORM),	TNS("clts",NORM),	TNS("sysret",NORM),
+/*  [08]  */	TNS("invd",NORM),	TNS("wbinvd",NORM),	INVALID,		TNS("ud2",NORM),
+/*  [0C]  */	INVALID,		IND(dis_op0F0D),	INVALID,		INVALID,
+}, {
+/*  [10]  */	TNSZ("movups",XMMO,16),	TNSZ("movups",XMMOS,16),TNSZ("movlps",XMMO,8),	TNSZ("movlps",XMMOS,8),
+/*  [14]  */	TNSZ("unpcklps",XMMO,16),TNSZ("unpckhps",XMMO,16),TNSZ("movhps",XMMOM,8),TNSZ("movhps",XMMOMS,8),
+/*  [18]  */	IND(dis_op0F18),	INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		TS("nop",Mw),
+}, {
+/*  [20]  */	TSy("mov",SREG),	TSy("mov",SREG),	TSy("mov",SREG),	TSy("mov",SREG),
+/*  [24]  */	TSx("mov",SREG),	INVALID,		TSx("mov",SREG),	INVALID,
+/*  [28]  */	TNSZ("movaps",XMMO,16),	TNSZ("movaps",XMMOS,16),TNSZ("cvtpi2ps",XMMOMX,8),TNSZ("movntps",XMMOS,16),
+/*  [2C]  */	TNSZ("cvttps2pi",XMMOXMM,8),TNSZ("cvtps2pi",XMMOXMM,8),TNSZ("ucomiss",XMMO,4),TNSZ("comiss",XMMO,4),
+}, {
+/*  [30]  */	TNS("wrmsr",NORM),	TNS("rdtsc",NORM),	TNS("rdmsr",NORM),	TNS("rdpmc",NORM),
+/*  [34]  */	TNSx("sysenter",NORM),	TNSx("sysexit",NORM),	INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [40]  */	TS("cmovx.o",MR),	TS("cmovx.no",MR),	TS("cmovx.b",MR),	TS("cmovx.ae",MR),
+/*  [44]  */	TS("cmovx.e",MR),	TS("cmovx.ne",MR),	TS("cmovx.be",MR),	TS("cmovx.a",MR),
+/*  [48]  */	TS("cmovx.s",MR),	TS("cmovx.ns",MR),	TS("cmovx.pe",MR),	TS("cmovx.po",MR),
+/*  [4C]  */	TS("cmovx.l",MR),	TS("cmovx.ge",MR),	TS("cmovx.le",MR),	TS("cmovx.g",MR),
+}, {
+/*  [50]  */	TNS("movmskps",XMMOX3),	TNSZ("sqrtps",XMMO,16),	TNSZ("rsqrtps",XMMO,16),TNSZ("rcpps",XMMO,16),
+/*  [54]  */	TNSZ("andps",XMMO,16),	TNSZ("andnps",XMMO,16),	TNSZ("orps",XMMO,16),	TNSZ("xorps",XMMO,16),
+/*  [58]  */	TNSZ("addps",XMMO,16),	TNSZ("mulps",XMMO,16),	TNSZ("cvtps2pd",XMMO,8),TNSZ("cvtdq2ps",XMMO,16),
+/*  [5C]  */	TNSZ("subps",XMMO,16),	TNSZ("minps",XMMO,16),	TNSZ("divps",XMMO,16),	TNSZ("maxps",XMMO,16),
+}, {
+/*  [60]  */	TNSZ("punpcklbw",MMO,4),TNSZ("punpcklwd",MMO,4),TNSZ("punpckldq",MMO,4),TNSZ("packsswb",MMO,8),
+/*  [64]  */	TNSZ("pcmpgtb",MMO,8),	TNSZ("pcmpgtw",MMO,8),	TNSZ("pcmpgtd",MMO,8),	TNSZ("packuswb",MMO,8),
+/*  [68]  */	TNSZ("punpckhbw",MMO,8),TNSZ("punpckhwd",MMO,8),TNSZ("punpckhdq",MMO,8),TNSZ("packssdw",MMO,8),
+/*  [6C]  */	TNSZ("INVALID",MMO,0),	TNSZ("INVALID",MMO,0),	TNSZ("movd",MMO,4),	TNSZ("movq",MMO,8),
+}, {
+/*  [70]  */	TNSZ("pshufw",MMOPM,8),	TNS("psrXXX",MR),	TNS("psrXXX",MR),	TNS("psrXXX",MR),
+/*  [74]  */	TNSZ("pcmpeqb",MMO,8),	TNSZ("pcmpeqw",MMO,8),	TNSZ("pcmpeqd",MMO,8),	TNS("emms",NORM),
+/*  [78]  */	TNSy("vmread",RM),	TNSy("vmwrite",MR),	INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		TNSZ("movd",MMOS,4),	TNSZ("movq",MMOS,8),
+}, {
+/*  [80]  */	TNS("jo",D),		TNS("jno",D),		TNS("jb",D),		TNS("jae",D),
+/*  [84]  */	TNS("je",D),		TNS("jne",D),		TNS("jbe",D),		TNS("ja",D),
+/*  [88]  */	TNS("js",D),		TNS("jns",D),		TNS("jp",D),		TNS("jnp",D),
+/*  [8C]  */	TNS("jl",D),		TNS("jge",D),		TNS("jle",D),		TNS("jg",D),
+}, {
+/*  [90]  */	TNS("seto",Mb),		TNS("setno",Mb),	TNS("setb",Mb),		TNS("setae",Mb),
+/*  [94]  */	TNS("sete",Mb),		TNS("setne",Mb),	TNS("setbe",Mb),	TNS("seta",Mb),
+/*  [98]  */	TNS("sets",Mb),		TNS("setns",Mb),	TNS("setp",Mb),		TNS("setnp",Mb),
+/*  [9C]  */	TNS("setl",Mb),		TNS("setge",Mb),	TNS("setle",Mb),	TNS("setg",Mb),
+}, {
+/*  [A0]  */	TSp("push",LSEG),	TSp("pop",LSEG),	TNS("cpuid",NORM),	TS("bt",RMw),
+/*  [A4]  */	TS("shld",DSHIFT),	TS("shld",DSHIFTcl),	INVALID,		INVALID,
+/*  [A8]  */	TSp("push",LSEG),	TSp("pop",LSEG),	TNS("rsm",NORM),	TS("bts",RMw),
+/*  [AC]  */	TS("shrd",DSHIFT),	TS("shrd",DSHIFTcl),	IND(dis_op0FAE),	TS("imul",MRw),
+}, {
+/*  [B0]  */	TNS("cmpxchgb",RMw),	TS("cmpxchg",RMw),	TS("lss",MR),		TS("btr",RMw),
+/*  [B4]  */	TS("lfs",MR),		TS("lgs",MR),		TS("movzb",MOVZ),	TNS("movzwl",MOVZ),
+/*  [B8]  */	TNS("INVALID",MRw),	INVALID,		IND(dis_op0FBA),	TS("btc",RMw),
+/*  [BC]  */	TS("bsf",MRw),		TS("bsr",MRw),		TS("movsb",MOVZ),	TNS("movswl",MOVZ),
+}, {
+/*  [C0]  */	TNS("xaddb",XADDB),	TS("xadd",RMw),		TNSZ("cmpps",XMMOPM,16),TNS("movnti",RM),
+/*  [C4]  */	TNSZ("pinsrw",MMOPRM,2),TNS("pextrw",MMO3P), 	TNSZ("shufps",XMMOPM,16),IND(dis_op0FC7),
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [D0]  */	INVALID,		TNSZ("psrlw",MMO,8),	TNSZ("psrld",MMO,8),	TNSZ("psrlq",MMO,8),
+/*  [D4]  */	TNSZ("paddq",MMO,8),	TNSZ("pmullw",MMO,8),	TNSZ("INVALID",MMO,0),	TNS("pmovmskb",MMOM3),
+/*  [D8]  */	TNSZ("psubusb",MMO,8),	TNSZ("psubusw",MMO,8),	TNSZ("pminub",MMO,8),	TNSZ("pand",MMO,8),
+/*  [DC]  */	TNSZ("paddusb",MMO,8),	TNSZ("paddusw",MMO,8),	TNSZ("pmaxub",MMO,8),	TNSZ("pandn",MMO,8),
+}, {
+/*  [E0]  */	TNSZ("pavgb",MMO,8),	TNSZ("psraw",MMO,8),	TNSZ("psrad",MMO,8),	TNSZ("pavgw",MMO,8),
+/*  [E4]  */	TNSZ("pmulhuw",MMO,8),	TNSZ("pmulhw",MMO,8),	TNS("INVALID",XMMO),	TNSZ("movntq",MMOMS,8),
+/*  [E8]  */	TNSZ("psubsb",MMO,8),	TNSZ("psubsw",MMO,8),	TNSZ("pminsw",MMO,8),	TNSZ("por",MMO,8),
+/*  [EC]  */	TNSZ("paddsb",MMO,8),	TNSZ("paddsw",MMO,8),	TNSZ("pmaxsw",MMO,8),	TNSZ("pxor",MMO,8),
+}, {
+/*  [F0]  */	INVALID,		TNSZ("psllw",MMO,8),	TNSZ("pslld",MMO,8),	TNSZ("psllq",MMO,8),
+/*  [F4]  */	TNSZ("pmuludq",MMO,8),	TNSZ("pmaddwd",MMO,8),	TNSZ("psadbw",MMO,8),	TNSZ("maskmovq",MMOIMPL,8),
+/*  [F8]  */	TNSZ("psubb",MMO,8),	TNSZ("psubw",MMO,8),	TNSZ("psubd",MMO,8),	TNSZ("psubq",MMO,8),
+/*  [FC]  */	TNSZ("paddb",MMO,8),	TNSZ("paddw",MMO,8),	TNSZ("paddd",MMO,8),	INVALID,
+} };
+
+const instable_t dis_opAVX0F[16][16] = {
+{
+/*  [00]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [08]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [0C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [10]  */	TNSZ("vmovups",VEX_MX,16),	TNSZ("vmovups",VEX_RM,16),TNSZ("vmovlps",VEX_RMrX,8),	TNSZ("vmovlps",VEX_RM,8),
+/*  [14]  */	TNSZ("vunpcklps",VEX_RMrX,16),TNSZ("vunpckhps",VEX_RMrX,16),TNSZ("vmovhps",VEX_RMrX,8),TNSZ("vmovhps",VEX_RM,8),
+/*  [18]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [1C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [20]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [24]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [28]  */	TNSZ("vmovaps",VEX_MX,16),	TNSZ("vmovaps",VEX_RX,16),INVALID,		TNSZ("vmovntps",VEX_RM,16),
+/*  [2C]  */	INVALID,		INVALID,		TNSZ("vucomiss",VEX_MX,4),TNSZ("vcomiss",VEX_MX,4),
+}, {
+/*  [30]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [34]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [38]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [40]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [44]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [48]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [4C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [50]  */	TNS("vmovmskps",VEX_MR),	TNSZ("vsqrtps",VEX_MX,16),	TNSZ("vrsqrtps",VEX_MX,16),TNSZ("vrcpps",VEX_MX,16),
+/*  [54]  */	TNSZ("vandps",VEX_RMrX,16),	TNSZ("vandnps",VEX_RMrX,16),	TNSZ("vorps",VEX_RMrX,16),	TNSZ("vxorps",VEX_RMrX,16),
+/*  [58]  */	TNSZ("vaddps",VEX_RMrX,16),	TNSZ("vmulps",VEX_RMrX,16),	TNSZ("vcvtps2pd",VEX_MX,8),TNSZ("vcvtdq2ps",VEX_MX,16),
+/*  [5C]  */	TNSZ("vsubps",VEX_RMrX,16),	TNSZ("vminps",VEX_RMrX,16),	TNSZ("vdivps",VEX_RMrX,16),	TNSZ("vmaxps",VEX_RMrX,16),
+}, {
+/*  [60]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [64]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [68]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [6C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [70]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [74]  */	INVALID,		INVALID,		INVALID,		TNS("vzeroupper", VEX_NONE),
+/*  [78]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [7C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [80]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [84]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [88]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [8C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [90]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [94]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [98]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [9C]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [A0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [A8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [AC]  */	INVALID,		INVALID,		TNSZ("vldmxcsr",VEX_MO,2),		INVALID,
+}, {
+/*  [B0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [B8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [BC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [C0]  */	INVALID,		INVALID,		TNSZ("vcmpps",VEX_RMRX,16),INVALID,
+/*  [C4]  */	INVALID,		INVALID,	 	TNSZ("vshufps",VEX_RMRX,16),INVALID,
+/*  [C8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [CC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [D0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [D8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [DC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [E0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [E8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [EC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [F0]  */	INVALID,		INVALID,		TNSZvr("andn",VEX_RMrX,5),TNSZvr("bls",BLS,5),
+/*  [F4]  */	INVALID,		TNSZvr("bzhi",VEX_VRMrX,5),INVALID,		TNSZvr("bextr",VEX_VRMrX,5),
+/*  [F8]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [FC]  */	INVALID,		INVALID,		INVALID,		INVALID,
+} };
+
+/*
+ *	Decode table for 0x80 opcodes
+ */
+
+const instable_t dis_op80[8] = {
+
+/*  [0]  */	TNS("addb",IMlw),	TNS("orb",IMw),		TNS("adcb",IMlw),	TNS("sbbb",IMlw),
+/*  [4]  */	TNS("andb",IMw),	TNS("subb",IMlw),	TNS("xorb",IMw),	TNS("cmpb",IMlw),
+};
+
+
+/*
+ *	Decode table for 0x81 opcodes.
+ */
+
+const instable_t dis_op81[8] = {
+
+/*  [0]  */	TS("add",IMlw),		TS("or",IMw),		TS("adc",IMlw),		TS("sbb",IMlw),
+/*  [4]  */	TS("and",IMw),		TS("sub",IMlw),		TS("xor",IMw),		TS("cmp",IMlw),
+};
+
+
+/*
+ *	Decode table for 0x82 opcodes.
+ */
+
+const instable_t dis_op82[8] = {
+
+/*  [0]  */	TNSx("addb",IMlw),	TNSx("orb",IMlw),	TNSx("adcb",IMlw),	TNSx("sbbb",IMlw),
+/*  [4]  */	TNSx("andb",IMlw),	TNSx("subb",IMlw),	TNSx("xorb",IMlw),	TNSx("cmpb",IMlw),
+};
+/*
+ *	Decode table for 0x83 opcodes.
+ */
+
+const instable_t dis_op83[8] = {
+
+/*  [0]  */	TS("add",IMlw),		TS("or",IMlw),		TS("adc",IMlw),		TS("sbb",IMlw),
+/*  [4]  */	TS("and",IMlw),		TS("sub",IMlw),		TS("xor",IMlw),		TS("cmp",IMlw),
+};
+
+/*
+ *	Decode table for 0xC0 opcodes.
+ */
+
+const instable_t dis_opC0[8] = {
+
+/*  [0]  */	TNS("rolb",MvI),	TNS("rorb",MvI),	TNS("rclb",MvI),	TNS("rcrb",MvI),
+/*  [4]  */	TNS("shlb",MvI),	TNS("shrb",MvI),	INVALID,		TNS("sarb",MvI),
+};
+
+/*
+ *	Decode table for 0xD0 opcodes.
+ */
+
+const instable_t dis_opD0[8] = {
+
+/*  [0]  */	TNS("rolb",Mv),		TNS("rorb",Mv),		TNS("rclb",Mv),		TNS("rcrb",Mv),
+/*  [4]  */	TNS("shlb",Mv),		TNS("shrb",Mv),		TNS("salb",Mv),		TNS("sarb",Mv),
+};
+
+/*
+ *	Decode table for 0xC1 opcodes.
+ *	186 instruction set
+ */
+
+const instable_t dis_opC1[8] = {
+
+/*  [0]  */	TS("rol",MvI),		TS("ror",MvI),		TS("rcl",MvI),		TS("rcr",MvI),
+/*  [4]  */	TS("shl",MvI),		TS("shr",MvI),		TS("sal",MvI),		TS("sar",MvI),
+};
+
+/*
+ *	Decode table for 0xD1 opcodes.
+ */
+
+const instable_t dis_opD1[8] = {
+
+/*  [0]  */	TS("rol",Mv),		TS("ror",Mv),		TS("rcl",Mv),		TS("rcr",Mv),
+/*  [4]  */	TS("shl",Mv),		TS("shr",Mv),		TS("sal",Mv),		TS("sar",Mv),
+};
+
+
+/*
+ *	Decode table for 0xD2 opcodes.
+ */
+
+const instable_t dis_opD2[8] = {
+
+/*  [0]  */	TNS("rolb",Mv),		TNS("rorb",Mv),		TNS("rclb",Mv),		TNS("rcrb",Mv),
+/*  [4]  */	TNS("shlb",Mv),		TNS("shrb",Mv),		TNS("salb",Mv),		TNS("sarb",Mv),
+};
+/*
+ *	Decode table for 0xD3 opcodes.
+ */
+
+const instable_t dis_opD3[8] = {
+
+/*  [0]  */	TS("rol",Mv),		TS("ror",Mv),		TS("rcl",Mv),		TS("rcr",Mv),
+/*  [4]  */	TS("shl",Mv),		TS("shr",Mv),		TS("salb",Mv),		TS("sar",Mv),
+};
+
+
+/*
+ *	Decode table for 0xF6 opcodes.
+ */
+
+const instable_t dis_opF6[8] = {
+
+/*  [0]  */	TNS("testb",IMw),	TNS("testb",IMw),	TNS("notb",Mw),		TNS("negb",Mw),
+/*  [4]  */	TNS("mulb",MA),		TNS("imulb",MA),	TNS("divb",MA),		TNS("idivb",MA),
+};
+
+
+/*
+ *	Decode table for 0xF7 opcodes.
+ */
+
+const instable_t dis_opF7[8] = {
+
+/*  [0]  */	TS("test",IMw),		TS("test",IMw),		TS("not",Mw),		TS("neg",Mw),
+/*  [4]  */	TS("mul",MA),		TS("imul",MA),		TS("div",MA),		TS("idiv",MA),
+};
+
+
+/*
+ *	Decode table for 0xFE opcodes.
+ */
+
+const instable_t dis_opFE[8] = {
+
+/*  [0]  */	TNS("incb",Mw),		TNS("decb",Mw),		INVALID,		INVALID,
+/*  [4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+/*
+ *	Decode table for 0xFF opcodes.
+ */
+
+const instable_t dis_opFF[8] = {
+
+/*  [0]  */	TS("inc",Mw),		TS("dec",Mw),		TNSyp("call",INM),	TNS("lcall",INM),
+/*  [4]  */	TNSy("jmp",INM),	TNS("ljmp",INM),	TSp("push",M),		INVALID,
+};
+
+/* for 287 instructions, which are a mess to decode */
+
+const instable_t dis_opFP1n2[8][8] = {
+{
+/* bit pattern:	1101 1xxx MODxx xR/M */
+/*  [0,0] */	TNS("fadds",M),		TNS("fmuls",M),		TNS("fcoms",M),		TNS("fcomps",M),
+/*  [0,4] */	TNS("fsubs",M),		TNS("fsubrs",M),	TNS("fdivs",M),		TNS("fdivrs",M),
+}, {
+/*  [1,0]  */	TNS("flds",M),		INVALID,		TNS("fsts",M),		TNS("fstps",M),
+/*  [1,4]  */	TNSZ("fldenv",M,28),	TNSZ("fldcw",M,2),	TNSZ("fnstenv",M,28),	TNSZ("fnstcw",M,2),
+}, {
+/*  [2,0]  */	TNS("fiaddl",M),	TNS("fimull",M),	TNS("ficoml",M),	TNS("ficompl",M),
+/*  [2,4]  */	TNS("fisubl",M),	TNS("fisubrl",M),	TNS("fidivl",M),	TNS("fidivrl",M),
+}, {
+/*  [3,0]  */	TNS("fildl",M),		TNSZ("tisttpl",M,4),	TNS("fistl",M),		TNS("fistpl",M),
+/*  [3,4]  */	INVALID,		TNSZ("fldt",M,10),	INVALID,		TNSZ("fstpt",M,10),
+}, {
+/*  [4,0]  */	TNSZ("faddl",M,8),	TNSZ("fmull",M,8),	TNSZ("fcoml",M,8),	TNSZ("fcompl",M,8),
+/*  [4,1]  */	TNSZ("fsubl",M,8),	TNSZ("fsubrl",M,8),	TNSZ("fdivl",M,8),	TNSZ("fdivrl",M,8),
+}, {
+/*  [5,0]  */	TNSZ("fldl",M,8),	TNSZ("fisttpll",M,8),	TNSZ("fstl",M,8),	TNSZ("fstpl",M,8),
+/*  [5,4]  */	TNSZ("frstor",M,108),	INVALID,		TNSZ("fnsave",M,108),	TNSZ("fnstsw",M,2),
+}, {
+/*  [6,0]  */	TNSZ("fiadd",M,2),	TNSZ("fimul",M,2),	TNSZ("ficom",M,2),	TNSZ("ficomp",M,2),
+/*  [6,4]  */	TNSZ("fisub",M,2),	TNSZ("fisubr",M,2),	TNSZ("fidiv",M,2),	TNSZ("fidivr",M,2),
+}, {
+/*  [7,0]  */	TNSZ("fild",M,2),	TNSZ("fisttp",M,2),	TNSZ("fist",M,2),	TNSZ("fistp",M,2),
+/*  [7,4]  */	TNSZ("fbld",M,10),	TNSZ("fildll",M,8),	TNSZ("fbstp",M,10),	TNSZ("fistpll",M,8),
+} };
+
+const instable_t dis_opFP3[8][8] = {
+{
+/* bit  pattern:	1101 1xxx 11xx xREG */
+/*  [0,0]  */	TNS("fadd",FF),		TNS("fmul",FF),		TNS("fcom",F),		TNS("fcomp",F),
+/*  [0,4]  */	TNS("fsub",FF),		TNS("fsubr",FF),	TNS("fdiv",FF),		TNS("fdivr",FF),
+}, {
+/*  [1,0]  */	TNS("fld",F),		TNS("fxch",F),		TNS("fnop",NORM),	TNS("fstp",F),
+/*  [1,4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [2,0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [2,4]  */	INVALID,		TNS("fucompp",NORM),	INVALID,		INVALID,
+}, {
+/*  [3,0]  */	INVALID,		INVALID,		INVALID,		INVALID,
+/*  [3,4]  */	INVALID,		INVALID,		INVALID,		INVALID,
+}, {
+/*  [4,0]  */	TNS("fadd",FF),		TNS("fmul",FF),		TNS("fcom",F),		TNS("fcomp",F),
+/*  [4,4]  */	TNS("fsub",FF),		TNS("fsubr",FF),	TNS("fdiv",FF),		TNS("fdivr",FF),
+}, {
+/*  [5,0]  */	TNS("ffree",F),		TNS("fxch",F),		TNS("fst",F),		TNS("fstp",F),
+/*  [5,4]  */	TNS("fucom",F),		TNS("fucomp",F),	INVALID,		INVALID,
+}, {
+/*  [6,0]  */	TNS("faddp",FF),	TNS("fmulp",FF),	TNS("fcomp",F),		TNS("fcompp",NORM),
+/*  [6,4]  */	TNS("fsubp",FF),	TNS("fsubrp",FF),	TNS("fdivp",FF),	TNS("fdivrp",FF),
+}, {
+/*  [7,0]  */	TNS("ffreep",F),		TNS("fxch",F),		TNS("fstp",F),		TNS("fstp",F),
+/*  [7,4]  */	TNS("fnstsw",M),	TNS("fucomip",FFC),	TNS("fcomip",FFC),	INVALID,
+} };
+
+const instable_t dis_opFP4[4][8] = {
+{
+/* bit pattern:	1101 1001 111x xxxx */
+/*  [0,0]  */	TNS("fchs",NORM),	TNS("fabs",NORM),	INVALID,		INVALID,
+/*  [0,4]  */	TNS("ftst",NORM),	TNS("fxam",NORM),	TNS("ftstp",NORM),	INVALID,
+}, {
+/*  [1,0]  */	TNS("fld1",NORM),	TNS("fldl2t",NORM),	TNS("fldl2e",NORM),	TNS("fldpi",NORM),
+/*  [1,4]  */	TNS("fldlg2",NORM),	TNS("fldln2",NORM),	TNS("fldz",NORM),	INVALID,
+}, {
+/*  [2,0]  */	TNS("f2xm1",NORM),	TNS("fyl2x",NORM),	TNS("fptan",NORM),	TNS("fpatan",NORM),
+/*  [2,4]  */	TNS("fxtract",NORM),	TNS("fprem1",NORM),	TNS("fdecstp",NORM),	TNS("fincstp",NORM),
+}, {
+/*  [3,0]  */	TNS("fprem",NORM),	TNS("fyl2xp1",NORM),	TNS("fsqrt",NORM),	TNS("fsincos",NORM),
+/*  [3,4]  */	TNS("frndint",NORM),	TNS("fscale",NORM),	TNS("fsin",NORM),	TNS("fcos",NORM),
+} };
+
+const instable_t dis_opFP5[8] = {
+/* bit pattern:	1101 1011 111x xxxx */
+/*  [0]  */	TNS("feni",NORM),	TNS("fdisi",NORM),	TNS("fnclex",NORM),	TNS("fninit",NORM),
+/*  [4]  */	TNS("fsetpm",NORM),	TNS("frstpm",NORM),	INVALID,		INVALID,
+};
+
+const instable_t dis_opFP6[8] = {
+/* bit pattern:	1101 1011 11yy yxxx */
+/*  [00]  */	TNS("fcmov.nb",FF),	TNS("fcmov.ne",FF),	TNS("fcmov.nbe",FF),	TNS("fcmov.nu",FF),
+/*  [04]  */	INVALID,		TNS("fucomi",F),	TNS("fcomi",F),		INVALID,
+};
+
+const instable_t dis_opFP7[8] = {
+/* bit pattern:	1101 1010 11yy yxxx */
+/*  [00]  */	TNS("fcmov.b",FF),	TNS("fcmov.e",FF),	TNS("fcmov.be",FF),	TNS("fcmov.u",FF),
+/*  [04]  */	INVALID,		INVALID,		INVALID,		INVALID,
+};
+
+/*
+ *	Main decode table for the op codes.  The first two nibbles
+ *	will be used as an index into the table.  If there is a
+ *	a need to further decode an instruction, the array to be
+ *	referenced is indicated with the other two entries being
+ *	empty.
+ */
+
+const instable_t dis_distable[16][16] = {
+{
+/* [0,0] */	TNS("addb",RMw),	TS("add",RMw),		TNS("addb",MRw),	TS("add",MRw),
+/* [0,4] */	TNS("addb",IA),		TS("add",IA),		TSx("push",SEG),	TSx("pop",SEG),
+/* [0,8] */	TNS("orb",RMw),		TS("or",RMw),		TNS("orb",MRw),		TS("or",MRw),
+/* [0,C] */	TNS("orb",IA),		TS("or",IA),		TSx("push",SEG),	IND(dis_op0F),
+}, {
+/* [1,0] */	TNS("adcb",RMw),	TS("adc",RMw),		TNS("adcb",MRw),	TS("adc",MRw),
+/* [1,4] */	TNS("adcb",IA),		TS("adc",IA),		TSx("push",SEG),	TSx("pop",SEG),
+/* [1,8] */	TNS("sbbb",RMw),	TS("sbb",RMw),		TNS("sbbb",MRw),	TS("sbb",MRw),
+/* [1,C] */	TNS("sbbb",IA),		TS("sbb",IA),		TSx("push",SEG),	TSx("pop",SEG),
+}, {
+/* [2,0] */	TNS("andb",RMw),	TS("and",RMw),		TNS("andb",MRw),	TS("and",MRw),
+/* [2,4] */	TNS("andb",IA),		TS("and",IA),		TNSx("%es:",OVERRIDE),	TNSx("daa",NORM),
+/* [2,8] */	TNS("subb",RMw),	TS("sub",RMw),		TNS("subb",MRw),	TS("sub",MRw),
+/* [2,C] */	TNS("subb",IA),		TS("sub",IA),		TNS("%cs:",OVERRIDE),	TNSx("das",NORM),
+}, {
+/* [3,0] */	TNS("xorb",RMw),	TS("xor",RMw),		TNS("xorb",MRw),	TS("xor",MRw),
+/* [3,4] */	TNS("xorb",IA),		TS("xor",IA),		TNSx("%ss:",OVERRIDE),	TNSx("aaa",NORM),
+/* [3,8] */	TNS("cmpb",RMw),	TS("cmp",RMw),		TNS("cmpb",MRw),	TS("cmp",MRw),
+/* [3,C] */	TNS("cmpb",IA),		TS("cmp",IA),		TNSx("%ds:",OVERRIDE),	TNSx("aas",NORM),
+}, {
+/* [4,0] */	TSx("inc",R),		TSx("inc",R),		TSx("inc",R),		TSx("inc",R),
+/* [4,4] */	TSx("inc",R),		TSx("inc",R),		TSx("inc",R),		TSx("inc",R),
+/* [4,8] */	TSx("dec",R),		TSx("dec",R),		TSx("dec",R),		TSx("dec",R),
+/* [4,C] */	TSx("dec",R),		TSx("dec",R),		TSx("dec",R),		TSx("dec",R),
+}, {
+/* [5,0] */	TSp("push",R),		TSp("push",R),		TSp("push",R),		TSp("push",R),
+/* [5,4] */	TSp("push",R),		TSp("push",R),		TSp("push",R),		TSp("push",R),
+/* [5,8] */	TSp("pop",R),		TSp("pop",R),		TSp("pop",R),		TSp("pop",R),
+/* [5,C] */	TSp("pop",R),		TSp("pop",R),		TSp("pop",R),		TSp("pop",R),
+}, {
+/* [6,0] */	TSZx("pusha",IMPLMEM,28),TSZx("popa",IMPLMEM,28), TSx("bound",MR),	TNS("arpl",RMw),
+/* [6,4] */	TNS("%fs:",OVERRIDE),	TNS("%gs:",OVERRIDE),	TNS("data16",DM),	TNS("addr16",AM),
+/* [6,8] */	TSp("push",I),		TS("imul",IMUL),	TSp("push",Ib),	TS("imul",IMUL),
+/* [6,C] */	TNSZ("insb",IMPLMEM,1),	TSZ("ins",IMPLMEM,4),	TNSZ("outsb",IMPLMEM,1),TSZ("outs",IMPLMEM,4),
+}, {
+/* [7,0] */	TNSy("jo",BD),		TNSy("jno",BD),		TNSy("jb",BD),		TNSy("jae",BD),
+/* [7,4] */	TNSy("je",BD),		TNSy("jne",BD),		TNSy("jbe",BD),		TNSy("ja",BD),
+/* [7,8] */	TNSy("js",BD),		TNSy("jns",BD),		TNSy("jp",BD),		TNSy("jnp",BD),
+/* [7,C] */	TNSy("jl",BD),		TNSy("jge",BD),		TNSy("jle",BD),		TNSy("jg",BD),
+}, {
+/* [8,0] */	IND(dis_op80),		IND(dis_op81),		INDx(dis_op82),		IND(dis_op83),
+/* [8,4] */	TNS("testb",RMw),	TS("test",RMw),		TNS("xchgb",RMw),	TS("xchg",RMw),
+/* [8,8] */	TNS("movb",RMw),	TS("mov",RMw),		TNS("movb",MRw),	TS("mov",MRw),
+/* [8,C] */	TNS("movw",SM),		TS("lea",MR),		TNS("movw",MS),		TSp("pop",M),
+}, {
+/* [9,0] */	TNS("nop",NORM),	TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),
+/* [9,4] */	TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),		TS("xchg",RA),
+/* [9,8] */	TNS("cXtX",CBW),	TNS("cXtX",CWD),	TNSx("lcall",SO),	TNS("fwait",NORM),
+/* [9,C] */	TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4),	TNS("sahf",NORM),	TNS("lahf",NORM),
+}, {
+/* [A,0] */	TNS("movb",OA),		TS("mov",OA),		TNS("movb",AO),		TS("mov",AO),
+/* [A,4] */	TNSZ("movsb",SD,1),	TS("movs",SD),		TNSZ("cmpsb",SD,1),	TS("cmps",SD),
+/* [A,8] */	TNS("testb",IA),	TS("test",IA),		TNS("stosb",AD),	TS("stos",AD),
+/* [A,C] */	TNS("lodsb",SA),	TS("lods",SA),		TNS("scasb",AD),	TS("scas",AD),
+}, {
+/* [B,0] */	TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),
+/* [B,4] */	TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),		TNS("movb",IR),
+/* [B,8] */	TS("mov",IR),		TS("mov",IR),		TS("mov",IR),		TS("mov",IR),
+/* [B,C] */	TS("mov",IR),		TS("mov",IR),		TS("mov",IR),		TS("mov",IR),
+}, {
+/* [C,0] */	IND(dis_opC0),		IND(dis_opC1), 		TNSyp("ret",RET),	TNSyp("ret",NORM),
+/* [C,4] */	TNSx("les",MR),		TNSx("lds",MR),		TNS("movb",IMw),	TS("mov",IMw),
+/* [C,8] */	TNSyp("enter",ENTER),	TNSyp("leave",NORM),	TNS("lret",RET),	TNS("lret",NORM),
+/* [C,C] */	TNS("int",INT3),	TNS("int",INTx),	TNSx("into",NORM),	TNS("iret",NORM),
+}, {
+/* [D,0] */	IND(dis_opD0),		IND(dis_opD1),		IND(dis_opD2),		IND(dis_opD3),
+/* [D,4] */	TNSx("aam",U),		TNSx("aad",U),		TNSx("falc",NORM),	TNSZ("xlat",IMPLMEM,1),
+
+/* 287 instructions.  Note that although the indirect field		*/
+/* indicates opFP1n2 for further decoding, this is not necessarily	*/
+/* the case since the opFP arrays are not partitioned according to key1	*/
+/* and key2.  opFP1n2 is given only to indicate that we haven't		*/
+/* finished decoding the instruction.					*/
+/* [D,8] */	IND(dis_opFP1n2),	IND(dis_opFP1n2),	IND(dis_opFP1n2),	IND(dis_opFP1n2),
+/* [D,C] */	IND(dis_opFP1n2),	IND(dis_opFP1n2),	IND(dis_opFP1n2),	IND(dis_opFP1n2),
+}, {
+/* [E,0] */	TNSy("loopnz",BD),	TNSy("loopz",BD),	TNSy("loop",BD),	TNSy("jcxz",BD),
+/* [E,4] */	TNS("inb",P),		TS("in",P),		TNS("outb",P),		TS("out",P),
+/* [E,8] */	TNSyp("call",D),	TNSy("jmp",D),		TNSx("ljmp",SO),		TNSy("jmp",BD),
+/* [E,C] */	TNS("inb",V),		TS("in",V),		TNS("outb",V),		TS("out",V),
+}, {
+/* [F,0] */	TNS("lock",LOCK),	TNS("icebp", NORM),	TNS("repnz",PREFIX),	TNS("repz",PREFIX),
+/* [F,4] */	TNS("hlt",NORM),	TNS("cmc",NORM),	IND(dis_opF6),		IND(dis_opF7),
+/* [F,8] */	TNS("clc",NORM),	TNS("stc",NORM),	TNS("cli",NORM),	TNS("sti",NORM),
+/* [F,C] */	TNS("cld",NORM),	TNS("std",NORM),	IND(dis_opFE),		IND(dis_opFF),
+} };
+
+/* END CSTYLED */
+
+/*
+ * common functions to decode and disassemble an x86 or amd64 instruction
+ */
+
+/*
+ * These are the individual fields of a REX prefix. Note that a REX
+ * prefix with none of these set is still needed to:
+ *	- use the MOVSXD (sign extend 32 to 64 bits) instruction
+ *	- access the %sil, %dil, %bpl, %spl registers
+ */
+#define	REX_W 0x08	/* 64 bit operand size when set */
+#define	REX_R 0x04	/* high order bit extension of ModRM reg field */
+#define	REX_X 0x02	/* high order bit extension of SIB index field */
+#define	REX_B 0x01	/* extends ModRM r_m, SIB base, or opcode reg */
+
+/*
+ * These are the individual fields of a VEX prefix.
+ */
+#define	VEX_R 0x08	/* REX.R in 1's complement form */
+#define	VEX_X 0x04	/* REX.X in 1's complement form */
+#define	VEX_B 0x02	/* REX.B in 1's complement form */
+/* Vector Length, 0: scalar or 128-bit vector, 1: 256-bit vector */
+#define	VEX_L 0x04
+#define	VEX_W 0x08	/* opcode specific, use like REX.W */
+#define	VEX_m 0x1F	/* VEX m-mmmm field */
+#define	VEX_v 0x78	/* VEX register specifier */
+#define	VEX_p 0x03	/* VEX pp field, opcode extension */
+
+/* VEX m-mmmm field, only used by three bytes prefix */
+#define	VEX_m_0F 0x01   /* implied 0F leading opcode byte */
+#define	VEX_m_0F38 0x02 /* implied 0F 38 leading opcode byte */
+#define	VEX_m_0F3A 0x03 /* implied 0F 3A leading opcode byte */
+
+/* VEX pp field, providing equivalent functionality of a SIMD prefix */
+#define	VEX_p_66 0x01
+#define	VEX_p_F3 0x02
+#define	VEX_p_F2 0x03
+
+/*
+ * Even in 64 bit mode, usually only 4 byte immediate operands are supported.
+ */
+static int isize[] = {1, 2, 4, 4};
+static int isize64[] = {1, 2, 4, 8};
+
+/*
+ * Just a bunch of useful macros.
+ */
+#define	WBIT(x)	(x & 0x1)		/* to get w bit	*/
+#define	REGNO(x) (x & 0x7)		/* to get 3 bit register */
+#define	VBIT(x)	((x)>>1 & 0x1)		/* to get 'v' bit */
+#define	OPSIZE(osize, wbit) ((wbit) ? isize[osize] : 1)
+#define	OPSIZE64(osize, wbit) ((wbit) ? isize64[osize] : 1)
+
+#define	REG_ONLY 3	/* mode to indicate a register operand (not memory) */
+
+#define	BYTE_OPND	0	/* w-bit value indicating byte register */
+#define	LONG_OPND	1	/* w-bit value indicating opnd_size register */
+#define	MM_OPND		2	/* "value" used to indicate a mmx reg */
+#define	XMM_OPND	3	/* "value" used to indicate a xmm reg */
+#define	SEG_OPND	4	/* "value" used to indicate a segment reg */
+#define	CONTROL_OPND	5	/* "value" used to indicate a control reg */
+#define	DEBUG_OPND	6	/* "value" used to indicate a debug reg */
+#define	TEST_OPND	7	/* "value" used to indicate a test reg */
+#define	WORD_OPND	8	/* w-bit value indicating word size reg */
+#define	YMM_OPND	9	/* "value" used to indicate a ymm reg */
+
+/*
+ * The AVX2 gather instructions are a bit of a mess. While there's a pattern,
+ * there's not really a consistent scheme that we can use to know what the mode
+ * is supposed to be for a given type. Various instructions, like VPGATHERDD,
+ * always match the value of VEX_L. Other instructions like VPGATHERDQ, have
+ * some registers match VEX_L, but the VSIB is always XMM.
+ *
+ * The simplest way to deal with this is to just define a table based on the
+ * instruction opcodes, which are 0x90-0x93, so we subtract 0x90 to index into
+ * them.
+ *
+ * We further have to subdivide this based on the value of VEX_W and the value
+ * of VEX_L. The array is constructed to be indexed as:
+ * 	[opcode - 0x90][VEX_W][VEX_L].
+ */
+/* w = 0, 0x90 */
+typedef struct dis_gather_regs {
+	uint_t dgr_arg0;	/* src reg */
+	uint_t dgr_arg1;	/* vsib reg */
+	uint_t dgr_arg2;	/* dst reg */
+	char   *dgr_suffix;	/* suffix to append */
+} dis_gather_regs_t;
+
+static dis_gather_regs_t dis_vgather[4][2][2] = {
+	{
+		/* op 0x90, W.0 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "d" },
+			{ YMM_OPND, YMM_OPND, YMM_OPND, "d" }
+		},
+		/* op 0x90, W.1 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "q" },
+			{ YMM_OPND, XMM_OPND, YMM_OPND, "q" }
+		}
+	},
+	{
+		/* op 0x91, W.0 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "d" },
+			{ XMM_OPND, YMM_OPND, XMM_OPND, "d" },
+		},
+		/* op 0x91, W.1 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "q" },
+			{ YMM_OPND, YMM_OPND, YMM_OPND, "q" },
+		}
+	},
+	{
+		/* op 0x92, W.0 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "s" },
+			{ YMM_OPND, YMM_OPND, YMM_OPND, "s" }
+		},
+		/* op 0x92, W.1 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "d" },
+			{ YMM_OPND, XMM_OPND, YMM_OPND, "d" }
+		}
+	},
+	{
+		/* op 0x93, W.0 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "s" },
+			{ XMM_OPND, YMM_OPND, XMM_OPND, "s" }
+		},
+		/* op 0x93, W.1 */
+		{
+			{ XMM_OPND, XMM_OPND, XMM_OPND, "d" },
+			{ YMM_OPND, YMM_OPND, YMM_OPND, "d" }
+		}
+	}
+};
+
+/*
+ * Get the next byte and separate the op code into the high and low nibbles.
+ */
+static int
+dtrace_get_opcode(dis86_t *x, uint_t *high, uint_t *low)
+{
+	int byte;
+
+	/*
+	 * x86 instructions have a maximum length of 15 bytes.  Bail out if
+	 * we try to read more.
+	 */
+	if (x->d86_len >= 15)
+		return (x->d86_error = 1);
+
+	if (x->d86_error)
+		return (1);
+	byte = x->d86_get_byte(x->d86_data);
+	if (byte < 0)
+		return (x->d86_error = 1);
+	x->d86_bytes[x->d86_len++] = byte;
+	*low = byte & 0xf;		/* ----xxxx low 4 bits */
+	*high = byte >> 4 & 0xf;	/* xxxx---- bits 7 to 4 */
+	return (0);
+}
+
+/*
+ * Get and decode an SIB (scaled index base) byte
+ */
+static void
+dtrace_get_SIB(dis86_t *x, uint_t *ss, uint_t *index, uint_t *base)
+{
+	int byte;
+
+	if (x->d86_error)
+		return;
+
+	byte = x->d86_get_byte(x->d86_data);
+	if (byte < 0) {
+		x->d86_error = 1;
+		return;
+	}
+	x->d86_bytes[x->d86_len++] = byte;
+
+	*base = byte & 0x7;
+	*index = (byte >> 3) & 0x7;
+	*ss = (byte >> 6) & 0x3;
+}
+
+/*
+ * Get the byte following the op code and separate it into the
+ * mode, register, and r/m fields.
+ */
+static void
+dtrace_get_modrm(dis86_t *x, uint_t *mode, uint_t *reg, uint_t *r_m)
+{
+	if (x->d86_got_modrm == 0) {
+		if (x->d86_rmindex == -1)
+			x->d86_rmindex = x->d86_len;
+		dtrace_get_SIB(x, mode, reg, r_m);
+		x->d86_got_modrm = 1;
+	}
+}
+
+/*
+ * Adjust register selection based on any REX prefix bits present.
+ */
+/*ARGSUSED*/
+static void
+dtrace_rex_adjust(uint_t rex_prefix, uint_t mode, uint_t *reg, uint_t *r_m)
+{
+	if (reg != NULL && r_m == NULL) {
+		if (rex_prefix & REX_B)
+			*reg += 8;
+	} else {
+		if (reg != NULL && (REX_R & rex_prefix) != 0)
+			*reg += 8;
+		if (r_m != NULL && (REX_B & rex_prefix) != 0)
+			*r_m += 8;
+	}
+}
+
+/*
+ * Adjust register selection based on any VEX prefix bits present.
+ * Notes: VEX.R, VEX.X and VEX.B use the inverted form compared with REX prefix
+ */
+/*ARGSUSED*/
+static void
+dtrace_vex_adjust(uint_t vex_byte1, uint_t mode, uint_t *reg, uint_t *r_m)
+{
+	if (reg != NULL && r_m == NULL) {
+		if (!(vex_byte1 & VEX_B))
+			*reg += 8;
+	} else {
+		if (reg != NULL && ((VEX_R & vex_byte1) == 0))
+			*reg += 8;
+		if (r_m != NULL && ((VEX_B & vex_byte1) == 0))
+			*r_m += 8;
+	}
+}
+
+/*
+ * Get an immediate operand of the given size, with sign extension.
+ */
+static void
+dtrace_imm_opnd(dis86_t *x, int wbit, int size, int opindex)
+{
+	int i;
+	int byte;
+	int valsize;
+
+	if (x->d86_numopnds < opindex + 1)
+		x->d86_numopnds = opindex + 1;
+
+	switch (wbit) {
+	case BYTE_OPND:
+		valsize = 1;
+		break;
+	case LONG_OPND:
+		if (x->d86_opnd_size == SIZE16)
+			valsize = 2;
+		else if (x->d86_opnd_size == SIZE32)
+			valsize = 4;
+		else
+			valsize = 8;
+		break;
+	case MM_OPND:
+	case XMM_OPND:
+	case YMM_OPND:
+	case SEG_OPND:
+	case CONTROL_OPND:
+	case DEBUG_OPND:
+	case TEST_OPND:
+		valsize = size;
+		break;
+	case WORD_OPND:
+		valsize = 2;
+		break;
+	}
+	if (valsize < size)
+		valsize = size;
+
+	if (x->d86_error)
+		return;
+	x->d86_opnd[opindex].d86_value = 0;
+	for (i = 0; i < size; ++i) {
+		byte = x->d86_get_byte(x->d86_data);
+		if (byte < 0) {
+			x->d86_error = 1;
+			return;
+		}
+		x->d86_bytes[x->d86_len++] = byte;
+		x->d86_opnd[opindex].d86_value |= (uint64_t)byte << (i * 8);
+	}
+	/* Do sign extension */
+	if (x->d86_bytes[x->d86_len - 1] & 0x80) {
+		for (; i < sizeof (uint64_t); i++)
+			x->d86_opnd[opindex].d86_value |=
+			    (uint64_t)0xff << (i * 8);
+	}
+#ifdef DIS_TEXT
+	x->d86_opnd[opindex].d86_mode = MODE_SIGNED;
+	x->d86_opnd[opindex].d86_value_size = valsize;
+	x->d86_imm_bytes += size;
+#endif
+}
+
+/*
+ * Get an ip relative operand of the given size, with sign extension.
+ */
+static void
+dtrace_disp_opnd(dis86_t *x, int wbit, int size, int opindex)
+{
+	dtrace_imm_opnd(x, wbit, size, opindex);
+#ifdef DIS_TEXT
+	x->d86_opnd[opindex].d86_mode = MODE_IPREL;
+#endif
+}
+
+/*
+ * Check to see if there is a segment override prefix pending.
+ * If so, print it in the current 'operand' location and set
+ * the override flag back to false.
+ */
+/*ARGSUSED*/
+static void
+dtrace_check_override(dis86_t *x, int opindex)
+{
+#ifdef DIS_TEXT
+	if (x->d86_seg_prefix) {
+		(void) strlcat(x->d86_opnd[opindex].d86_prefix,
+		    x->d86_seg_prefix, PFIXLEN);
+	}
+#endif
+	x->d86_seg_prefix = NULL;
+}
+
+
+/*
+ * Process a single instruction Register or Memory operand.
+ *
+ * mode = addressing mode from ModRM byte
+ * r_m = r_m (or reg if mode == 3) field from ModRM byte
+ * wbit = indicates which register (8bit, 16bit, ... MMX, etc.) set to use.
+ * o = index of operand that we are processing (0, 1 or 2)
+ *
+ * the value of reg or r_m must have already been adjusted for any REX prefix.
+ */
+/*ARGSUSED*/
+static void
+dtrace_get_operand(dis86_t *x, uint_t mode, uint_t r_m, int wbit, int opindex)
+{
+	int have_SIB = 0;	/* flag presence of scale-index-byte */
+	uint_t ss;		/* scale-factor from opcode */
+	uint_t index;		/* index register number */
+	uint_t base;		/* base register number */
+	int dispsize;   	/* size of displacement in bytes */
+#ifdef DIS_TEXT
+	char *opnd = x->d86_opnd[opindex].d86_opnd;
+#endif
+
+	if (x->d86_numopnds < opindex + 1)
+		x->d86_numopnds = opindex + 1;
+
+	if (x->d86_error)
+		return;
+
+	/*
+	 * first handle a simple register
+	 */
+	if (mode == REG_ONLY) {
+#ifdef DIS_TEXT
+		switch (wbit) {
+		case MM_OPND:
+			(void) strlcat(opnd, dis_MMREG[r_m], OPLEN);
+			break;
+		case XMM_OPND:
+			(void) strlcat(opnd, dis_XMMREG[r_m], OPLEN);
+			break;
+		case YMM_OPND:
+			(void) strlcat(opnd, dis_YMMREG[r_m], OPLEN);
+			break;
+		case SEG_OPND:
+			(void) strlcat(opnd, dis_SEGREG[r_m], OPLEN);
+			break;
+		case CONTROL_OPND:
+			(void) strlcat(opnd, dis_CONTROLREG[r_m], OPLEN);
+			break;
+		case DEBUG_OPND:
+			(void) strlcat(opnd, dis_DEBUGREG[r_m], OPLEN);
+			break;
+		case TEST_OPND:
+			(void) strlcat(opnd, dis_TESTREG[r_m], OPLEN);
+			break;
+		case BYTE_OPND:
+			if (x->d86_rex_prefix == 0)
+				(void) strlcat(opnd, dis_REG8[r_m], OPLEN);
+			else
+				(void) strlcat(opnd, dis_REG8_REX[r_m], OPLEN);
+			break;
+		case WORD_OPND:
+			(void) strlcat(opnd, dis_REG16[r_m], OPLEN);
+			break;
+		case LONG_OPND:
+			if (x->d86_opnd_size == SIZE16)
+				(void) strlcat(opnd, dis_REG16[r_m], OPLEN);
+			else if (x->d86_opnd_size == SIZE32)
+				(void) strlcat(opnd, dis_REG32[r_m], OPLEN);
+			else
+				(void) strlcat(opnd, dis_REG64[r_m], OPLEN);
+			break;
+		}
+#endif /* DIS_TEXT */
+		return;
+	}
+
+	/*
+	 * if symbolic representation, skip override prefix, if any
+	 */
+	dtrace_check_override(x, opindex);
+
+	/*
+	 * Handle 16 bit memory references first, since they decode
+	 * the mode values more simply.
+	 * mode 1 is r_m + 8 bit displacement
+	 * mode 2 is r_m + 16 bit displacement
+	 * mode 0 is just r_m, unless r_m is 6 which is 16 bit disp
+	 */
+	if (x->d86_addr_size == SIZE16) {
+		if ((mode == 0 && r_m == 6) || mode == 2)
+			dtrace_imm_opnd(x, WORD_OPND, 2, opindex);
+		else if (mode == 1)
+			dtrace_imm_opnd(x, BYTE_OPND, 1, opindex);
+#ifdef DIS_TEXT
+		if (mode == 0 && r_m == 6)
+			x->d86_opnd[opindex].d86_mode = MODE_SIGNED;
+		else if (mode == 0)
+			x->d86_opnd[opindex].d86_mode = MODE_NONE;
+		else
+			x->d86_opnd[opindex].d86_mode = MODE_OFFSET;
+		(void) strlcat(opnd, dis_addr16[mode][r_m], OPLEN);
+#endif
+		return;
+	}
+
+	/*
+	 * 32 and 64 bit addressing modes are more complex since they
+	 * can involve an SIB (scaled index and base) byte to decode.
+	 */
+	if (r_m == ESP_REGNO || r_m == ESP_REGNO + 8) {
+		have_SIB = 1;
+		dtrace_get_SIB(x, &ss, &index, &base);
+		if (x->d86_error)
+			return;
+		if (base != 5 || mode != 0)
+			if (x->d86_rex_prefix & REX_B)
+				base += 8;
+		if (x->d86_rex_prefix & REX_X)
+			index += 8;
+	} else {
+		base = r_m;
+	}
+
+	/*
+	 * Compute the displacement size and get its bytes
+	 */
+	dispsize = 0;
+
+	if (mode == 1)
+		dispsize = 1;
+	else if (mode == 2)
+		dispsize = 4;
+	else if ((r_m & 7) == EBP_REGNO ||
+	    (have_SIB && (base & 7) == EBP_REGNO))
+		dispsize = 4;
+
+	if (dispsize > 0) {
+		dtrace_imm_opnd(x, dispsize == 4 ? LONG_OPND : BYTE_OPND,
+		    dispsize, opindex);
+		if (x->d86_error)
+			return;
+	}
+
+#ifdef DIS_TEXT
+	if (dispsize > 0)
+		x->d86_opnd[opindex].d86_mode = MODE_OFFSET;
+
+	if (have_SIB == 0) {
+		if (x->d86_mode == SIZE32) {
+			if (mode == 0)
+				(void) strlcat(opnd, dis_addr32_mode0[r_m],
+				    OPLEN);
+			else
+				(void) strlcat(opnd, dis_addr32_mode12[r_m],
+				    OPLEN);
+		} else {
+			if (mode == 0) {
+				(void) strlcat(opnd, dis_addr64_mode0[r_m],
+				    OPLEN);
+				if (r_m == 5) {
+					x->d86_opnd[opindex].d86_mode =
+					    MODE_RIPREL;
+				}
+			} else {
+				(void) strlcat(opnd, dis_addr64_mode12[r_m],
+				    OPLEN);
+			}
+		}
+	} else {
+		uint_t need_paren = 0;
+		char **regs;
+		char **bregs;
+		const char *const *sf;
+		if (x->d86_mode == SIZE32) /* NOTE this is not addr_size! */
+			regs = (char **)dis_REG32;
+		else
+			regs = (char **)dis_REG64;
+
+		if (x->d86_vsib != 0) {
+			if (wbit == YMM_OPND) /* NOTE this is not addr_size! */
+				bregs = (char **)dis_YMMREG;
+			else
+				bregs = (char **)dis_XMMREG;
+			sf = dis_vscale_factor;
+		} else {
+			bregs = regs;
+			sf = dis_scale_factor;
+		}
+
+		/*
+		 * print the base (if any)
+		 */
+		if (base == EBP_REGNO && mode == 0) {
+			if (index != ESP_REGNO || x->d86_vsib != 0) {
+				(void) strlcat(opnd, "(", OPLEN);
+				need_paren = 1;
+			}
+		} else {
+			(void) strlcat(opnd, "(", OPLEN);
+			(void) strlcat(opnd, regs[base], OPLEN);
+			need_paren = 1;
+		}
+
+		/*
+		 * print the index (if any)
+		 */
+		if (index != ESP_REGNO || x->d86_vsib) {
+			(void) strlcat(opnd, ",", OPLEN);
+			(void) strlcat(opnd, bregs[index], OPLEN);
+			(void) strlcat(opnd, sf[ss], OPLEN);
+		} else
+			if (need_paren)
+				(void) strlcat(opnd, ")", OPLEN);
+	}
+#endif
+}
+
+/*
+ * Operand sequence for standard instruction involving one register
+ * and one register/memory operand.
+ * wbit indicates a byte(0) or opnd_size(1) operation
+ * vbit indicates direction (0 for "opcode r,r_m") or (1 for "opcode r_m, r")
+ */
+#define	STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, vbit)  {	\
+		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
+		dtrace_get_operand(x, mode, r_m, wbit, vbit);		\
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1 - vbit);	\
+}
+
+/*
+ * Similar to above, but allows for the two operands to be of different
+ * classes (ie. wbit).
+ *	wbit is for the r_m operand
+ *	w2 is for the reg operand
+ */
+#define	MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, w2, vbit)	{	\
+		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
+		dtrace_get_operand(x, mode, r_m, wbit, vbit);		\
+		dtrace_get_operand(x, REG_ONLY, reg, w2, 1 - vbit);	\
+}
+
+/*
+ * Similar, but for 2 operands plus an immediate.
+ * vbit indicates direction
+ * 	0 for "opcode imm, r, r_m" or
+ *	1 for "opcode imm, r_m, r"
+ */
+#define	THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize, vbit) { \
+		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
+		dtrace_get_operand(x, mode, r_m, wbit, 2-vbit);		\
+		dtrace_get_operand(x, REG_ONLY, reg, w2, 1+vbit);	\
+		dtrace_imm_opnd(x, wbit, immsize, 0);			\
+}
+
+/*
+ * Similar, but for 2 operands plus two immediates.
+ */
+#define	FOUROPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize) { \
+		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
+		dtrace_get_operand(x, mode, r_m, wbit, 2);		\
+		dtrace_get_operand(x, REG_ONLY, reg, w2, 3);		\
+		dtrace_imm_opnd(x, wbit, immsize, 1);			\
+		dtrace_imm_opnd(x, wbit, immsize, 0);			\
+}
+
+/*
+ * 1 operands plus two immediates.
+ */
+#define	ONEOPERAND_TWOIMM(x, mode, reg, r_m, rex_prefix, wbit, immsize) { \
+		dtrace_get_modrm(x, &mode, &reg, &r_m);			\
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);	\
+		dtrace_get_operand(x, mode, r_m, wbit, 2);		\
+		dtrace_imm_opnd(x, wbit, immsize, 1);			\
+		dtrace_imm_opnd(x, wbit, immsize, 0);			\
+}
+
+/*
+ * Dissassemble a single x86 or amd64 instruction.
+ *
+ * Mode determines the default operating mode (SIZE16, SIZE32 or SIZE64)
+ * for interpreting instructions.
+ *
+ * returns non-zero for bad opcode
+ */
+int
+dtrace_disx86(dis86_t *x, uint_t cpu_mode)
+{
+	instable_t *dp;		/* decode table being used */
+#ifdef DIS_TEXT
+	uint_t i;
+#endif
+#ifdef DIS_MEM
+	uint_t nomem = 0;
+#define	NOMEM	(nomem = 1)
+#else
+#define	NOMEM	/* nothing */
+#endif
+	uint_t opnd_size;	/* SIZE16, SIZE32 or SIZE64 */
+	uint_t addr_size;	/* SIZE16, SIZE32 or SIZE64 */
+	uint_t wbit;		/* opcode wbit, 0 is 8 bit, !0 for opnd_size */
+	uint_t w2;		/* wbit value for second operand */
+	uint_t vbit;
+	uint_t mode = 0;	/* mode value from ModRM byte */
+	uint_t reg;		/* reg value from ModRM byte */
+	uint_t r_m;		/* r_m value from ModRM byte */
+
+	uint_t opcode1;		/* high nibble of 1st byte */
+	uint_t opcode2;		/* low nibble of 1st byte */
+	uint_t opcode3;		/* extra opcode bits usually from ModRM byte */
+	uint_t opcode4;		/* high nibble of 2nd byte */
+	uint_t opcode5;		/* low nibble of 2nd byte */
+	uint_t opcode6;		/* high nibble of 3rd byte */
+	uint_t opcode7;		/* low nibble of 3rd byte */
+	uint_t opcode_bytes = 1;
+
+	/*
+	 * legacy prefixes come in 5 flavors, you should have only one of each
+	 */
+	uint_t	opnd_size_prefix = 0;
+	uint_t	addr_size_prefix = 0;
+	uint_t	segment_prefix = 0;
+	uint_t	lock_prefix = 0;
+	uint_t	rep_prefix = 0;
+	uint_t	rex_prefix = 0;	/* amd64 register extension prefix */
+
+	/*
+	 * Intel VEX instruction encoding prefix and fields
+	 */
+
+	/* 0xC4 means 3 bytes prefix, 0xC5 means 2 bytes prefix */
+	uint_t vex_prefix = 0;
+
+	/*
+	 * VEX prefix byte 1, includes vex.r, vex.x and vex.b
+	 * (for 3 bytes prefix)
+	 */
+	uint_t vex_byte1 = 0;
+
+	/*
+	 * For 32-bit mode, it should prefetch the next byte to
+	 * distinguish between AVX and les/lds
+	 */
+	uint_t vex_prefetch = 0;
+
+	uint_t vex_m = 0;
+	uint_t vex_v = 0;
+	uint_t vex_p = 0;
+	uint_t vex_R = 1;
+	uint_t vex_X = 1;
+	uint_t vex_B = 1;
+	uint_t vex_W = 0;
+	uint_t vex_L;
+	dis_gather_regs_t *vreg;
+
+#ifdef	DIS_TEXT
+	/* Instruction name for BLS* family of instructions */
+	char *blsinstr;
+#endif
+
+	size_t	off;
+
+	instable_t dp_mmx;
+
+	x->d86_len = 0;
+	x->d86_rmindex = -1;
+	x->d86_error = 0;
+#ifdef DIS_TEXT
+	x->d86_numopnds = 0;
+	x->d86_seg_prefix = NULL;
+	x->d86_mnem[0] = 0;
+	for (i = 0; i < 4; ++i) {
+		x->d86_opnd[i].d86_opnd[0] = 0;
+		x->d86_opnd[i].d86_prefix[0] = 0;
+		x->d86_opnd[i].d86_value_size = 0;
+		x->d86_opnd[i].d86_value = 0;
+		x->d86_opnd[i].d86_mode = MODE_NONE;
+	}
+#endif
+	x->d86_rex_prefix = 0;
+	x->d86_got_modrm = 0;
+	x->d86_memsize = 0;
+	x->d86_vsib = 0;
+
+	if (cpu_mode == SIZE16) {
+		opnd_size = SIZE16;
+		addr_size = SIZE16;
+	} else if (cpu_mode == SIZE32) {
+		opnd_size = SIZE32;
+		addr_size = SIZE32;
+	} else {
+		opnd_size = SIZE32;
+		addr_size = SIZE64;
+	}
+
+	/*
+	 * Get one opcode byte and check for zero padding that follows
+	 * jump tables.
+	 */
+	if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
+		goto error;
+
+	if (opcode1 == 0 && opcode2 == 0 &&
+	    x->d86_check_func != NULL && x->d86_check_func(x->d86_data)) {
+#ifdef DIS_TEXT
+		(void) strncpy(x->d86_mnem, ".byte\t0", OPLEN);
+#endif
+		goto done;
+	}
+
+	/*
+	 * Gather up legacy x86 prefix bytes.
+	 */
+	for (;;) {
+		uint_t *which_prefix = NULL;
+
+		dp = (instable_t *)&dis_distable[opcode1][opcode2];
+
+		switch (dp->it_adrmode) {
+		case PREFIX:
+			which_prefix = &rep_prefix;
+			break;
+		case LOCK:
+			which_prefix = &lock_prefix;
+			break;
+		case OVERRIDE:
+			which_prefix = &segment_prefix;
+#ifdef DIS_TEXT
+			x->d86_seg_prefix = (char *)dp->it_name;
+#endif
+			if (dp->it_invalid64 && cpu_mode == SIZE64)
+				goto error;
+			break;
+		case AM:
+			which_prefix = &addr_size_prefix;
+			break;
+		case DM:
+			which_prefix = &opnd_size_prefix;
+			break;
+		}
+		if (which_prefix == NULL)
+			break;
+		*which_prefix = (opcode1 << 4) | opcode2;
+		if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
+			goto error;
+	}
+
+	/*
+	 * Handle amd64 mode PREFIX values.
+	 * Some of the segment prefixes are no-ops. (only FS/GS actually work)
+	 * We might have a REX prefix (opcodes 0x40-0x4f)
+	 */
+	if (cpu_mode == SIZE64) {
+		if (segment_prefix != 0x64 && segment_prefix != 0x65)
+			segment_prefix = 0;
+
+		if (opcode1 == 0x4) {
+			rex_prefix = (opcode1 << 4) | opcode2;
+			if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
+				goto error;
+			dp = (instable_t *)&dis_distable[opcode1][opcode2];
+		} else if (opcode1 == 0xC &&
+		    (opcode2 == 0x4 || opcode2 == 0x5)) {
+			/* AVX instructions */
+			vex_prefix = (opcode1 << 4) | opcode2;
+			x->d86_rex_prefix = 0x40;
+		}
+	} else if (opcode1 == 0xC && (opcode2 == 0x4 || opcode2 == 0x5)) {
+		/* LDS, LES or AVX */
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		vex_prefetch = 1;
+
+		if (mode == REG_ONLY) {
+			/* AVX */
+			vex_prefix = (opcode1 << 4) | opcode2;
+			x->d86_rex_prefix = 0x40;
+			opcode3 = (((mode << 3) | reg)>>1) & 0x0F;
+			opcode4 = ((reg << 3) | r_m) & 0x0F;
+		}
+	}
+
+	if (vex_prefix == VEX_2bytes) {
+		if (!vex_prefetch) {
+			if (dtrace_get_opcode(x, &opcode3, &opcode4) != 0)
+				goto error;
+		}
+		vex_R = ((opcode3 & VEX_R) & 0x0F) >> 3;
+		vex_L = ((opcode4 & VEX_L) & 0x0F) >> 2;
+		vex_v = (((opcode3 << 4) | opcode4) & VEX_v) >> 3;
+		vex_p = opcode4 & VEX_p;
+		/*
+		 * The vex.x and vex.b bits are not defined in two bytes
+		 * mode vex prefix, their default values are 1
+		 */
+		vex_byte1 = (opcode3 & VEX_R) | VEX_X | VEX_B;
+
+		if (vex_R == 0)
+			x->d86_rex_prefix |= REX_R;
+
+		if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
+			goto error;
+
+		switch (vex_p) {
+			case VEX_p_66:
+				dp = (instable_t *)
+				    &dis_opAVX660F[(opcode1 << 4) | opcode2];
+				break;
+			case VEX_p_F3:
+				dp = (instable_t *)
+				    &dis_opAVXF30F[(opcode1 << 4) | opcode2];
+				break;
+			case VEX_p_F2:
+				dp = (instable_t *)
+				    &dis_opAVXF20F [(opcode1 << 4) | opcode2];
+				break;
+			default:
+				dp = (instable_t *)
+				    &dis_opAVX0F[opcode1][opcode2];
+
+		}
+
+	} else if (vex_prefix == VEX_3bytes) {
+		if (!vex_prefetch) {
+			if (dtrace_get_opcode(x, &opcode3, &opcode4) != 0)
+				goto error;
+		}
+		vex_R = (opcode3 & VEX_R) >> 3;
+		vex_X = (opcode3 & VEX_X) >> 2;
+		vex_B = (opcode3 & VEX_B) >> 1;
+		vex_m = (((opcode3 << 4) | opcode4) & VEX_m);
+		vex_byte1 = opcode3 & (VEX_R | VEX_X | VEX_B);
+
+		if (vex_R == 0)
+			x->d86_rex_prefix |= REX_R;
+		if (vex_X == 0)
+			x->d86_rex_prefix |= REX_X;
+		if (vex_B == 0)
+			x->d86_rex_prefix |= REX_B;
+
+		if (dtrace_get_opcode(x, &opcode5, &opcode6) != 0)
+			goto error;
+		vex_W = (opcode5 & VEX_W) >> 3;
+		vex_L = (opcode6 & VEX_L) >> 2;
+		vex_v = (((opcode5 << 4) | opcode6) & VEX_v) >> 3;
+		vex_p = opcode6 & VEX_p;
+
+		if (vex_W)
+			x->d86_rex_prefix |= REX_W;
+
+		/* Only these three vex_m values valid; others are reserved */
+		if ((vex_m != VEX_m_0F) && (vex_m != VEX_m_0F38) &&
+		    (vex_m != VEX_m_0F3A))
+			goto error;
+
+		if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0)
+			goto error;
+
+		switch (vex_p) {
+			case VEX_p_66:
+				if (vex_m == VEX_m_0F) {
+					dp = (instable_t *)
+					    &dis_opAVX660F
+					    [(opcode1 << 4) | opcode2];
+				} else if (vex_m == VEX_m_0F38) {
+					dp = (instable_t *)
+					    &dis_opAVX660F38
+					    [(opcode1 << 4) | opcode2];
+				} else if (vex_m == VEX_m_0F3A) {
+					dp = (instable_t *)
+					    &dis_opAVX660F3A
+					    [(opcode1 << 4) | opcode2];
+				} else {
+					goto error;
+				}
+				break;
+			case VEX_p_F3:
+				if (vex_m == VEX_m_0F) {
+					dp = (instable_t *)
+					    &dis_opAVXF30F
+					    [(opcode1 << 4) | opcode2];
+				} else if (vex_m == VEX_m_0F38) {
+					dp = (instable_t *)
+					    &dis_opAVXF30F38
+					    [(opcode1 << 4) | opcode2];
+				} else {
+					goto error;
+				}
+				break;
+			case VEX_p_F2:
+				if (vex_m == VEX_m_0F) {
+					dp = (instable_t *)
+					    &dis_opAVXF20F
+					    [(opcode1 << 4) | opcode2];
+				} else if (vex_m == VEX_m_0F3A) {
+					dp = (instable_t *)
+					    &dis_opAVXF20F3A
+					    [(opcode1 << 4) | opcode2];
+				} else if (vex_m == VEX_m_0F38) {
+					dp = (instable_t *)
+					    &dis_opAVXF20F38
+					    [(opcode1 << 4) | opcode2];
+				} else {
+					goto error;
+				}
+				break;
+			default:
+				dp = (instable_t *)
+				    &dis_opAVX0F[opcode1][opcode2];
+
+		}
+	}
+	if (vex_prefix) {
+		if (dp->it_vexwoxmm) {
+			wbit = LONG_OPND;
+		} else {
+			if (vex_L)
+				wbit = YMM_OPND;
+			else
+				wbit = XMM_OPND;
+		}
+	}
+
+	/*
+	 * Deal with selection of operand and address size now.
+	 * Note that the REX.W bit being set causes opnd_size_prefix to be
+	 * ignored.
+	 */
+	if (cpu_mode == SIZE64) {
+		if ((rex_prefix & REX_W) || vex_W)
+			opnd_size = SIZE64;
+		else if (opnd_size_prefix)
+			opnd_size = SIZE16;
+
+		if (addr_size_prefix)
+			addr_size = SIZE32;
+	} else if (cpu_mode == SIZE32) {
+		if (opnd_size_prefix)
+			opnd_size = SIZE16;
+		if (addr_size_prefix)
+			addr_size = SIZE16;
+	} else {
+		if (opnd_size_prefix)
+			opnd_size = SIZE32;
+		if (addr_size_prefix)
+			addr_size = SIZE32;
+	}
+	/*
+	 * The pause instruction - a repz'd nop.  This doesn't fit
+	 * with any of the other prefix goop added for SSE, so we'll
+	 * special-case it here.
+	 */
+	if (rep_prefix == 0xf3 && opcode1 == 0x9 && opcode2 == 0x0) {
+		rep_prefix = 0;
+		dp = (instable_t *)&dis_opPause;
+	}
+
+	/*
+	 * Some 386 instructions have 2 bytes of opcode before the mod_r/m
+	 * byte so we may need to perform a table indirection.
+	 */
+	if (dp->it_indirect == (instable_t *)dis_op0F) {
+		if (dtrace_get_opcode(x, &opcode4, &opcode5) != 0)
+			goto error;
+		opcode_bytes = 2;
+		if (opcode4 == 0x7 && opcode5 >= 0x1 && opcode5 <= 0x3) {
+			uint_t	subcode;
+
+			if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0)
+				goto error;
+			opcode_bytes = 3;
+			subcode = ((opcode6 & 0x3) << 1) |
+			    ((opcode7 & 0x8) >> 3);
+			dp = (instable_t *)&dis_op0F7123[opcode5][subcode];
+		} else if ((opcode4 == 0xc) && (opcode5 >= 0x8)) {
+			dp = (instable_t *)&dis_op0FC8[0];
+		} else if ((opcode4 == 0x3) && (opcode5 == 0xA)) {
+			opcode_bytes = 3;
+			if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0)
+				goto error;
+			if (opnd_size == SIZE16)
+				opnd_size = SIZE32;
+
+			dp = (instable_t *)&dis_op0F3A[(opcode6<<4)|opcode7];
+#ifdef DIS_TEXT
+			if (strcmp(dp->it_name, "INVALID") == 0)
+				goto error;
+#endif
+			switch (dp->it_adrmode) {
+				case XMMP:
+					break;
+				case XMMP_66r:
+				case XMMPRM_66r:
+				case XMM3PM_66r:
+					if (opnd_size_prefix == 0) {
+						goto error;
+					}
+					break;
+				case XMMP_66o:
+					if (opnd_size_prefix == 0) {
+						/* SSSE3 MMX instructions */
+						dp_mmx = *dp;
+						dp = &dp_mmx;
+						dp->it_adrmode = MMOPM_66o;
+#ifdef	DIS_MEM
+						dp->it_size = 8;
+#endif
+					}
+					break;
+				default:
+					goto error;
+			}
+		} else if ((opcode4 == 0x3) && (opcode5 == 0x8)) {
+			opcode_bytes = 3;
+			if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0)
+				goto error;
+			dp = (instable_t *)&dis_op0F38[(opcode6<<4)|opcode7];
+
+			/*
+			 * Both crc32 and movbe have the same 3rd opcode
+			 * byte of either 0xF0 or 0xF1, so we use another
+			 * indirection to distinguish between the two.
+			 */
+			if (dp->it_indirect == (instable_t *)dis_op0F38F0 ||
+			    dp->it_indirect == (instable_t *)dis_op0F38F1) {
+
+				dp = dp->it_indirect;
+				if (rep_prefix != 0xF2) {
+					/* It is movbe */
+					dp++;
+				}
+			}
+
+			/*
+			 * The adx family of instructions (adcx and adox)
+			 * continue the classic Intel tradition of abusing
+			 * arbitrary prefixes without actually meaning the
+			 * prefix bit. Therefore, if we find either the
+			 * opnd_size_prefix or rep_prefix we end up zeroing it
+			 * out after making our determination so as to ensure
+			 * that we don't get confused and accidentally print
+			 * repz prefixes and the like on these instructions.
+			 *
+			 * In addition, these instructions are actually much
+			 * closer to AVX instructions in semantics. Importantly,
+			 * they always default to having 32-bit operands.
+			 * However, if the CPU is in 64-bit mode, then and only
+			 * then, does it use REX.w promotes things to 64-bits
+			 * and REX.r allows 64-bit mode to use register r8-r15.
+			 */
+			if (dp->it_indirect == (instable_t *)dis_op0F38F6) {
+				dp = dp->it_indirect;
+				if (opnd_size_prefix == 0 &&
+				    rep_prefix == 0xf3) {
+					/* It is adox */
+					dp++;
+				} else if (opnd_size_prefix != 0x66 &&
+				    rep_prefix != 0) {
+					/* It isn't adcx */
+					goto error;
+				}
+				opnd_size_prefix = 0;
+				rep_prefix = 0;
+				opnd_size = SIZE32;
+				if (rex_prefix & REX_W)
+					opnd_size = SIZE64;
+			}
+
+#ifdef DIS_TEXT
+			if (strcmp(dp->it_name, "INVALID") == 0)
+				goto error;
+#endif
+			switch (dp->it_adrmode) {
+				case ADX:
+				case XMM:
+					break;
+				case RM_66r:
+				case XMM_66r:
+				case XMMM_66r:
+					if (opnd_size_prefix == 0) {
+						goto error;
+					}
+					break;
+				case XMM_66o:
+					if (opnd_size_prefix == 0) {
+						/* SSSE3 MMX instructions */
+						dp_mmx = *dp;
+						dp = &dp_mmx;
+						dp->it_adrmode = MM;
+#ifdef	DIS_MEM
+						dp->it_size = 8;
+#endif
+					}
+					break;
+				case CRC32:
+					if (rep_prefix != 0xF2) {
+						goto error;
+					}
+					rep_prefix = 0;
+					break;
+				case MOVBE:
+					if (rep_prefix != 0x0) {
+						goto error;
+					}
+					break;
+				default:
+					goto error;
+			}
+		} else {
+			dp = (instable_t *)&dis_op0F[opcode4][opcode5];
+		}
+	}
+
+	/*
+	 * If still not at a TERM decode entry, then a ModRM byte
+	 * exists and its fields further decode the instruction.
+	 */
+	x->d86_got_modrm = 0;
+	if (dp->it_indirect != TERM) {
+		dtrace_get_modrm(x, &mode, &opcode3, &r_m);
+		if (x->d86_error)
+			goto error;
+		reg = opcode3;
+
+		/*
+		 * decode 287 instructions (D8-DF) from opcodeN
+		 */
+		if (opcode1 == 0xD && opcode2 >= 0x8) {
+			if (opcode2 == 0xB && mode == 0x3 && opcode3 == 4)
+				dp = (instable_t *)&dis_opFP5[r_m];
+			else if (opcode2 == 0xA && mode == 0x3 && opcode3 < 4)
+				dp = (instable_t *)&dis_opFP7[opcode3];
+			else if (opcode2 == 0xB && mode == 0x3)
+				dp = (instable_t *)&dis_opFP6[opcode3];
+			else if (opcode2 == 0x9 && mode == 0x3 && opcode3 >= 4)
+				dp = (instable_t *)&dis_opFP4[opcode3 - 4][r_m];
+			else if (mode == 0x3)
+				dp = (instable_t *)
+				    &dis_opFP3[opcode2 - 8][opcode3];
+			else
+				dp = (instable_t *)
+				    &dis_opFP1n2[opcode2 - 8][opcode3];
+		} else {
+			dp = (instable_t *)dp->it_indirect + opcode3;
+		}
+	}
+
+	/*
+	 * In amd64 bit mode, ARPL opcode is changed to MOVSXD
+	 * (sign extend 32bit to 64 bit)
+	 */
+	if ((vex_prefix == 0) && cpu_mode == SIZE64 &&
+	    opcode1 == 0x6 && opcode2 == 0x3)
+		dp = (instable_t *)&dis_opMOVSLD;
+
+	/*
+	 * at this point we should have a correct (or invalid) opcode
+	 */
+	if (cpu_mode == SIZE64 && dp->it_invalid64 ||
+	    cpu_mode != SIZE64 && dp->it_invalid32)
+		goto error;
+	if (dp->it_indirect != TERM)
+		goto error;
+
+	/*
+	 * Deal with MMX/SSE opcodes which are changed by prefixes. Note, we do
+	 * need to include UNKNOWN below, as we may have instructions that
+	 * actually have a prefix, but don't exist in any other form.
+	 */
+	switch (dp->it_adrmode) {
+	case UNKNOWN:
+	case MMO:
+	case MMOIMPL:
+	case MMO3P:
+	case MMOM3:
+	case MMOMS:
+	case MMOPM:
+	case MMOPRM:
+	case MMOS:
+	case XMMO:
+	case XMMOM:
+	case XMMOMS:
+	case XMMOPM:
+	case XMMOS:
+	case XMMOMX:
+	case XMMOX3:
+	case XMMOXMM:
+		/*
+		 * This is horrible.  Some SIMD instructions take the
+		 * form 0x0F 0x?? ..., which is easily decoded using the
+		 * existing tables.  Other SIMD instructions use various
+		 * prefix bytes to overload existing instructions.  For
+		 * Example, addps is F0, 58, whereas addss is F3 (repz),
+		 * F0, 58.  Presumably someone got a raise for this.
+		 *
+		 * If we see one of the instructions which can be
+		 * modified in this way (if we've got one of the SIMDO*
+		 * address modes), we'll check to see if the last prefix
+		 * was a repz.  If it was, we strip the prefix from the
+		 * mnemonic, and we indirect using the dis_opSIMDrepz
+		 * table.
+		 */
+
+		/*
+		 * Calculate our offset in dis_op0F
+		 */
+		if ((uintptr_t)dp - (uintptr_t)dis_op0F > sizeof (dis_op0F))
+			goto error;
+
+		off = ((uintptr_t)dp - (uintptr_t)dis_op0F) /
+		    sizeof (instable_t);
+
+		/*
+		 * Rewrite if this instruction used one of the magic prefixes.
+		 */
+		if (rep_prefix) {
+			if (rep_prefix == 0xf2)
+				dp = (instable_t *)&dis_opSIMDrepnz[off];
+			else
+				dp = (instable_t *)&dis_opSIMDrepz[off];
+			rep_prefix = 0;
+		} else if (opnd_size_prefix) {
+			dp = (instable_t *)&dis_opSIMDdata16[off];
+			opnd_size_prefix = 0;
+			if (opnd_size == SIZE16)
+				opnd_size = SIZE32;
+		}
+		break;
+
+	case MG9:
+		/*
+		 * More horribleness: the group 9 (0xF0 0xC7) instructions are
+		 * allowed an optional prefix of 0x66 or 0xF3.  This is similar
+		 * to the SIMD business described above, but with a different
+		 * addressing mode (and an indirect table), so we deal with it
+		 * separately (if similarly).
+		 *
+		 * Intel further complicated this with the release of Ivy Bridge
+		 * where they overloaded these instructions based on the ModR/M
+		 * bytes. The VMX instructions have a mode of 0 since they are
+		 * memory instructions but rdrand instructions have a mode of
+		 * 0b11 (REG_ONLY) because they only operate on registers. While
+		 * there are different prefix formats, for now it is sufficient
+		 * to use a single different table.
+		 */
+
+		/*
+		 * Calculate our offset in dis_op0FC7 (the group 9 table)
+		 */
+		if ((uintptr_t)dp - (uintptr_t)dis_op0FC7 > sizeof (dis_op0FC7))
+			goto error;
+
+		off = ((uintptr_t)dp - (uintptr_t)dis_op0FC7) /
+		    sizeof (instable_t);
+
+		/*
+		 * If we have a mode of 0b11 then we have to rewrite this.
+		 */
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		if (mode == REG_ONLY) {
+			dp = (instable_t *)&dis_op0FC7m3[off];
+			break;
+		}
+
+		/*
+		 * Rewrite if this instruction used one of the magic prefixes.
+		 */
+		if (rep_prefix) {
+			if (rep_prefix == 0xf3)
+				dp = (instable_t *)&dis_opF30FC7[off];
+			else
+				goto error;
+			rep_prefix = 0;
+		} else if (opnd_size_prefix) {
+			dp = (instable_t *)&dis_op660FC7[off];
+			opnd_size_prefix = 0;
+			if (opnd_size == SIZE16)
+				opnd_size = SIZE32;
+		}
+		break;
+
+
+	case MMOSH:
+		/*
+		 * As with the "normal" SIMD instructions, the MMX
+		 * shuffle instructions are overloaded.  These
+		 * instructions, however, are special in that they use
+		 * an extra byte, and thus an extra table.  As of this
+		 * writing, they only use the opnd_size prefix.
+		 */
+
+		/*
+		 * Calculate our offset in dis_op0F7123
+		 */
+		if ((uintptr_t)dp - (uintptr_t)dis_op0F7123 >
+		    sizeof (dis_op0F7123))
+			goto error;
+
+		if (opnd_size_prefix) {
+			off = ((uintptr_t)dp - (uintptr_t)dis_op0F7123) /
+			    sizeof (instable_t);
+			dp = (instable_t *)&dis_opSIMD7123[off];
+			opnd_size_prefix = 0;
+			if (opnd_size == SIZE16)
+				opnd_size = SIZE32;
+		}
+		break;
+	case MRw:
+		if (rep_prefix) {
+			if (rep_prefix == 0xf3) {
+
+				/*
+				 * Calculate our offset in dis_op0F
+				 */
+				if ((uintptr_t)dp - (uintptr_t)dis_op0F
+				    > sizeof (dis_op0F))
+					goto error;
+
+				off = ((uintptr_t)dp - (uintptr_t)dis_op0F) /
+				    sizeof (instable_t);
+
+				dp = (instable_t *)&dis_opSIMDrepz[off];
+				rep_prefix = 0;
+			} else {
+				goto error;
+			}
+		}
+		break;
+	}
+
+	/*
+	 * In 64 bit mode, some opcodes automatically use opnd_size == SIZE64.
+	 */
+	if (cpu_mode == SIZE64)
+		if (dp->it_always64 || (opnd_size == SIZE32 && dp->it_stackop))
+			opnd_size = SIZE64;
+
+#ifdef DIS_TEXT
+	/*
+	 * At this point most instructions can format the opcode mnemonic
+	 * including the prefixes.
+	 */
+	if (lock_prefix)
+		(void) strlcat(x->d86_mnem, "lock ", OPLEN);
+
+	if (rep_prefix == 0xf2)
+		(void) strlcat(x->d86_mnem, "repnz ", OPLEN);
+	else if (rep_prefix == 0xf3)
+		(void) strlcat(x->d86_mnem, "repz ", OPLEN);
+
+	if (cpu_mode == SIZE64 && addr_size_prefix)
+		(void) strlcat(x->d86_mnem, "addr32 ", OPLEN);
+
+	if (dp->it_adrmode != CBW &&
+	    dp->it_adrmode != CWD &&
+	    dp->it_adrmode != XMMSFNC) {
+		if (strcmp(dp->it_name, "INVALID") == 0)
+			goto error;
+		(void) strlcat(x->d86_mnem, dp->it_name, OPLEN);
+		if (dp->it_avxsuf && dp->it_suffix) {
+			(void) strlcat(x->d86_mnem, vex_W != 0 ? "q" : "d",
+			    OPLEN);
+		} else if (dp->it_suffix) {
+			char *types[] = {"", "w", "l", "q"};
+			if (opcode_bytes == 2 && opcode4 == 4) {
+				/* It's a cmovx.yy. Replace the suffix x */
+				for (i = 5; i < OPLEN; i++) {
+					if (x->d86_mnem[i] == '.')
+						break;
+				}
+				x->d86_mnem[i - 1] = *types[opnd_size];
+			} else if ((opnd_size == 2) && (opcode_bytes == 3) &&
+			    ((opcode6 == 1 && opcode7 == 6) ||
+			    (opcode6 == 2 && opcode7 == 2))) {
+				/*
+				 * To handle PINSRD and PEXTRD
+				 */
+				(void) strlcat(x->d86_mnem, "d", OPLEN);
+			} else {
+				(void) strlcat(x->d86_mnem, types[opnd_size],
+				    OPLEN);
+			}
+		}
+	}
+#endif
+
+	/*
+	 * Process operands based on the addressing modes.
+	 */
+	x->d86_mode = cpu_mode;
+	/*
+	 * In vex mode the rex_prefix has no meaning
+	 */
+	if (!vex_prefix)
+		x->d86_rex_prefix = rex_prefix;
+	x->d86_opnd_size = opnd_size;
+	x->d86_addr_size = addr_size;
+	vbit = 0;		/* initialize for mem/reg -> reg */
+	switch (dp->it_adrmode) {
+		/*
+		 * amd64 instruction to sign extend 32 bit reg/mem operands
+		 * into 64 bit register values
+		 */
+	case MOVSXZ:
+#ifdef DIS_TEXT
+		if (rex_prefix == 0)
+			(void) strncpy(x->d86_mnem, "movzld", OPLEN);
+#endif
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		x->d86_opnd_size = SIZE64;
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
+		x->d86_opnd_size = opnd_size = SIZE32;
+		wbit = LONG_OPND;
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+
+		/*
+		 * movsbl movsbw movsbq (0x0FBE) or movswl movswq (0x0FBF)
+		 * movzbl movzbw movzbq (0x0FB6) or movzwl movzwq (0x0FB7)
+		 * wbit lives in 2nd byte, note that operands
+		 * are different sized
+		 */
+	case MOVZ:
+		if (rex_prefix & REX_W) {
+			/* target register size = 64 bit */
+			x->d86_mnem[5] = 'q';
+		}
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
+		x->d86_opnd_size = opnd_size = SIZE16;
+		wbit = WBIT(opcode5);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+	case CRC32:
+		opnd_size = SIZE32;
+		if (rex_prefix & REX_W)
+			opnd_size = SIZE64;
+		x->d86_opnd_size = opnd_size;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
+		wbit = WBIT(opcode7);
+		if (opnd_size_prefix)
+			x->d86_opnd_size = opnd_size = SIZE16;
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+	case MOVBE:
+		opnd_size = SIZE32;
+		if (rex_prefix & REX_W)
+			opnd_size = SIZE64;
+		x->d86_opnd_size = opnd_size;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		wbit = WBIT(opcode7);
+		if (opnd_size_prefix)
+			x->d86_opnd_size = opnd_size = SIZE16;
+		if (wbit) {
+			/* reg -> mem */
+			dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0);
+			dtrace_get_operand(x, mode, r_m, wbit, 1);
+		} else {
+			/* mem -> reg */
+			dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
+			dtrace_get_operand(x, mode, r_m, wbit, 0);
+		}
+		break;
+
+	/*
+	 * imul instruction, with either 8-bit or longer immediate
+	 * opcode 0x6B for byte, sign-extended displacement, 0x69 for word(s)
+	 */
+	case IMUL:
+		wbit = LONG_OPND;
+		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND,
+		    OPSIZE(opnd_size, opcode2 == 0x9), 1);
+		break;
+
+	/* memory or register operand to register, with 'w' bit	*/
+	case MRw:
+	case ADX:
+		wbit = WBIT(opcode2);
+		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
+		break;
+
+	/* register to memory or register operand, with 'w' bit	*/
+	/* arpl happens to fit here also because it is odd */
+	case RMw:
+		if (opcode_bytes == 2)
+			wbit = WBIT(opcode5);
+		else
+			wbit = WBIT(opcode2);
+		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
+		break;
+
+	/* xaddb instruction */
+	case XADDB:
+		wbit = 0;
+		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
+		break;
+
+	/* MMX register to memory or register operand		*/
+	case MMS:
+	case MMOS:
+#ifdef DIS_TEXT
+		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
+#else
+		wbit = LONG_OPND;
+#endif
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1);
+		break;
+
+	/* MMX register to memory */
+	case MMOMS:
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		if (mode == REG_ONLY)
+			goto error;
+		wbit = MM_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1);
+		break;
+
+	/* Double shift. Has immediate operand specifying the shift. */
+	case DSHIFT:
+		wbit = LONG_OPND;
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 2);
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		break;
+
+	/*
+	 * Double shift. With no immediate operand, specifies using %cl.
+	 */
+	case DSHIFTcl:
+		wbit = LONG_OPND;
+		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
+		break;
+
+	/* immediate to memory or register operand */
+	case IMlw:
+		wbit = WBIT(opcode2);
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+		/*
+		 * Have long immediate for opcode 0x81, but not 0x80 nor 0x83
+		 */
+		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, opcode2 == 1), 0);
+		break;
+
+	/* immediate to memory or register operand with the	*/
+	/* 'w' bit present					*/
+	case IMw:
+		wbit = WBIT(opcode2);
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0);
+		break;
+
+	/* immediate to register with register in low 3 bits	*/
+	/* of op code						*/
+	case IR:
+		/* w-bit here (with regs) is bit 3 */
+		wbit = opcode2 >>3 & 0x1;
+		reg = REGNO(opcode2);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
+		mode = REG_ONLY;
+		r_m = reg;
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+		dtrace_imm_opnd(x, wbit, OPSIZE64(opnd_size, wbit), 0);
+		break;
+
+	/* MMX immediate shift of register */
+	case MMSH:
+	case MMOSH:
+		wbit = MM_OPND;
+		goto mm_shift;	/* in next case */
+
+	/* SIMD immediate shift of register */
+	case XMMSH:
+		wbit = XMM_OPND;
+mm_shift:
+		reg = REGNO(opcode7);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		NOMEM;
+		break;
+
+	/* accumulator to memory operand */
+	case AO:
+		vbit = 1;
+		/*FALLTHROUGH*/
+
+	/* memory operand to accumulator */
+	case OA:
+		wbit = WBIT(opcode2);
+		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1 - vbit);
+		dtrace_imm_opnd(x, wbit, OPSIZE64(addr_size, LONG_OPND), vbit);
+#ifdef DIS_TEXT
+		x->d86_opnd[vbit].d86_mode = MODE_OFFSET;
+#endif
+		break;
+
+
+	/* segment register to memory or register operand */
+	case SM:
+		vbit = 1;
+		/*FALLTHROUGH*/
+
+	/* memory or register operand to segment register */
+	case MS:
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, LONG_OPND, vbit);
+		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 1 - vbit);
+		break;
+
+	/*
+	 * rotate or shift instructions, which may shift by 1 or
+	 * consult the cl register, depending on the 'v' bit
+	 */
+	case Mv:
+		vbit = VBIT(opcode2);
+		wbit = WBIT(opcode2);
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+#ifdef DIS_TEXT
+		if (vbit) {
+			(void) strlcat(x->d86_opnd[0].d86_opnd, "%cl", OPLEN);
+		} else {
+			x->d86_opnd[0].d86_mode = MODE_SIGNED;
+			x->d86_opnd[0].d86_value_size = 1;
+			x->d86_opnd[0].d86_value = 1;
+		}
+#endif
+		break;
+	/*
+	 * immediate rotate or shift instructions
+	 */
+	case MvI:
+		wbit = WBIT(opcode2);
+normal_imm_mem:
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		break;
+
+	/* bit test instructions */
+	case MIb:
+		wbit = LONG_OPND;
+		goto normal_imm_mem;
+
+	/* single memory or register operand with 'w' bit present */
+	case Mw:
+		wbit = WBIT(opcode2);
+just_mem:
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+
+	case SWAPGS_RDTSCP:
+		if (cpu_mode == SIZE64 && mode == 3 && r_m == 0) {
+#ifdef DIS_TEXT
+			(void) strncpy(x->d86_mnem, "swapgs", OPLEN);
+#endif
+			NOMEM;
+			break;
+		} else if (mode == 3 && r_m == 1) {
+#ifdef DIS_TEXT
+			(void) strncpy(x->d86_mnem, "rdtscp", OPLEN);
+#endif
+			NOMEM;
+			break;
+		}
+
+		/*FALLTHROUGH*/
+
+	/* prefetch instruction - memory operand, but no memory acess */
+	case PREF:
+		NOMEM;
+		/*FALLTHROUGH*/
+
+	/* single memory or register operand */
+	case M:
+	case MG9:
+		wbit = LONG_OPND;
+		goto just_mem;
+
+	/* single memory or register byte operand */
+	case Mb:
+		wbit = BYTE_OPND;
+		goto just_mem;
+
+	case VMx:
+		if (mode == 3) {
+#ifdef DIS_TEXT
+			char *vminstr;
+
+			switch (r_m) {
+			case 1:
+				vminstr = "vmcall";
+				break;
+			case 2:
+				vminstr = "vmlaunch";
+				break;
+			case 3:
+				vminstr = "vmresume";
+				break;
+			case 4:
+				vminstr = "vmxoff";
+				break;
+			default:
+				goto error;
+			}
+
+			(void) strncpy(x->d86_mnem, vminstr, OPLEN);
+#else
+			if (r_m < 1 || r_m > 4)
+				goto error;
+#endif
+
+			NOMEM;
+			break;
+		}
+		/*FALLTHROUGH*/
+	case SVM:
+		if (mode == 3) {
+#ifdef DIS_TEXT
+			char *vinstr;
+
+			switch (r_m) {
+			case 0:
+				vinstr = "vmrun";
+				break;
+			case 1:
+				vinstr = "vmmcall";
+				break;
+			case 2:
+				vinstr = "vmload";
+				break;
+			case 3:
+				vinstr = "vmsave";
+				break;
+			case 4:
+				vinstr = "stgi";
+				break;
+			case 5:
+				vinstr = "clgi";
+				break;
+			case 6:
+				vinstr = "skinit";
+				break;
+			case 7:
+				vinstr = "invlpga";
+				break;
+			}
+
+			(void) strncpy(x->d86_mnem, vinstr, OPLEN);
+#endif
+			NOMEM;
+			break;
+		}
+		/*FALLTHROUGH*/
+	case MONITOR_MWAIT:
+		if (mode == 3) {
+			if (r_m == 0) {
+#ifdef DIS_TEXT
+				(void) strncpy(x->d86_mnem, "monitor", OPLEN);
+#endif
+				NOMEM;
+				break;
+			} else if (r_m == 1) {
+#ifdef DIS_TEXT
+				(void) strncpy(x->d86_mnem, "mwait", OPLEN);
+#endif
+				NOMEM;
+				break;
+			} else if (r_m == 2) {
+#ifdef DIS_TEXT
+				(void) strncpy(x->d86_mnem, "clac", OPLEN);
+#endif
+				NOMEM;
+				break;
+			} else if (r_m == 3) {
+#ifdef DIS_TEXT
+				(void) strncpy(x->d86_mnem, "stac", OPLEN);
+#endif
+				NOMEM;
+				break;
+			} else {
+				goto error;
+			}
+		}
+		/*FALLTHROUGH*/
+	case XGETBV_XSETBV:
+		if (mode == 3) {
+			if (r_m == 0) {
+#ifdef DIS_TEXT
+				(void) strncpy(x->d86_mnem, "xgetbv", OPLEN);
+#endif
+				NOMEM;
+				break;
+			} else if (r_m == 1) {
+#ifdef DIS_TEXT
+				(void) strncpy(x->d86_mnem, "xsetbv", OPLEN);
+#endif
+				NOMEM;
+				break;
+			} else {
+				goto error;
+			}
+
+		}
+		/*FALLTHROUGH*/
+	case MO:
+		/* Similar to M, but only memory (no direct registers) */
+		wbit = LONG_OPND;
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		if (mode == 3)
+			goto error;
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+
+	/* move special register to register or reverse if vbit */
+	case SREG:
+		switch (opcode5) {
+
+		case 2:
+			vbit = 1;
+			/*FALLTHROUGH*/
+		case 0:
+			wbit = CONTROL_OPND;
+			break;
+
+		case 3:
+			vbit = 1;
+			/*FALLTHROUGH*/
+		case 1:
+			wbit = DEBUG_OPND;
+			break;
+
+		case 6:
+			vbit = 1;
+			/*FALLTHROUGH*/
+		case 4:
+			wbit = TEST_OPND;
+			break;
+
+		}
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit);
+		dtrace_get_operand(x, REG_ONLY, r_m, LONG_OPND, 1 - vbit);
+		NOMEM;
+		break;
+
+	/*
+	 * single register operand with register in the low 3
+	 * bits of op code
+	 */
+	case R:
+		if (opcode_bytes == 2)
+			reg = REGNO(opcode5);
+		else
+			reg = REGNO(opcode2);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0);
+		NOMEM;
+		break;
+
+	/*
+	 * register to accumulator with register in the low 3
+	 * bits of op code, xchg instructions
+	 */
+	case RA:
+		NOMEM;
+		reg = REGNO(opcode2);
+		dtrace_rex_adjust(rex_prefix, mode, &reg, NULL);
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0);
+		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, LONG_OPND, 1);
+		break;
+
+	/*
+	 * single segment register operand, with register in
+	 * bits 3-4 of op code byte
+	 */
+	case SEG:
+		NOMEM;
+		reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x3;
+		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0);
+		break;
+
+	/*
+	 * single segment register operand, with register in
+	 * bits 3-5 of op code
+	 */
+	case LSEG:
+		NOMEM;
+		/* long seg reg from opcode */
+		reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x7;
+		dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0);
+		break;
+
+	/* memory or register operand to register */
+	case MR:
+		if (vex_prefetch)
+			x->d86_got_modrm = 1;
+		wbit = LONG_OPND;
+		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
+		break;
+
+	case RM:
+	case RM_66r:
+		wbit = LONG_OPND;
+		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1);
+		break;
+
+	/* MMX/SIMD-Int memory or mm reg to mm reg		*/
+	case MM:
+	case MMO:
+#ifdef DIS_TEXT
+		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
+#else
+		wbit = LONG_OPND;
+#endif
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0);
+		break;
+
+	case MMOIMPL:
+#ifdef DIS_TEXT
+		wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND;
+#else
+		wbit = LONG_OPND;
+#endif
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		if (mode != REG_ONLY)
+			goto error;
+
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		dtrace_get_operand(x, REG_ONLY, reg, MM_OPND, 1);
+		mode = 0;	/* change for memory access size... */
+		break;
+
+	/* MMX/SIMD-Int and SIMD-FP predicated mm reg to r32 */
+	case MMO3P:
+		wbit = MM_OPND;
+		goto xmm3p;
+	case XMM3P:
+		wbit = XMM_OPND;
+xmm3p:
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		if (mode != REG_ONLY)
+			goto error;
+
+		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 1,
+		    1);
+		NOMEM;
+		break;
+
+	case XMM3PM_66r:
+		THREEOPERAND(x, mode, reg, r_m, rex_prefix, LONG_OPND, XMM_OPND,
+		    1, 0);
+		break;
+
+	/* MMX/SIMD-Int predicated r32/mem to mm reg */
+	case MMOPRM:
+		wbit = LONG_OPND;
+		w2 = MM_OPND;
+		goto xmmprm;
+	case XMMPRM:
+	case XMMPRM_66r:
+		wbit = LONG_OPND;
+		w2 = XMM_OPND;
+xmmprm:
+		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, 1, 1);
+		break;
+
+	/* MMX/SIMD-Int predicated mm/mem to mm reg */
+	case MMOPM:
+	case MMOPM_66o:
+		wbit = w2 = MM_OPND;
+		goto xmmprm;
+
+	/* MMX/SIMD-Int mm reg to r32 */
+	case MMOM3:
+		NOMEM;
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		if (mode != REG_ONLY)
+			goto error;
+		wbit = MM_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0);
+		break;
+
+	/* SIMD memory or xmm reg operand to xmm reg		*/
+	case XMM:
+	case XMM_66o:
+	case XMM_66r:
+	case XMMO:
+	case XMMXIMPL:
+		wbit = XMM_OPND;
+		STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0);
+
+		if (dp->it_adrmode == XMMXIMPL && mode != REG_ONLY)
+			goto error;
+
+#ifdef DIS_TEXT
+		/*
+		 * movlps and movhlps share opcodes.  They differ in the
+		 * addressing modes allowed for their operands.
+		 * movhps and movlhps behave similarly.
+		 */
+		if (mode == REG_ONLY) {
+			if (strcmp(dp->it_name, "movlps") == 0)
+				(void) strncpy(x->d86_mnem, "movhlps", OPLEN);
+			else if (strcmp(dp->it_name, "movhps") == 0)
+				(void) strncpy(x->d86_mnem, "movlhps", OPLEN);
+		}
+#endif
+		if (dp->it_adrmode == XMMXIMPL)
+			mode = 0;	/* change for memory access size... */
+		break;
+
+	/* SIMD xmm reg to memory or xmm reg */
+	case XMMS:
+	case XMMOS:
+	case XMMMS:
+	case XMMOMS:
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+#ifdef DIS_TEXT
+		if ((strcmp(dp->it_name, "movlps") == 0 ||
+		    strcmp(dp->it_name, "movhps") == 0 ||
+		    strcmp(dp->it_name, "movntps") == 0) &&
+		    mode == REG_ONLY)
+			goto error;
+#endif
+		wbit = XMM_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
+		break;
+
+	/* SIMD memory to xmm reg */
+	case XMMM:
+	case XMMM_66r:
+	case XMMOM:
+		wbit = XMM_OPND;
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+#ifdef DIS_TEXT
+		if (mode == REG_ONLY) {
+			if (strcmp(dp->it_name, "movhps") == 0)
+				(void) strncpy(x->d86_mnem, "movlhps", OPLEN);
+			else
+				goto error;
+		}
+#endif
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
+		break;
+
+	/* SIMD memory or r32 to xmm reg			*/
+	case XMM3MX:
+		wbit = LONG_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
+		break;
+
+	case XMM3MXS:
+		wbit = LONG_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1);
+		break;
+
+	/* SIMD memory or mm reg to xmm reg			*/
+	case XMMOMX:
+	/* SIMD mm to xmm */
+	case XMMMX:
+		wbit = MM_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0);
+		break;
+
+	/* SIMD memory or xmm reg to mm reg			*/
+	case XMMXMM:
+	case XMMOXMM:
+	case XMMXM:
+		wbit = XMM_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0);
+		break;
+
+
+	/* SIMD memory or xmm reg to r32			*/
+	case XMMXM3:
+		wbit = XMM_OPND;
+		MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0);
+		break;
+
+	/* SIMD xmm to r32					*/
+	case XMMX3:
+	case XMMOX3:
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		if (mode != REG_ONLY)
+			goto error;
+		dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+		dtrace_get_operand(x, mode, r_m, XMM_OPND, 0);
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1);
+		NOMEM;
+		break;
+
+	/* SIMD predicated memory or xmm reg with/to xmm reg */
+	case XMMP:
+	case XMMP_66r:
+	case XMMP_66o:
+	case XMMOPM:
+		wbit = XMM_OPND;
+		THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1,
+		    1);
+
+#ifdef DIS_TEXT
+		/*
+		 * cmpps and cmpss vary their instruction name based
+		 * on the value of imm8.  Other XMMP instructions,
+		 * such as shufps, require explicit specification of
+		 * the predicate.
+		 */
+		if (dp->it_name[0] == 'c' &&
+		    dp->it_name[1] == 'm' &&
+		    dp->it_name[2] == 'p' &&
+		    strlen(dp->it_name) == 5) {
+			uchar_t pred = x->d86_opnd[0].d86_value & 0xff;
+
+			if (pred >= (sizeof (dis_PREDSUFFIX) / sizeof (char *)))
+				goto error;
+
+			(void) strncpy(x->d86_mnem, "cmp", OPLEN);
+			(void) strlcat(x->d86_mnem, dis_PREDSUFFIX[pred],
+			    OPLEN);
+			(void) strlcat(x->d86_mnem,
+			    dp->it_name + strlen(dp->it_name) - 2,
+			    OPLEN);
+			x->d86_opnd[0] = x->d86_opnd[1];
+			x->d86_opnd[1] = x->d86_opnd[2];
+			x->d86_numopnds = 2;
+		}
+#endif
+		break;
+
+	case XMMX2I:
+		FOUROPERAND(x, mode, reg, r_m, rex_prefix, XMM_OPND, XMM_OPND,
+		    1);
+		NOMEM;
+		break;
+
+	case XMM2I:
+		ONEOPERAND_TWOIMM(x, mode, reg, r_m, rex_prefix, XMM_OPND, 1);
+		NOMEM;
+		break;
+
+	/* immediate operand to accumulator */
+	case IA:
+		wbit = WBIT(opcode2);
+		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1);
+		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0);
+		NOMEM;
+		break;
+
+	/* memory or register operand to accumulator */
+	case MA:
+		wbit = WBIT(opcode2);
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+
+	/* si register to di register used to reference memory		*/
+	case SD:
+#ifdef DIS_TEXT
+		dtrace_check_override(x, 0);
+		x->d86_numopnds = 2;
+		if (addr_size == SIZE64) {
+			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)",
+			    OPLEN);
+			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)",
+			    OPLEN);
+		} else if (addr_size == SIZE32) {
+			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)",
+			    OPLEN);
+			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)",
+			    OPLEN);
+		} else {
+			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)",
+			    OPLEN);
+			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)",
+			    OPLEN);
+		}
+#endif
+		wbit = LONG_OPND;
+		break;
+
+	/* accumulator to di register				*/
+	case AD:
+		wbit = WBIT(opcode2);
+#ifdef DIS_TEXT
+		dtrace_check_override(x, 1);
+		x->d86_numopnds = 2;
+		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 0);
+		if (addr_size == SIZE64)
+			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)",
+			    OPLEN);
+		else if (addr_size == SIZE32)
+			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)",
+			    OPLEN);
+		else
+			(void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)",
+			    OPLEN);
+#endif
+		break;
+
+	/* si register to accumulator				*/
+	case SA:
+		wbit = WBIT(opcode2);
+#ifdef DIS_TEXT
+		dtrace_check_override(x, 0);
+		x->d86_numopnds = 2;
+		if (addr_size == SIZE64)
+			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)",
+			    OPLEN);
+		else if (addr_size == SIZE32)
+			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)",
+			    OPLEN);
+		else
+			(void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)",
+			    OPLEN);
+		dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1);
+#endif
+		break;
+
+	/*
+	 * single operand, a 16/32 bit displacement
+	 */
+	case D:
+		wbit = LONG_OPND;
+		dtrace_disp_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0);
+		NOMEM;
+		break;
+
+	/* jmp/call indirect to memory or register operand		*/
+	case INM:
+#ifdef DIS_TEXT
+		(void) strlcat(x->d86_opnd[0].d86_prefix, "*", OPLEN);
+#endif
+		dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m);
+		dtrace_get_operand(x, mode, r_m, LONG_OPND, 0);
+		wbit = LONG_OPND;
+		break;
+
+	/*
+	 * for long jumps and long calls -- a new code segment
+	 * register and an offset in IP -- stored in object
+	 * code in reverse order. Note - not valid in amd64
+	 */
+	case SO:
+		dtrace_check_override(x, 1);
+		wbit = LONG_OPND;
+		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 1);
+#ifdef DIS_TEXT
+		x->d86_opnd[1].d86_mode = MODE_SIGNED;
+#endif
+		/* will now get segment operand */
+		dtrace_imm_opnd(x, wbit, 2, 0);
+		break;
+
+	/*
+	 * jmp/call. single operand, 8 bit displacement.
+	 * added to current EIP in 'compofff'
+	 */
+	case BD:
+		dtrace_disp_opnd(x, BYTE_OPND, 1, 0);
+		NOMEM;
+		break;
+
+	/* single 32/16 bit immediate operand			*/
+	case I:
+		wbit = LONG_OPND;
+		dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0);
+		break;
+
+	/* single 8 bit immediate operand			*/
+	case Ib:
+		wbit = LONG_OPND;
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		break;
+
+	case ENTER:
+		wbit = LONG_OPND;
+		dtrace_imm_opnd(x, wbit, 2, 0);
+		dtrace_imm_opnd(x, wbit, 1, 1);
+		switch (opnd_size) {
+		case SIZE64:
+			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 8;
+			break;
+		case SIZE32:
+			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 4;
+			break;
+		case SIZE16:
+			x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 2;
+			break;
+		}
+
+		break;
+
+	/* 16-bit immediate operand */
+	case RET:
+		wbit = LONG_OPND;
+		dtrace_imm_opnd(x, wbit, 2, 0);
+		break;
+
+	/* single 8 bit port operand				*/
+	case P:
+		dtrace_check_override(x, 0);
+		dtrace_imm_opnd(x, BYTE_OPND, 1, 0);
+		NOMEM;
+		break;
+
+	/* single operand, dx register (variable port instruction) */
+	case V:
+		x->d86_numopnds = 1;
+		dtrace_check_override(x, 0);
+#ifdef DIS_TEXT
+		(void) strlcat(x->d86_opnd[0].d86_opnd, "(%dx)", OPLEN);
+#endif
+		NOMEM;
+		break;
+
+	/*
+	 * The int instruction, which has two forms:
+	 * int 3 (breakpoint) or
+	 * int n, where n is indicated in the subsequent
+	 * byte (format Ib).  The int 3 instruction (opcode 0xCC),
+	 * where, although the 3 looks  like an operand,
+	 * it is implied by the opcode. It must be converted
+	 * to the correct base and output.
+	 */
+	case INT3:
+#ifdef DIS_TEXT
+		x->d86_numopnds = 1;
+		x->d86_opnd[0].d86_mode = MODE_SIGNED;
+		x->d86_opnd[0].d86_value_size = 1;
+		x->d86_opnd[0].d86_value = 3;
+#endif
+		NOMEM;
+		break;
+
+	/* single 8 bit immediate operand			*/
+	case INTx:
+		dtrace_imm_opnd(x, BYTE_OPND, 1, 0);
+		NOMEM;
+		break;
+
+	/* an unused byte must be discarded */
+	case U:
+		if (x->d86_get_byte(x->d86_data) < 0)
+			goto error;
+		x->d86_len++;
+		NOMEM;
+		break;
+
+	case CBW:
+#ifdef DIS_TEXT
+		if (opnd_size == SIZE16)
+			(void) strlcat(x->d86_mnem, "cbtw", OPLEN);
+		else if (opnd_size == SIZE32)
+			(void) strlcat(x->d86_mnem, "cwtl", OPLEN);
+		else
+			(void) strlcat(x->d86_mnem, "cltq", OPLEN);
+#endif
+		wbit = LONG_OPND;
+		NOMEM;
+		break;
+
+	case CWD:
+#ifdef DIS_TEXT
+		if (opnd_size == SIZE16)
+			(void) strlcat(x->d86_mnem, "cwtd", OPLEN);
+		else if (opnd_size == SIZE32)
+			(void) strlcat(x->d86_mnem, "cltd", OPLEN);
+		else
+			(void) strlcat(x->d86_mnem, "cqtd", OPLEN);
+#endif
+		wbit = LONG_OPND;
+		NOMEM;
+		break;
+
+	case XMMSFNC:
+		/*
+		 * sfence is sfence if mode is REG_ONLY.  If mode isn't
+		 * REG_ONLY, mnemonic should be 'clflush'.
+		 */
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+
+		/* sfence doesn't take operands */
+#ifdef DIS_TEXT
+		if (mode == REG_ONLY) {
+			(void) strlcat(x->d86_mnem, "sfence", OPLEN);
+		} else {
+			(void) strlcat(x->d86_mnem, "clflush", OPLEN);
+			dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+			dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0);
+			NOMEM;
+		}
+#else
+		if (mode != REG_ONLY) {
+			dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+			dtrace_get_operand(x, mode, r_m, LONG_OPND, 0);
+			NOMEM;
+		}
+#endif
+		break;
+
+	/*
+	 * no disassembly, the mnemonic was all there was so go on
+	 */
+	case NORM:
+		if (dp->it_invalid32 && cpu_mode != SIZE64)
+			goto error;
+		NOMEM;
+		/*FALLTHROUGH*/
+	case IMPLMEM:
+		break;
+
+	case XMMFENCE:
+		/*
+		 * XRSTOR and LFENCE share the same opcode but differ in mode
+		 */
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+
+		if (mode == REG_ONLY) {
+			/*
+			 * Only the following exact byte sequences are allowed:
+			 *
+			 * 	0f ae e8	lfence
+			 * 	0f ae f0	mfence
+			 */
+			if ((uint8_t)x->d86_bytes[x->d86_len - 1] != 0xe8 &&
+			    (uint8_t)x->d86_bytes[x->d86_len - 1] != 0xf0)
+				goto error;
+		} else {
+#ifdef DIS_TEXT
+			(void) strncpy(x->d86_mnem, "xrstor", OPLEN);
+#endif
+			dtrace_rex_adjust(rex_prefix, mode, &reg, &r_m);
+			dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0);
+		}
+		break;
+
+	/* float reg */
+	case F:
+#ifdef DIS_TEXT
+		x->d86_numopnds = 1;
+		(void) strlcat(x->d86_opnd[0].d86_opnd, "%st(X)", OPLEN);
+		x->d86_opnd[0].d86_opnd[4] = r_m + '0';
+#endif
+		NOMEM;
+		break;
+
+	/* float reg to float reg, with ret bit present */
+	case FF:
+		vbit = opcode2 >> 2 & 0x1;	/* vbit = 1: st -> st(i) */
+		/*FALLTHROUGH*/
+	case FFC:				/* case for vbit always = 0 */
+#ifdef DIS_TEXT
+		x->d86_numopnds = 2;
+		(void) strlcat(x->d86_opnd[1 - vbit].d86_opnd, "%st", OPLEN);
+		(void) strlcat(x->d86_opnd[vbit].d86_opnd, "%st(X)", OPLEN);
+		x->d86_opnd[vbit].d86_opnd[4] = r_m + '0';
+#endif
+		NOMEM;
+		break;
+
+	/* AVX instructions */
+	case VEX_MO:
+		/* op(ModR/M.r/m) */
+		x->d86_numopnds = 1;
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+#ifdef DIS_TEXT
+		if ((dp == &dis_opAVX0F[0xA][0xE]) && (reg == 3))
+			(void) strncpy(x->d86_mnem, "vstmxcsr", OPLEN);
+#endif
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+	case VEX_RMrX:
+	case FMA:
+		/* ModR/M.reg := op(VEX.vvvv, ModR/M.r/m) */
+		x->d86_numopnds = 3;
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		/*
+		 * In classic Intel fashion, the opcodes for all of the FMA
+		 * instructions all have two possible mnemonics which vary by
+		 * one letter, which is selected based on the value of the wbit.
+		 * When wbit is one, they have the 'd' suffix and when 'wbit' is
+		 * 0, they have the 's' suffix. Otherwise, the FMA instructions
+		 * are all a standard VEX_RMrX.
+		 */
+#ifdef DIS_TEXT
+		if (dp->it_adrmode == FMA) {
+			size_t len = strlen(dp->it_name);
+			(void) strncpy(x->d86_mnem, dp->it_name, OPLEN);
+			if (len + 1 < OPLEN) {
+				(void) strncpy(x->d86_mnem + len,
+				    vex_W != 0 ? "d" : "s", OPLEN - len);
+			}
+		}
+#endif
+
+		if (mode != REG_ONLY) {
+			if ((dp == &dis_opAVXF20F[0x10]) ||
+			    (dp == &dis_opAVXF30F[0x10])) {
+				/* vmovsd <m64>, <xmm> */
+				/* or vmovss <m64>, <xmm> */
+				x->d86_numopnds = 2;
+				goto L_VEX_MX;
+			}
+		}
+
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 2);
+		/*
+		 * VEX prefix uses the 1's complement form to encode the
+		 * XMM/YMM regs
+		 */
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1);
+
+		if ((dp == &dis_opAVXF20F[0x2A]) ||
+		    (dp == &dis_opAVXF30F[0x2A])) {
+			/*
+			 * vcvtsi2si </r,m>, <xmm>, <xmm> or vcvtsi2ss </r,m>,
+			 * <xmm>, <xmm>
+			 */
+			wbit = LONG_OPND;
+		}
+#ifdef DIS_TEXT
+		else if ((mode == REG_ONLY) &&
+		    (dp == &dis_opAVX0F[0x1][0x6])) {	/* vmovlhps */
+			(void) strncpy(x->d86_mnem, "vmovlhps", OPLEN);
+		} else if ((mode == REG_ONLY) &&
+		    (dp == &dis_opAVX0F[0x1][0x2])) {	/* vmovhlps */
+			(void) strncpy(x->d86_mnem, "vmovhlps", OPLEN);
+		}
+#endif
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+
+		break;
+
+	case VEX_VRMrX:
+		/* ModR/M.reg := op(MODR/M.r/m, VEX.vvvv) */
+		x->d86_numopnds = 3;
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 2);
+		/*
+		 * VEX prefix uses the 1's complement form to encode the
+		 * XMM/YMM regs
+		 */
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 0);
+
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+		break;
+
+	case VEX_SbVM:
+		/* ModR/M.reg := op(MODR/M.r/m, VSIB, VEX.vvvv) */
+		x->d86_numopnds = 3;
+		x->d86_vsib = 1;
+
+		/*
+		 * All instructions that use VSIB are currently a mess. See the
+		 * comment around the dis_gather_regs_t structure definition.
+		 */
+
+		vreg = &dis_vgather[opcode2][vex_W][vex_L];
+
+#ifdef DIS_TEXT
+		(void) strncpy(x->d86_mnem, dp->it_name, OPLEN);
+		(void) strlcat(x->d86_mnem + strlen(dp->it_name),
+		    vreg->dgr_suffix, OPLEN - strlen(dp->it_name));
+#endif
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		dtrace_get_operand(x, REG_ONLY, reg, vreg->dgr_arg2, 2);
+		/*
+		 * VEX prefix uses the 1's complement form to encode the
+		 * XMM/YMM regs
+		 */
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), vreg->dgr_arg0,
+		    0);
+		dtrace_get_operand(x, mode, r_m, vreg->dgr_arg1, 1);
+		break;
+
+	case VEX_RRX:
+		/* ModR/M.rm := op(VEX.vvvv, ModR/M.reg) */
+		x->d86_numopnds = 3;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		if (mode != REG_ONLY) {
+			if ((dp == &dis_opAVXF20F[0x11]) ||
+			    (dp == &dis_opAVXF30F[0x11])) {
+				/* vmovsd <xmm>, <m64> */
+				/* or vmovss <xmm>, <m64> */
+				x->d86_numopnds = 2;
+				goto L_VEX_RM;
+			}
+		}
+
+		dtrace_get_operand(x, mode, r_m, wbit, 2);
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 0);
+		break;
+
+	case VEX_RMRX:
+		/* ModR/M.reg := op(VEX.vvvv, ModR/M.r_m, imm8[7:4]) */
+		x->d86_numopnds = 4;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 3);
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 2);
+		if (dp == &dis_opAVX660F3A[0x18]) {
+			/* vinsertf128 <imm8>, <xmm>, <ymm>, <ymm> */
+			dtrace_get_operand(x, mode, r_m, XMM_OPND, 1);
+		} else if ((dp == &dis_opAVX660F3A[0x20]) ||
+		    (dp == & dis_opAVX660F[0xC4])) {
+			/* vpinsrb <imm8>, <reg/mm>, <xmm>, <xmm> */
+			/* or vpinsrw <imm8>, <reg/mm>, <xmm>, <xmm> */
+			dtrace_get_operand(x, mode, r_m, LONG_OPND, 1);
+		} else if (dp == &dis_opAVX660F3A[0x22]) {
+			/* vpinsrd/q <imm8>, <reg/mm>, <xmm>, <xmm> */
+#ifdef DIS_TEXT
+			if (vex_W)
+				x->d86_mnem[6] = 'q';
+#endif
+			dtrace_get_operand(x, mode, r_m, LONG_OPND, 1);
+		} else {
+			dtrace_get_operand(x, mode, r_m, wbit, 1);
+		}
+
+		/* one byte immediate number */
+		dtrace_imm_opnd(x, wbit, 1, 0);
+
+		/* vblendvpd, vblendvps, vblendvb use the imm encode the regs */
+		if ((dp == &dis_opAVX660F3A[0x4A]) ||
+		    (dp == &dis_opAVX660F3A[0x4B]) ||
+		    (dp == &dis_opAVX660F3A[0x4C])) {
+#ifdef DIS_TEXT
+			int regnum = (x->d86_opnd[0].d86_value & 0xF0) >> 4;
+#endif
+			x->d86_opnd[0].d86_mode = MODE_NONE;
+#ifdef DIS_TEXT
+			if (vex_L)
+				(void) strncpy(x->d86_opnd[0].d86_opnd,
+				    dis_YMMREG[regnum], OPLEN);
+			else
+				(void) strncpy(x->d86_opnd[0].d86_opnd,
+				    dis_XMMREG[regnum], OPLEN);
+#endif
+		}
+		break;
+
+	case VEX_MX:
+		/* ModR/M.reg := op(ModR/M.rm) */
+		x->d86_numopnds = 2;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+L_VEX_MX:
+
+		if ((dp == &dis_opAVXF20F[0xE6]) ||
+		    (dp == &dis_opAVX660F[0x5A]) ||
+		    (dp == &dis_opAVX660F[0xE6])) {
+			/* vcvtpd2dq <ymm>, <xmm> */
+			/* or vcvtpd2ps <ymm>, <xmm> */
+			/* or vcvttpd2dq <ymm>, <xmm> */
+			dtrace_get_operand(x, REG_ONLY, reg, XMM_OPND, 1);
+			dtrace_get_operand(x, mode, r_m, wbit, 0);
+		} else if ((dp == &dis_opAVXF30F[0xE6]) ||
+		    (dp == &dis_opAVX0F[0x5][0xA]) ||
+		    (dp == &dis_opAVX660F38[0x13]) ||
+		    (dp == &dis_opAVX660F38[0x18]) ||
+		    (dp == &dis_opAVX660F38[0x19]) ||
+		    (dp == &dis_opAVX660F38[0x58]) ||
+		    (dp == &dis_opAVX660F38[0x78]) ||
+		    (dp == &dis_opAVX660F38[0x79]) ||
+		    (dp == &dis_opAVX660F38[0x59])) {
+			/* vcvtdq2pd <xmm>, <ymm> */
+			/* or vcvtps2pd <xmm>, <ymm> */
+			/* or vcvtph2ps <xmm>, <ymm> */
+			/* or vbroadcasts* <xmm>, <ymm> */
+			dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+			dtrace_get_operand(x, mode, r_m, XMM_OPND, 0);
+		} else if (dp == &dis_opAVX660F[0x6E]) {
+			/* vmovd/q <reg/mem 32/64>, <xmm> */
+#ifdef DIS_TEXT
+			if (vex_W)
+				x->d86_mnem[4] = 'q';
+#endif
+			dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+			dtrace_get_operand(x, mode, r_m, LONG_OPND, 0);
+		} else {
+			dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+			dtrace_get_operand(x, mode, r_m, wbit, 0);
+		}
+
+		break;
+
+	case VEX_MXI:
+		/* ModR/M.reg := op(ModR/M.rm, imm8) */
+		x->d86_numopnds = 3;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 2);
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+
+		/* one byte immediate number */
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		break;
+
+	case VEX_XXI:
+		/* VEX.vvvv := op(ModR/M.rm, imm8) */
+		x->d86_numopnds = 3;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+#ifdef DIS_TEXT
+		(void) strncpy(x->d86_mnem, dis_AVXvgrp7[opcode2 - 1][reg],
+		    OPLEN);
+#endif
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 2);
+		dtrace_get_operand(x, REG_ONLY, r_m, wbit, 1);
+
+		/* one byte immediate number */
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		break;
+
+	case VEX_MR:
+		/* ModR/M.reg (reg32/64) := op(ModR/M.rm) */
+		if (dp == &dis_opAVX660F[0xC5]) {
+			/* vpextrw <imm8>, <xmm>, <reg> */
+			x->d86_numopnds = 2;
+			vbit = 2;
+		} else {
+			x->d86_numopnds = 2;
+			vbit = 1;
+		}
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+		dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, vbit);
+		dtrace_get_operand(x, mode, r_m, wbit, vbit - 1);
+
+		if (vbit == 2)
+			dtrace_imm_opnd(x, wbit, 1, 0);
+
+		break;
+
+	case VEX_RRI:
+		/* implicit(eflags/r32) := op(ModR/M.reg, ModR/M.rm) */
+		x->d86_numopnds = 2;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+
+	case VEX_RX:
+		/* ModR/M.rm := op(ModR/M.reg) */
+		/* vextractf128 || vcvtps2ph */
+		if (dp == &dis_opAVX660F3A[0x19] ||
+		    dp == &dis_opAVX660F3A[0x1d]) {
+			x->d86_numopnds = 3;
+
+			dtrace_get_modrm(x, &mode, &reg, &r_m);
+			dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+			dtrace_get_operand(x, mode, r_m, XMM_OPND, 2);
+			dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+
+			/* one byte immediate number */
+			dtrace_imm_opnd(x, wbit, 1, 0);
+			break;
+		}
+
+		x->d86_numopnds = 2;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 1);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 0);
+		break;
+
+	case VEX_RR:
+		/* ModR/M.rm := op(ModR/M.reg) */
+		x->d86_numopnds = 2;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		if (dp == &dis_opAVX660F[0x7E]) {
+			/* vmovd/q <reg/mem 32/64>, <xmm> */
+#ifdef DIS_TEXT
+			if (vex_W)
+				x->d86_mnem[4] = 'q';
+#endif
+			dtrace_get_operand(x, mode, r_m, LONG_OPND, 1);
+		} else
+			dtrace_get_operand(x, mode, r_m, wbit, 1);
+
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 0);
+		break;
+
+	case VEX_RRi:
+		/* ModR/M.rm := op(ModR/M.reg, imm) */
+		x->d86_numopnds = 3;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+#ifdef DIS_TEXT
+		if (dp == &dis_opAVX660F3A[0x16]) {
+			/* vpextrd/q <imm>, <xmm>, <reg/mem 32/64> */
+			if (vex_W)
+				x->d86_mnem[6] = 'q';
+		}
+#endif
+		dtrace_get_operand(x, mode, r_m, LONG_OPND, 2);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+
+		/* one byte immediate number */
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		break;
+	case VEX_RIM:
+		/* ModR/M.rm := op(ModR/M.reg, imm) */
+		x->d86_numopnds = 3;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		dtrace_get_operand(x, mode, r_m, XMM_OPND, 2);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+		/* one byte immediate number */
+		dtrace_imm_opnd(x, wbit, 1, 0);
+		break;
+
+	case VEX_RM:
+		/* ModR/M.rm := op(ModR/M.reg) */
+		if (dp == &dis_opAVX660F3A[0x17]) {	/* vextractps */
+			x->d86_numopnds = 3;
+
+			dtrace_get_modrm(x, &mode, &reg, &r_m);
+			dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+			dtrace_get_operand(x, mode, r_m, LONG_OPND, 2);
+			dtrace_get_operand(x, REG_ONLY, reg, wbit, 1);
+			/* one byte immediate number */
+			dtrace_imm_opnd(x, wbit, 1, 0);
+			break;
+		}
+		x->d86_numopnds = 2;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+L_VEX_RM:
+		vbit = 1;
+		dtrace_get_operand(x, mode, r_m, wbit, vbit);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit - 1);
+
+		break;
+
+	case VEX_RRM:
+		/* ModR/M.rm := op(VEX.vvvv, ModR/M.reg) */
+		x->d86_numopnds = 3;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+		dtrace_get_operand(x, mode, r_m, wbit, 2);
+		/* VEX use the 1's complement form encode the XMM/YMM regs */
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 0);
+		break;
+
+	case VEX_RMX:
+		/* ModR/M.reg := op(VEX.vvvv, ModR/M.rm) */
+		x->d86_numopnds = 3;
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+		dtrace_get_operand(x, REG_ONLY, reg, wbit, 2);
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1);
+		dtrace_get_operand(x, REG_ONLY, r_m, wbit, 0);
+		break;
+
+	case VEX_NONE:
+#ifdef DIS_TEXT
+		if (vex_L)
+			(void) strncpy(x->d86_mnem, "vzeroall", OPLEN);
+#endif
+		break;
+	case BLS: {
+
+		/*
+		 * The BLS instructions are VEX instructions that are based on
+		 * VEX.0F38.F3; however, they are considered special group 17
+		 * and like everything else, they use the bits in 3-5 of the
+		 * MOD R/M to determine the sub instruction. Unlike many others
+		 * like the VMX instructions, these are valid both for memory
+		 * and register forms.
+		 */
+
+		dtrace_get_modrm(x, &mode, &reg, &r_m);
+		dtrace_vex_adjust(vex_byte1, mode, &reg, &r_m);
+
+		switch (reg) {
+		case 1:
+#ifdef	DIS_TEXT
+			blsinstr = "blsr";
+#endif
+			break;
+		case 2:
+#ifdef	DIS_TEXT
+			blsinstr = "blsmsk";
+#endif
+			break;
+		case 3:
+#ifdef	DIS_TEXT
+			blsinstr = "blsi";
+#endif
+			break;
+		default:
+			goto error;
+		}
+
+		x->d86_numopnds = 2;
+#ifdef DIS_TEXT
+		(void) strncpy(x->d86_mnem, blsinstr, OPLEN);
+#endif
+		dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1);
+		dtrace_get_operand(x, mode, r_m, wbit, 0);
+		break;
+	}
+	/* an invalid op code */
+	case AM:
+	case DM:
+	case OVERRIDE:
+	case PREFIX:
+	case UNKNOWN:
+		NOMEM;
+	default:
+		goto error;
+	} /* end switch */
+	if (x->d86_error)
+		goto error;
+
+done:
+#ifdef DIS_MEM
+	/*
+	 * compute the size of any memory accessed by the instruction
+	 */
+	if (x->d86_memsize != 0) {
+		return (0);
+	} else if (dp->it_stackop) {
+		switch (opnd_size) {
+		case SIZE16:
+			x->d86_memsize = 2;
+			break;
+		case SIZE32:
+			x->d86_memsize = 4;
+			break;
+		case SIZE64:
+			x->d86_memsize = 8;
+			break;
+		}
+	} else if (nomem || mode == REG_ONLY) {
+		x->d86_memsize = 0;
+
+	} else if (dp->it_size != 0) {
+		/*
+		 * In 64 bit mode descriptor table entries
+		 * go up to 10 bytes and popf/pushf are always 8 bytes
+		 */
+		if (x->d86_mode == SIZE64 && dp->it_size == 6)
+			x->d86_memsize = 10;
+		else if (x->d86_mode == SIZE64 && opcode1 == 0x9 &&
+		    (opcode2 == 0xc || opcode2 == 0xd))
+			x->d86_memsize = 8;
+		else
+			x->d86_memsize = dp->it_size;
+
+	} else if (wbit == 0) {
+		x->d86_memsize = 1;
+
+	} else if (wbit == LONG_OPND) {
+		if (opnd_size == SIZE64)
+			x->d86_memsize = 8;
+		else if (opnd_size == SIZE32)
+			x->d86_memsize = 4;
+		else
+			x->d86_memsize = 2;
+
+	} else if (wbit == SEG_OPND) {
+		x->d86_memsize = 4;
+
+	} else {
+		x->d86_memsize = 8;
+	}
+#endif
+	return (0);
+
+error:
+#ifdef DIS_TEXT
+	(void) strlcat(x->d86_mnem, "undef", OPLEN);
+#endif
+	return (1);
+}
+
+#ifdef DIS_TEXT
+
+/*
+ * Some instructions should have immediate operands printed
+ * as unsigned integers. We compare against this table.
+ */
+static char *unsigned_ops[] = {
+	"or", "and", "xor", "test", "in", "out", "lcall", "ljmp",
+	"rcr", "rcl", "ror", "rol", "shl", "shr", "sal", "psr", "psl",
+	0
+};
+
+
+static int
+isunsigned_op(char *opcode)
+{
+	char *where;
+	int i;
+	int is_unsigned = 0;
+
+	/*
+	 * Work back to start of last mnemonic, since we may have
+	 * prefixes on some opcodes.
+	 */
+	where = opcode + strlen(opcode) - 1;
+	while (where > opcode && *where != ' ')
+		--where;
+	if (*where == ' ')
+		++where;
+
+	for (i = 0; unsigned_ops[i]; ++i) {
+		if (strncmp(where, unsigned_ops[i],
+		    strlen(unsigned_ops[i])))
+			continue;
+		is_unsigned = 1;
+		break;
+	}
+	return (is_unsigned);
+}
+
+/*
+ * Print a numeric immediate into end of buf, maximum length buflen.
+ * The immediate may be an address or a displacement.  Mask is set
+ * for address size.  If the immediate is a "small negative", or
+ * if it's a negative displacement of any magnitude, print as -<absval>.
+ * Respect the "octal" flag.  "Small negative" is defined as "in the
+ * interval [NEG_LIMIT, 0)".
+ *
+ * Also, "isunsigned_op()" instructions never print negatives.
+ *
+ * Return whether we decided to print a negative value or not.
+ */
+
+#define	NEG_LIMIT	-255
+enum {IMM, DISP};
+enum {POS, TRY_NEG};
+
+static int
+print_imm(dis86_t *dis, uint64_t usv, uint64_t mask, char *buf,
+    size_t buflen, int disp, int try_neg)
+{
+	int curlen;
+	int64_t sv = (int64_t)usv;
+	int octal = dis->d86_flags & DIS_F_OCTAL;
+
+	curlen = strlen(buf);
+
+	if (try_neg == TRY_NEG && sv < 0 &&
+	    (disp || sv >= NEG_LIMIT) &&
+	    !isunsigned_op(dis->d86_mnem)) {
+		dis->d86_sprintf_func(buf + curlen, buflen - curlen,
+		    octal ? "-0%llo" : "-0x%llx", (-sv) & mask);
+		return (1);
+	} else {
+		if (disp == DISP)
+			dis->d86_sprintf_func(buf + curlen, buflen - curlen,
+			    octal ? "+0%llo" : "+0x%llx", usv & mask);
+		else
+			dis->d86_sprintf_func(buf + curlen, buflen - curlen,
+			    octal ? "0%llo" : "0x%llx", usv & mask);
+		return (0);
+
+	}
+}
+
+
+static int
+log2(int size)
+{
+	switch (size) {
+	case 1: return (0);
+	case 2: return (1);
+	case 4: return (2);
+	case 8: return (3);
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+void
+dtrace_disx86_str(dis86_t *dis, uint_t mode, uint64_t pc, char *buf,
+    size_t buflen)
+{
+	uint64_t reltgt = 0;
+	uint64_t tgt = 0;
+	int curlen;
+	int (*lookup)(void *, uint64_t, char *, size_t);
+	int i;
+	int64_t sv;
+	uint64_t usv, mask, save_mask, save_usv;
+	static uint64_t masks[] =
+	    {0xffU, 0xffffU, 0xffffffffU, 0xffffffffffffffffULL};
+	save_usv = 0;
+
+	dis->d86_sprintf_func(buf, buflen, "%-6s ", dis->d86_mnem);
+
+	/*
+	 * For PC-relative jumps, the pc is really the next pc after executing
+	 * this instruction, so increment it appropriately.
+	 */
+	pc += dis->d86_len;
+
+	for (i = 0; i < dis->d86_numopnds; i++) {
+		d86opnd_t *op = &dis->d86_opnd[i];
+
+		if (i != 0)
+			(void) strlcat(buf, ",", buflen);
+
+		(void) strlcat(buf, op->d86_prefix, buflen);
+
+		/*
+		 * sv is for the signed, possibly-truncated immediate or
+		 * displacement; usv retains the original size and
+		 * unsignedness for symbol lookup.
+		 */
+
+		sv = usv = op->d86_value;
+
+		/*
+		 * About masks: for immediates that represent
+		 * addresses, the appropriate display size is
+		 * the effective address size of the instruction.
+		 * This includes MODE_OFFSET, MODE_IPREL, and
+		 * MODE_RIPREL.  Immediates that are simply
+		 * immediate values should display in the operand's
+		 * size, however, since they don't represent addresses.
+		 */
+
+		/* d86_addr_size is SIZEnn, which is log2(real size) */
+		mask = masks[dis->d86_addr_size];
+
+		/* d86_value_size and d86_imm_bytes are in bytes */
+		if (op->d86_mode == MODE_SIGNED ||
+		    op->d86_mode == MODE_IMPLIED)
+			mask = masks[log2(op->d86_value_size)];
+
+		switch (op->d86_mode) {
+
+		case MODE_NONE:
+
+			(void) strlcat(buf, op->d86_opnd, buflen);
+			break;
+
+		case MODE_SIGNED:
+		case MODE_IMPLIED:
+		case MODE_OFFSET:
+
+			tgt = usv;
+
+			if (dis->d86_seg_prefix)
+				(void) strlcat(buf, dis->d86_seg_prefix,
+				    buflen);
+
+			if (op->d86_mode == MODE_SIGNED ||
+			    op->d86_mode == MODE_IMPLIED) {
+				(void) strlcat(buf, "$", buflen);
+			}
+
+			if (print_imm(dis, usv, mask, buf, buflen,
+			    IMM, TRY_NEG) &&
+			    (op->d86_mode == MODE_SIGNED ||
+			    op->d86_mode == MODE_IMPLIED)) {
+
+				/*
+				 * We printed a negative value for an
+				 * immediate that wasn't a
+				 * displacement.  Note that fact so we can
+				 * print the positive value as an
+				 * annotation.
+				 */
+
+				save_usv = usv;
+				save_mask = mask;
+			}
+			(void) strlcat(buf, op->d86_opnd, buflen);
+
+			break;
+
+		case MODE_IPREL:
+		case MODE_RIPREL:
+
+			reltgt = pc + sv;
+
+			switch (mode) {
+			case SIZE16:
+				reltgt = (uint16_t)reltgt;
+				break;
+			case SIZE32:
+				reltgt = (uint32_t)reltgt;
+				break;
+			}
+
+			(void) print_imm(dis, usv, mask, buf, buflen,
+			    DISP, TRY_NEG);
+
+			if (op->d86_mode == MODE_RIPREL)
+				(void) strlcat(buf, "(%rip)", buflen);
+			break;
+		}
+	}
+
+	/*
+	 * The symbol lookups may result in false positives,
+	 * particularly on object files, where small numbers may match
+	 * the 0-relative non-relocated addresses of symbols.
+	 */
+
+	lookup = dis->d86_sym_lookup;
+	if (tgt != 0) {
+		if ((dis->d86_flags & DIS_F_NOIMMSYM) == 0 &&
+		    lookup(dis->d86_data, tgt, NULL, 0) == 0) {
+			(void) strlcat(buf, "\t<", buflen);
+			curlen = strlen(buf);
+			lookup(dis->d86_data, tgt, buf + curlen,
+			    buflen - curlen);
+			(void) strlcat(buf, ">", buflen);
+		}
+
+		/*
+		 * If we printed a negative immediate above, print the
+		 * positive in case our heuristic was unhelpful
+		 */
+		if (save_usv) {
+			(void) strlcat(buf, "\t<", buflen);
+			(void) print_imm(dis, save_usv, save_mask, buf, buflen,
+			    IMM, POS);
+			(void) strlcat(buf, ">", buflen);
+		}
+	}
+
+	if (reltgt != 0) {
+		/* Print symbol or effective address for reltgt */
+
+		(void) strlcat(buf, "\t<", buflen);
+		curlen = strlen(buf);
+		lookup(dis->d86_data, reltgt, buf + curlen,
+		    buflen - curlen);
+		(void) strlcat(buf, ">", buflen);
+	}
+}
+
+#endif /* DIS_TEXT */
Index: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h
diff -N src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h	20 Apr 2017 11:49:47 -0000
@@ -0,0 +1,112 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1988 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * $FreeBSD: head/sys/cddl/dev/dtrace/x86/dis_tables.h 313133 2017-02-03 03:22:47Z markj $
+ */
+
+#ifndef _DIS_TABLES_H
+#define	_DIS_TABLES_H
+
+/*
+ * Constants and prototypes for the IA32 disassembler backend.  See dis_tables.c
+ * for usage information and documentation.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/param.h>
+
+/*
+ * values for cpu mode
+ */
+#define	SIZE16	1
+#define	SIZE32	2
+#define	SIZE64	3
+
+#define	OPLEN	256
+#define	PFIXLEN	  8
+#define	NCPS	20	/* number of chars per symbol	*/
+
+/*
+ * data structures that must be provided to dtrace_dis86()
+ */
+typedef struct d86opnd {
+	char		d86_opnd[OPLEN];	/* symbolic rep of operand */
+	char		d86_prefix[PFIXLEN];	/* any prefix string or "" */
+	uint_t		d86_mode;		/* mode for immediate */
+	uint_t		d86_value_size;		/* size in bytes of d86_value */
+	uint64_t	d86_value;		/* immediate value of opnd */
+} d86opnd_t;
+
+typedef struct dis86 {
+	uint_t		d86_mode;
+	uint_t		d86_error;
+	uint_t		d86_len;		/* instruction length */
+	int		d86_rmindex;		/* index of modrm byte or -1 */
+	uint_t		d86_memsize;		/* size of memory referenced */
+	char		d86_bytes[16];		/* bytes of instruction */
+	char		d86_mnem[OPLEN];
+	uint_t		d86_numopnds;
+	uint_t		d86_rex_prefix;		/* value of REX prefix if !0 */
+	char		*d86_seg_prefix;	/* segment prefix, if any */
+	uint_t		d86_opnd_size;
+	uint_t		d86_addr_size;
+	uint_t		d86_got_modrm;
+	uint_t		d86_vsib;		/* Has a VSIB */
+	struct d86opnd	d86_opnd[4];		/* up to 4 operands */
+	int		(*d86_check_func)(void *);
+	int		(*d86_get_byte)(void *);
+#ifdef DIS_TEXT
+	int		(*d86_sym_lookup)(void *, uint64_t, char *, size_t);
+	int		(*d86_sprintf_func)(char *, size_t, const char *, ...);
+	int		d86_flags;
+	uint_t		d86_imm_bytes;
+#endif
+	void		*d86_data;
+} dis86_t;
+
+extern int dtrace_disx86(dis86_t *x, uint_t cpu_mode);
+
+#define	DIS_F_OCTAL	0x1	/* Print all numbers in octal */
+#define	DIS_F_NOIMMSYM	0x2	/* Don't print symbols for immediates (.o) */
+
+#ifdef DIS_TEXT
+extern void dtrace_disx86_str(dis86_t *x, uint_t cpu_mode, uint64_t pc,
+    char *buf, size_t len);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DIS_TABLES_H */
Index: src/external/cddl/osnet/dev/dtrace/x86/instr_size.c
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/x86/instr_size.c
diff -N src/external/cddl/osnet/dev/dtrace/x86/instr_size.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/dtrace/x86/instr_size.c	20 Apr 2017 11:58:22 -0000
@@ -0,0 +1,149 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD: head/sys/cddl/dev/dtrace/x86/instr_size.c 303050 2016-07-20 00:02:10Z markj $
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1988 AT&T	*/
+/*	  All Rights Reserved	*/
+
+
+#ifdef illumos
+#pragma ident	"@(#)instr_size.c	1.14	05/07/08 SMI"
+#endif
+
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/param.h>
+#ifdef illumos
+#include <sys/cmn_err.h>
+#include <sys/archsystm.h>
+#include <sys/copyops.h>
+#include <vm/seg_enum.h>
+#include <sys/privregs.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/cred.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+
+typedef	u_int			model_t;
+#define	DATAMODEL_NATIVE	0
+int dtrace_instr_size(uchar_t *);
+int dtrace_instr_size_isa(uchar_t *, model_t, int *);
+#endif
+#ifdef __NetBSD__
+#include <dtrace_cddl.h>
+
+typedef	u_int			model_t;
+#define	DATAMODEL_NATIVE	0
+int dtrace_instr_size(uchar_t *);
+int dtrace_instr_size_isa(uchar_t *, model_t, int *);
+#endif
+
+#include <dis_tables.h>
+
+/*
+ * This subsystem (with the minor exception of the instr_size() function) is
+ * is called from DTrace probe context.  This imposes several requirements on
+ * the implementation:
+ *
+ * 1. External subsystems and functions may not be referenced.  The one current
+ *    exception is for cmn_err, but only to signal the detection of table
+ *    errors.  Assuming the tables are correct, no combination of input is to
+ *    trigger a cmn_err call.
+ *
+ * 2. These functions can't be allowed to be traced.  To prevent this,
+ *    all functions in the probe path (everything except instr_size()) must
+ *    have names that begin with "dtrace_".
+ */
+
+typedef enum dis_isize {
+	DIS_ISIZE_INSTR,
+	DIS_ISIZE_OPERAND
+} dis_isize_t;
+
+
+/*
+ * get a byte from instruction stream
+ */
+static int
+dtrace_dis_get_byte(void *p)
+{
+	int ret;
+	uchar_t **instr = p;
+
+	ret = **instr;
+	*instr += 1;
+
+	return (ret);
+}
+
+/*
+ * Returns either the size of a given instruction, in bytes, or the size of that
+ * instruction's memory access (if any), depending on the value of `which'.
+ * If a programming error in the tables is detected, the system will panic to
+ * ease diagnosis.  Invalid instructions will not be flagged.  They will appear
+ * to have an instruction size between 1 and the actual size, and will be
+ * reported as having no memory impact.
+ */
+/* ARGSUSED2 */
+static int
+dtrace_dis_isize(uchar_t *instr, dis_isize_t which, model_t model, int *rmindex)
+{
+	int sz;
+	dis86_t	x;
+	uint_t mode = SIZE32;
+
+	mode = (model == DATAMODEL_LP64) ? SIZE64 : SIZE32;
+
+	x.d86_data = (void **)&instr;
+	x.d86_get_byte = dtrace_dis_get_byte;
+	x.d86_check_func = NULL;
+
+	if (dtrace_disx86(&x, mode) != 0)
+		return (-1);
+
+	if (which == DIS_ISIZE_INSTR)
+		sz = x.d86_len;		/* length of the instruction */
+	else
+		sz = x.d86_memsize;	/* length of memory operand */
+
+	if (rmindex != NULL)
+		*rmindex = x.d86_rmindex;
+	return (sz);
+}
+
+int
+dtrace_instr_size_isa(uchar_t *instr, model_t model, int *rmindex)
+{
+	return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, model, rmindex));
+}
+
+int
+dtrace_instr_size(uchar_t *instr)
+{
+	return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, DATAMODEL_NATIVE,
+	    NULL));
+}
Index: src/external/cddl/osnet/dev/dtrace/x86/regset.h
===================================================================
RCS file: src/external/cddl/osnet/dev/dtrace/x86/regset.h
diff -N src/external/cddl/osnet/dev/dtrace/x86/regset.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/dtrace/x86/regset.h	8 May 2017 08:17:51 -0000
@@ -0,0 +1,178 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD: head/sys/cddl/dev/dtrace/x86/regset.h 277300 2015-01-17 14:44:59Z smh $ 
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
+/*	All Rights Reserved	*/
+
+#ifndef	_REGSET_H
+#define	_REGSET_H
+
+/*
+ * #pragma ident	"@(#)regset.h	1.11	05/06/08 SMI"
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The names and offsets defined here should be specified by the
+ * AMD64 ABI suppl.
+ *
+ * We make fsbase and gsbase part of the lwp context (since they're
+ * the only way to access the full 64-bit address range via the segment
+ * registers) and thus belong here too.  However we treat them as
+ * read-only; if %fs or %gs are updated, the results of the descriptor
+ * table lookup that those updates implicitly cause will be reflected
+ * in the corresponding fsbase and/or gsbase values the next time the
+ * context can be inspected.  However it is NOT possible to override
+ * the fsbase/gsbase settings via this interface.
+ *
+ * Direct modification of the base registers (thus overriding the
+ * descriptor table base address) can be achieved with _lwp_setprivate.
+ */
+
+#define	REG_GSBASE	27
+#define	REG_FSBASE	26
+#ifdef illumos
+#define	REG_DS		25
+#define	REG_ES		24
+
+#define	REG_GS		23
+#define	REG_FS		22
+#define	REG_SS		21
+#define	REG_RSP		20
+#define	REG_RFL		19
+#define	REG_CS		18
+#define	REG_RIP		17
+#define	REG_ERR		16
+#define	REG_TRAPNO	15
+#define	REG_RAX		14
+#define	REG_RCX		13
+#define	REG_RDX		12
+#define	REG_RBX		11
+#define	REG_RBP		10
+#define	REG_RSI		9
+#define	REG_RDI		8
+#define	REG_R8		7
+#define	REG_R9		6
+#define	REG_R10		5
+#define	REG_R11		4
+#define	REG_R12		3
+#define	REG_R13		2
+#define	REG_R14		1
+#define	REG_R15		0
+#else	/* !illumos */
+#define	REG_SS		25
+#define	REG_RSP		24
+#define	REG_RFL		23
+#define	REG_CS		22
+#define	REG_RIP		21
+#define	REG_DS		20
+#define	REG_ES		19
+#define	REG_ERR		18
+#define	REG_GS		17
+#define	REG_FS		16
+#define	REG_TRAPNO	15
+#define	REG_RAX		14
+#define	REG_RCX		13
+#define	REG_RDX		12
+#define	REG_RBX		11
+#define	REG_RBP		10
+#define	REG_RSI		9
+#define	REG_RDI		8
+#define	REG_R8		7
+#define	REG_R9		6
+#define	REG_R10		5
+#define	REG_R11		4
+#define	REG_R12		3
+#define	REG_R13		2
+#define	REG_R14		1
+#define	REG_R15		0
+#endif	/* illumos */
+
+/*
+ * The names and offsets defined here are specified by i386 ABI suppl.
+ */
+
+#ifdef illumos
+#define	SS		18	/* only stored on a privilege transition */
+#define	UESP		17	/* only stored on a privilege transition */
+#define	EFL		16
+#define	CS		15
+#define	EIP		14
+#define	ERR		13
+#define	TRAPNO		12
+#define	EAX		11
+#define	ECX		10
+#define	EDX		9
+#define	EBX		8
+#define	ESP		7
+#define	EBP		6
+#define	ESI		5
+#define	EDI		4
+#define	DS		3
+#define	ES		2
+#define	FS		1
+#define	GS		0
+#else	/* !illumos */
+#define	GS		18
+#define	SS		17	/* only stored on a privilege transition */
+#define	UESP		16	/* only stored on a privilege transition */
+#define	EFL		15
+#define	CS		14
+#define	EIP		13
+#define	ERR		12
+#define	TRAPNO		11
+#define	EAX		10
+#define	ECX		9
+#define	EDX		8
+#define	EBX		7
+#define	ESP		6
+#define	EBP		5
+#define	ESI		4
+#define	EDI		3
+#define	DS		2
+#define	ES		1
+#define	FS		0
+#endif	/* illumos */
+
+#define REG_PC  EIP
+#define REG_FP  EBP
+#define REG_SP  UESP
+#define REG_PS  EFL
+#define REG_R0  EAX
+#define REG_R1  EDX
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _REGSET_H */
Index: src/external/cddl/osnet/dev/fbt/fbt.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/fbt/fbt.c,v
retrieving revision 1.22
diff -u -p -r1.22 fbt.c
--- src/external/cddl/osnet/dev/fbt/fbt.c	27 Feb 2017 06:47:00 -0000	1.22
+++ src/external/cddl/osnet/dev/fbt/fbt.c	5 Jul 2017 00:25:12 -0000
@@ -23,7 +23,7 @@
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  * Portions Copyright 2010 Darran Hunt darran@NetBSD.org
  *
- * $FreeBSD: src/sys/cddl/dev/fbt/fbt.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/fbt/fbt.c 309786 2016-12-10 03:13:11Z markj $
  *
  */
 
@@ -33,6 +33,7 @@
  */
 
 #include <sys/cdefs.h>
+#include <sys/proc.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
@@ -57,21 +58,6 @@
 #include <sys/uio.h>
 #include <sys/unistd.h>
 
-#include <machine/cpu.h>
-#if defined(__i386__) || defined(__amd64__)
-#include <machine/cpufunc.h>
-#include <machine/specialreg.h>
-#if 0
-#include <x86/cpuvar.h>
-#endif
-#include <x86/cputypes.h>
-#elif __arm__
-#include <machine/trap.h>
-#include <arm/cpufunc.h>
-#include <arm/armreg.h>
-#include <arm/frame.h>
-#endif
-
 #define ELFSIZE ARCH_ELFSIZE
 #include <sys/exec_elf.h>
 
@@ -80,65 +66,19 @@
 #include <sys/kern_ctf.h>
 #include <sys/dtrace_impl.h>
 
-mod_ctf_t *modptr;
-
-MALLOC_DEFINE(M_FBT, "fbt", "Function Boundary Tracing");
+#include "fbt.h"
 
-#if defined(__i386__) || defined(__amd64__)
-#define	FBT_PUSHL_EBP		0x55
-#define	FBT_MOVL_ESP_EBP0_V0	0x8b
-#define	FBT_MOVL_ESP_EBP1_V0	0xec
-#define	FBT_MOVL_ESP_EBP0_V1	0x89
-#define	FBT_MOVL_ESP_EBP1_V1	0xe5
-#define	FBT_REX_RSP_RBP		0x48
-
-#define	FBT_POPL_EBP		0x5d
-#define	FBT_RET			0xc3
-#define	FBT_RET_IMM16		0xc2
-#define	FBT_LEAVE		0xc9
-#endif
-
-#ifdef __amd64__
-#define	FBT_PATCHVAL		0xcc
-#elif defined(__i386__)
-#define	FBT_PATCHVAL		0xf0
-
-#elif defined(__arm__)
-#define	FBT_PATCHVAL		DTRACE_BREAKPOINT
-
-/* entry and return */
-#define	FBT_BX_LR_P(insn)	(((insn) & ~INSN_COND_MASK) == 0x012fff1e)
-#define	FBT_B_LABEL_P(insn)	(((insn) & 0xff000000) == 0xea000000)
-/* entry */
-#define	FBT_MOV_IP_SP_P(insn)	((insn) == 0xe1a0c00d)
-/* index=1, add=1, wback=0 */
-#define	FBT_LDR_IMM_P(insn)	(((insn) & 0xfff00000) == 0xe5900000)
-#define	FBT_MOVW_P(insn)	(((insn) & 0xfff00000) == 0xe3000000)
-#define	FBT_MOV_IMM_P(insn)	(((insn) & 0xffff0000) == 0xe3a00000)
-#define	FBT_CMP_IMM_P(insn)	(((insn) & 0xfff00000) == 0xe3500000)
-#define	FBT_PUSH_P(insn)	(((insn) & 0xffff0000) == 0xe92d0000)
-/* return */
-/* cond=always, writeback=no, rn=sp and register_list includes pc */
-#define	FBT_LDM_P(insn)	(((insn) & 0x0fff8000) == 0x089d8000)
-#define	FBT_LDMIB_P(insn)	(((insn) & 0x0fff8000) == 0x099d8000)
-#define	FBT_MOV_PC_LR_P(insn)	(((insn) & ~INSN_COND_MASK) == 0x01a0f00e)
-/* cond=always, writeback=no, rn=sp and register_list includes lr, but not pc */
-#define	FBT_LDM_LR_P(insn)	(((insn) & 0xffffc000) == 0xe89d4000)
-#define	FBT_LDMIB_LR_P(insn)	(((insn) & 0xffffc000) == 0xe99d4000)
-
-/* rval = insn | invop_id (overwriting cond with invop ID) */
-#define	BUILD_RVAL(insn, id)	(((insn) & ~INSN_COND_MASK) | __SHIFTIN((id), INSN_COND_MASK))
-/* encode cond in the first byte */
-#define	PATCHVAL_ENCODE_COND(insn)	(FBT_PATCHVAL | __SHIFTOUT((insn), INSN_COND_MASK))
+mod_ctf_t *modptr;
 
-#else
-#error "architecture not supported"
-#endif
+dtrace_provider_id_t	fbt_id;
+fbt_probe_t		**fbt_probetab;
+int			fbt_probetab_mask;
+static int			fbt_probetab_size;
 
 static dev_type_open(fbt_open);
 static int	fbt_unload(void);
 static void	fbt_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
-static void	fbt_provide_module(void *, dtrace_modctl_t *);
+static void	fbt_provide_module(void *, modctl_t *);
 static void	fbt_destroy(void *, dtrace_id_t, void *);
 static int	fbt_enable(void *, dtrace_id_t, void *);
 static void	fbt_disable(void *, dtrace_id_t, void *);
@@ -146,11 +86,6 @@ static void	fbt_load(void);
 static void	fbt_suspend(void *, dtrace_id_t, void *);
 static void	fbt_resume(void *, dtrace_id_t, void *);
 
-#define	FBT_ENTRY	"entry"
-#define	FBT_RETURN	"return"
-#define	FBT_ADDR2NDX(addr)	((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
-#define	FBT_PROBETAB_SIZE	0x8000		/* 32k entries -- 128K total */
-
 static const struct cdevsw fbt_cdevsw = {
 	.d_open		= fbt_open,
 	.d_close	= noclose,
@@ -187,259 +122,71 @@ static dtrace_pops_t fbt_pops = {
 	fbt_destroy
 };
 
-typedef struct fbt_probe {
-	struct fbt_probe *fbtp_hashnext;
-#if defined(__i386__) || defined(__amd64__)
-	uint8_t		*fbtp_patchpoint;
-	int8_t		fbtp_rval;
-	uint8_t		fbtp_patchval;
-	uint8_t		fbtp_savedval;
-#elif __arm__
-	uint32_t	*fbtp_patchpoint;
-	int32_t		fbtp_rval;
-	uint32_t	fbtp_patchval;
-	uint32_t	fbtp_savedval;
-#endif
-	uintptr_t	fbtp_roffset;
-	dtrace_id_t	fbtp_id;
-	const char	*fbtp_name;
-	dtrace_modctl_t	*fbtp_ctl;
-	int		fbtp_loadcnt;
-	int		fbtp_primary;
-	int		fbtp_invop_cnt;
-	int		fbtp_symindx;
-	struct fbt_probe *fbtp_next;
-} fbt_probe_t;
-
-#ifdef notyet
-static struct cdev		*fbt_cdev;
+#ifdef __FreeBSD__
 static int			fbt_verbose = 0;
-#endif
-static dtrace_provider_id_t	fbt_id;
-static fbt_probe_t		**fbt_probetab;
-static int			fbt_probetab_size;
-static int			fbt_probetab_mask;
-
-#ifdef __arm__
-extern void (* dtrace_emulation_jump_addr)(int, struct trapframe *);
 
-static uint32_t
-expand_imm(uint32_t imm12)
-{
-	uint32_t unrot = imm12 & 0xff;
-	int amount = 2 * (imm12 >> 8);
+static struct cdev		*fbt_cdev;
+#endif /* __FreeBSD__ */
 
-	if (amount)
-		return (unrot >> amount) | (unrot << (32 - amount));
-	else
-		return unrot;
-}
+#ifdef __NetBSD__
+specificdata_key_t fbt_module_key;
 
-static uint32_t
-add_with_carry(uint32_t x, uint32_t y, int carry_in,
-	int *carry_out, int *overflow)
-{
-	uint32_t result;
-	uint64_t unsigned_sum = x + y + (uint32_t)carry_in;
-	int64_t signed_sum = (int32_t)x + (int32_t)y + (int32_t)carry_in;
-	KASSERT(carry_in == 1);
-
-	result = (uint32_t)(unsigned_sum & 0xffffffff);
-	*carry_out = ((uint64_t)result == unsigned_sum) ? 1 : 0;
-	*overflow = ((int64_t)result == signed_sum) ? 0 : 1;
-	
-	return result;
-}
+#define version xversion
+#endif /* __NetBSD__ */
 
-static void
-fbt_emulate(int _op, struct trapframe *frame)
+int
+fbt_excluded(const char *name)
 {
-	uint32_t op = _op;
 
-	switch (op >> 28) {
-	case DTRACE_INVOP_MOV_IP_SP:
-		/* mov ip, sp */
-		frame->tf_ip = frame->tf_svc_sp;
-		frame->tf_pc += 4;
-		break;
-	case DTRACE_INVOP_BX_LR:
-		/* bx lr */
-		frame->tf_pc = frame->tf_svc_lr;
-		break;
-	case DTRACE_INVOP_MOV_PC_LR:
-		/* mov pc, lr */
-		frame->tf_pc = frame->tf_svc_lr;
-		break;
-	case DTRACE_INVOP_LDM:
-		/* ldm sp, {..., pc} */
-		/* FALLTHRU */
-	case DTRACE_INVOP_LDMIB: {
-		/* ldmib sp, {..., pc} */
-		uint32_t register_list = (op & 0xffff);
-		uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp;
-		uint32_t *regs = &frame->tf_r0;
-		int i;
-
-		/* IDMIB */
-		if ((op >> 28) == 5)
-			sp++;
-
-		for (i=0; i <= 12; i++) {
-			if (register_list & (1 << i))
-				regs[i] = *sp++;
-		}
-		if (register_list & (1 << 13))
-			frame->tf_svc_sp = *sp++;
-		if (register_list & (1 << 14))
-			frame->tf_svc_lr = *sp++;
-		frame->tf_pc = *sp;
-		break;
-	}
-	case DTRACE_INVOP_LDR_IMM: {
-		/* ldr r?, [{pc,r?}, #?] */
-		uint32_t rt = (op >> 12) & 0xf;
-		uint32_t rn = (op >> 16) & 0xf;
-		uint32_t imm = op & 0xfff;
-		uint32_t *regs = &frame->tf_r0;
-		KDASSERT(rt <= 12);
-		KDASSERT(rn == 15 || rn =< 12);
-		if (rn == 15)
-			regs[rt] = *((uint32_t *)(intptr_t)(frame->tf_pc + 8 + imm));
-		else
-			regs[rt] = *((uint32_t *)(intptr_t)(regs[rn] + imm));
-		frame->tf_pc += 4;
-		break;
-	}
-	case DTRACE_INVOP_MOVW: {
-		/* movw r?, #? */
-		uint32_t rd = (op >> 12) & 0xf;
-		uint32_t imm = (op & 0xfff) | ((op & 0xf0000) >> 4);
-		uint32_t *regs = &frame->tf_r0;
-		KDASSERT(rd <= 12);
-		regs[rd] = imm;
-		frame->tf_pc += 4;
-		break;
-	}
-	case DTRACE_INVOP_MOV_IMM: {
-		/* mov r?, #? */
-		uint32_t rd = (op >> 12) & 0xf;
-		uint32_t imm = expand_imm(op & 0xfff);
-		uint32_t *regs = &frame->tf_r0;
-		KDASSERT(rd <= 12);
-		regs[rd] = imm;
-		frame->tf_pc += 4;
-		break;
-	}
-	case DTRACE_INVOP_CMP_IMM: {
-		/* cmp r?, #? */
-		uint32_t rn = (op >> 16) & 0xf;
-		uint32_t *regs = &frame->tf_r0;
-		uint32_t imm = expand_imm(op & 0xfff);
-		uint32_t spsr = frame->tf_spsr;
-		uint32_t result;
-		int carry;
-		int overflow;
+	if (strncmp(name, "dtrace_", 7) == 0 &&
+	    strncmp(name, "dtrace_safe_", 12) != 0) {
 		/*
-		 * (result, carry, overflow) = AddWithCarry(R[n], NOT(imm32), â€™1â€™);
-		 * APSR.N = result<31>;
-		 * APSR.Z = IsZeroBit(result);
-		 * APSR.C = carry;
-		 * APSR.V = overflow; 
+		 * Anything beginning with "dtrace_" may be called
+		 * from probe context unless it explicitly indicates
+		 * that it won't be called from probe context by
+		 * using the prefix "dtrace_safe_".
 		 */
-		KDASSERT(rn <= 12);
-		result = add_with_carry(regs[rn], ~imm, 1, &carry, &overflow);
-		if (result & 0x80000000)
-			spsr |= PSR_N_bit;
-		else
-			spsr &= ~PSR_N_bit;
-		if (result == 0)
-			spsr |= PSR_Z_bit;
-		else
-			spsr &= ~PSR_Z_bit;
-		if (carry)
-			spsr |= PSR_C_bit;
-		else
-			spsr &= ~PSR_C_bit;
-		if (overflow)
-			spsr |= PSR_V_bit;
-		else
-			spsr &= ~PSR_V_bit;
-
-#if 0
-		aprint_normal("pc=%x Rn=%x imm=%x %c%c%c%c\n", frame->tf_pc, regs[rn], imm,
-		    (spsr & PSR_N_bit) ? 'N' : 'n',
-		    (spsr & PSR_Z_bit) ? 'Z' : 'z',
-		    (spsr & PSR_C_bit) ? 'C' : 'c',
-		    (spsr & PSR_V_bit) ? 'V' : 'v');
-#endif
-		frame->tf_spsr = spsr;
-		frame->tf_pc += 4;
-		break;
-	}
-	case DTRACE_INVOP_B_LABEL: {
-		/* b ??? */
-		uint32_t imm = (op & 0x00ffffff) << 2;
-		int32_t diff;
-		/* SignExtend(imm26, 32) */
-		if (imm & 0x02000000)
-			imm |= 0xfc000000;
-		diff = (int32_t)imm;
-		frame->tf_pc += 8 + diff;
-		break;
+		return (1);
 	}
-	/* FIXME: push will overwrite trapframe... */
-	case DTRACE_INVOP_PUSH: {
-		/* push {...} */
-		uint32_t register_list = (op & 0xffff);
-		uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp;
-		uint32_t *regs = &frame->tf_r0;
-		int i;
-		int count = 0;
-
-#if 0
-		if ((op & 0x0fff0fff) == 0x052d0004) {
-			/* A2: str r4, [sp, #-4]! */
-			*(sp - 1) = regs[4];
-			frame->tf_pc += 4;
-			break;
-		}
-#endif
 
-		for (i=0; i < 16; i++) {
-			if (register_list & (1 << i))
-				count++;
-		}
-		sp -= count;
+#ifdef __FreeBSD__
+	/*
+	 * Lock owner methods may be called from probe context.
+	 */
+	if (strcmp(name, "owner_mtx") == 0 ||
+	    strcmp(name, "owner_rm") == 0 ||
+	    strcmp(name, "owner_rw") == 0 ||
+	    strcmp(name, "owner_sx") == 0)
+		return (1);
 
-		for (i=0; i <= 12; i++) {
-			if (register_list & (1 << i))
-				*sp++ = regs[i];
-		}
-		if (register_list & (1 << 13))
-			*sp++ = frame->tf_svc_sp;
-		if (register_list & (1 << 14))
-			*sp++ = frame->tf_svc_lr;
-		if (register_list & (1 << 15))
-			*sp = frame->tf_pc + 8;
-
-		/* make sure the caches and memory are in sync */
-		cpu_dcache_wbinv_range(frame->tf_svc_sp, count * 4);
-
-		/* In case the current page tables have been modified ... */
-		cpu_tlb_flushID();
-		cpu_cpwait();
+	/*
+	 * When DTrace is built into the kernel we need to exclude
+	 * the FBT functions from instrumentation.
+	 */
+#ifndef _KLD_MODULE
+	if (strncmp(name, "fbt_", 4) == 0)
+		return (1);
+#endif
+#endif
 
-		frame->tf_svc_sp -= count * 4;
-		frame->tf_pc += 4;
+#ifdef __NetBSD__
+	if (name[0] == '_' && name[1] == '_')
+		return (1);
 
-		break;
-	}
-	default:
-		KDASSERTMSG(0, "op=%u\n", op >> 28);
+	if (strcmp(name, "cpu_index") == 0 ||
+	    strncmp(name, "db_", 3) == 0 ||
+	    strncmp(name, "ddb_", 4) == 0 ||
+	    strncmp(name, "kdb_", 4) == 0 ||
+	    strncmp(name, "lockdebug_", 10) == 0 ||
+	    strncmp(name, "kauth_", 5) == 0 ||
+	    strncmp(name, "ktext_write", 11) == 0) {
+		return (1);
 	}
-}
 #endif
 
+	return (0);
+}
+
 static void
 fbt_doubletrap(void)
 {
@@ -450,489 +197,70 @@ fbt_doubletrap(void)
 		fbt = fbt_probetab[i];
 
 		for (; fbt != NULL; fbt = fbt->fbtp_next)
-			*fbt->fbtp_patchpoint = fbt->fbtp_savedval;
-	}
-}
-
-
-static int
-fbt_invop(uintptr_t addr, struct trapframe *frame, uintptr_t rval)
-{
-	solaris_cpu_t *cpu;
-	uintptr_t *stack;
-	uintptr_t arg0, arg1, arg2, arg3, arg4;
-	fbt_probe_t *fbt;
-
-#ifdef __amd64__
-	stack = (uintptr_t *)frame->tf_rsp;
-#endif
-#ifdef __i386__
-	/* Skip hardware-saved registers. */
-	stack = (uintptr_t *)&frame->tf_esp;
-#endif
-#ifdef __arm__
-	stack = (uintptr_t *)frame->tf_svc_sp;
-#endif
-
-	cpu = &solaris_cpu[cpu_number()];
-	fbt = fbt_probetab[FBT_ADDR2NDX(addr)];
-	for (; fbt != NULL; fbt = fbt->fbtp_hashnext) {
-		if ((uintptr_t)fbt->fbtp_patchpoint == addr) {
-			fbt->fbtp_invop_cnt++;
-			if (fbt->fbtp_roffset == 0) {
-#ifdef __amd64__
-				/* fbt->fbtp_rval == DTRACE_INVOP_PUSHQ_RBP */
-				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
-				cpu->cpu_dtrace_caller = stack[0];
-				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
-				    CPU_DTRACE_BADADDR);
-
-				arg0 = frame->tf_rdi;
-				arg1 = frame->tf_rsi;
-				arg2 = frame->tf_rdx;
-				arg3 = frame->tf_rcx;
-				arg4 = frame->tf_r8;
-#else
-				int i = 0;
-
-				/*
-				 * When accessing the arguments on the stack,
-				 * we must protect against accessing beyond
-				 * the stack.  We can safely set NOFAULT here
-				 * -- we know that interrupts are already
-				 * disabled.
-				 */
-				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
-				cpu->cpu_dtrace_caller = stack[i++];
-				arg0 = stack[i++];
-				arg1 = stack[i++];
-				arg2 = stack[i++];
-				arg3 = stack[i++];
-				arg4 = stack[i++];
-				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
-				    CPU_DTRACE_BADADDR);
-#endif
-
-				dtrace_probe(fbt->fbtp_id, arg0, arg1,
-				    arg2, arg3, arg4);
-
-				cpu->cpu_dtrace_caller = 0;
-			} else {
-#ifdef __amd64__
-				/*
-				 * On amd64, we instrument the ret, not the
-				 * leave.  We therefore need to set the caller
-				 * to ensure that the top frame of a stack()
-				 * action is correct.
-				 */
-				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
-				cpu->cpu_dtrace_caller = stack[0];
-				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
-				    CPU_DTRACE_BADADDR);
-#endif
-
-				dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset,
-				    rval, 0, 0, 0);
-				cpu->cpu_dtrace_caller = 0;
-			}
-
-			return (fbt->fbtp_rval);
-		}
+			fbt_patch_tracepoint(fbt, fbt->fbtp_savedval);
 	}
-
-	return (0);
 }
 
-#if defined(__i386__) || defined(__amd64__)
-static int
-fbt_provide_module_cb(const char *name, int symindx, void *value,
-	uint32_t symsize, int type, void *opaque)
+#ifdef __FreeBSD__
+static void
+fbt_provide_module(void *arg, modctl_t *lf)
 {
-	fbt_probe_t *fbt, *retfbt;
-	u_int8_t *instr, *limit;
-	dtrace_modctl_t *mod = opaque;
-	const char *modname = mod->mod_info->mi_name;
-	int j;
-	int size;
-
-	/* got a function? */
-	if (ELF_ST_TYPE(type) != STT_FUNC) {
-	    return 0;
-	}
-
-	if (strncmp(name, "dtrace_", 7) == 0 &&
-	    strncmp(name, "dtrace_safe_", 12) != 0) {
-		/*
-		 * Anything beginning with "dtrace_" may be called
-		 * from probe context unless it explicitly indicates
-		 * that it won't be called from probe context by
-		 * using the prefix "dtrace_safe_".
-		 */
-		return (0);
-	}
+	char modname[MAXPATHLEN];
+	int i;
+	size_t len;
 
-	if (name[0] == '_' && name[1] == '_')
-		return (0);
+	strlcpy(modname, lf->filename, sizeof(modname));
+	len = strlen(modname);
+	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
+		modname[len - 3] = '\0';
 
 	/*
-	 * Exclude some more symbols which can be called from probe context.
+	 * Employees of dtrace and their families are ineligible.  Void
+	 * where prohibited.
 	 */
-	if (strcmp(name, "x86_curcpu") == 0 /* CPU */
-	    || strcmp(name, "x86_curlwp") == 0 /* curproc, curlwp, curthread */
-	    || strcmp(name, "cpu_index") == 0 /* cpu_number, curcpu_id */
-	    || strncmp(name, "db_", 3) == 0 /* debugger */
-	    || strncmp(name, "ddb_", 4) == 0 /* debugger */
-	    || strncmp(name, "kdb_", 4) == 0 /* debugger */
-	    || strncmp(name, "lockdebug_", 10) == 0 /* lockdebug XXX for now */
-	    || strncmp(name, "kauth_", 5) == 0 /* CRED XXX for now */
-	    ) {
-		return 0;
-	}
-
-	instr = (u_int8_t *) value;
-	limit = (u_int8_t *) value + symsize;
-
-#ifdef __amd64__
-	while (instr < limit) {
-		if (*instr == FBT_PUSHL_EBP)
-			break;
-
-		if ((size = dtrace_instr_size(instr)) <= 0)
-			break;
-
-		instr += size;
-	}
-
-	if (instr >= limit || *instr != FBT_PUSHL_EBP) {
-		/*
-		 * We either don't save the frame pointer in this
-		 * function, or we ran into some disassembly
-		 * screw-up.  Either way, we bail.
-		 */
-		return (0);
-	}
-#else
-	if (instr[0] != FBT_PUSHL_EBP) {
-		return (0);
-	}
-
-	if (!(instr[1] == FBT_MOVL_ESP_EBP0_V0 &&
-	    instr[2] == FBT_MOVL_ESP_EBP1_V0) &&
-	    !(instr[1] == FBT_MOVL_ESP_EBP0_V1 &&
-	    instr[2] == FBT_MOVL_ESP_EBP1_V1)) {
-		return (0);
-	}
-#endif
-	fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO);
-	fbt->fbtp_name = name;
-	fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
-	    name, FBT_ENTRY, 3, fbt);
-	fbt->fbtp_patchpoint = instr;
-	fbt->fbtp_ctl = mod;
-	/* fbt->fbtp_loadcnt = lf->loadcnt; */
-	fbt->fbtp_rval = DTRACE_INVOP_PUSHL_EBP;
-	fbt->fbtp_savedval = *instr;
-	fbt->fbtp_patchval = FBT_PATCHVAL;
-	fbt->fbtp_symindx = symindx;
-
-	fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
-	fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
-	mod->mod_fbtentries++;
-
-	retfbt = NULL;
-
-	while (instr < limit) {
-		if (instr >= limit)
-			return (0);
-
-		/*
-		 * If this disassembly fails, then we've likely walked off into
-		 * a jump table or some other unsuitable area.  Bail out of the
-		 * disassembly now.
-		 */
-		if ((size = dtrace_instr_size(instr)) <= 0)
-			return (0);
-
-#ifdef __amd64__
-		/*
-		 * We only instrument "ret" on amd64 -- we don't yet instrument
-		 * ret imm16, largely because the compiler doesn't seem to
-		 * (yet) emit them in the kernel...
-		 */
-		if (*instr != FBT_RET) {
-			instr += size;
-			continue;
-		}
-#else
-		if (!(size == 1 &&
-		    (*instr == FBT_POPL_EBP || *instr == FBT_LEAVE) &&
-		    (*(instr + 1) == FBT_RET ||
-		    *(instr + 1) == FBT_RET_IMM16))) {
-			instr += size;
-			continue;
-		}
-#endif
-
-		/*
-		 * We (desperately) want to avoid erroneously instrumenting a
-		 * jump table, especially given that our markers are pretty
-		 * short:  two bytes on x86, and just one byte on amd64.  To
-		 * determine if we're looking at a true instruction sequence
-		 * or an inline jump table that happens to contain the same
-		 * byte sequences, we resort to some heuristic sleeze:  we
-		 * treat this instruction as being contained within a pointer,
-		 * and see if that pointer points to within the body of the
-		 * function.  If it does, we refuse to instrument it.
-		 */
-		for (j = 0; j < sizeof (uintptr_t); j++) {
-			caddr_t check = (caddr_t) instr - j;
-			uint8_t *ptr;
-
-			if (check < (caddr_t)value)
-				break;
-
-			if (check + sizeof (caddr_t) > (caddr_t)limit)
-				continue;
-
-			ptr = *(uint8_t **)check;
-
-			if (ptr >= (uint8_t *) value && ptr < limit) {
-				instr += size;
-				continue;
-			}
-		}
-
-		/*
-		 * We have a winner!
-		 */
-		fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO);
-		fbt->fbtp_name = name;
-
-		if (retfbt == NULL) {
-			fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
-			    name, FBT_RETURN, 3, fbt);
-		} else {
-			retfbt->fbtp_next = fbt;
-			fbt->fbtp_id = retfbt->fbtp_id;
-		}
-
-		retfbt = fbt;
-		fbt->fbtp_patchpoint = instr;
-		fbt->fbtp_ctl = mod;
-		/* fbt->fbtp_loadcnt = lf->loadcnt; */
-		fbt->fbtp_symindx = symindx;
-
-#ifndef __amd64__
-		if (*instr == FBT_POPL_EBP) {
-			fbt->fbtp_rval = DTRACE_INVOP_POPL_EBP;
-		} else {
-			ASSERT(*instr == FBT_LEAVE);
-			fbt->fbtp_rval = DTRACE_INVOP_LEAVE;
-		}
-		fbt->fbtp_roffset =
-		    (uintptr_t)(instr - (uint8_t *) value) + 1;
-
-#else
-		ASSERT(*instr == FBT_RET);
-		fbt->fbtp_rval = DTRACE_INVOP_RET;
-		fbt->fbtp_roffset =
-		    (uintptr_t)(instr - (uint8_t *) value);
-#endif
-
-		fbt->fbtp_savedval = *instr;
-		fbt->fbtp_patchval = FBT_PATCHVAL;
-		fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
-		fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
-
-		mod->mod_fbtentries++;
-
-		instr += size;
-	}
-
-	return 0;
-}
-
-#elif defined(__arm__)
-
-static int
-fbt_provide_module_cb(const char *name, int symindx, void *value,
-	uint32_t symsize, int type, void *opaque)
-{
-	fbt_probe_t *fbt, *retfbt;
-	uint32_t *instr, *limit;
-	bool was_ldm_lr = false;
-	dtrace_modctl_t *mod = opaque;
-	const char *modname = mod->mod_info->mi_name;
-	int size;
-
-	/* got a function? */
-	if (ELF_ST_TYPE(type) != STT_FUNC) {
-	    return 0;
-	}
-
-	if (strncmp(name, "dtrace_", 7) == 0 &&
-	    strncmp(name, "dtrace_safe_", 12) != 0) {
-		/*
-		 * Anything beginning with "dtrace_" may be called
-		 * from probe context unless it explicitly indicates
-		 * that it won't be called from probe context by
-		 * using the prefix "dtrace_safe_".
-		 */
-		return (0);
-	}
-
-	if (name[0] == '_' && name[1] == '_')
-		return (0);
+	if (strcmp(modname, "dtrace") == 0)
+		return;
 
 	/*
-	 * Exclude some more symbols which can be called from probe context.
+	 * To register with DTrace, a module must list 'dtrace' as a
+	 * dependency in order for the kernel linker to resolve
+	 * symbols like dtrace_register(). All modules with such a
+	 * dependency are ineligible for FBT tracing.
 	 */
-	if (strncmp(name, "db_", 3) == 0 /* debugger */
-	    || strncmp(name, "ddb_", 4) == 0 /* debugger */
-	    || strncmp(name, "kdb_", 4) == 0 /* debugger */
-	    || strncmp(name, "lockdebug_", 10) == 0 /* lockdebug XXX for now */
-	    || strncmp(name, "kauth_", 5) == 0 /* CRED XXX for now */
-	    /* Sensitive functions on ARM */
-	    || strncmp(name, "_spl", 4) == 0
-	    || strcmp(name, "binuptime") == 0
-	    || strcmp(name, "dosoftints") == 0
-	    || strcmp(name, "fbt_emulate") == 0
-	    || strcmp(name, "nanouptime") == 0
-	    || strcmp(name, "undefinedinstruction") == 0
-	    || strncmp(name, "dmt_", 4) == 0 /* omap */
-	    || strncmp(name, "mvsoctmr_", 9) == 0 /* marvell */
-	    ) {
-		return 0;
-	}
-
-	instr = (uint32_t *) value;
-	limit = (uint32_t *)((uintptr_t)value + symsize);
-
-	if (!FBT_MOV_IP_SP_P(*instr)
-	    && !FBT_BX_LR_P(*instr)
-	    && !FBT_MOVW_P(*instr)
-	    && !FBT_MOV_IMM_P(*instr)
-	    && !FBT_B_LABEL_P(*instr)
-	    && !FBT_LDR_IMM_P(*instr)
-	    && !FBT_CMP_IMM_P(*instr)
-	    /* && !FBT_PUSH_P(*instr) */
-	    ) {
-		return 0;
-	}
-
-	fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO);
-	fbt->fbtp_name = name;
-	fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
-	    name, FBT_ENTRY, 3, fbt);
-	fbt->fbtp_patchpoint = instr;
-	fbt->fbtp_ctl = mod;
-	/* fbt->fbtp_loadcnt = lf->loadcnt; */
-	if (FBT_MOV_IP_SP_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IP_SP);
-	else if (FBT_LDR_IMM_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDR_IMM);
-	else if (FBT_MOVW_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOVW);
-	else if (FBT_MOV_IMM_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IMM);
-	else if (FBT_CMP_IMM_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_CMP_IMM);
-	else if (FBT_BX_LR_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR);
-	else if (FBT_PUSH_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_PUSH);
-	else if (FBT_B_LABEL_P(*instr))
-		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B_LABEL);
-
-	fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr);
-	fbt->fbtp_savedval = *instr;
-	fbt->fbtp_symindx = symindx;
-
-	fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
-	fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
-	mod->mod_fbtentries++;
-
-	retfbt = NULL;
-
-	while (instr < limit) {
-		if (instr >= limit)
-			return (0);
-
-		size = 1;
-
-		if (!FBT_BX_LR_P(*instr)
-		    && !FBT_MOV_PC_LR_P(*instr)
-		    && !FBT_LDM_P(*instr)
-		    && !FBT_LDMIB_P(*instr)
-		    && !(was_ldm_lr && FBT_B_LABEL_P(*instr))
-		    ) {
-			if (FBT_LDM_LR_P(*instr) || FBT_LDMIB_LR_P(*instr))
-				was_ldm_lr = true;
-			else
-				was_ldm_lr = false;
-			instr += size;
-			continue;
-		}
+	for (i = 0; i < lf->ndeps; i++)
+		if (strncmp(lf->deps[i]->filename, "dtrace", 6) == 0)
+			return;
 
+	if (lf->fbt_nentries) {
 		/*
-		 * We have a winner!
+		 * This module has some FBT entries allocated; we're afraid
+		 * to screw with it.
 		 */
-		fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO);
-		fbt->fbtp_name = name;
-
-		if (retfbt == NULL) {
-			fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
-			    name, FBT_RETURN, 3, fbt);
-		} else {
-			retfbt->fbtp_next = fbt;
-			fbt->fbtp_id = retfbt->fbtp_id;
-		}
-
-		retfbt = fbt;
-		fbt->fbtp_patchpoint = instr;
-		fbt->fbtp_ctl = mod;
-		/* fbt->fbtp_loadcnt = lf->loadcnt; */
-		fbt->fbtp_symindx = symindx;
-
-		if (FBT_BX_LR_P(*instr))
-			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR);
-		else if (FBT_MOV_PC_LR_P(*instr))
-			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_PC_LR);
-		else if (FBT_LDM_P(*instr))
-			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDM);
-		else if (FBT_LDMIB_P(*instr))
-			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDMIB);
-		else if (FBT_B_LABEL_P(*instr))
-			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B_LABEL);
-
-		fbt->fbtp_roffset = (uintptr_t)(instr - (uint32_t *) value);
-		fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr);
-
-		fbt->fbtp_savedval = *instr;
-		fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
-		fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
-
-		mod->mod_fbtentries++;
-
-		instr += size;
-		was_ldm_lr = false;
+		return;
 	}
 
-	return 0;
+	/*
+	 * List the functions in the module and the symbol values.
+	 */
+	(void) linker_file_function_listall(lf, fbt_provide_module_function, modname);
 }
-#else
-#error "architecture not supported"
 #endif
-
-
+#ifdef __NetBSD__
 static void
-fbt_provide_module(void *arg, dtrace_modctl_t *mod)
+fbt_provide_module(void *arg, modctl_t *mod)
 {
+	struct fbt_ksyms_arg fka;
+	struct mod_ctf *mc;
 	char modname[MAXPATHLEN];
 	int i;
 	size_t len;
 
-	strlcpy(modname, mod->mod_info->mi_name, sizeof(modname));
+	if (mod_ctf_get(mod, &mc)) {
+		printf("fbt: no CTF data for module %s\n", module_name(mod));
+		return;
+	}
+
+	strlcpy(modname, module_name(mod), sizeof(modname));
 	len = strlen(modname);
 	if (len > 5 && strcmp(modname + len - 3, ".kmod") == 0)
 		modname[len - 4] = '\0';
@@ -958,36 +286,52 @@ fbt_provide_module(void *arg, dtrace_mod
 	 * dependency are ineligible for FBT tracing.
 	 */
 	for (i = 0; i < mod->mod_nrequired; i++) {
-		if (strncmp(mod->mod_required[i]->mod_info->mi_name,
+		if (strncmp(module_name(mod->mod_required[i]),
 			    "dtrace", 6) == 0)
 			return;
 	}
-
-	if (mod->mod_fbtentries) {
-		/*
-		 * This module has some FBT entries allocated; we're afraid
-		 * to screw with it.
-		 */
+	if (mc->fbt_provided) {
 		return;
 	}
 
 	/*
 	 * List the functions in the module and the symbol values.
 	 */
-	ksyms_mod_foreach(modname, fbt_provide_module_cb, mod);
+	memset(&fka, 0, sizeof(fka));
+	fka.fka_mod = mod;
+	fka.fka_mc = mc;
+	ksyms_mod_foreach(modname, fbt_provide_module_cb, &fka);
+	mc->fbt_provided = true;
+}
+
+static void
+fbt_module_dtor(void *arg)
+{
+	mod_ctf_t *mc = arg;
+
+	if (mc->ctfalloc)
+		free(mc->ctftab, M_TEMP);
+	kmem_free(mc, sizeof(*mc));
 }
+#endif
 
 static void
 fbt_destroy(void *arg, dtrace_id_t id, void *parg)
 {
 	fbt_probe_t *fbt = parg, *next, *hash, *last;
-	dtrace_modctl_t *ctl;
+	modctl_t *ctl;
 	int ndx;
 
 	do {
 		ctl = fbt->fbtp_ctl;
 
+#ifdef __FreeBSD__
 		ctl->mod_fbtentries--;
+#endif
+#ifdef __NetBSD__
+		mod_ctf_t *mc = module_getspecific(ctl, fbt_module_key);
+		mc->fbt_provided = false;
+#endif
 
 		/*
 		 * Now we need to remove this probe from the fbt_probetab.
@@ -1009,26 +353,21 @@ fbt_destroy(void *arg, dtrace_id_t id, v
 		}
 
 		next = fbt->fbtp_next;
-		free(fbt, M_FBT);
+		kmem_free(fbt, sizeof(*fbt));
 
 		fbt = next;
 	} while (fbt != NULL);
 }
 
-#if defined(__i386__) || defined(__amd64__)
-
 static int
 fbt_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	u_long psl;
-	u_long cr0;
+	modctl_t *ctl = fbt->fbtp_ctl;
 
-
-#if 0	/* XXX TBD */
+#ifdef __NetBSD__
+	module_hold(ctl);
+#else
 	ctl->nenabled++;
 
 	/*
@@ -1040,33 +379,15 @@ fbt_enable(void *arg, dtrace_id_t id, vo
 		if (fbt_verbose) {
 			printf("fbt is failing for probe %s "
 			    "(module %s reloaded)",
-			    fbt->fbtp_name, ctl->filename);
+			    fbt->fbtp_name, module_name(ctl));
 		}
 
-		return;
+		return 0;
 	}
 #endif
 
-	/* Disable interrupts. */
-	psl = x86_read_psl();
-	x86_disable_intr();
-
-	/* Disable write protection in supervisor mode. */
-	cr0 = rcr0();
-	lcr0(cr0 & ~CR0_WP);
-
-	for (; fbt != NULL; fbt = fbt->fbtp_next) {
-		*fbt->fbtp_patchpoint = fbt->fbtp_patchval;
-	}
-
-	/* Write back and invalidate cache, flush pipelines. */
-	wbinvd();
-	x86_flush();
-	x86_write_psl(psl);
-
-	/* Re-enable write protection. */
-	lcr0(cr0);
-
+	for (; fbt != NULL; fbt = fbt->fbtp_next)
+		fbt_patch_tracepoint(fbt, fbt->fbtp_patchval);
 	return 0;
 }
 
@@ -1074,240 +395,60 @@ static void
 fbt_disable(void *arg, dtrace_id_t id, void *parg)
 {
 	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	u_long psl;
-	u_long cr0;
+	modctl_t *ctl = fbt->fbtp_ctl;
 
-#if 0	/* XXX TBD */
+#ifndef __NetBSD__
 	ASSERT(ctl->nenabled > 0);
 	ctl->nenabled--;
 
 	if ((ctl->loadcnt != fbt->fbtp_loadcnt))
 		return;
 #endif
-	/* Disable interrupts. */
-	psl = x86_read_psl();
-	x86_disable_intr();
-
-	/* Disable write protection in supervisor mode. */
-	cr0 = rcr0();
-	lcr0(cr0 & ~CR0_WP);
 
 	for (; fbt != NULL; fbt = fbt->fbtp_next)
-		*fbt->fbtp_patchpoint = fbt->fbtp_savedval;
+		fbt_patch_tracepoint(fbt, fbt->fbtp_savedval);
 
-	/* Write back and invalidate cache, flush pipelines. */
-	wbinvd();
-	x86_flush();
-	x86_write_psl(psl);
-
-	/* Re-enable write protection. */
-	lcr0(cr0);
+#ifdef __NetBSD__
+	module_rele(ctl);
+#endif
 }
 
 static void
 fbt_suspend(void *arg, dtrace_id_t id, void *parg)
 {
 	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	u_long psl;
-	u_long cr0;
+#ifndef __NetBSD__
+	modctl_t *ctl = fbt->fbtp_ctl;
 
-#if 0	/* XXX TBD */
 	ASSERT(ctl->nenabled > 0);
 
 	if ((ctl->loadcnt != fbt->fbtp_loadcnt))
 		return;
 #endif
 
-	/* Disable interrupts. */
-	psl = x86_read_psl();
-	x86_disable_intr();
-
-	/* Disable write protection in supervisor mode. */
-	cr0 = rcr0();
-	lcr0(cr0 & ~CR0_WP);
-
 	for (; fbt != NULL; fbt = fbt->fbtp_next)
-		*fbt->fbtp_patchpoint = fbt->fbtp_savedval;
-
-	/* Write back and invalidate cache, flush pipelines. */
-	wbinvd();
-	x86_flush();
-	x86_write_psl(psl);
-
-	/* Re-enable write protection. */
-	lcr0(cr0);
+		fbt_patch_tracepoint(fbt, fbt->fbtp_savedval);
 }
 
 static void
 fbt_resume(void *arg, dtrace_id_t id, void *parg)
 {
 	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	u_long psl;
-	u_long cr0;
+#ifndef __NetBSD__
+	modctl_t *ctl = fbt->fbtp_ctl;
 
-#if 0	/* XXX TBD */
 	ASSERT(ctl->nenabled > 0);
 
 	if ((ctl->loadcnt != fbt->fbtp_loadcnt))
 		return;
 #endif
-	/* Disable interrupts. */
-	psl = x86_read_psl();
-	x86_disable_intr();
-
-	/* Disable write protection in supervisor mode. */
-	cr0 = rcr0();
-	lcr0(cr0 & ~CR0_WP);
 
 	for (; fbt != NULL; fbt = fbt->fbtp_next)
-		*fbt->fbtp_patchpoint = fbt->fbtp_patchval;
-
-	/* Write back and invalidate cache, flush pipelines. */
-	wbinvd();
-	x86_flush();
-	x86_write_psl(psl);
-
-	/* Re-enable write protection. */
-	lcr0(cr0);
+		fbt_patch_tracepoint(fbt, fbt->fbtp_patchval);
 }
 
-#elif defined(__arm__)
-
 static int
-fbt_enable(void *arg, dtrace_id_t id, void *parg)
-{
-	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	dtrace_icookie_t c;
-
-
-#if 0	/* XXX TBD */
-	ctl->nenabled++;
-
-	/*
-	 * Now check that our modctl has the expected load count.  If it
-	 * doesn't, this module must have been unloaded and reloaded -- and
-	 * we're not going to touch it.
-	 */
-	if (ctl->loadcnt != fbt->fbtp_loadcnt) {
-		if (fbt_verbose) {
-			printf("fbt is failing for probe %s "
-			    "(module %s reloaded)",
-			    fbt->fbtp_name, ctl->filename);
-		}
-
-		return;
-	}
-#endif
-
-	c = dtrace_interrupt_disable();
-
-	for (fbt = parg; fbt != NULL; fbt = fbt->fbtp_next) {
-		*fbt->fbtp_patchpoint = fbt->fbtp_patchval;
-		cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4);
-	}
-
-	dtrace_interrupt_enable(c);
-
-	return 0;
-}
-
-static void
-fbt_disable(void *arg, dtrace_id_t id, void *parg)
-{
-	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	dtrace_icookie_t c;
-
-#if 0	/* XXX TBD */
-	ASSERT(ctl->nenabled > 0);
-	ctl->nenabled--;
-
-	if ((ctl->loadcnt != fbt->fbtp_loadcnt))
-		return;
-#endif
-
-	c = dtrace_interrupt_disable();
-
-	for (; fbt != NULL; fbt = fbt->fbtp_next) {
-		*fbt->fbtp_patchpoint = fbt->fbtp_savedval;
-		cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4);
-	}
-
-	dtrace_interrupt_enable(c);
-}
-
-static void
-fbt_suspend(void *arg, dtrace_id_t id, void *parg)
-{
-	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	dtrace_icookie_t c;
-
-#if 0	/* XXX TBD */
-	ASSERT(ctl->nenabled > 0);
-
-	if ((ctl->loadcnt != fbt->fbtp_loadcnt))
-		return;
-#endif
-
-	c = dtrace_interrupt_disable();
-
-	for (; fbt != NULL; fbt = fbt->fbtp_next) {
-		*fbt->fbtp_patchpoint = fbt->fbtp_savedval;
-		cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4);
-	}
-
-	dtrace_interrupt_enable(c);
-}
-
-static void
-fbt_resume(void *arg, dtrace_id_t id, void *parg)
-{
-	fbt_probe_t *fbt = parg;
-#if 0
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
-#endif
-	dtrace_icookie_t c;
-
-#if 0	/* XXX TBD */
-	ASSERT(ctl->nenabled > 0);
-
-	if ((ctl->loadcnt != fbt->fbtp_loadcnt))
-		return;
-#endif
-
-	c = dtrace_interrupt_disable();
-
-	for (; fbt != NULL; fbt = fbt->fbtp_next) {
-		*fbt->fbtp_patchpoint = fbt->fbtp_patchval;
-		cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4);
-	}
-
-	dtrace_interrupt_enable(c);
-}
-
-#else
-#error "architecture not supported"
-#endif
-
-static int
-fbt_ctfoff_init(dtrace_modctl_t *mod, mod_ctf_t *mc)
+fbt_ctfoff_init(modctl_t *mod, mod_ctf_t *mc)
 {
 	const Elf_Sym *symp = mc->symtab;
 	const ctf_header_t *hp = (const ctf_header_t *) mc->ctftab;
@@ -1323,19 +464,16 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo
 	/* Sanity check. */
 	if (hp->cth_magic != CTF_MAGIC) {
 		printf("Bad magic value in CTF data of '%s'\n",
-			mod->mod_info->mi_name);
+		    module_name(mod));
 		return (EINVAL);
 	}
 
 	if (mc->symtab == NULL) {
-		printf("No symbol table in '%s'\n",
-			mod->mod_info->mi_name);
+		printf("No symbol table in '%s'\n", module_name(mod));
 		return (EINVAL);
 	}
 
-	if ((ctfoff = malloc(sizeof(uint32_t) * nsyms, M_FBT, M_WAITOK)) == NULL)
-		return (ENOMEM);
-
+	ctfoff = malloc(sizeof(uint32_t) * nsyms, M_FBT, M_WAITOK);
 	mc->ctfoffp = ctfoff;
 
 	for (i = 0; i < nsyms; i++, ctfoff++, symp++) {
@@ -1346,9 +484,10 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo
 				continue;
 			}
 
-			/* CTF expects the pre-sorted symbol ordering, 
+			/*
+			 * CTF expects the unsorted symbol ordering,
 			 * so map it from that to the current sorted
-			 * and trimmed symbol table.
+			 * symbol table.
 			 * ctfoff[new-ind] = oldind symbol info.
 			 */
 
@@ -1359,6 +498,12 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo
 			ctfoff = &mc->ctfoffp[mc->nmap[i]-1];
 		}
 
+		/*
+		 * Note that due to how kern_ksyms.c adjusts st_name
+		 * to be the offset into a virtual combined strtab,
+		 * st_name will never be 0 for loaded modules.
+		 */
+
 		if (symp->st_name == 0 || symp->st_shndx == SHN_UNDEF) {
 			*ctfoff = 0xffffffff;
 			continue;
@@ -1408,12 +553,12 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo
 }
 
 static ssize_t
-fbt_get_ctt_size(uint8_t xversion, const ctf_type_t *tp, ssize_t *sizep,
+fbt_get_ctt_size(uint8_t version, const ctf_type_t *tp, ssize_t *sizep,
     ssize_t *incrementp)
 {
 	ssize_t size, increment;
 
-	if (xversion > CTF_VERSION_1 &&
+	if (version > CTF_VERSION_1 &&
 	    tp->ctt_size == CTF_LSIZE_SENT) {
 		size = CTF_TYPE_LSIZE(tp);
 		increment = sizeof (ctf_type_t);
@@ -1442,6 +587,7 @@ fbt_typoff_init(mod_ctf_t *mc)
 	uint32_t *xp;
 	ulong_t pop[CTF_K_MAX + 1] = { 0 };
 
+
 	/* Sanity check. */
 	if (hp->cth_magic != CTF_MAGIC)
 		return (EINVAL);
@@ -1522,17 +668,19 @@ fbt_typoff_init(mod_ctf_t *mc)
 			vbytes = 0;
 			break;
 		default:
-			printf("%s(%d): detected invalid CTF kind -- %u\n", __func__, __LINE__, kind);
+			printf("%s(%d): detected invalid CTF kind -- %u\n",
+			       __func__, __LINE__, kind);
 			return (EIO);
 		}
 		tp = (ctf_type_t *)((uintptr_t)tp + increment + vbytes);
 		pop[kind]++;
 	}
 
+	/* account for a sentinel value below */
+	ctf_typemax++;
 	mc->typlen = ctf_typemax;
 
-	if ((xp = malloc(sizeof(uint32_t) * ctf_typemax, M_FBT, M_ZERO | M_WAITOK)) == NULL)
-		return (ENOMEM);
+	xp = malloc(sizeof(uint32_t) * ctf_typemax, M_FBT, M_ZERO | M_WAITOK);
 
 	mc->typoffp = xp;
 
@@ -1854,11 +1002,7 @@ ctf_decl_push(ctf_decl_t *cd, mod_ctf_t 
 		prec = CTF_PREC_BASE;
 	}
 
-	if ((cdp = malloc(sizeof (ctf_decl_node_t), M_FBT, M_WAITOK)) == NULL) {
-		cd->cd_err = EAGAIN;
-		return;
-	}
-
+	cdp = malloc(sizeof(*cdp), M_FBT, M_WAITOK);
 	cdp->cd_type = type;
 	cdp->cd_kind = kind;
 	cdp->cd_n = n;
@@ -2002,8 +1146,8 @@ fbt_getargdesc(void *arg __unused, dtrac
 {
 	const ushort_t *dp;
 	fbt_probe_t *fbt = parg;
-	mod_ctf_t mc;
-	dtrace_modctl_t *ctl = fbt->fbtp_ctl;
+	mod_ctf_t *mc;
+	modctl_t *ctl = fbt->fbtp_ctl;
 	int ndx = desc->dtargd_ndx;
 	int symindx = fbt->fbtp_symindx;
 	uint32_t *ctfoff;
@@ -2011,41 +1155,44 @@ fbt_getargdesc(void *arg __unused, dtrac
 	ushort_t info, kind, n;
 	int nsyms;
 
+	if (fbt->fbtp_roffset != 0 && desc->dtargd_ndx == 0) {
+		(void) strcpy(desc->dtargd_native, "int");
+		return;
+	}
+
 	desc->dtargd_ndx = DTRACE_ARGNONE;
 
-	/* Get a pointer to the CTF data and it's length. */
+	/* Get a pointer to the CTF data and its length. */
 	if (mod_ctf_get(ctl, &mc) != 0) {
-		static int report=0;
+		static int report = 0;
 		if (report < 1) {
-		    report++;
-		    printf("FBT: Error no CTF section found in module \"%s\"\n",
-			    ctl->mod_info->mi_name);
+			report++;
+			printf("FBT: Error no CTF section found in module \"%s\"\n",
+			    module_name(ctl));
 		}
 		/* No CTF data? Something wrong? *shrug* */
 		return;
 	}
 
-	nsyms = (mc.nmap != NULL) ? mc.nmapsize : mc.nsym;
+	nsyms = (mc->nmap != NULL) ? mc->nmapsize : mc->nsym;
 
 	/* Check if this module hasn't been initialised yet. */
-	if (mc.ctfoffp == NULL) {
+	if (mc->ctfoffp == NULL) {
 		/*
 		 * Initialise the CTF object and function symindx to
 		 * byte offset array.
 		 */
-		if (fbt_ctfoff_init(ctl, &mc) != 0) {
+		if (fbt_ctfoff_init(ctl, mc) != 0)
 			return;
-		}
 
 		/* Initialise the CTF type to byte offset array. */
-		if (fbt_typoff_init(&mc) != 0) {
+		if (fbt_typoff_init(mc) != 0)
 			return;
-		}
 	}
 
-	ctfoff = mc.ctfoffp;
+	ctfoff = mc->ctfoffp;
 
-	if (ctfoff == NULL || mc.typoffp == NULL) {
+	if (ctfoff == NULL || mc->typoffp == NULL) {
 		return;
 	}
 
@@ -2057,7 +1204,7 @@ fbt_getargdesc(void *arg __unused, dtrac
 	if ((offset = ctfoff[symindx]) == 0xffffffff)
 		return;
 
-	dp = (const ushort_t *)(mc.ctftab + offset + sizeof(ctf_header_t));
+	dp = (const ushort_t *)(mc->ctftab + offset + sizeof(ctf_header_t));
 
 	info = *dp++;
 	kind = CTF_INFO_KIND(info);
@@ -2075,23 +1222,50 @@ fbt_getargdesc(void *arg __unused, dtrac
 		return;
 	}
 
-	/* Check if the requested argument doesn't exist. */
-	if (ndx >= n)
-		return;
+	if (fbt->fbtp_roffset != 0) {
+		/* Only return type is available for args[1] in return probe. */
+		if (ndx > 1)
+			return;
+		ASSERT(ndx == 1);
+	} else {
+		/* Check if the requested argument doesn't exist. */
+		if (ndx >= n)
+			return;
 
-	/* Skip the return type and arguments up to the one requested. */
-	dp += ndx + 1;
+		/* Skip the return type and arguments up to the one requested. */
+		dp += ndx + 1;
+	}
 
-	if (fbt_type_name(&mc, *dp, desc->dtargd_native, sizeof(desc->dtargd_native)) > 0) {
+	if (fbt_type_name(mc, *dp, desc->dtargd_native, sizeof(desc->dtargd_native)) > 0)
 		desc->dtargd_ndx = ndx;
-	}
 
 	return;
 }
 
+#ifdef __FreeBSD__
+static int
+fbt_linker_file_cb(linker_file_t lf, void *arg)
+{
+
+	fbt_provide_module(arg, lf);
+
+	return (0);
+}
+#endif
+
 static void
 fbt_load(void)
 {
+
+#ifdef __FreeBSD__
+	/* Create the /dev/dtrace/fbt entry. */
+	fbt_cdev = make_dev(&fbt_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+	    "dtrace/fbt");
+#endif
+#ifdef __NetBSD__
+	(void) module_specific_key_create(&fbt_module_key, fbt_module_dtor);
+#endif
+
 	/* Default the probe table size if not specified. */
 	if (fbt_probetab_size == 0)
 		fbt_probetab_size = FBT_PROBETAB_SIZE;
@@ -2105,9 +1279,6 @@ fbt_load(void)
 
 	dtrace_doubletrap_func = fbt_doubletrap;
 	dtrace_invop_add(fbt_invop);
-#ifdef __arm__
-	dtrace_emulation_jump_addr = fbt_emulate;
-#endif
 
 	if (dtrace_register("fbt", &fbt_attr, DTRACE_PRIV_USER,
 	    NULL, &fbt_pops, NULL, &fbt_id) != 0)
@@ -2120,9 +1291,6 @@ fbt_unload(void)
 {
 	int error = 0;
 
-#ifdef __arm__
-	dtrace_emulation_jump_addr = NULL;
-#endif
 	/* De-register the invalid opcode handler. */
 	dtrace_invop_remove(fbt_invop);
 
@@ -2137,6 +1305,12 @@ fbt_unload(void)
 	fbt_probetab = NULL;
 	fbt_probetab_mask = 0;
 
+#ifdef __FreeBSD__
+	destroy_dev(fbt_cdev);
+#endif
+#ifdef __NetBSD__
+	(void) module_specific_key_delete(fbt_module_key);
+#endif
 	return (error);
 }
 
@@ -2170,4 +1344,15 @@ fbt_open(dev_t dev, int flags, int mode,
 	return (0);
 }
 
+#ifdef __FreeBSD__
+SYSINIT(fbt_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fbt_load, NULL);
+SYSUNINIT(fbt_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fbt_unload, NULL);
+
+DEV_MODULE(fbt, fbt_modevent, NULL);
+MODULE_VERSION(fbt, 1);
+MODULE_DEPEND(fbt, dtrace, 1, 1, 1);
+MODULE_DEPEND(fbt, opensolaris, 1, 1, 1);
+#endif
+#ifdef __NetBSD__
 MODULE(MODULE_CLASS_MISC, dtrace_fbt, "dtrace,zlib");
+#endif
Index: src/external/cddl/osnet/dev/fbt/fbt.h
===================================================================
RCS file: src/external/cddl/osnet/dev/fbt/fbt.h
diff -N src/external/cddl/osnet/dev/fbt/fbt.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/fbt/fbt.h	20 Jun 2017 13:43:33 -0000
@@ -0,0 +1,82 @@
+/*	$NetBSD$	*/
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
+ *
+ * $FreeBSD: head/sys/cddl/dev/fbt/fbt.h 298171 2016-04-17 23:08:47Z markj $
+ *
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FBT_H_
+#define _FBT_H_
+
+#include "fbt_isa.h"
+
+typedef struct fbt_probe {
+	struct fbt_probe *fbtp_hashnext;
+	fbt_patchval_t	*fbtp_patchpoint;
+	fbt_patchval_t	fbtp_rval;
+	fbt_patchval_t	fbtp_patchval;
+	fbt_patchval_t	fbtp_savedval;
+	uintptr_t	fbtp_roffset;
+	dtrace_id_t	fbtp_id;
+	const char	*fbtp_name;
+	modctl_t	*fbtp_ctl;
+	int		fbtp_loadcnt;
+	int		fbtp_symindx;
+	struct fbt_probe *fbtp_next;
+} fbt_probe_t;
+
+struct fbt_ksyms_arg {
+	modctl_t *fka_mod;
+	void *fka_mc;
+};
+
+struct linker_file;
+struct linker_symval;
+struct trapframe;
+
+int	fbt_invop(uintptr_t, struct trapframe *, uintptr_t);
+void	fbt_patch_tracepoint(fbt_probe_t *, fbt_patchval_t);
+int	fbt_provide_module_function(struct linker_file *, int,
+	    struct linker_symval *, void *);
+int	fbt_provide_module_cb(const char *, int, void *,
+	    uint32_t, int, void *);
+int	fbt_excluded(const char *);
+
+extern dtrace_provider_id_t	fbt_id;
+extern fbt_probe_t		**fbt_probetab;
+extern int			fbt_probetab_mask;
+
+#define	FBT_ADDR2NDX(addr)	((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
+#define	FBT_PROBETAB_SIZE	0x8000		/* 32k entries -- 128K total */
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_FBT);
+#endif
+
+#endif
Index: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c
===================================================================
RCS file: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c
diff -N src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c	20 Jun 2017 19:14:24 -0000
@@ -0,0 +1,403 @@
+/*	$NetBSD$	*/
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
+ * Portions Copyright 2013 Justin Hibbits jhibbits@freebsd.org
+ * Portions Copyright 2013 Howard Su howardsu@freebsd.org
+ *
+ * $FreeBSD: head/sys/cddl/dev/fbt/arm/fbt_isa.c 312378 2017-01-18 13:27:24Z andrew $
+ *
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kmem.h>
+
+#include <sys/dtrace.h>
+
+#include <machine/trap.h>
+#include <arm/cpufunc.h>
+#include <arm/armreg.h>
+#include <arm/frame.h>
+#include <uvm/uvm_extern.h>
+
+#include "fbt.h"
+
+#define	FBT_PUSHM		0xe92d0000
+#define	FBT_POPM		0xe8bd0000
+#define	FBT_JUMP		0xea000000
+#define	FBT_SUBSP		0xe24dd000
+
+#define	FBT_ENTRY	"entry"
+#define	FBT_RETURN	"return"
+
+int
+fbt_invop(uintptr_t addr, struct trapframe *frame, uintptr_t rval)
+{
+	solaris_cpu_t *cpu = &solaris_cpu[cpu_number()];
+	fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)];
+	register_t fifthparam;
+
+	for (; fbt != NULL; fbt = fbt->fbtp_hashnext) {
+		if ((uintptr_t)fbt->fbtp_patchpoint == addr) {
+			if (fbt->fbtp_roffset == 0) {
+				/* Get 5th parameter from stack */
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				fifthparam = *(register_t *)frame->tf_svc_sp;
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
+				    CPU_DTRACE_BADADDR);
+
+				cpu->cpu_dtrace_caller = frame->tf_svc_lr;
+				dtrace_probe(fbt->fbtp_id, frame->tf_r0,
+					     frame->tf_r1, frame->tf_r2,
+					     frame->tf_r3, fifthparam);
+			} else {
+				/* XXX set caller */
+				cpu->cpu_dtrace_caller = 0;
+				dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset,
+				    rval, 0, 0, 0);
+			}
+
+			cpu->cpu_dtrace_caller = 0;
+			return (fbt->fbtp_rval);
+		}
+	}
+
+	return (0);
+}
+
+
+void
+fbt_patch_tracepoint(fbt_probe_t *fbt, fbt_patchval_t val)
+{
+	dtrace_icookie_t c;
+
+	c = dtrace_interrupt_disable();
+
+	ktext_write(fbt->fbtp_patchpoint, &val, sizeof (val));
+
+	dtrace_interrupt_enable(c);
+}
+
+#ifdef __FreeBSD__
+
+int
+fbt_provide_module_function(linker_file_t lf, int symindx,
+    linker_symval_t *symval, void *opaque)
+{
+	char *modname = opaque;
+	const char *name = symval->name;
+	fbt_probe_t *fbt, *retfbt;
+	uint32_t *instr, *limit;
+	int popm;
+
+	if (fbt_excluded(name))
+		return (0);
+
+	instr = (uint32_t *)symval->value;
+	limit = (uint32_t *)(symval->value + symval->size);
+
+	/*
+	 * va_arg functions has first instruction of
+	 * sub sp, sp, #?
+	 */
+	if ((*instr & 0xfffff000) == FBT_SUBSP)
+		instr++;
+
+	/*
+	 * check if insn is a pushm with LR
+	 */
+	if ((*instr & 0xffff0000) != FBT_PUSHM ||
+	    (*instr & (1 << LR)) == 0)
+		return (0);
+
+	fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
+	fbt->fbtp_name = name;
+	fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
+	    name, FBT_ENTRY, 5, fbt);
+	fbt->fbtp_patchpoint = instr;
+	fbt->fbtp_ctl = lf;
+	fbt->fbtp_loadcnt = lf->loadcnt;
+	fbt->fbtp_savedval = *instr;
+	fbt->fbtp_patchval = FBT_BREAKPOINT;
+	fbt->fbtp_rval = DTRACE_INVOP_PUSHM;
+	fbt->fbtp_symindx = symindx;
+
+	fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
+	fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
+
+	lf->fbt_nentries++;
+
+	popm = FBT_POPM | ((*instr) & 0x3FFF) | 0x8000;
+
+	retfbt = NULL;
+again:
+	for (; instr < limit; instr++) {
+		if (*instr == popm)
+			break;
+		else if ((*instr & 0xff000000) == FBT_JUMP) {
+			uint32_t *target, *start;
+			int offset;
+
+			offset = (*instr & 0xffffff);
+			offset <<= 8;
+			offset /= 64;
+			target = instr + (2 + offset);
+			start = (uint32_t *)symval->value;
+			if (target >= limit || target < start)
+				break;
+		}
+	}
+
+	if (instr >= limit)
+		return (0);
+
+	/*
+	 * We have a winner!
+	 */
+	fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
+	fbt->fbtp_name = name;
+	if (retfbt == NULL) {
+		fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
+		    name, FBT_RETURN, 5, fbt);
+	} else {
+		retfbt->fbtp_next = fbt;
+		fbt->fbtp_id = retfbt->fbtp_id;
+	}
+	retfbt = fbt;
+
+	fbt->fbtp_patchpoint = instr;
+	fbt->fbtp_ctl = lf;
+	fbt->fbtp_loadcnt = lf->loadcnt;
+	fbt->fbtp_symindx = symindx;
+	if ((*instr & 0xff000000) == FBT_JUMP)
+		fbt->fbtp_rval = DTRACE_INVOP_B;
+	else
+		fbt->fbtp_rval = DTRACE_INVOP_POPM;
+	fbt->fbtp_savedval = *instr;
+	fbt->fbtp_patchval = FBT_BREAKPOINT;
+	fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
+	fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
+
+	lf->fbt_nentries++;
+
+	instr++;
+	goto again;
+}
+
+#endif /* __FreeBSD_ */
+
+#ifdef __NetBSD__
+
+#define	FBT_PATCHVAL		DTRACE_BREAKPOINT
+
+/* entry and return */
+#define	FBT_BX_LR_P(insn)	(((insn) & ~INSN_COND_MASK) == 0x012fff1e)
+#define	FBT_B_LABEL_P(insn)	(((insn) & 0xff000000) == 0xea000000)
+/* entry */
+#define	FBT_MOV_IP_SP_P(insn)	((insn) == 0xe1a0c00d)
+/* index=1, add=1, wback=0 */
+#define	FBT_LDR_IMM_P(insn)	(((insn) & 0xfff00000) == 0xe5900000)
+#define	FBT_MOVW_P(insn)	(((insn) & 0xfff00000) == 0xe3000000)
+#define	FBT_MOV_IMM_P(insn)	(((insn) & 0xffff0000) == 0xe3a00000)
+#define	FBT_CMP_IMM_P(insn)	(((insn) & 0xfff00000) == 0xe3500000)
+#define	FBT_PUSH_P(insn)	(((insn) & 0xffff0000) == 0xe92d0000)
+/* return */
+/* cond=always, writeback=no, rn=sp and register_list includes pc */
+#define	FBT_LDM_P(insn)	(((insn) & 0x0fff8000) == 0x089d8000)
+#define	FBT_LDMIB_P(insn)	(((insn) & 0x0fff8000) == 0x099d8000)
+#define	FBT_MOV_PC_LR_P(insn)	(((insn) & ~INSN_COND_MASK) == 0x01a0f00e)
+/* cond=always, writeback=no, rn=sp and register_list includes lr, but not pc */
+#define	FBT_LDM_LR_P(insn)	(((insn) & 0xffffc000) == 0xe89d4000)
+#define	FBT_LDMIB_LR_P(insn)	(((insn) & 0xffffc000) == 0xe99d4000)
+
+/* rval = insn | invop_id (overwriting cond with invop ID) */
+#define	BUILD_RVAL(insn, id)	(((insn) & ~INSN_COND_MASK) | __SHIFTIN((id), INSN_COND_MASK))
+/* encode cond in the first byte */
+#define	PATCHVAL_ENCODE_COND(insn)	(FBT_PATCHVAL | __SHIFTOUT((insn), INSN_COND_MASK))
+
+int
+fbt_provide_module_cb(const char *name, int symindx, void *value,
+	uint32_t symsize, int type, void *opaque)
+{
+	fbt_probe_t *fbt, *retfbt;
+	uint32_t *instr, *limit;
+	bool was_ldm_lr = false;
+	int size;
+
+	struct fbt_ksyms_arg *fka = opaque;
+	modctl_t *mod = fka->fka_mod;
+	const char *modname = module_name(mod);
+
+
+	/* got a function? */
+	if (ELF_ST_TYPE(type) != STT_FUNC)
+		return 0;
+
+	if (fbt_excluded(name))
+		return (0);
+
+	/*
+	 * Exclude some more symbols which can be called from probe context.
+	 */
+	if (strncmp(name, "_spl", 4) == 0 ||
+	    strcmp(name, "binuptime") == 0 ||
+	    strcmp(name, "nanouptime") == 0 ||
+	    strcmp(name, "dosoftints") == 0 ||
+	    strcmp(name, "fbt_emulate") == 0 ||
+	    strcmp(name, "undefinedinstruction") == 0 ||
+	    strncmp(name, "dmt_", 4) == 0 /* omap */ ||
+	    strncmp(name, "mvsoctmr_", 9) == 0 /* marvell */ ) {
+		return 0;
+	}
+
+	instr = (uint32_t *) value;
+	limit = (uint32_t *)((uintptr_t)value + symsize);
+
+	if (!FBT_MOV_IP_SP_P(*instr)
+	    && !FBT_BX_LR_P(*instr)
+	    && !FBT_MOVW_P(*instr)
+	    && !FBT_MOV_IMM_P(*instr)
+	    && !FBT_B_LABEL_P(*instr)
+	    && !FBT_LDR_IMM_P(*instr)
+	    && !FBT_CMP_IMM_P(*instr)
+	    && !FBT_PUSH_P(*instr)
+	    ) {
+		return 0;
+	}
+
+	fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
+	fbt->fbtp_name = name;
+	fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
+	    name, FBT_ENTRY, 5, fbt);
+	fbt->fbtp_patchpoint = instr;
+	fbt->fbtp_ctl = mod;
+	/* fbt->fbtp_loadcnt = lf->loadcnt; */
+	if (FBT_MOV_IP_SP_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IP_SP);
+	else if (FBT_LDR_IMM_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDR_IMM);
+	else if (FBT_MOVW_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOVW);
+	else if (FBT_MOV_IMM_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IMM);
+	else if (FBT_CMP_IMM_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_CMP_IMM);
+	else if (FBT_BX_LR_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR);
+	else if (FBT_PUSH_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_PUSHM);
+	else if (FBT_B_LABEL_P(*instr))
+		fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B);
+	else
+		KASSERT(0);
+
+	KASSERTMSG((fbt->fbtp_rval >> 28) != 0,
+		   "fbt %p insn 0x%x name %s rval 0x%08x",
+		   fbt, *instr, name, fbt->fbtp_rval);
+
+	fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr);
+	fbt->fbtp_savedval = *instr;
+	fbt->fbtp_symindx = symindx;
+
+	fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
+	fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
+
+	retfbt = NULL;
+
+	while (instr < limit) {
+		if (instr >= limit)
+			return (0);
+
+		size = 1;
+
+		if (!FBT_BX_LR_P(*instr)
+		    && !FBT_MOV_PC_LR_P(*instr)
+		    && !FBT_LDM_P(*instr)
+		    && !FBT_LDMIB_P(*instr)
+		    && !(was_ldm_lr && FBT_B_LABEL_P(*instr))
+		    ) {
+			if (FBT_LDM_LR_P(*instr) || FBT_LDMIB_LR_P(*instr))
+				was_ldm_lr = true;
+			else
+				was_ldm_lr = false;
+			instr += size;
+			continue;
+		}
+
+		/*
+		 * We have a winner!
+		 */
+		fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
+		fbt->fbtp_name = name;
+
+		if (retfbt == NULL) {
+			fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
+			    name, FBT_RETURN, 5, fbt);
+		} else {
+			retfbt->fbtp_next = fbt;
+			fbt->fbtp_id = retfbt->fbtp_id;
+		}
+
+		retfbt = fbt;
+		fbt->fbtp_patchpoint = instr;
+		fbt->fbtp_ctl = mod;
+		/* fbt->fbtp_loadcnt = lf->loadcnt; */
+		fbt->fbtp_symindx = symindx;
+
+		if (FBT_BX_LR_P(*instr))
+			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR);
+		else if (FBT_MOV_PC_LR_P(*instr))
+			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_PC_LR);
+		else if (FBT_LDM_P(*instr))
+			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDM);
+		else if (FBT_LDMIB_P(*instr))
+			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_POPM);
+		else if (FBT_B_LABEL_P(*instr))
+			fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B);
+		else
+			KASSERT(0);
+
+		KASSERTMSG((fbt->fbtp_rval >> 28) != 0, "fbt %p name %s rval 0x%08x",
+			   fbt, name, fbt->fbtp_rval);
+
+		fbt->fbtp_roffset = (uintptr_t)(instr - (uint32_t *) value);
+		fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr);
+
+		fbt->fbtp_savedval = *instr;
+		fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
+		fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
+
+		instr += size;
+		was_ldm_lr = false;
+	}
+
+	return 0;
+}
+
+#endif /* __NetBSD__ */
Index: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h
===================================================================
RCS file: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h
diff -N src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h	12 Apr 2017 18:56:59 -0000
@@ -0,0 +1,32 @@
+/*	$NetBSD$	*/
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD: head/sys/cddl/dev/fbt/arm/fbt_isa.h 278529 2015-02-10 19:41:30Z gnn $
+ *
+ */
+
+#ifndef _FBT_ISA_H_
+#define _FBT_ISA_H_
+
+typedef uint32_t fbt_patchval_t;
+
+#endif
Index: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c
===================================================================
RCS file: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c
diff -N src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c	15 Jun 2017 19:39:26 -0000
@@ -0,0 +1,425 @@
+/*	$NetBSD$	*/
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
+ *
+ * $FreeBSD: head/sys/cddl/dev/fbt/x86/fbt_isa.c 309785 2016-12-10 03:11:05Z markj $
+ *
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/proc.h>
+#include <sys/param.h>
+#include <sys/cpu.h>
+#include <sys/module.h>
+#include <sys/kmem.h>
+
+#include <sys/dtrace.h>
+
+#if 1
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+#if 0
+#include <x86/cpuvar.h>
+#endif
+#include <x86/cputypes.h>
+#endif
+
+#include "fbt.h"
+
+#define	FBT_PUSHL_EBP		0x55
+#define	FBT_MOVL_ESP_EBP0_V0	0x8b
+#define	FBT_MOVL_ESP_EBP1_V0	0xec
+#define	FBT_MOVL_ESP_EBP0_V1	0x89
+#define	FBT_MOVL_ESP_EBP1_V1	0xe5
+#define	FBT_REX_RSP_RBP		0x48
+
+#define	FBT_POPL_EBP		0x5d
+#define	FBT_RET			0xc3
+#define	FBT_RET_IMM16		0xc2
+#define	FBT_LEAVE		0xc9
+
+#ifdef __amd64__
+#define	FBT_PATCHVAL		0xcc
+#else
+#define	FBT_PATCHVAL		0xf0
+#endif
+
+#define	FBT_ENTRY	"entry"
+#define	FBT_RETURN	"return"
+
+int
+fbt_invop(uintptr_t addr, struct trapframe *frame, uintptr_t rval)
+{
+	solaris_cpu_t *cpu;
+	uintptr_t *stack;
+	uintptr_t arg0, arg1, arg2, arg3, arg4;
+	fbt_probe_t *fbt;
+
+#ifdef __amd64__
+	stack = (uintptr_t *)frame->tf_rsp;
+#else
+	/* Skip hardware-saved registers. */
+#ifdef __NetBSD__
+	stack = (uintptr_t *)&frame->tf_esp;
+#else
+	stack = (uintptr_t *)frame->tf_isp + 3;
+#endif
+#endif
+
+	cpu = &solaris_cpu[cpu_number()];
+	fbt = fbt_probetab[FBT_ADDR2NDX(addr)];
+	for (; fbt != NULL; fbt = fbt->fbtp_hashnext) {
+		if ((uintptr_t)fbt->fbtp_patchpoint == addr) {
+			if (fbt->fbtp_roffset == 0) {
+#ifdef __amd64__
+				/* fbt->fbtp_rval == DTRACE_INVOP_PUSHQ_RBP */
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				cpu->cpu_dtrace_caller = stack[0];
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
+				    CPU_DTRACE_BADADDR);
+
+				arg0 = frame->tf_rdi;
+				arg1 = frame->tf_rsi;
+				arg2 = frame->tf_rdx;
+				arg3 = frame->tf_rcx;
+				arg4 = frame->tf_r8;
+#else
+				int i = 0;
+
+				/*
+				 * When accessing the arguments on the stack,
+				 * we must protect against accessing beyond
+				 * the stack.  We can safely set NOFAULT here
+				 * -- we know that interrupts are already
+				 * disabled.
+				 */
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				cpu->cpu_dtrace_caller = stack[i++];
+				arg0 = stack[i++];
+				arg1 = stack[i++];
+				arg2 = stack[i++];
+				arg3 = stack[i++];
+				arg4 = stack[i++];
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
+				    CPU_DTRACE_BADADDR);
+#endif
+
+				dtrace_probe(fbt->fbtp_id, arg0, arg1,
+				    arg2, arg3, arg4);
+
+				cpu->cpu_dtrace_caller = 0;
+			} else {
+#ifdef __amd64__
+				/*
+				 * On amd64, we instrument the ret, not the
+				 * leave.  We therefore need to set the caller
+				 * to ensure that the top frame of a stack()
+				 * action is correct.
+				 */
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				cpu->cpu_dtrace_caller = stack[0];
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT |
+				    CPU_DTRACE_BADADDR);
+#endif
+
+				dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset,
+				    rval, 0, 0, 0);
+				cpu->cpu_dtrace_caller = 0;
+			}
+
+			return (fbt->fbtp_rval);
+		}
+	}
+
+	return (0);
+}
+
+
+
+
+#ifdef __FreeBSD__
+void
+fbt_patch_tracepoint(fbt_probe_t *fbt, fbt_patchval_t val)
+{
+
+	*fbt->fbtp_patchpoint = val;
+}
+#endif
+
+#ifdef __NetBSD__
+void
+fbt_patch_tracepoint(fbt_probe_t *fbt, fbt_patchval_t val)
+{
+	u_long psl;
+	u_long cr0;
+
+	/* Disable interrupts. */
+	psl = x86_read_psl();
+	x86_disable_intr();
+
+	/* Disable write protection in supervisor mode. */
+	cr0 = rcr0();
+	lcr0(cr0 & ~CR0_WP);
+
+	for (; fbt != NULL; fbt = fbt->fbtp_next) {
+		*fbt->fbtp_patchpoint = val;
+	}
+
+	/* Write back and invalidate cache, flush pipelines. */
+	wbinvd();
+	x86_flush();
+	x86_write_psl(psl);
+
+	/* Re-enable write protection. */
+	lcr0(cr0);
+}
+#endif
+
+
+#ifdef __FreeBSD__
+int
+fbt_provide_module_function(linker_file_t lf, int symindx,
+    linker_symval_t *symval, void *opaque)
+#endif
+#ifdef __NetBSD__
+int
+fbt_provide_module_cb(const char *name, int symindx, void *value,
+    uint32_t symsize, int type, void *opaque)
+#endif
+{
+	fbt_probe_t *fbt, *retfbt;
+	u_int8_t *instr, *limit;
+	int j;
+	int size;
+
+#ifdef __FreeBSD_
+	char *modname = opaque;
+	const char *name = symval->name;
+	size_t symsize = symval->size;
+	void *value = symval->value;
+
+	/*
+	 * trap_check() is a wrapper for DTrace's fault handler, so we don't
+	 * want to be able to instrument it.
+	 */
+	if (strcmp(name, "trap_check") == 0)
+		return (0);
+#endif
+#ifdef __NetBSD__
+	struct fbt_ksyms_arg *fka = opaque;
+	modctl_t *mod = fka->fka_mod;
+	const char *modname = module_name(mod);
+
+	/* got a function? */
+	if (ELF_ST_TYPE(type) != STT_FUNC)
+		return 0;
+
+	/*
+	 * Exclude some more symbols which can be called from probe context.
+	 */
+	if (strcmp(name, "x86_curcpu") == 0 ||
+	    strcmp(name, "x86_curlwp") == 0) {
+		return 0;
+	}
+#endif
+
+	if (fbt_excluded(name))
+		return (0);
+
+	instr = (u_int8_t *) value;
+	limit = (u_int8_t *) value + symsize;
+
+#ifdef __amd64__
+	while (instr < limit) {
+		if (*instr == FBT_PUSHL_EBP)
+			break;
+
+		if ((size = dtrace_instr_size(instr)) <= 0)
+			break;
+
+		instr += size;
+	}
+
+	if (instr >= limit || *instr != FBT_PUSHL_EBP) {
+		/*
+		 * We either don't save the frame pointer in this
+		 * function, or we ran into some disassembly
+		 * screw-up.  Either way, we bail.
+		 */
+		return (0);
+	}
+#else
+	if (instr[0] != FBT_PUSHL_EBP)
+		return (0);
+
+	if (!(instr[1] == FBT_MOVL_ESP_EBP0_V0 &&
+	    instr[2] == FBT_MOVL_ESP_EBP1_V0) &&
+	    !(instr[1] == FBT_MOVL_ESP_EBP0_V1 &&
+	    instr[2] == FBT_MOVL_ESP_EBP1_V1))
+		return (0);
+#endif
+
+	fbt = kmem_zalloc(sizeof (*fbt), KM_SLEEP);
+	fbt->fbtp_name = name;
+	fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
+	    name, FBT_ENTRY, 3, fbt);
+	fbt->fbtp_patchpoint = instr;
+#ifdef __FreeBSD__
+	fbt->fbtp_ctl = lf;
+	fbt->fbtp_loadcnt = lf->loadcnt;
+#endif
+#ifdef __NetBSD__
+	fbt->fbtp_ctl = mod;
+#endif
+	fbt->fbtp_rval = DTRACE_INVOP_PUSHL_EBP;
+	fbt->fbtp_savedval = *instr;
+	fbt->fbtp_patchval = FBT_PATCHVAL;
+	fbt->fbtp_symindx = symindx;
+
+	fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
+	fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
+#ifdef __FreeBSD__
+	lf->fbt_nentries++;
+#endif
+
+	retfbt = NULL;
+again:
+	if (instr >= limit)
+		return (0);
+
+	/*
+	 * If this disassembly fails, then we've likely walked off into
+	 * a jump table or some other unsuitable area.  Bail out of the
+	 * disassembly now.
+	 */
+	if ((size = dtrace_instr_size(instr)) <= 0)
+		return (0);
+
+#ifdef __amd64__
+	/*
+	 * We only instrument "ret" on amd64 -- we don't yet instrument
+	 * ret imm16, largely because the compiler doesn't seem to
+	 * (yet) emit them in the kernel...
+	 */
+	if (*instr != FBT_RET) {
+		instr += size;
+		goto again;
+	}
+#else
+	if (!(size == 1 &&
+	    (*instr == FBT_POPL_EBP || *instr == FBT_LEAVE) &&
+	    (*(instr + 1) == FBT_RET ||
+	    *(instr + 1) == FBT_RET_IMM16))) {
+		instr += size;
+		goto again;
+	}
+#endif
+
+	/*
+	 * We (desperately) want to avoid erroneously instrumenting a
+	 * jump table, especially given that our markers are pretty
+	 * short:  two bytes on x86, and just one byte on amd64.  To
+	 * determine if we're looking at a true instruction sequence
+	 * or an inline jump table that happens to contain the same
+	 * byte sequences, we resort to some heuristic sleeze:  we
+	 * treat this instruction as being contained within a pointer,
+	 * and see if that pointer points to within the body of the
+	 * function.  If it does, we refuse to instrument it.
+	 */
+	for (j = 0; j < sizeof (uintptr_t); j++) {
+		caddr_t check = (caddr_t) instr - j;
+		uint8_t *ptr;
+
+		if (check < (caddr_t)value)
+			break;
+
+		if (check + sizeof (caddr_t) > (caddr_t)limit)
+			continue;
+
+		ptr = *(uint8_t **)check;
+
+		if (ptr >= (uint8_t *) value && ptr < limit) {
+			instr += size;
+			goto again;
+		}
+	}
+
+	/*
+	 * We have a winner!
+	 */
+	fbt = kmem_zalloc(sizeof (*fbt), KM_SLEEP);
+	fbt->fbtp_name = name;
+
+	if (retfbt == NULL) {
+		fbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
+		    name, FBT_RETURN, 3, fbt);
+	} else {
+		retfbt->fbtp_next = fbt;
+		fbt->fbtp_id = retfbt->fbtp_id;
+	}
+
+	retfbt = fbt;
+	fbt->fbtp_patchpoint = instr;
+#ifdef __FreeBSD__
+	fbt->fbtp_ctl = lf;
+	fbt->fbtp_loadcnt = lf->loadcnt;
+#endif
+#ifdef __NetBSD__
+	fbt->fbtp_ctl = mod;
+#endif
+	fbt->fbtp_symindx = symindx;
+
+#ifndef __amd64__
+	if (*instr == FBT_POPL_EBP) {
+		fbt->fbtp_rval = DTRACE_INVOP_POPL_EBP;
+	} else {
+		ASSERT(*instr == FBT_LEAVE);
+		fbt->fbtp_rval = DTRACE_INVOP_LEAVE;
+	}
+	fbt->fbtp_roffset =
+	    (uintptr_t)(instr - (uint8_t *) value) + 1;
+
+#else
+	ASSERT(*instr == FBT_RET);
+	fbt->fbtp_rval = DTRACE_INVOP_RET;
+	fbt->fbtp_roffset =
+		(uintptr_t)(instr - (uint8_t *) value);
+#endif
+
+	fbt->fbtp_savedval = *instr;
+	fbt->fbtp_patchval = FBT_PATCHVAL;
+	fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
+	fbt_probetab[FBT_ADDR2NDX(instr)] = fbt;
+
+#ifdef __FreeBSD__
+	lf->fbt_nentries++;
+#endif
+
+	instr += size;
+	goto again;
+}
Index: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h
===================================================================
RCS file: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h
diff -N src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h	19 Apr 2017 17:15:48 -0000
@@ -0,0 +1,32 @@
+/*	$NetBSD$	*/
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD: head/sys/cddl/dev/fbt/x86/fbt_isa.h 270067 2014-08-16 21:42:55Z markj $
+ *
+ */
+
+#ifndef _FBT_ISA_H_
+#define _FBT_ISA_H_
+
+typedef uint8_t fbt_patchval_t;
+
+#endif
Index: src/external/cddl/osnet/dev/lockstat/lockstat.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/lockstat/lockstat.c,v
retrieving revision 1.8
diff -u -p -r1.8 lockstat.c
--- src/external/cddl/osnet/dev/lockstat/lockstat.c	9 Mar 2015 01:42:26 -0000	1.8
+++ src/external/cddl/osnet/dev/lockstat/lockstat.c	20 Apr 2017 14:17:11 -0000
@@ -29,6 +29,7 @@
 __KERNEL_RCSID(0, "$NetBSD: lockstat.c,v 1.8 2015/03/09 01:42:26 christos Exp $");
 
 #include <sys/types.h>
+#include <sys/proc.h>
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/dtrace.h>
@@ -41,8 +42,6 @@ __KERNEL_RCSID(0, "$NetBSD: lockstat.c,v
 #define NLOCKSTAT 1
 #include <dev/lockstat.h>
 
-#define	ASSERT	KASSERT
-
 typedef struct lockstat_probe {
 	const char	*lsp_func;
 	const char	*lsp_name;
@@ -100,7 +99,7 @@ lockstat_disable(void *arg, dtrace_id_t 
 
 /*ARGSUSED*/
 static void
-lockstat_provide(void *arg, const dtrace_probedesc_t *desc)
+lockstat_provide(void *arg, dtrace_probedesc_t *desc)
 {
 	int i = 0;
 
Index: src/external/cddl/osnet/dev/profile/profile.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/profile/profile.c,v
retrieving revision 1.7
diff -u -p -r1.7 profile.c
--- src/external/cddl/osnet/dev/profile/profile.c	7 Jan 2017 21:39:52 -0000	1.7
+++ src/external/cddl/osnet/dev/profile/profile.c	6 May 2017 23:59:31 -0000
@@ -22,7 +22,7 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/sys/cddl/dev/profile/profile.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/dev/profile/profile.c 300618 2016-05-24 16:41:37Z br $
  *
  */
 
@@ -44,7 +44,9 @@
 #include <sys/kernel.h>
 #include <sys/kmem.h>
 #include <sys/kthread.h>
-#include <sys/syslimits.h>
+#ifdef __FreeBSD__
+#include <sys/limits.h>
+#endif
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -55,17 +57,22 @@
 #include <sys/selinfo.h>
 #ifdef __FreeBSD__
 #include <sys/smp.h>
+#include <sys/sysctl.h>
 #endif
 #include <sys/uio.h>
 #include <sys/unistd.h>
+#ifdef __FreeBSD__
+#include <machine/cpu.h>
+#include <machine/stdarg.h>
+#endif
 
 #ifdef __NetBSD__
+#include <sys/syslimits.h>
 #include <sys/atomic.h>
 #include <sys/cpu.h>
-#define ASSERT(x) KASSERT(x)
+#include <sys/cyclic.h>
 #endif
 
-#include <sys/cyclic.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_bsd.h>
 
@@ -109,7 +116,7 @@
  */
 #ifdef __FreeBSD__
 #ifdef __amd64
-#define	PROF_ARTIFICIAL_FRAMES	7
+#define	PROF_ARTIFICIAL_FRAMES	10
 #else
 #ifdef __i386
 #define	PROF_ARTIFICIAL_FRAMES	6
@@ -123,8 +130,44 @@
 #endif
 #endif
 #endif
+
+#ifdef __mips
+/*
+ * This value is bogus just to make module compilable on mips
+ */
+#define	PROF_ARTIFICIAL_FRAMES	3
+#endif
+
+#ifdef __powerpc__
+/*
+ * This value is bogus just to make module compilable on powerpc
+ */
+#define	PROF_ARTIFICIAL_FRAMES	3
+#endif
+
+struct profile_probe_percpu;
+
+#ifdef __mips
+/* bogus */
+#define	PROF_ARTIFICIAL_FRAMES	3
+#endif
+
+#ifdef __arm__
+#define	PROF_ARTIFICIAL_FRAMES	3
+#endif
+
+#ifdef __aarch64__
+/* TODO: verify */
+#define	PROF_ARTIFICIAL_FRAMES	10
+#endif
+
+#ifdef __riscv__
+/* TODO: verify */
+#define	PROF_ARTIFICIAL_FRAMES	10
 #endif
 
+#endif /* __FreeBSD__ */
+
 #ifdef __NetBSD__
 #define	PROF_ARTIFICIAL_FRAMES	3
 #endif
@@ -133,14 +176,25 @@ typedef struct profile_probe {
 	char		prof_name[PROF_NAMELEN];
 	dtrace_id_t	prof_id;
 	int		prof_kind;
+#if defined(illumos) || defined(__NetBSD__)
 	hrtime_t	prof_interval;
 	cyclic_id_t	prof_cyclic;
+#endif
+#ifdef __FreeBSD__
+	sbintime_t	prof_interval;
+	struct callout	prof_cyclic;
+	sbintime_t	prof_expected;
+	struct profile_probe_percpu **prof_pcpus;
+#endif
 } profile_probe_t;
 
 typedef struct profile_probe_percpu {
 	hrtime_t	profc_expected;
 	hrtime_t	profc_interval;
 	profile_probe_t	*profc_probe;
+#ifdef __FreeBSD__
+	struct callout	profc_cyclic;
+#endif
 } profile_probe_percpu_t;
 
 #ifdef __FreeBSD__
@@ -152,7 +206,7 @@ static void	profile_destroy(void *, dtra
 static int	profile_enable(void *, dtrace_id_t, void *);
 static void	profile_disable(void *, dtrace_id_t, void *);
 static void	profile_load(void *);
-static void	profile_provide(void *, const dtrace_probedesc_t *);
+static void	profile_provide(void *, dtrace_probedesc_t *);
 
 static int profile_rates[] = {
     97, 199, 499, 997, 1999,
@@ -213,8 +267,105 @@ static struct cdev		*profile_cdev;
 #endif
 static dtrace_provider_id_t	profile_id;
 static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
-static int			profile_aframes = 0;			/* override */
+static int			profile_aframes = PROF_ARTIFICIAL_FRAMES;
+
+#ifdef __FreeBSD__
+SYSCTL_DECL(_kern_dtrace);
+SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD, 0, "DTrace profile parameters");
+SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
+    0, "Skipped frames for profile provider");
+
+static sbintime_t
+nsec_to_sbt(hrtime_t nsec)
+{
+	time_t sec;
+
+	/*
+	 * We need to calculate nsec * 2^32 / 10^9
+	 * Seconds and nanoseconds are split to avoid overflow.
+	 */
+	sec = nsec / NANOSEC;
+	nsec = nsec % NANOSEC;
+	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
+}
+
+static hrtime_t
+sbt_to_nsec(sbintime_t sbt)
+{
+
+	return ((sbt >> 32) * NANOSEC +
+	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
+}
+
+static void
+profile_fire(void *arg)
+{
+	profile_probe_percpu_t *pcpu = arg;
+	profile_probe_t *prof = pcpu->profc_probe;
+	hrtime_t late;
+	struct trapframe *frame;
+	uintfptr_t pc, upc;
+
+#ifdef illumos
+	late = gethrtime() - pcpu->profc_expected;
+#else
+	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
+#endif
+
+	pc = 0;
+	upc = 0;
+
+	/*
+	 * td_intr_frame can be unset if this is a catch up event
+	 * after waking up from idle sleep.
+	 * This can only happen on a CPU idle thread.
+	 */
+	frame = curthread->td_intr_frame;
+	if (frame != NULL) {
+		if (TRAPF_USERMODE(frame))
+			upc = TRAPF_PC(frame);
+		else
+			pc = TRAPF_PC(frame);
+	}
+	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
+
+	pcpu->profc_expected += pcpu->profc_interval;
+	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
+	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
+}
+
+static void
+profile_tick(void *arg)
+{
+	profile_probe_t *prof = arg;
+	struct trapframe *frame;
+	uintfptr_t pc, upc;
+
+	pc = 0;
+	upc = 0;
+
+	/*
+	 * td_intr_frame can be unset if this is a catch up event
+	 * after waking up from idle sleep.
+	 * This can only happen on a CPU idle thread.
+	 */
+	frame = curthread->td_intr_frame;
+	if (frame != NULL) {
+		if (TRAPF_USERMODE(frame))
+			upc = TRAPF_PC(frame);
+		else
+			pc = TRAPF_PC(frame);
+	}
+	dtrace_probe(prof->prof_id, pc, upc, 0, 0, 0);
+
+	prof->prof_expected += prof->prof_interval;
+	callout_schedule_sbt(&prof->prof_cyclic,
+	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
+}
+
+#endif
 
+#ifdef __NetBSD__
 static void
 profile_fire(void *arg)
 {
@@ -240,6 +391,8 @@ profile_tick(void *arg)
 	    c->cpu_profile_upc, 0, 0, 0);
 }
 
+#endif
+
 static void
 profile_create(hrtime_t interval, char *name, int kind)
 {
@@ -259,24 +412,29 @@ profile_create(hrtime_t interval, char *
 
 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
 	(void) strcpy(prof->prof_name, name);
+#ifdef __FreeBSD__
+	prof->prof_interval = nsec_to_sbt(interval);
+	callout_init(&prof->prof_cyclic, 1);
+#else
 	prof->prof_interval = interval;
 	prof->prof_cyclic = CYCLIC_NONE;
+#endif
 	prof->prof_kind = kind;
 	prof->prof_id = dtrace_probe_create(profile_id,
 	    NULL, NULL, name,
-	    profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof);
+	    profile_aframes, prof);
 }
 
 /*ARGSUSED*/
 static void
-profile_provide(void *arg, const dtrace_probedesc_t *desc)
+profile_provide(void *arg, dtrace_probedesc_t *desc)
 {
 	int i, j, rate, kind;
 	hrtime_t val = 0, mult = 1, len = 0;
 	char *name, *suffix = NULL;
 
 	const struct {
-		const char *prefix;
+		char *prefix;
 		int kind;
 	} types[] = {
 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
@@ -285,7 +443,7 @@ profile_provide(void *arg, const dtrace_
 	};
 
 	const struct {
-		const char *name;
+		char *name;
 		hrtime_t mult;
 	} suffixes[] = {
 		{ "ns", 	NANOSEC / NANOSEC },
@@ -333,7 +491,7 @@ profile_provide(void *arg, const dtrace_
 		return;
 	}
 
-	name = (char *)desc->dtpd_name;
+	name = desc->dtpd_name;
 
 	for (i = 0; types[i].prefix != NULL; i++) {
 		len = strlen(types[i].prefix);
@@ -405,13 +563,19 @@ profile_destroy(void *arg, dtrace_id_t i
 {
 	profile_probe_t *prof = parg;
 
+#ifdef __FreeBSD__
+	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
+#else
 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
+#endif
 	kmem_free(prof, sizeof (profile_probe_t));
 
 	ASSERT(profile_total >= 1);
 	atomic_add_32(&profile_total, -1);
 }
 
+#ifndef __FreeBSD__
+
 /*ARGSUSED*/
 static void
 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
@@ -488,6 +652,81 @@ profile_disable(void *arg, dtrace_id_t i
 	prof->prof_cyclic = CYCLIC_NONE;
 }
 
+#else
+
+static void
+profile_enable_omni(profile_probe_t *prof)
+{
+	profile_probe_percpu_t *pcpu;
+	int cpu;
+
+	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
+	CPU_FOREACH(cpu) {
+		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
+		prof->prof_pcpus[cpu] = pcpu;
+		pcpu->profc_probe = prof;
+		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
+		pcpu->profc_interval = prof->prof_interval;
+		callout_init(&pcpu->profc_cyclic, 1);
+		callout_reset_sbt_on(&pcpu->profc_cyclic,
+		    pcpu->profc_expected, 0, profile_fire, pcpu,
+		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
+	}
+}
+
+static void
+profile_disable_omni(profile_probe_t *prof)
+{
+	profile_probe_percpu_t *pcpu;
+	int cpu;
+
+	ASSERT(prof->prof_pcpus != NULL);
+	CPU_FOREACH(cpu) {
+		pcpu = prof->prof_pcpus[cpu];
+		ASSERT(pcpu->profc_probe == prof);
+		ASSERT(callout_active(&pcpu->profc_cyclic));
+		callout_stop(&pcpu->profc_cyclic);
+		callout_drain(&pcpu->profc_cyclic);
+		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
+	}
+	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
+	prof->prof_pcpus = NULL;
+}
+
+/* ARGSUSED */
+static void
+profile_enable(void *arg, dtrace_id_t id, void *parg)
+{
+	profile_probe_t *prof = parg;
+
+	if (prof->prof_kind == PROF_TICK) {
+		prof->prof_expected = sbinuptime() + prof->prof_interval;
+		callout_reset_sbt(&prof->prof_cyclic,
+		    prof->prof_expected, 0, profile_tick, prof,
+		    C_DIRECT_EXEC | C_ABSOLUTE);
+	} else {
+		ASSERT(prof->prof_kind == PROF_PROFILE);
+		profile_enable_omni(prof);
+	}
+}
+
+/* ARGSUSED */
+static void
+profile_disable(void *arg, dtrace_id_t id, void *parg)
+{
+	profile_probe_t *prof = parg;
+
+	if (prof->prof_kind == PROF_TICK) {
+		ASSERT(callout_active(&prof->prof_cyclic));
+		callout_stop(&prof->prof_cyclic);
+		callout_drain(&prof->prof_cyclic);
+	} else {
+		ASSERT(prof->prof_kind == PROF_PROFILE);
+		profile_disable_omni(prof);
+	}
+}
+#endif
+
 static void
 profile_load(void *dummy)
 {
Index: src/external/cddl/osnet/dev/sdt/sdt.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/sdt/sdt.c,v
retrieving revision 1.18
diff -u -p -r1.18 sdt.c
--- src/external/cddl/osnet/dev/sdt/sdt.c	7 Jan 2017 21:39:52 -0000	1.18
+++ src/external/cddl/osnet/dev/sdt/sdt.c	20 Apr 2017 13:42:05 -0000
@@ -20,7 +20,7 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: head/sys/cddl/dev/sdt/sdt.c 285703 2015-07-19 22:14:09Z markj $
+ * $FreeBSD: head/sys/cddl/dev/sdt/sdt.c 297771 2016-04-10 01:24:27Z markj $
  *
  */
 
@@ -42,6 +42,7 @@
 __KERNEL_RCSID(0, "$NetBSD: sdt.c,v 1.18 2017/01/07 21:39:52 christos Exp $");
 
 #include <sys/cdefs.h>
+#include <sys/proc.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 
@@ -72,7 +73,7 @@ __KERNEL_RCSID(0, "$NetBSD: sdt.c,v 1.18
 
 /* DTrace methods. */
 static void	sdt_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
-static void	sdt_provide_probes(void *, const dtrace_probedesc_t *);
+static void	sdt_provide_probes(void *, dtrace_probedesc_t *);
 static void	sdt_destroy(void *, dtrace_id_t, void *);
 static int	sdt_enable(void *, dtrace_id_t, void *);
 static void	sdt_disable(void *, dtrace_id_t, void *);
@@ -115,7 +116,7 @@ static dtrace_pops_t sdt_pops = {
 static int
 sdt_open(dev_t dev, int flags, int mode, struct lwp *l)
 {
-	return (0);
+	return 0;
 }
 
 static const struct cdevsw sdt_cdevsw = {
@@ -137,8 +138,8 @@ static const struct cdevsw sdt_cdevsw = 
 static TAILQ_HEAD(, sdt_provider) sdt_prov_list;
 
 #ifdef __FreeBSD__
-eventhandler_tag	sdt_kld_load_tag;
-eventhandler_tag	sdt_kld_unload_try_tag;
+static eventhandler_tag	sdt_kld_load_tag;
+static eventhandler_tag	sdt_kld_unload_try_tag;
 #endif
 
 #ifdef __NetBSD__
@@ -191,6 +192,12 @@ sdt_create_probe(struct sdt_probe *probe
 	char *to;
 	size_t len;
 
+	if (probe->version != (int)sizeof(*probe)) {
+		printf("ignoring probe %p, version %u expected %u\n",
+		    probe, probe->version, (int)sizeof(*probe));
+		return;
+	}
+
 	TAILQ_FOREACH(prov, &sdt_prov_list, prov_entry)
 		if (strcmp(prov->name, probe->prov->name) == 0)
 			break;
@@ -214,6 +221,8 @@ sdt_create_probe(struct sdt_probe *probe
 	 * in the C compiler, so we have to respect const vs non-const.
 	 */
 	strlcpy(func, probe->func, sizeof(func));
+	if (func[0] == '\0')
+		strcpy(func, "none");
 
 	from = probe->name;
 	to = name;
@@ -239,7 +248,7 @@ sdt_create_probe(struct sdt_probe *probe
  * requires one of provide_probes and provide_module to be defined.
  */
 static void
-sdt_provide_probes(void *arg, const dtrace_probedesc_t *desc)
+sdt_provide_probes(void *arg, dtrace_probedesc_t *desc)
 {
 }
 
@@ -248,10 +257,6 @@ sdt_enable(void *arg __unused, dtrace_id
 {
 	struct sdt_probe *probe = parg;
 
-#ifdef SDT_DEBUG
-	printf("sdt: %s\n", __func__);
-#endif
-
 	probe->id = id;
 #ifdef __FreeBSD__
 	probe->sdtp_lf->nenabled++;
@@ -268,13 +273,6 @@ sdt_disable(void *arg __unused, dtrace_i
 
 #ifdef __FreeBSD__
 	SDT_KASSERT(probe->sdtp_lf->nenabled > 0, ("no probes enabled"));
-#endif
-
-#ifdef SDT_DEBUG
-	printf("sdt: %s\n", __func__);
-#endif
-
-#ifdef __FreeBSD__
 	if (strcmp(probe->prov->name, "lockstat") == 0)
 		lockstat_enabled--;
 	probe->sdtp_lf->nenabled--;
@@ -288,16 +286,6 @@ sdt_getargdesc(void *arg, dtrace_id_t id
 	struct sdt_argtype *argtype;
 	struct sdt_probe *probe = parg;
 
-#ifdef SDT_DEBUG
-	printf("sdt: %s probe %d\n", __func__, id);
-	printf("%s: probe %d (%s:%s:%s:%s).%d\n",
-		__func__, id,
-		probe->provider,
-		probe->module,
-		probe->function,
-		probe->name,
-		desc->dtargd_ndx);
-#endif
 	if (desc->dtargd_ndx >= probe->n_args) {
 		desc->dtargd_ndx = DTRACE_ARGNONE;
 		return;
@@ -488,12 +476,19 @@ sdt_load(void)
 {
 	TAILQ_INIT(&sdt_prov_list);
 
-	sdt_init(dtrace_probe);
 #ifdef __FreeBSD__
+	sdt_probe_func = dtrace_probe;
+
+	sdt_kld_load_tag = EVENTHANDLER_REGISTER(kld_load, sdt_kld_load, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	sdt_kld_unload_try_tag = EVENTHANDLER_REGISTER(kld_unload_try,
+	    sdt_kld_unload_try, NULL, EVENTHANDLER_PRI_ANY);
+
 	/* Pick up probes from the kernel and already-loaded linker files. */
 	linker_file_foreach(sdt_linker_file_cb, NULL);
 #endif
 #ifdef __NetBSD__
+	sdt_init(dtrace_probe);
 	sdt_link_set_load();
 #endif
 }
@@ -504,53 +499,51 @@ sdt_unload(void)
 	struct sdt_provider *prov, *tmp;
 	int ret;
 
-	sdt_exit();
+#ifdef __FreeBSD__
+	EVENTHANDLER_DEREGISTER(kld_load, sdt_kld_load_tag);
+	EVENTHANDLER_DEREGISTER(kld_unload_try, sdt_kld_unload_try_tag);
+
+	sdt_probe_func = sdt_probe_stub;
+#endif
 
 #ifdef __NetBSD__
+	sdt_exit();
+
 	sdt_link_set_unload();
 #endif
 	TAILQ_FOREACH_SAFE(prov, &sdt_prov_list, prov_entry, tmp) {
 		ret = dtrace_unregister(prov->id);
 		if (ret != 0)
-			return ret;
+			return (ret);
 		TAILQ_REMOVE(&sdt_prov_list, prov, prov_entry);
 		free(__UNCONST(prov->name), M_SDT);
 		free(prov, M_SDT);
 	}
 
-	return 0;
+	return (0);
 }
 
 #ifdef __FreeBSD__
 static int
 sdt_modevent(module_t mod __unused, int type, void *data __unused)
 {
-	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
-		sdt_load();
-		break;
-
 	case MOD_UNLOAD:
-		error = sdt_unload();
-		break;
-
 	case MOD_SHUTDOWN:
-		break;
-
+		return (0);
 	default:
-		error = EOPNOTSUPP;
-		break;
+		return (EOPNOTSUPP);
 	}
-
-	return (error);
 }
 
+SYSINIT(sdt_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, sdt_load, NULL);
+SYSUNINIT(sdt_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, sdt_unload, NULL);
+
 DEV_MODULE(sdt, sdt_modevent, NULL);
 MODULE_VERSION(sdt, 1);
 MODULE_DEPEND(sdt, dtrace, 1, 1, 1);
-MODULE_DEPEND(sdt, opensolaris, 1, 1, 1);
 #endif
 
 #ifdef __NetBSD__
Index: src/external/cddl/osnet/dev/systrace/systrace.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/systrace/systrace.c,v
retrieving revision 1.9
diff -u -p -r1.9 systrace.c
--- src/external/cddl/osnet/dev/systrace/systrace.c	7 Jan 2017 21:39:52 -0000	1.9
+++ src/external/cddl/osnet/dev/systrace/systrace.c	20 Apr 2017 14:31:12 -0000
@@ -22,8 +22,6 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/sys/cddl/dev/systrace/systrace.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
- *
  */
 
 /*
@@ -32,6 +30,9 @@
  */
 
 #include <sys/cdefs.h>
+/* __FBSDID("$FreeBSD: head/sys/cddl/dev/systrace/systrace.c 306220 2016-09-22 23:22:53Z markj $"); */
+
+#include <sys/proc.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
@@ -55,12 +56,82 @@
 #include <sys/unistd.h>
 
 #include <sys/dtrace.h>
+#include "dtrace_cddl.h"
 
 #include "emultrace.h"
 
 #define	CONCAT(x,y)	__CONCAT(x,y)
 #define	STRING(s)	__STRING(s)
 
+#ifdef __FreeBSD__
+#ifdef LINUX_SYSTRACE
+#if defined(__amd64__)
+#include <amd64/linux/linux.h>
+#include <amd64/linux/linux_proto.h>
+#include <amd64/linux/linux_syscalls.c>
+#include <amd64/linux/linux_systrace_args.c>
+#elif defined(__i386__)
+#include <i386/linux/linux.h>
+#include <i386/linux/linux_proto.h>
+#include <i386/linux/linux_syscalls.c>
+#include <i386/linux/linux_systrace_args.c>
+#else
+#error Only i386 and amd64 are supported.
+#endif
+#define	MODNAME		"linux"
+extern struct sysent linux_sysent[];
+#define	MAXSYSCALL	LINUX_SYS_MAXSYSCALL
+#define	SYSCALLNAMES	linux_syscallnames
+#define	SYSENT		linux_sysent
+#elif defined(LINUX32_SYSTRACE)
+#if defined(__amd64__)
+#include <amd64/linux32/linux.h>
+#include <amd64/linux32/linux32_proto.h>
+#include <amd64/linux32/linux32_syscalls.c>
+#include <amd64/linux32/linux32_systrace_args.c>
+#else
+#error Only amd64 is supported.
+#endif
+#define	MODNAME		"linux32"
+extern struct sysent linux32_sysent[];
+#define	MAXSYSCALL	LINUX32_SYS_MAXSYSCALL
+#define	SYSCALLNAMES	linux32_syscallnames
+#define	SYSENT		linux32_sysent
+#elif defined(FREEBSD32_SYSTRACE)
+/*
+ * The syscall arguments are processed into a DTrace argument array
+ * using a generated function. See sys/kern/makesyscalls.sh.
+ */
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_systrace_args.c>
+extern const char *freebsd32_syscallnames[];
+#define	MODNAME		"freebsd32"
+#define	MAXSYSCALL	FREEBSD32_SYS_MAXSYSCALL
+#define	SYSCALLNAMES	freebsd32_syscallnames
+#define	SYSENT		freebsd32_sysent
+#else
+/*
+ * The syscall arguments are processed into a DTrace argument array
+ * using a generated function. See sys/kern/makesyscalls.sh.
+ */
+#include <sys/syscall.h>
+#include <kern/systrace_args.c>
+#define	MODNAME		"freebsd"
+#define	MAXSYSCALL	SYS_MAXSYSCALL
+#define	SYSCALLNAMES	syscallnames
+#define	SYSENT		sysent
+#define	NATIVE_ABI
+#endif
+
+#define	PROVNAME	"syscall"
+#define	DEVNAME	        "dtrace/systrace/" MODNAME
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
+#include <sys/syscallargs.h>
+
 #ifndef NATIVE
 extern const char	* const CONCAT(emulname,_syscallnames)[];
 extern const char	* const CONCAT(alt,CONCAT(emulname,_syscallnames))[];
@@ -87,6 +158,8 @@ extern const char	* const altsyscallname
 #define	MODCMD		CONCAT(MODNAME,_modcmd)
 #define EMUL		CONCAT(emul_,emulname)
 extern struct emul 	EMUL;
+#define curthread	curlwp
+#endif /* __NetBSD__ */
 
 #define	SYSTRACE_ARTIFICIAL_FRAMES	1
 
@@ -102,12 +175,20 @@ extern struct emul 	EMUL;
 
 static int	systrace_unload(void);
 static void	systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *);
-static void	systrace_provide(void *, const dtrace_probedesc_t *);
+static uint64_t	systrace_getargval(void *, dtrace_id_t, void *, int, int);
+static void	systrace_provide(void *, dtrace_probedesc_t *);
 static void	systrace_destroy(void *, dtrace_id_t, void *);
 static int	systrace_enable(void *, dtrace_id_t, void *);
 static void	systrace_disable(void *, dtrace_id_t, void *);
 static void	systrace_load(void *);
 
+#ifdef __FreeBSD__
+static union {
+	const char	**p_constnames;
+	char		**pp_syscallnames;
+} uglyhack = { SYSCALLNAMES };
+#endif
+
 static dtrace_pattr_t systrace_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
@@ -124,7 +205,7 @@ static dtrace_pops_t systrace_pops = {
 	NULL,
 	NULL,
 	systrace_getargdesc,
-	NULL,
+	systrace_getargval,
 	NULL,
 	systrace_destroy
 };
@@ -138,6 +219,57 @@ static dtrace_provider_id_t	systrace_id;
  *       array the syscall comes from. It could be a standard syscall or a
  *       compat syscall from something like Linux.
  */
+#ifdef __FreeBSD__
+#ifdef NATIVE_ABI
+static void
+systrace_probe(struct syscall_args *sa, enum systrace_probe_t type, int retval)
+{
+	uint64_t uargs[nitems(sa->args)];
+	dtrace_id_t id;
+	int n_args, sysnum;
+
+	sysnum = sa->code;
+	memset(uargs, 0, sizeof(uargs));
+
+	if (type == SYSTRACE_ENTRY) {
+		if ((id = sa->callp->sy_entry) == DTRACE_IDNONE)
+			return;
+
+		if (sa->callp->sy_systrace_args_func != NULL)
+			/*
+			 * Convert the syscall parameters using the registered
+			 * function.
+			 */
+			(*sa->callp->sy_systrace_args_func)(sysnum, sa->args,
+			    uargs, &n_args);
+		else
+			/*
+			 * Use the built-in system call argument conversion
+			 * function to translate the syscall structure fields
+			 * into the array of 64-bit values that DTrace expects.
+			 */
+			systrace_args(sysnum, sa->args, uargs, &n_args);
+		/*
+		 * Save probe arguments now so that we can retrieve them if
+		 * the getargval method is called from further down the stack.
+		 */
+		curthread->t_dtrace_systrace_args = uargs;
+	} else {
+		if ((id = sa->callp->sy_return) == DTRACE_IDNONE)
+			return;
+
+		curthread->t_dtrace_systrace_args = NULL;
+		/* Set arg0 and arg1 as the return value of this syscall. */
+		uargs[0] = uargs[1] = retval;
+	}
+
+	/* Process the probe using the converted argments. */
+	dtrace_probe(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]);
+}
+#endif /* NATIVE_ABI */
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
 static void
 systrace_probe(uint32_t id, register_t sysnum, const struct sysent *se,
     const void *params, const register_t *ret, int error)
@@ -160,24 +292,47 @@ systrace_probe(uint32_t id, register_t s
 	/* XXX: fix for more arguments! */
 	dtrace_probe(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]);
 }
+#endif
 
 static void
 systrace_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
 {
 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
+
 	if (SYSTRACE_ISENTRY((uintptr_t)parg))
-		systrace_entry_setargdesc(sysnum, desc->dtargd_ndx, 
+		systrace_entry_setargdesc(sysnum, desc->dtargd_ndx,
 		    desc->dtargd_native, sizeof(desc->dtargd_native));
 	else
-		systrace_return_setargdesc(sysnum, desc->dtargd_ndx, 
+		systrace_return_setargdesc(sysnum, desc->dtargd_ndx,
 		    desc->dtargd_native, sizeof(desc->dtargd_native));
 
 	if (desc->dtargd_native[0] == '\0')
 		desc->dtargd_ndx = DTRACE_ARGNONE;
 }
 
+static uint64_t
+systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes)
+{
+	uint64_t *uargs;
+
+	uargs = curthread->t_dtrace_systrace_args;
+	if (uargs == NULL)
+		/* This is a return probe. */
+		return (0);
+#ifdef __FreeBSD__
+	if (argno >= nitems(((struct syscall_args *)NULL)->args))
+		return (0);
+#endif
+#ifdef __NetBSD__
+	if (argno >= SYS_MAXSYSARGS)
+		return (0);
+#endif
+
+	return (uargs[argno]);
+}
+
 static void
-systrace_provide(void *arg, const dtrace_probedesc_t *desc)
+systrace_provide(void *arg, dtrace_probedesc_t *desc)
 {
 	int i;
 
@@ -185,6 +340,20 @@ systrace_provide(void *arg, const dtrace
 		return;
 
 	for (i = 0; i < MAXSYSCALL; i++) {
+#ifdef __FreeBSD__
+		if (dtrace_probe_lookup(systrace_id, MODNAME,
+		    uglyhack.pp_syscallnames[i], "entry") != 0)
+			continue;
+
+		(void)dtrace_probe_create(systrace_id, MODNAME,
+		    uglyhack.pp_syscallnames[i], "entry",
+		    SYSTRACE_ARTIFICIAL_FRAMES,
+		    (void *)((uintptr_t)SYSTRACE_ENTRY(i)));
+		(void)dtrace_probe_create(systrace_id, MODNAME,
+		    uglyhack.pp_syscallnames[i], "return",
+		    SYSTRACE_ARTIFICIAL_FRAMES,
+		    (void *)((uintptr_t)SYSTRACE_RETURN(i)));
+#else
 		const char *name = ALTSYSCALLNAMES[i] ? ALTSYSCALLNAMES[i] :
 		    SYSCALLNAMES[i];
 		if (dtrace_probe_lookup(systrace_id, NULL, name, "entry") != 0)
@@ -196,6 +365,7 @@ systrace_provide(void *arg, const dtrace
 		(void) dtrace_probe_create(systrace_id, NULL,
 		    name, "return", SYSTRACE_ARTIFICIAL_FRAMES,
 		    (void *)(intptr_t)SYSTRACE_RETURN(i));
+#endif
 	}
 }
 
@@ -222,10 +392,16 @@ systrace_enable(void *arg, dtrace_id_t i
 {
 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
 
+#ifdef __FreeBSD__
+	if (SYSENT[sysnum].sy_systrace_args_func == NULL)
+		SYSENT[sysnum].sy_systrace_args_func = systrace_args;
+#endif
+
 	if (SYSTRACE_ISENTRY((uintptr_t)parg))
 		SYSENT[sysnum].sy_entry = id;
 	else
 		SYSENT[sysnum].sy_return = id;
+
 	return 0;
 }
 
@@ -241,11 +417,16 @@ systrace_disable(void *arg, dtrace_id_t 
 static void
 systrace_load(void *dummy)
 {
-	if (dtrace_register(PROVNAME, &systrace_attr, DTRACE_PRIV_USER,
-	    NULL, &systrace_pops, NULL, &systrace_id) != 0)
+	if (dtrace_register(PROVNAME, &systrace_attr, DTRACE_PRIV_USER, NULL,
+	    &systrace_pops, NULL, &systrace_id) != 0)
 		return;
 
+#ifdef NATIVE_ABI
+	systrace_probe_func = systrace_probe;
+#endif
+#ifdef __NetBSD__
 	EMUL.e_dtrace_syscall = systrace_probe;
+#endif
 }
 
 
@@ -254,14 +435,80 @@ systrace_unload()
 {
 	int error;
 
+#ifdef NATIVE_ABI
+	systrace_probe_func = NULL;
+#endif
+#ifdef __NetBSD__
+	EMUL.e_dtrace_syscall = NULL;
+#endif
+
 	if ((error = dtrace_unregister(systrace_id)) != 0)
 		return (error);
 
-	EMUL.e_dtrace_syscall = NULL;
-
 	return error;
 }
 
+#ifdef __FreeBSD__
+static int
+systrace_modevent(module_t mod __unused, int type, void *data __unused)
+{
+	int error;
+
+	error = 0;
+	switch (type) {
+	case MOD_LOAD:
+		break;
+
+	case MOD_UNLOAD:
+		break;
+
+	case MOD_SHUTDOWN:
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+
+	}
+	return (error);
+}
+
+SYSINIT(systrace_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    systrace_load, NULL);
+SYSUNINIT(systrace_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    systrace_unload, NULL);
+
+#ifdef LINUX_SYSTRACE
+DEV_MODULE(systrace_linux, systrace_modevent, NULL);
+MODULE_VERSION(systrace_linux, 1);
+#ifdef __amd64__
+MODULE_DEPEND(systrace_linux, linux64, 1, 1, 1);
+#else
+MODULE_DEPEND(systrace_linux, linux, 1, 1, 1);
+#endif
+MODULE_DEPEND(systrace_linux, dtrace, 1, 1, 1);
+MODULE_DEPEND(systrace_linux, opensolaris, 1, 1, 1);
+#elif defined(LINUX32_SYSTRACE)
+DEV_MODULE(systrace_linux32, systrace_modevent, NULL);
+MODULE_VERSION(systrace_linux32, 1);
+MODULE_DEPEND(systrace_linux32, linux, 1, 1, 1);
+MODULE_DEPEND(systrace_linux32, dtrace, 1, 1, 1);
+MODULE_DEPEND(systrace_linux32, opensolaris, 1, 1, 1);
+#elif defined(FREEBSD32_SYSTRACE)
+DEV_MODULE(systrace_freebsd32, systrace_modevent, NULL);
+MODULE_VERSION(systrace_freebsd32, 1);
+MODULE_DEPEND(systrace_freebsd32, dtrace, 1, 1, 1);
+MODULE_DEPEND(systrace_freebsd32, opensolaris, 1, 1, 1);
+#else
+DEV_MODULE(systrace, systrace_modevent, NULL);
+MODULE_VERSION(systrace, 1);
+MODULE_DEPEND(systrace, dtrace, 1, 1, 1);
+MODULE_DEPEND(systrace, opensolaris, 1, 1, 1);
+#endif
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
+
 static int
 MODCMD(modcmd_t cmd, void *data)
 {
@@ -282,3 +529,5 @@ MODCMD(modcmd_t cmd, void *data)
 }
 
 MODULE(MODULE_CLASS_MISC, MODNAME, MODDEP)
+
+#endif /* __NetBSD__ */
Index: src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1,v
retrieving revision 1.3
diff -u -p -r1.3 dtrace.1
--- src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1	12 May 2017 21:01:36 -0000	1.3
+++ src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1	17 May 2017 00:00:52 -0000
@@ -2,7 +2,7 @@
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
-.\" Common Development and Distribution License (the "License").  
+.\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
@@ -18,653 +18,660 @@
 .\"
 .\" CDDL HEADER END
 .\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved.
-.TH dtrace 1 "5 Sep 2006" "SunOS 5.11" "System Administration Commands"
-.SH NAME
-dtrace \- DTrace dynamic tracing compiler and tracing utility
-.SH SYNOPSIS
-.LP
-.nf
-\fBdtrace\fR [\fB-32\fR | \fB-64\fR] [\fB-aACeFGHhlqSvVwZ\fR] [\fB-b\fR \fIbufsz\fR] [\fB-c\fR \fIcmd\fR] 
-    [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] 
-    [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] 
-    [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] 
-    [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] 
-    [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] 
-    [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] 
-    [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] 
-    [\fB-i\fR \fIprobe-id\fR [[\fIpredicate\fR] \fIaction\fR]]
-.fi
-
-.SH DESCRIPTION
-.sp
-.LP
-DTrace is a comprehensive dynamic tracing framework for the Solaris Operating System. DTrace provides a powerful infrastructure that permits administrators, developers, and service personnel to concisely answer arbitrary questions about the behavior of the operating system and user programs. 
-.sp
-.LP
-The \fISolaris Dynamic Tracing Guide\fR describes how to use DTrace to observe, debug, and tune system behavior. Refer to this book for a detailed description of DTrace features, including the bundled DTrace observability
-tools, instrumentation providers, and the D programming language.
-.sp
-.LP
-The \fBdtrace\fR command provides a generic interface to the essential services provided by the DTrace facility, including:
-.RS +4
-.TP
-.ie t \(bu
-.el o
+.\"
+.\" $FreeBSD: head/cddl/contrib/opensolaris/cmd/dtrace/dtrace.1 281705 2015-04-18 21:00:36Z markj $
+.\"
+.Dd April 18, 2015
+.Dt DTRACE 1
+.Os
+.Sh NAME
+.Nm dtrace
+.Nd dynamic tracing compiler and tracing utility
+.Sh SYNOPSIS
+.Nm
+.Op Fl 32 | Fl 64
+.Op Fl aACeFGhHlqSvVwZ
+.Op Fl b Ar bufsz
+.Op Fl c Ar cmd
+.Op Fl D Ar name Op Ns = Ns value
+.Op Fl I Ar path
+.Op Fl L Ar path
+.Op Fl o Ar output
+.Op Fl s Ar script
+.Op Fl U Ar name
+.Op Fl x Ar arg Op Ns = Ns value
+.Op Fl X Cm a | c | s | t
+.Op Fl p Ar pid
+.Op Fl P Ar provider Oo Oo Ar predicate Oc Ar action Oc
+.Op Fl m Oo Ar provider : Oc Ar module Oo Oo Ar predicate Oc Ar action Oc
+.Op Fl f Oo Oo Ar provider : Oc Ar module : Oc Ar function Oo Oo Ar predicate \
+    Oc Ar action Oc
+.Op Fl n Oo Oo Oo Ar provider : Oc Ar module : Oc Ar function : Oc Ar name \
+    Oo Oo Ar predicate Oc Ar action Oc
+.Op Fl i Ar probe-id Oo Oo Ar predicate Oc Ar action Oc
+.Sh DESCRIPTION
+DTrace is a comprehensive dynamic tracing framework ported from Solaris.
+DTrace provides a powerful infrastructure that permits administrators,
+developers, and service personnel to concisely answer arbitrary questions about
+the behavior of the operating system and user programs.
+.Pp
+The
+.Nm
+command provides a generic interface to the essential services provided by the
+DTrace facility, including:
+.Bl -bullet -offset indent
+.It
 Options that list the set of probes and providers currently published by DTrace
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-Options that enable probes directly using any of the probe description specifiers (provider, module, function, name)
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-Options that run the D compiler and compile one or more D program files or programs written directly on the command line
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
+.It
+Options that enable probes directly using any of the probe description
+specifiers (provider, module, function, name)
+.It
+Options that run the D compiler and compile one or more D program files or
+programs written directly on the command line
+.It
 Options that generate anonymous tracing programs
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
+.It
 Options that generate program stability reports
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-Options that modify DTrace tracing and buffering behavior and enable additional D compiler features
-.RE
-.sp
-.LP
-You can use \fBdtrace\fR to create D scripts by using it in a \fB#!\fR declaration to create an interpreter file. You can also use \fBdtrace\fR to attempt to compile D programs and determine their properties without actually enabling tracing using the \fB-e\fR option. See \fBOPTIONS\fR. See the \fISolaris Dynamic Tracing Guide\fR for detailed examples of how to use the \fBdtrace\fR utility to perform these tasks.
-.SH OPTIONS
-.sp
-.LP
-The arguments accepted by the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, and \fB-i\fR options can include an optional D language \fIpredicate\fR enclosed in slashes \fB//\fR and optional D language \fIaction\fR statement list enclosed in braces \fB{}\fR. D program code specified on the command line must be appropriately quoted to avoid interpretation of meta-characters by the shell. 
-.sp
-.LP
+.It
+Options that modify DTrace tracing and buffering behavior and enable
+additional D compiler features
+.El
+.Pp
+You can use
+.Nm
+to create D scripts by using it in a shebang declaration to create an
+interpreter file.
+You can also use
+.Nm
+to attempt to compile D programs and determine their properties without
+actually enabling traces using the
+.Fl e
+option.
+.Sh OPTIONS
+The arguments accepted by the
+.Fl P ,
+.Fl m ,
+.Fl f ,
+.Fl n ,
+and
+.Fl i
+options can include an optional D language
+.Ar predicate
+enclosed in slashes and an optional D language
+.Ar action
+statement list enclosed in braces.
+D program code specified on the command line must be appropriately quoted to
+avoid interpretation of meta-characters by the shell.
+.Pp
 The following options are supported:
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-32\fR | \fB-64\fR\fR
-.ad
-.sp .6
-.RS 4n
-The D compiler produces programs using the native data model of the operating system kernel. You can use the \fBisainfo\fR \fB-b\fR command to determine the current operating system data model.  If the \fB-32\fR option is specified, \fBdtrace\fR forces
-the D compiler to compile a D program using the 32-bit data model. If the \fB-64\fR option is specified, \fBdtrace\fR forces the D compiler to compile a D program using the 64-bit data model. These options are typically not required as \fBdtrace\fR selects the
-native data model as the default. The data model affects the sizes of integer types and other language properties. D programs compiled for either data model can be executed on both 32-bit and 64-bit kernels. The \fB-32\fR and \fB-64\fR options also determine the ELF file format
-(ELF32 or ELF64) produced by the \fB-G\fR option.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-a\fR\fR
-.ad
-.sp .6
-.RS 4n
-Claim anonymous tracing state and display the traced data. You can combine the \fB-a\fR option with the \fB-e\fR option to force \fBdtrace\fR to exit immediately after consuming the anonymous tracing state rather than continuing to wait for new
-data. See the \fISolaris Dynamic Tracing Guide\fR for more information about anonymous tracing.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-A\fR\fR
-.ad
-.sp .6
-.RS 4n
-Generate \fBdriver.conf\fR(4) directives for anonymous tracing. This option constructs a set of \fBdtrace\fR(7D) configuration file directives to enable the specified probes for anonymous tracing and then exits. By default, \fBdtrace\fR attempts to store the directives to the file \fB/kernel/drv/dtrace.conf\fR. You can modify this behavior if you use the \fB-o\fR option to specify an alternate output file.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-b\fR \fIbufsz\fR\fR
-.ad
-.sp .6
-.RS 4n
-Set principal trace buffer size (\fIbufsz\fR). The trace buffer size can include any of the size suffixes \fBk\fR, \fBm\fR, \fBg\fR, or \fBt\fR.  If the buffer space cannot be allocated, \fBdtrace\fR attempts
-to reduce the buffer size or exit depending on the setting of the \fBbufresize\fR property.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-c\fR \fIcmd\fR\fR
-.ad
-.sp .6
-.RS 4n
-Run the specified command \fIcmd\fR and exit upon its completion. If more than one \fB-c\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status for each child process as it
-terminates. The process-ID of the first command is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information
-on macro variables.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-C\fR\fR
-.ad
-.sp .6
-.RS 4n
-Run the C preprocessor \fBcpp\fR(1) over D programs before compiling them. You can pass options to the C preprocessor using the \fB-D\fR, \fB-U\fR, \fB-I\fR, and \fB-H\fR options. You can select the degree of C standard conformance if you use the \fB-X\fR option. For a description of the set of tokens defined by the D compiler when invoking the C preprocessor, see \fB-X\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-D\fR \fIname\fR \fB[=\fR\fIvalue\fR\fB]\fR\fR
-.ad
-.sp .6
-.RS 4n
-Define \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). If you specify the equals sign (\fB=\fR)
-and additional \fIvalue\fR, the name is assigned the corresponding value. This option passes the \fB-D\fR option to each \fBcpp\fR invocation.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-e\fR\fR
-.ad
-.sp .6
-.RS 4n
-Exit after compiling any requests and consuming anonymous tracing state (\fB-a\fR option) but prior to enabling any probes. You can combine this option with the \fB-a\fR option to print anonymous tracing data and exit. You can also combine this option with D
-compiler options. This combination verifies that the programs compile without actually executing them and enabling the corresponding instrumentation.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-f\fR\fB[[\fR\fIprovider\fR\fB:]\fR\fImodule\fR\fB:]\fR\fIfunction\fR\fB[[\fR\fIpredicate\fR\fB]\fR\fIaction\fR\fB]]\fR\fR
-.ad
-.sp .6
-.RS 4n
-Specify function name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function\fR, \fImodule:function\fR, or \fIfunction\fR.
-Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIfunction\fR are specified in the description, all probes with the corresponding \fIfunction\fR are matched.
-The \fB-f\fR argument can be suffixed with an optional D probe clause. You can specify more than one \fB-f\fR option on the command line at a time.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-F\fR\fR
-.ad
-.sp .6
-.RS 4n
-Coalesce trace output by identifying function entry and return. Function entry probe reports are indented and their output is prefixed with \fB->\fR. Function return probe reports are unindented and their output is prefixed with \fB<-\fR\&. System call
-entry probe reports are indented and their output is prefixed with \fB=>\fR. System call return probe reports are unindented and their output is prefixed with \fB<=\fR\&.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-G\fR\fR
-.ad
-.sp .6
-.RS 4n
-Generate an ELF file containing an embedded DTrace program. The DTrace probes specified in the program are saved inside of a relocatable ELF object which can be linked into another program. If the \fB-o\fR option is present, the ELF file is saved using the pathname specified
-as the argument for this operand. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fB\fIfilename\fR.d\fR, then the ELF file is saved using the name \fB\fIfilename\fR.o\fR.
-Otherwise the ELF file is saved using the name \fBd.out\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-H\fR\fR
-.ad
-.sp .6
-.RS 4n
-Print the pathnames of included files when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-H\fR option
-to each \fBcpp\fR invocation, causing it to display the list of pathnames, one for each line, to \fBstderr\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-h\fR\fR
-.ad
-.sp .6
-.RS 4n
-Generate a header file containing macros that correspond to probes in the specified provider definitions. This option should be used to generate a header file that is included by other source files for later use with the \fB-G\fR option. If the \fB-o\fR option
-is present, the header file is saved using the pathname specified as the argument for that option. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fIfilename\fR\fB\&.d\fR, then the header file is saved
-using the name \fIfilename\fR\fB\&.h\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-i\fR \fIprobe-id\fR\fB[[\fR\fIpredicate\fR] \fIaction\fR\fB]\fR\fR
-.ad
-.sp .6
-.RS 4n
-Specify probe identifier (\fIprobe-id\fR) to trace or list (\fB-l\fR option). You can specify probe IDs using decimal integers as shown by \fBdtrace\fR \fB-l\fR. The \fB-i\fR argument can be suffixed with an optional
-D probe clause. You can specify more than one \fB-i\fR option at a time.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-I\fR \fIpath\fR\fR
-.ad
-.sp .6
-.RS 4n
-Add the specified directory \fIpath\fR to the search path for \fB#include\fR files when invoking \fBcpp\fR(1) (enabled
-using the \fB-C\fR option). This option passes the \fB-I\fR option to each \fBcpp\fR invocation. The specified \fIpath\fR is inserted into the search path ahead of the default directory list.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-L\fR \fIpath\fR\fR
-.ad
-.sp .6
-.RS 4n
-Add the specified directory \fIpath\fR to the search path for DTrace libraries. DTrace libraries are used to contain common definitions that can be used when writing D programs. The specified \fIpath\fR is added after the default library
-search path.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-l\fR\fR
-.ad
-.sp .6
-.RS 4n
-List probes instead of enabling them. If the \fB-l\fR option is specified, \fBdtrace\fR produces a report of the probes matching the descriptions given using the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, \fB-i\fR,
-and \fB-s\fR options. If none of these options are specified, this option lists all probes.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-m\fR [[\fIprovider:\fR] \fImodule:\fR [[\fIpredicate\fR] \fIaction\fR]]\fR
-.ad
-.sp .6
-.RS 4n
-Specify module name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module\fR or \fImodule\fR. Unspecified probe description fields are left blank and match
-any probes regardless of the values in those fields. If no qualifiers other than \fImodule\fR are specified in the description, all probes with a corresponding \fImodule\fR are matched. The \fB-m\fR argument can be suffixed with an optional D
-probe clause. More than one \fB-m\fR option can be specified on the command line at a time.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]\fR
-.ad
-.sp .6
-.RS 4n
-Specify probe name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function:name\fR, \fImodule:function:name\fR, \fIfunction:name\fR,
-or \fIname\fR.  Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIname\fR are specified in the description, all probes with a corresponding \fIname\fR are
-matched. The \fB-n\fR argument can be suffixed with an optional D probe clause. More than one \fB-n\fR option can be specified on the command line at a time.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-o\fR \fIoutput\fR\fR
-.ad
-.sp .6
-.RS 4n
-Specify the \fIoutput\fR file for the \fB-A\fR , \fB-G\fR, and \fB-l\fR options, or for the traced data itself. If the \fB-A\fR option is present and \fB-o\fR is not present, the default output file is \fB/kernel/drv/dtrace.conf\fR. If the \fB-G\fR option is present and the \fB-s\fR option's argument is of the form \fB\fIfilename\fR.d\fR and \fB-o\fR is not present, the default output file is \fB\fIfilename\fR.o\fR.
-Otherwise the default output file is \fBd.out\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-p\fR \fIpid\fR\fR
-.ad
-.sp .6
-.RS 4n
-Grab the specified process-ID \fIpid\fR, cache its symbol tables, and exit upon its completion. If more than one \fB-p\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status
-for each process as it terminates. The first process-ID is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for
-more information on macro variables.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-P\fR \fIprovider\fR \fB[[\fR\fIpredicate\fR\fB]\fR \fIaction\fR]\fR
-.ad
-.sp .6
-.RS 4n
-Specify provider name to trace or list (\fB-l\fR option). The remaining probe description fields module, function, and name are left blank and match any probes regardless of the values in those fields. The \fB-P\fR argument can be suffixed with an optional D
-probe clause. You can specify more than one \fB-P\fR option on the command line at a time.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-q\fR\fR
-.ad
-.sp .6
-.RS 4n
-Set quiet mode. \fBdtrace\fR suppresses messages such as the number of probes matched by the specified options and D programs and does not print column headers, the CPU ID, the probe ID, or insert newlines into the output. Only data traced and formatted by D program
-statements such as \fBtrace()\fR and \fBprintf()\fR is displayed to \fBstdout\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-s\fR\fR
-.ad
-.sp .6
-.RS 4n
-Compile the specified D program source file. If the \fB-e\fR option is present, the program is compiled but instrumentation is not enabled. If the \fB-l\fR option is present, the program is compiled and the set of probes matched by it is listed, but instrumentation
-is not enabled. If none of \fB-e\fR, \fB-l\fR, \fB-G\fR, or \fB-A\fR are present, the instrumentation specified by the D program is enabled and tracing begins.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-S\fR\fR
-.ad
-.sp .6
-.RS 4n
-Show D compiler intermediate code. The D compiler produces a report of the intermediate code generated for each D program to \fBstderr\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-U\fR \fIname\fR\fR
-.ad
-.sp .6
-.RS 4n
-Undefine the specified \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-U\fR option to each \fBcpp\fR invocation.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-v\fR\fR
-.ad
-.sp .6
-.RS 4n
-Set verbose mode. If the \fB-v\fR option is specified, \fBdtrace\fR produces a program stability report showing the minimum interface stability and dependency level for the specified D programs. DTrace stability levels are explained in further detail in the \fISolaris Dynamic Tracing Guide\fR.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-V\fR\fR
-.ad
-.sp .6
-.RS 4n
-Report the highest D programming interface version supported by \fBdtrace\fR. The version information is printed to \fBstdout\fR and the \fBdtrace\fR command exits. Refer to the \fISolaris Dynamic Tracing Guide\fR for
-more information about DTrace versioning features.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-w\fR\fR
-.ad
-.sp .6
-.RS 4n
-Permit destructive actions in D programs specified using the \fB-s\fR, \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options. If the \fB-w\fR option is not specified, \fBdtrace\fR does not
-permit the compilation or enabling of a D program that contains destructive actions.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-x\fR \fIarg\fR [\fI=val\fR]\fR
-.ad
-.sp .6
-.RS 4n
-Enable or modify a DTrace runtime option or D compiler option. The list of options is found in the \fISolaris Dynamic Tracing Guide\fR.  Boolean options are enabled by specifying their name. Options with values are set by separating the option name and
-value with an equals sign (\fB=\fR).
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-X\fR \fBa | c | s | t\fR\fR
-.ad
-.sp .6
-.RS 4n
-Specify the degree of conformance to the ISO C standard that should be selected when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option).
-The \fB-X\fR option argument affects the value and presence of the \fB__STDC__\fR macro depending upon the value of the argument letter.
-.sp
-The \fB-X\fR option supports the following arguments:
-.sp
-.ne 2
-.mk
-.na
-\fB\fBa\fR\fR
-.ad
-.RS 5n
-.rt  
-Default. ISO C plus K&R compatibility extensions, with semantic changes required by ISO C. This is the default mode if \fB-X\fR is not specified. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction
-with the \fB-Xa\fR option.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fBc\fR\fR
-.ad
-.RS 5n
-.rt  
-Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. The predefined macro \fB__STDC__\fR has a value of 1 when \fBcpp\fR is invoked in conjunction with the \fB-Xc\fR option.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fBs\fR\fR
-.ad
-.RS 5n
-.rt  
-K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked in conjunction with the \fB-Xs\fR option.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fBt\fR\fR
-.ad
-.RS 5n
-.rt  
-Transition. ISO C plus K&R C compatibility extensions, without semantic changes required by ISO C. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction with the \fB-Xt\fR option.
-.RE
-
-As the \fB-X\fR option only affects how the D compiler invokes the C preprocessor, the \fB-Xa\fR and \fB-Xt\fR options are equivalent from the perspective of D and both are provided only to ease re-use of settings from a C build environment.
-.sp
-Regardless of the \fB-X\fR mode, the following additional C preprocessor definitions are always specified and valid in all modes:
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__sun\fR
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__unix\fR
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__SVR4\fR
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__sparc\fR (on SPARC systems only)
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__sparcv9\fR (on SPARC systems only when 64-bit programs are compiled)
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__i386\fR (on x86 systems only when 32-bit programs are compiled)
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__amd64\fR (on x86 systems only when 64-bit programs are compiled)
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__\fI`uname -s`\fR_\fI`uname -r`\fR\fR (for example, \fB__SunOS_5_10\fR)
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__SUNW_D=1\fR
-.RE
-.RS +4
-.TP
-.ie t \(bu
-.el o
-\fB__SUNW_D_VERSION=0x\fIMMmmmuuu\fR\fR
-.sp
-Where \fIMM\fR is the major release value in hexadecimal, \fImmm\fR is the minor release value in hexadecimal, and \fIuuu\fR is the
-micro release value in hexadecimal. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information about DTrace versioning.
-.RE
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB-Z\fR\fR
-.ad
-.sp .6
-.RS 4n
-Permit probe descriptions that match zero probes. If the \fB-Z\fR option is not specified, \fBdtrace\fR reports an error and exits if any probe descriptions specified in D program files (\fB-s\fR option) or on the command line (\fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options) contain descriptions that do not match any known probes.
-.RE
-
-.SH OPERANDS
-.sp
-.LP
-You can specify zero or more additional arguments on the \fBdtrace\fR command line to define a set of macro variables (\fB$1\fR, \fB$2\fR, and so forth). The additional arguments can be used in D programs specified using the \fB-s\fR option
-or on the command line. The use of macro variables is described further in the \fISolaris Dynamic Tracing Guide\fR.
-.SH EXIT STATUS
-.sp
-.LP
-The following exit values are returned:
-.sp
-.ne 2
-.mk
-.na
-\fB0\fR
-.ad
-.RS 5n
-.rt  
-Successful completion. 
-.sp
-For D program requests, an exit status of \fB0\fR indicates that programs were successfully compiled, probes were successfully enabled, or anonymous state was successfully retrieved. \fBdtrace\fR returns \fB0\fR even if the specified tracing requests
-encountered errors or drops.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB1\fR\fR
-.ad
-.RS 5n
-.rt  
+.Bl -tag -width indent
+.It Fl 32 | Fl 64
+The D compiler produces programs using the native data model of the operating
+system kernel.
+If the
+.Fl 32
+option is specified,
+.Nm
+forces the D compiler to compile a D program using the 32-bit data model.
+If the
+.Fl 64
+option is specified,
+.Nm
+forces the D compiler to compile a D program using the 64-bit data model.
+These options are typically not required as
+.Nm
+selects the native data model as the default.
+The data model affects the sizes of integer types and other language properties.
+D programs compiled for either data model can be executed on both 32-bit and
+64-bit kernels.
+The
+.Fl 32
+and
+.Fl 64
+options also determine the
+.Xr elf 5
+file format (ELF32 or ELF64) produced by the
+.Fl G
+option.
+.It Fl a
+Claim anonymous tracing state and display the traced data.
+You can combine the
+.Fl a
+option with the
+.Fl e
+option to force
+.Nm
+to exit immediately after consuming the anonymous tracing state rather than
+continuing to wait for new data.
+.It Fl A
+Generate directives for anonymous tracing and write them to
+.Pa /boot/dtrace.dof .
+This option constructs a set of dtrace configuration file directives to enable
+the specified probes for anonymous tracing and then exits.
+By default,
+.Nm
+attempts to store the directives to the file
+.Pa /boot/dtrace.dof .
+This behavior can be modified using the
+.Fl o
+option to specify an alternate output file.
+.It Fl b Ar bufsz
+Set the principal trace buffer size to
+.Ar bufsz .
+The trace buffer size can include any of the size suffixes k, m, g, or t.
+If the buffer space cannot be allocated,
+.Nm dtrace
+attempts to reduce the buffer size or exit depending on the setting of the
+bufresize property.
+.It Fl c Ar cmd
+Run the specified command
+.Ar cmd
+and exit upon its completion.
+If more than one
+.Fl c
+option is present on the command line,
+.Nm dtrace
+exits when all commands have exited, reporting the exit status for each child
+process as it terminates.
+The process ID of the first command is made available to any D programs
+specified on the command line or using the
+.Fl s
+option through the
+.Li $target
+macro variable.
+.It Fl C
+Run the C preprocessor
+.Xr cpp 1
+over D programs before compiling them.
+You can pass options to the C preprocessor using the
+.Fl D ,
+.Fl U ,
+.Fl I ,
+and
+.Fl H
+options.
+You can select the degree of C standard conformance if you use the
+.Fl X
+option.
+For a description of the set of tokens defined by the D compiler when invoking
+the C preprocessor, see
+.Fl X .
+.It Fl D Ar name Op Ns = Ns value
+Define
+.Ar name
+when invoking
+.Xr cpp 1
+(enabled using the
+.Fl C
+option).
+If you specify an additional
+.Ar value ,
+the name is assigned the corresponding value.
+This option passes the
+.Fl D
+option to each
+.Xr cpp 1
+invocation.
+.It Fl e
+Exit after compiling any requests and consuming anonymous tracing state
+.Fl ( a
+option) but prior to enabling any probes.
+You can combine this option with the
+.Fl a
+option to print anonymous tracing data and exit.
+You can also combine this option with D compiler options.
+This combination verifies that the programs compile without actually executing
+them and enabling the corresponding instrumentation.
+.It Fl f Oo Oo Ar provider : Oc Ar module : Oc Ar function Oo Oo Ar predicate \
+    Oc Ar action Oc
+Specify function name to trace or list
+.Fl ( l
+option).
+The corresponding argument can include any of the probe description forms
+.Ar provider:module:function ,
+.Ar module:function ,
+or
+.Ar function .
+Unspecified probe description fields are left blank and match any probes
+regardless of the values in those fields.
+If no qualifiers other than
+.Ar function
+are specified in the description, all probes with the corresponding
+.Ar function
+are matched.
+The
+.Fl f
+argument can be suffixed with an optional D probe clause.
+You can specify more than one
+.Fl f
+option on the command line at a time.
+.It Fl F
+Coalesce trace output by identifying function entry and return.
+Function entry probe reports are indented and their output is prefixed with
+.Ql -> .
+Function return probe reports are unindented and their output is prefixed with
+.Ql <- .
+System call entry probe reports are indented and their output is prefixed with
+.Ql => .
+System call return probe reports are unindented and their output is prefixed
+with
+.Ql <= .
+.It Fl G
+Generate an ELF file containing an embedded DTrace program.
+The DTrace probes specified in the program are saved inside of a relocatable ELF
+object which can be linked into another program.
+If the
+.Fl o
+option is present, the ELF file is saved using the pathname specified as the
+argument for this operand.
+If the
+.Fl o
+option is not present and the DTrace program is contained with a file whose name
+is
+.Ar filename.d ,
+then the ELF file is saved using the name
+.Ar filename.o .
+Otherwise the ELF file is saved using the name d.out.
+.It Fl h
+Generate a header file containing macros that correspond to probes in the
+specified provider definitions.
+This option should be used to generate a header file that is included by other
+source files for later use with the
+.Fl G
+option.
+If the
+.Fl o
+option is present, the header file is saved using the pathname specified as the
+argument for that option.
+If the
+.Fl o
+option is not present and the DTrace program is contained within a file whose
+name is
+.Ar filename.d ,
+then the header file is saved using the name
+.Ar filename.h .
+.It Fl H
+Print the pathnames of included files when invoking
+.Xr cpp 1
+(enabled using the
+.Fl C
+option).
+This option passes the
+.Fl H
+option to each
+.Xr cpp 1
+invocation, causing it to display the list of pathnames, one for each line, to
+standard error.
+.It Fl i Ar probe-id Op Oo Ar predicate Oc Ar action
+Specify probe identifier
+.Ar ( probe-id )
+to trace or list
+.Ar ( l
+option).
+You can specify probe IDs using decimal integers as shown by `dtrace -l`.
+The
+.Fl i
+argument can be suffixed with an optional D probe clause.
+You can specify more than one
+.Fl i
+option at a time.
+.It Fl I Ar path
+Add the specified directory
+.Ar path
+to the search path for #include files when invoking
+.Xr cpp 1
+(enabled using the
+.Fl C
+option).
+This option passes the
+.Fl I
+option to each
+.Xr cpp 1
+invocation.
+The specified
+.Ar path
+is inserted into the search path ahead of the default directory list.
+.It Fl l
+List probes instead of enabling them.
+If the
+.Fl l
+option is specified,
+.Nm
+produces a report of the probes matching the descriptions given using the
+.Fl P , m , f , n , i ,
+and
+.Fl s
+options.
+If none of these options are specified, this option lists all probes.
+.It Fl L Ar path
+Add the specified directory
+.Ar path
+to the search path for DTrace libraries.
+DTrace libraries are used to contain common definitions that can be used when
+writing D programs.
+The specified
+.Ar path
+is added after the default library search path.
+.It Fl m Oo Ar provider : Oc Ar module Oo Oo Ar predicate Oc Ar action Oc
+Specify module name to trace or list
+.Fl ( l
+option).
+The corresponding argument can include any of the probe description forms
+.Ar provider:module
+or
+.Ar module .
+Unspecified probe description fields are left blank and match any probes
+regardless of the values in those fields.
+If no qualifiers other than
+.Ar module
+are specified in the description, all probes with a corresponding
+.Ar module
+are matched.
+The
+.Fl m
+argument can be suffixed with an optional D probe clause.
+More than one
+.Fl m
+option can be specified on the command line at a time.
+.It Fl n Oo Oo Oo Ar provider : Oc Ar module : Oc Ar function : Oc Ar name \
+    Oo Oo Ar predicate Oc Ar action Oc
+Specify probe name to trace or list
+.Fl ( l
+option).
+The corresponding argument can include any of the probe description forms
+.Ar provider:module:function:name , module:function:name , function:name ,
+or
+.Ar name .
+Unspecified probe description fields are left blank and match any probes
+regardless of the values in those fields.
+If no qualifiers other than
+.Ar name
+are specified in the description, all probes with a corresponding
+.Ar name
+are matched.
+The
+.Fl n
+argument can be suffixed with an optional D probe clause.
+More than one
+.Fl n
+option can be specified on the command line at a time.
+.It Fl o Ar output
+Specify the
+.Ar output
+file for the
+.Fl A , G ,
+and
+.Fl l
+options, or for the traced data itself.
+If the
+.Fl A
+option is present and
+.Fl o
+is not present, the default output file is
+.Pa /boot/dtrace.dof .
+If the
+.Fl G
+option is present and the
+.Fl s
+option's argument is of the form
+.Ar filename.d
+and
+.Fl o
+is not present, the default output file is
+.Ar filename.o .
+Otherwise the default output file is
+.Ar d.out .
+.It Fl p Ar pid
+Grab the specified process-ID
+.Ar pid ,
+cache its symbol tables, and exit upon its completion.
+If more than one
+.Fl p
+option is present on the command line,
+.Nm
+exits when all commands have exited, reporting the exit status for each process
+as it terminates.
+The first process-ID is made available to any D programs specified on the
+command line or using the
+.Fl s
+option through the
+.Li $target
+macro variable.
+.It Fl P Ar provider Oo Oo Ar predicate Oc Ar action Oc
+Specify provider name to trace or list
+.Fl ( l
+option).
+The remaining probe description fields module, function, and name are left
+blank and match any probes regardless of the values in those fields.
+The
+.Fl P
+argument can be suffixed with an optional D probe clause.
+You can specify more than one
+.Fl P
+option on the command line at a time.
+.It Fl q
+Set quiet mode.
+.Nm
+suppresses messages such as the number of probes matched by the specified
+options and D programs and does not print column headers, the CPU ID, the probe
+ID, or insert newlines into the output.
+Only data traced and formatted by D program statements such as
+.Ql dtrace()
+and
+.Ql printf()
+is displayed to standard output.
+.It Fl s Ar script
+Compile the specified D program source file.
+If the
+.Fl e
+option is present, the program is compiled but instrumentation is not enabled.
+If the
+.Fl l
+option is present, the program is compiled and the set of probes matched by it
+is listed, but instrumentation is not enabled.
+If none of
+.Fl e , l , G ,
+or
+.Fl A
+are present, the instrumentation specified by the D program is enabled and
+tracing begins.
+.It Fl S
+Show D compiler intermediate code.
+The D compiler produces a report of the intermediate code generated for each D
+program to standard error.
+.It Fl U Ar name
+Undefine the specified
+.Ar name
+when invoking
+.Xr cpp 1
+(enabled using the
+.Fl C
+option).
+This option passes the
+.Fl U
+option to each
+.Xr cpp 1
+invocation.
+.It Fl v
+Set verbose mode.
+If the
+.Fl v
+option is specified,
+.Nm
+produces a program stability report showing the minimum interface stability and
+dependency level for the specified D programs.
+.It Fl V
+Report the highest D programming interface version supported by
+.Nm .
+The version information is printed to standard output and the
+.Nm
+command exits.
+.It Fl w
+Permit destructive actions in D programs specified using the
+.Fl s , P , m , f , n ,
+or
+.Fl i
+options.
+If the
+.Fl w
+option is not specified,
+.Nm
+does not permit the compilation or enabling of a D program that contains
+destructive actions.
+.It Fl x Ar arg Op Ns = Ns value
+Enable or modify a DTrace runtime option or D compiler option.
+Boolean options are enabled by specifying their name.
+Options with values are set by separating the option name and value with an
+equals sign (=).
+.It Fl X Cm a | c | s | t
+Specify the degree of conformance to the ISO C standard that should be selected
+when invoking
+.Xr cpp 1
+(enabled using the
+.Fl C
+option).
+The
+.Fl X
+option argument affects the value and presence of the __STDC__ macro depending
+upon the value of the argument letter.
+.sp
+The
+.Fl X
+option supports the following arguments:
+.Bl -tag -width indent
+.It a
+Default.
+ISO C plus K&R compatibility extensions, with semantic changes required by ISO
+C.
+This is the default mode if
+.Fl X
+is not specified.
+The predefined macro __STDC__ has a value of 0 when
+.Xr cpp 1
+is invoked in conjunction with the
+.Fl Xa
+option.
+.It c
+Conformance.
+Strictly conformant ISO C, without K&R C compatibility extensions.
+The predefined macro __STDC__ has a value of 1 when
+.Xr cpp 1
+is invoked in conjunction with the
+.Fl \&Xc
+option.
+.It s
+K&R C only.
+The macro __STDC__ is not defined when
+.Xr cpp 1
+is invoked in conjunction with the
+.Fl Xs
+option.
+.It t
+Transition.
+ISO C plus K&R C compatibility extensions, without semantic changes required by
+ISO C.
+The predefined macro __STDC__ has a value of 0 when
+.Xr cpp 1
+is invoked in conjunction with the
+.Fl Xt
+option.
+.El
+.Pp
+As the
+.Fl X
+option only affects how the D compiler invokes the C preprocessor, the
+.Fl Xa
+and
+.Fl Xt
+options are equivalent from the perspective of D and both are provided only to
+ease re-use of settings from a C build environment.
+.Pp
+Regardless of the
+.Fl X
+mode, the following additional C preprocessor definitions are always specified
+and valid in all modes:
+.Bl -bullet -offset indent
+.It
+__sun
+.It
+__unix
+.It
+__SVR4
+.It
+__sparc (on SPARC systems only)
+.It
+__sparcv9 (on SPARC systems only when 64-bit programs are compiled)
+.It
+__i386 (on x86 systems only when 32-bit programs are compiled)
+.It
+__amd64 (on x86 systems only when 64-bit programs are compiled)
+.It
+__`uname -s`_`uname -r` (for example,
+.Ql FreeBSD_9.2-RELEASE .
+.It
+__SUNW_D=1
+.It
+.No __SUNW_D_VERSION=0x Ns Ar MMmmmuuu
+.Pp
+Where
+.Ar MM
+is the major release value in hexadecimal,
+.Ar mmm
+is the minor release value in hexadecimal, and
+.Ar uuu
+is the micro release value in hexadecimal.
+.El
+.It Fl Z
+Permit probe descriptions that match zero probes.
+If the
+.Fl Z
+option is not specified,
+.Nm
+reports an error and exits if any probe descriptions specified in D program
+files
+.Fl ( s
+option) or on the command line
+.Fl ( P , m , f , n ,
+or
+.Fl i
+options) contain descriptions that do not match any known probes.
+.El
+.Sh OPERANDS
+You can specify zero or more additional arguments on the
+.Nm
+command line to define a set of macro variables and so forth).
+The additional arguments can be used in D programs specified using the
+.Fl s
+option or on the command line.
+.Sh FILES
+.Bl -tag -width /boot/dtrace.dof -compact
+.It Pa /boot/dtrace.dof
+File for anonymous tracing directives.
+.El
+.Sh EXIT STATUS
+The following exit statuses are returned:
+.Bl -tag -width indent
+.It 0
+Successful completion.
+.Pp
+For D program requests, an exit status of 0 indicates that programs were
+successfully compiled, probes were successfully enabled, or anonymous state
+was successfully retrieved.
+.Nm
+returns 0 even if the specified tracing requests encountered errors or drops.
+.It 1
 An error occurred.
-.sp
-For D program requests, an exit status of \fB1\fR indicates that program compilation failed or that the specified request could not be satisfied.
-.RE
-
-.sp
-.ne 2
-.mk
-.na
-\fB\fB2\fR\fR
-.ad
-.RS 5n
-.rt  
+.Pp
+For D program requests, an exit status of 1 indicates that program compilation
+failed or that the specified request could not be satisfied.
+.It 2
 Invalid command line options or arguments were specified.
-.RE
-
-.SH ATTRIBUTES
-.sp
-.LP
-See \fBattributes\fR(5) for descriptions of the following attributes:
-.sp
-
-.sp
-.TS
-tab() box;
-cw(2.75i) |cw(2.75i) 
-lw(2.75i) |lw(2.75i) 
-.
-ATTRIBUTE TYPEATTRIBUTE VALUE
-_
-AvailabilitySUNWdtrc
-_
-Interface StabilitySee below.
-.TE
-
-.sp
-.LP
-The command-line syntax is Committed. The human-readable output is Uncommitted.
-.SH SEE ALSO
-.sp
-.LP
-\fBcpp\fR(1), \fBisainfo\fR(1), \fBlibdtrace\fR(3LIB), \fBdriver.conf\fR(4), \fBattributes\fR(5), \fBdtrace\fR(7D)
-.sp
-.LP
-\fISolaris Dynamic Tracing Guide\fR
+.El
+.Sh SEE ALSO
+.Xr cpp 1 ,
+.Xr dtruss 1 ,
+.Xr elf 5 ,
+.Xr SDT 9
+.Rs
+.%T Solaris Dynamic Tracing Guide
+.Re
Index: src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c,v
retrieving revision 1.10
diff -u -p -r1.10 dtrace.c
--- src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c	5 Jun 2017 21:19:32 -0000	1.10
+++ src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c	7 Jun 2017 18:38:01 -0000
@@ -23,8 +23,10 @@
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -41,13 +43,16 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <signal.h>
-#if defined(sun)
+#ifdef illumos
 #include <alloca.h>
 #endif
 #include <libgen.h>
-#if defined(sun)
+#ifdef illumos
 #include <libproc.h>
 #endif
+#ifdef __FreeBSD__
+#include <spawn.h>
+#endif
 
 typedef struct dtrace_cmd {
 	void (*dc_func)(struct dtrace_cmd *);	/* function to compile arg */
@@ -88,6 +93,9 @@ static int g_flowindent;
 static int g_intr;
 static int g_impatient;
 static int g_newline;
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+static int g_siginfo;
+#endif
 static int g_total;
 static int g_cflags;
 static int g_oflags;
@@ -99,7 +107,7 @@ static int g_grabanon = 0;
 static const char *g_ofile = NULL;
 static FILE *g_ofp;
 static dtrace_hdl_t *g_dtp;
-#if defined(sun)
+#ifdef illumos
 static char *g_etcfile = "/etc/system";
 static const char *g_etcbegin = "* vvvv Added by DTrace";
 static const char *g_etcend = "* ^^^^ Added by DTrace";
@@ -195,6 +203,13 @@ fatal(const char *fmt, ...)
 	verror(fmt, ap);
 	va_end(ap);
 
+	/*
+	 * Close the DTrace handle to ensure that any controlled processes are
+	 * correctly restored and continued.
+	 */
+	if (g_dtp)
+		dtrace_close(g_dtp);
+
 	exit(E_ERROR);
 }
 
@@ -202,7 +217,7 @@ fatal(const char *fmt, ...)
 static void __printflike(1, 2) __dead
 dfatal(const char *fmt, ...)
 {
-#if !defined(sun) && defined(NEED_ERRLOC)
+#if !defined(illumos) && defined(NEED_ERRLOC)
 	char *p_errfile = NULL;
 	int errline = 0;
 #endif
@@ -223,7 +238,7 @@ dfatal(const char *fmt, ...)
 		(void) fprintf(stderr, "%s\n",
 		    dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
 	}
-#if !defined(sun) && defined(NEED_ERRLOC)
+#if !defined(illumos) && defined(NEED_ERRLOC)
 	dt_get_errloc(g_dtp, &p_errfile, &errline);
 	if (p_errfile != NULL)
 		printf("File '%s', line %d\n", p_errfile, errline);
@@ -388,7 +403,42 @@ dof_prune(const char *fname)
 	free(buf);
 }
 
-#if defined(sun)
+#ifdef __FreeBSD__
+/*
+ * Use nextboot(8) to tell the loader to load DTrace kernel modules during
+ * the next boot of the system. The nextboot(8) configuration is removed during
+ * boot, so it will not persist indefinitely.
+ */
+static void
+bootdof_add(void)
+{
+	char * const nbargv[] = {
+		"nextboot", "-a",
+		"-e", "dtraceall_load=\"YES\"",
+		"-e", "dtrace_dof_load=\"YES\"",
+		"-e", "dtrace_dof_name=\"/boot/dtrace.dof\"",
+		"-e", "dtrace_dof_type=\"dtrace_dof\"",
+		NULL,
+	};
+	pid_t child;
+	int err, status;
+
+	err = posix_spawnp(&child, "nextboot", NULL, NULL, nbargv,
+	    NULL);
+	if (err != 0) {
+		error("failed to execute nextboot: %s", strerror(err));
+		exit(E_ERROR);
+	}
+
+	if (waitpid(child, &status, 0) != child)
+		fatal("waiting for nextboot");
+	if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+		error("nextboot returned with status %d", status);
+		exit(E_ERROR);
+	}
+}
+#endif
+#ifdef illumos
 static void
 etcsystem_prune(void)
 {
@@ -499,12 +549,13 @@ etcsystem_add(void)
 
 	error("added forceload directives to %s\n", g_ofile);
 }
-#endif
+#endif /* illumos__ */
 
 static void
 print_probe_info(const dtrace_probeinfo_t *p)
 {
 	char buf[BUFSIZ];
+	char *user;
 	int i;
 
 	oprintf("\n\tProbe Description Attributes\n");
@@ -528,10 +579,14 @@ print_probe_info(const dtrace_probeinfo_
 	oprintf("\n\tArgument Types\n");
 
 	for (i = 0; i < p->dtp_argc; i++) {
+		if (p->dtp_argv[i].dtt_flags & DTT_FL_USER)
+			user = "userland ";
+		else
+			user = "";
 		if (ctf_type_name(p->dtp_argv[i].dtt_ctfp,
 		    p->dtp_argv[i].dtt_type, buf, sizeof (buf)) == NULL)
 			(void) strlcpy(buf, "(unknown)", sizeof (buf));
-		oprintf("\t\targs[%d]: %s\n", i, buf);
+		oprintf("\t\targs[%d]: %s%s\n", i, user, buf);
 	}
 
 	if (p->dtp_argc == 0)
@@ -629,25 +684,26 @@ anon_prog(const dtrace_cmd_t *dcp, dof_h
 	p = (uchar_t *)dof;
 	q = p + dof->dofh_loadsz;
 
-#if defined(sun)
-	oprintf("dof-data-%d=0x%x", n, *p++);
-
-	while (p < q)
-		oprintf(",0x%x", *p++);
-
-	oprintf(";\n");
-#else
+#ifdef __FreeBSD__
 	/*
-	 * On FreeBSD, the DOF data is handled as a kernel environment (kenv)
-	 * string. We use two hex characters per DOF byte.
+	 * On FreeBSD, the DOF file is read directly during boot - just write
+	 * two hex characters per byte.
 	 */
-	oprintf("dof-data-%d=%02x", n, *p++);
+	oprintf("dof-data-%d=", n);
 
 	while (p < q)
 		oprintf("%02x", *p++);
 
 	oprintf("\n");
 #endif
+#ifdef illumos
+	oprintf("dof-data-%d=0x%x", n, *p++);
+
+	while (p < q)
+		oprintf(",0x%x", *p++);
+
+	oprintf(";\n");
+#endif
 
 	dtrace_dof_destroy(g_dtp, dof);
 }
@@ -671,9 +727,12 @@ link_prog(dtrace_cmd_t *dcp)
 		p[0] = '\0'; /* strip .d suffix */
 		(void) snprintf(dcp->dc_ofile, sizeof (dcp->dc_ofile),
 		    "%s.o", basename(dcp->dc_arg));
+	} else if (g_cmdc > 1) {
+		(void) snprintf(dcp->dc_ofile, sizeof (dcp->dc_ofile),
+		    "d.out.%td", dcp - g_cmdv);
 	} else {
 		(void) snprintf(dcp->dc_ofile, sizeof (dcp->dc_ofile),
-		    g_cmdc > 1 ?  "%s.%d" : "%s", "d.out", (int)(dcp - g_cmdv));
+		    "d.out");
 	}
 
 	if (dtrace_program_link(g_dtp, dcp->dc_prog, DTRACE_D_PROBES,
@@ -693,6 +752,9 @@ list_probe(dtrace_hdl_t *dtp, const dtra
 	if (g_verbose && dtrace_probe_info(dtp, pdp, &p) == 0)
 		print_probe_info(&p);
 
+	if (g_intr != 0)
+		return (1);
+
 	return (0);
 }
 
@@ -868,16 +930,16 @@ setopthandler(const dtrace_setoptdata_t 
 #define	BUFDUMPSTR(ptr, field) \
 	(void) printf("%s: %20s => ", g_pname, #field);	\
 	if ((ptr)->field != NULL) {			\
-		const char *xc = (ptr)->field;		\
+		const char *c = (ptr)->field;		\
 		(void) printf("\"");			\
 		do {					\
-			if (*xc == '\n') {		\
+			if (*c == '\n') {		\
 				(void) printf("\\n");	\
 				continue;		\
 			}				\
 							\
-			(void) printf("%c", *xc);	\
-		} while (*xc++ != '\0');		\
+			(void) printf("%c", *c);	\
+		} while (*c++ != '\0');			\
 		(void) printf("\"\n");			\
 	} else {					\
 		(void) printf("<NULL>\n");		\
@@ -914,7 +976,7 @@ bufhandler(const dtrace_bufdata_t *bufda
 	    { "AGGFORMAT",	DTRACE_BUFDATA_AGGFORMAT },
 	    { "AGGLAST",	DTRACE_BUFDATA_AGGLAST },
 	    { "???",		UINT32_MAX },
-	    { NULL,		0 }
+	    { NULL }
 	};
 
 	if (bufdata->dtbda_probe != NULL) {
@@ -971,7 +1033,7 @@ bufhandler(const dtrace_bufdata_t *bufda
 			uint8_t *data;
 			int lim = rec->dtrd_size;
 
-			(void) snprintf(buf, end - buf, "%d (data: ", rec->dtrd_offset);
+			(void) sprintf(buf, "%d (data: ", rec->dtrd_offset);
 			c = buf + strlen(buf);
 
 			if (lim > sizeof (uint64_t))
@@ -1070,7 +1132,7 @@ chew(const dtrace_probedata_t *data, voi
 			(void) snprintf(name, sizeof (name), "%s:%s",
 			    pd->dtpd_func, pd->dtpd_name);
 
-			oprintf("%3d %6d %32s ", (int)cpu, pd->dtpd_id, name);
+			oprintf("%3d %6d %32s ", cpu, pd->dtpd_id, name);
 		}
 	} else {
 		int indent = data->dtpda_indent;
@@ -1090,7 +1152,7 @@ chew(const dtrace_probedata_t *data, voi
 			    data->dtpda_prefix, pd->dtpd_func);
 		}
 
-		oprintf("%3d %-41s ", (int)cpu, name);
+		oprintf("%3d %-41s ", cpu, name);
 	}
 
 	return (DTRACE_CONSUME_THIS);
@@ -1102,19 +1164,19 @@ go(void)
 	int i;
 
 	struct {
-		const char *name;
-		const char *optname;
+		char *name;
+		char *optname;
 		dtrace_optval_t val;
 	} bufs[] = {
-		{ "buffer size", "bufsize", 0 },
-		{ "aggregation size", "aggsize", 0 },
-		{ "speculation size", "specsize", 0 },
-		{ "dynamic variable size", "dynvarsize", 0 },
-		{ NULL, NULL, 0 }
+		{ "buffer size", "bufsize" },
+		{ "aggregation size", "aggsize" },
+		{ "speculation size", "specsize" },
+		{ "dynamic variable size", "dynvarsize" },
+		{ NULL }
 	}, rates[] = {
-		{ "cleaning rate", "cleanrate", 0 },
-		{ "status rate", "statusrate", 0 },
-		{ NULL, NULL ,0 }
+		{ "cleaning rate", "cleanrate" },
+		{ "status rate", "statusrate" },
+		{ NULL }
 	};
 
 	for (i = 0; bufs[i].name != NULL; i++) {
@@ -1159,7 +1221,7 @@ go(void)
 
 	for (i = 0; rates[i].name != NULL; i++) {
 		dtrace_optval_t nval;
-		const char *dir;
+		char *dir;
 
 		if (rates[i].val == DTRACEOPT_UNSET)
 			continue;
@@ -1203,11 +1265,48 @@ intr(int signo)
 		g_impatient = 1;
 }
 
+#ifdef __FreeBSD__
+static void
+siginfo(int signo __unused)
+{
+
+	g_siginfo++;
+	g_newline = 1;
+}
+#endif
+
+static void
+installsighands(void)
+{
+	struct sigaction act, oact;
+
+	(void) sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	act.sa_handler = intr;
+
+	if (sigaction(SIGINT, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
+		(void) sigaction(SIGINT, &act, NULL);
+
+	if (sigaction(SIGTERM, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
+		(void) sigaction(SIGTERM, &act, NULL);
+
+#ifdef __FreeBSD__
+	if (sigaction(SIGPIPE, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
+		(void) sigaction(SIGPIPE, &act, NULL);
+
+	if (sigaction(SIGUSR1, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
+		(void) sigaction(SIGUSR1, &act, NULL);
+
+	act.sa_handler = siginfo;
+	if (sigaction(SIGINFO, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
+		(void) sigaction(SIGINFO, &act, NULL);
+#endif
+}
+
 int
 main(int argc, char *argv[])
 {
 	dtrace_bufdesc_t buf;
-	struct sigaction act, oact;
 	dtrace_status_t status[2];
 	dtrace_optval_t opt;
 	dtrace_cmd_t *dcp;
@@ -1399,6 +1498,7 @@ main(int argc, char *argv[])
 	(void) dtrace_setopt(g_dtp, "bufsize", "4m");
 	(void) dtrace_setopt(g_dtp, "aggsize", "4m");
 #endif
+	(void) dtrace_setopt(g_dtp, "temporal", "yes");
 
 	/*
 	 * If -G is specified, enable -xlink=dynamic and -xunodefs to permit
@@ -1676,19 +1776,19 @@ main(int argc, char *argv[])
 
 	case DMODE_ANON:
 		if (g_ofile == NULL)
-#if defined(sun)
+#ifdef illumos
 			g_ofile = "/kernel/drv/dtrace.conf";
-#else
+#endif
+#ifdef __FreeBSD__
 			/*
 			 * On FreeBSD, anonymous DOF data is written to
-			 * the DTrace DOF file that the boot loader will
-			 * read if booting with the DTrace option.
+			 * the DTrace DOF file.
 			 */
 			g_ofile = "/boot/dtrace.dof";
 #endif
 
 		dof_prune(g_ofile); /* strip out any old DOF directives */
-#if defined(sun)
+#ifdef illumos
 		etcsystem_prune(); /* string out any forceload directives */
 #endif
 
@@ -1721,7 +1821,11 @@ main(int argc, char *argv[])
 		 * that itself contains a #pragma D option quiet.
 		 */
 		error("saved anonymous enabling in %s\n", g_ofile);
-#if defined(sun)
+
+#ifdef __FreeBSD__
+		bootdof_add();
+#endif
+#ifdef illumos
 		etcsystem_add();
 		error("run update_drv(1M) or reboot to enable changes\n");
 #endif
@@ -1758,6 +1862,8 @@ main(int argc, char *argv[])
 		if (g_ofile != NULL && (g_ofp = fopen(g_ofile, "a")) == NULL)
 			fatal("failed to open output file '%s'", g_ofile);
 
+		installsighands();
+
 		oprintf("%5s %10s %17s %33s %s\n",
 		    "ID", "PROVIDER", "MODULE", "FUNCTION", "NAME");
 
@@ -1779,7 +1885,7 @@ main(int argc, char *argv[])
 		}
 
 		if (g_ofile == NULL) {
-			char *pv;
+			char *p;
 
 			if (g_cmdc > 1) {
 				(void) fprintf(stderr, "%s: -h requires an "
@@ -1789,8 +1895,8 @@ main(int argc, char *argv[])
 				return (E_USAGE);
 			}
 
-			if ((pv = strrchr(g_cmdv[0].dc_arg, '.')) == NULL ||
-			    strcmp(pv, ".d") != 0) {
+			if ((p = strrchr(g_cmdv[0].dc_arg, '.')) == NULL ||
+			    strcmp(p, ".d") != 0) {
 				(void) fprintf(stderr, "%s: -h requires an "
 				    "output file if no scripts are "
 				    "specified\n", g_pname);
@@ -1798,9 +1904,9 @@ main(int argc, char *argv[])
 				return (E_USAGE);
 			}
 
-			pv[0] = '\0'; /* strip .d suffix */
-			g_ofile = pv = g_cmdv[0].dc_ofile;
-			(void) snprintf(pv, sizeof (g_cmdv[0].dc_ofile),
+			p[0] = '\0'; /* strip .d suffix */
+			g_ofile = p = g_cmdv[0].dc_ofile;
+			(void) snprintf(p, sizeof (g_cmdv[0].dc_ofile),
 			    "%s.h", basename(g_cmdv[0].dc_arg));
 		}
 
@@ -1843,20 +1949,7 @@ main(int argc, char *argv[])
 	if (opt != DTRACEOPT_UNSET)
 		notice("allowing destructive actions\n");
 
-	(void) sigemptyset(&act.sa_mask);
-	act.sa_flags = 0;
-	act.sa_handler = intr;
-
-	if (sigaction(SIGINT, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
-		(void) sigaction(SIGINT, &act, NULL);
-
-	if (sigaction(SIGTERM, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
-		(void) sigaction(SIGTERM, &act, NULL);
-
-#if !defined(sun)
-	if (sigaction(SIGUSR1, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN)
-		(void) sigaction(SIGUSR1, &act, NULL);
-#endif
+	installsighands();
 
 	/*
 	 * Now that tracing is active and we are ready to consume trace data,
@@ -1872,6 +1965,13 @@ main(int argc, char *argv[])
 		if (!g_intr && !done)
 			dtrace_sleep(g_dtp);
 
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+		if (g_siginfo) {
+			(void)dtrace_aggregate_print(g_dtp, g_ofp, NULL);
+			g_siginfo = 0;
+		}
+#endif
+
 		if (g_newline) {
 			/*
 			 * Output a newline just to make the output look
Index: src/external/cddl/osnet/dist/cmd/zdb/zdb.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zdb/zdb.c,v
retrieving revision 1.6
diff -u -p -r1.6 zdb.c
--- src/external/cddl/osnet/dist/cmd/zdb/zdb.c	28 Mar 2014 03:18:24 -0000	1.6
+++ src/external/cddl/osnet/dist/cmd/zdb/zdb.c	27 Mar 2017 06:26:21 -0000
@@ -18,12 +18,15 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <stdio.h>
+#include <unistd.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
 #include <ctype.h>
@@ -34,6 +37,9 @@
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
@@ -52,22 +58,30 @@
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
-#undef ZFS_MAXNAMELEN
+#include <sys/zfeature.h>
+#include <zfs_comutil.h>
 #undef verify
 #include <libzfs.h>
 
-#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
-    zio_compress_table[(idx)].ci_name : "UNKNOWN")
-#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
-    zio_checksum_table[(idx)].ci_name : "UNKNOWN")
-#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
-    dmu_ot[(idx)].ot_name : "UNKNOWN")
-#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES)
+#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
+	zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
+	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
+	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
+	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
+#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
+	(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ?	\
+	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
 
 #ifndef lint
-extern int zfs_recover;
+extern boolean_t zfs_recover;
+extern uint64_t zfs_arc_max, zfs_arc_meta_limit;
+extern int zfs_vdev_async_read_max_active;
 #else
-int zfs_recover;
+boolean_t zfs_recover;
+uint64_t zfs_arc_max, zfs_arc_meta_limit;
+int zfs_vdev_async_read_max_active;
 #endif
 
 const char cmdname[] = "zdb";
@@ -76,9 +90,12 @@ uint8_t dump_opt[256];
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 extern void dump_intent_log(zilog_t *);
-uint64_t *zopt_object = NULL;
-int zopt_objects = 0;
-libzfs_handle_t *g_zfs;
+static uint64_t *zopt_object = NULL;
+static int zopt_objects = 0;
+static libzfs_handle_t *g_zfs;
+static uint64_t max_inflight = 1000;
+
+static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *);
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -100,13 +117,17 @@ static void
 usage(void)
 {
 	(void) fprintf(stderr,
-	    "Usage: %s [-CumdibcsvhL] poolname [object...]\n"
-	    "       %s [-div] dataset [object...]\n"
-	    "       %s -m [-L] poolname [vdev [metaslab...]]\n"
-	    "       %s -R poolname vdev:offset:size[:flags]\n"
-	    "       %s -S poolname\n"
-	    "       %s -l [-u] device\n"
-	    "       %s -C\n\n",
+	    "Usage: %s [-CumMdibcsDvhLXFPAG] [-t txg] [-e [-p path...]] "
+	    "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
+	    "       %s [-divPA] [-e -p path...] [-U config] dataset "
+	    "[object...]\n"
+	    "       %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
+	    "poolname [vdev [metaslab...]]\n"
+	    "       %s -R [-A] [-e [-p path...]] poolname "
+	    "vdev:offset:size[:flags]\n"
+	    "       %s -S [-PA] [-e [-p path...]] [-U config] poolname\n"
+	    "       %s -l [-uA] device\n"
+	    "       %s -C [-A] [-U config]\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
@@ -123,9 +144,11 @@ usage(void)
 	(void) fprintf(stderr, "        -h pool history\n");
 	(void) fprintf(stderr, "        -b block statistics\n");
 	(void) fprintf(stderr, "        -m metaslabs\n");
+	(void) fprintf(stderr, "        -M metaslab groups\n");
 	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
 	    "all data) blocks\n");
 	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
+	(void) fprintf(stderr, "        -D dedup statistics\n");
 	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -l dump label contents\n");
@@ -134,7 +157,7 @@ usage(void)
 	(void) fprintf(stderr, "        -R read and display block from a "
 	    "device\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
-	    "with other options (except -l):\n");
+	    "with other options:\n");
 	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
 	    "panic recovery (-AA) or both (-AAA)\n");
 	(void) fprintf(stderr, "        -F attempt automatic rewind within "
@@ -147,14 +170,31 @@ usage(void)
 	    "has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -p <path> -- use one or more with "
 	    "-e to specify path to vdev dir\n");
+	(void) fprintf(stderr, "        -x <dumpdir> -- "
+	    "dump all read blocks into specified directory\n");
+	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
 	    "searching for uberblocks\n");
+	(void) fprintf(stderr, "        -I <number of inflight I/Os> -- "
+	    "specify the maximum number of "
+	    "checksumming I/Os [default is 200]\n");
+	(void) fprintf(stderr, "        -G dump zfs_dbgmsg buffer before "
+	    "exiting\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	exit(1);
 }
 
+static void
+dump_debug_buffer()
+{
+	if (dump_opt['G']) {
+		(void) printf("\n");
+		zfs_dbgmsg_print("zdb");
+	}
+}
+
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
@@ -171,6 +211,8 @@ fatal(const char *fmt, ...)
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
+	dump_debug_buffer();
+
 	exit(1);
 }
 
@@ -193,18 +235,48 @@ dump_packed_nvlist(objset_t *os, uint64_
 	nvlist_free(nv);
 }
 
-const char dump_zap_stars[] = "****************************************";
-const int dump_zap_width = sizeof (dump_zap_stars) - 1;
+/* ARGSUSED */
+static void
+dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	spa_history_phys_t *shp = data;
+
+	if (shp == NULL)
+		return;
+
+	(void) printf("\t\tpool_create_len = %llu\n",
+	    (u_longlong_t)shp->sh_pool_create_len);
+	(void) printf("\t\tphys_max_off = %llu\n",
+	    (u_longlong_t)shp->sh_phys_max_off);
+	(void) printf("\t\tbof = %llu\n",
+	    (u_longlong_t)shp->sh_bof);
+	(void) printf("\t\teof = %llu\n",
+	    (u_longlong_t)shp->sh_eof);
+	(void) printf("\t\trecords_lost = %llu\n",
+	    (u_longlong_t)shp->sh_records_lost);
+}
 
 static void
-dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE])
+zdb_nicenum(uint64_t num, char *buf)
+{
+	if (dump_opt['P'])
+		(void) sprintf(buf, "%llu", (longlong_t)num);
+	else
+		nicenum(num, buf);
+}
+
+const char histo_stars[] = "****************************************";
+const int histo_width = sizeof (histo_stars) - 1;
+
+static void
+dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
-	int minidx = ZAP_HISTOGRAM_SIZE - 1;
+	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
-	for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) {
+	for (i = 0; i < size; i++) {
 		if (histo[i] > max)
 			max = histo[i];
 		if (histo[i] > 0 && i > maxidx)
@@ -213,12 +285,14 @@ dump_zap_histogram(uint64_t histo[ZAP_HI
 			minidx = i;
 	}
 
-	if (max < dump_zap_width)
-		max = dump_zap_width;
+	if (max < histo_width)
+		max = histo_width;
 
-	for (i = minidx; i <= maxidx; i++)
-		(void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i],
-		    &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]);
+	for (i = minidx; i <= maxidx; i++) {
+		(void) printf("\t\t\t%3u: %6llu %s\n",
+		    i + offset, (u_longlong_t)histo[i],
+		    &histo_stars[(max - histo[i]) * histo_width / max]);
+	}
 }
 
 static void
@@ -269,19 +343,19 @@ dump_zap_stats(objset_t *os, uint64_t ob
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
-	dump_zap_histogram(zs.zs_leafs_with_2n_pointers);
+	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
-	dump_zap_histogram(zs.zs_blocks_with_n5_entries);
+	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
-	dump_zap_histogram(zs.zs_blocks_n_tenths_full);
+	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
-	dump_zap_histogram(zs.zs_entries_using_n_chunks);
+	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
-	dump_zap_histogram(zs.zs_buckets_with_n_entries);
+	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 /*ARGSUSED*/
@@ -359,6 +433,79 @@ dump_zap(objset_t *os, uint64_t object, 
 	zap_cursor_fini(&zc);
 }
 
+static void
+dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	bpobj_phys_t *bpop = data;
+	char bytes[32], comp[32], uncomp[32];
+
+	if (bpop == NULL)
+		return;
+
+	zdb_nicenum(bpop->bpo_bytes, bytes);
+	zdb_nicenum(bpop->bpo_comp, comp);
+	zdb_nicenum(bpop->bpo_uncomp, uncomp);
+
+	(void) printf("\t\tnum_blkptrs = %llu\n",
+	    (u_longlong_t)bpop->bpo_num_blkptrs);
+	(void) printf("\t\tbytes = %s\n", bytes);
+	if (size >= BPOBJ_SIZE_V1) {
+		(void) printf("\t\tcomp = %s\n", comp);
+		(void) printf("\t\tuncomp = %s\n", uncomp);
+	}
+	if (size >= sizeof (*bpop)) {
+		(void) printf("\t\tsubobjs = %llu\n",
+		    (u_longlong_t)bpop->bpo_subobjs);
+		(void) printf("\t\tnum_subobjs = %llu\n",
+		    (u_longlong_t)bpop->bpo_num_subobjs);
+	}
+
+	if (dump_opt['d'] < 5)
+		return;
+
+	for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) {
+		char blkbuf[BP_SPRINTF_LEN];
+		blkptr_t bp;
+
+		int err = dmu_read(os, object,
+		    i * sizeof (bp), sizeof (bp), &bp, 0);
+		if (err != 0) {
+			(void) printf("got error %u from dmu_read\n", err);
+			break;
+		}
+		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp);
+		(void) printf("\t%s\n", blkbuf);
+	}
+}
+
+/* ARGSUSED */
+static void
+dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dmu_object_info_t doi;
+
+	VERIFY0(dmu_object_info(os, object, &doi));
+	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
+
+	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
+	if (err != 0) {
+		(void) printf("got error %u from dmu_read\n", err);
+		kmem_free(subobjs, doi.doi_max_offset);
+		return;
+	}
+
+	int64_t last_nonzero = -1;
+	for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) {
+		if (subobjs[i] != 0)
+			last_nonzero = i;
+	}
+
+	for (int64_t i = 0; i <= last_nonzero; i++) {
+		(void) printf("\t%llu\n", (longlong_t)subobjs[i]);
+	}
+	kmem_free(subobjs, doi.doi_max_offset);
+}
+
 /*ARGSUSED*/
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
@@ -369,6 +516,71 @@ dump_ddt_zap(objset_t *os, uint64_t obje
 
 /*ARGSUSED*/
 static void
+dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = ", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+		(void) printf(" %llx : [%d:%d:%d]\n",
+		    (u_longlong_t)attr.za_first_integer,
+		    (int)ATTR_LENGTH(attr.za_first_integer),
+		    (int)ATTR_BSWAP(attr.za_first_integer),
+		    (int)ATTR_NUM(attr.za_first_integer));
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
+dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	uint16_t *layout_attrs;
+	int i;
+
+	dump_zap_stats(os, object);
+	(void) printf("\n");
+
+	for (zap_cursor_init(&zc, os, object);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		(void) printf("\t\t%s = [", attr.za_name);
+		if (attr.za_num_integers == 0) {
+			(void) printf("\n");
+			continue;
+		}
+
+		VERIFY(attr.za_integer_length == 2);
+		layout_attrs = umem_zalloc(attr.za_num_integers *
+		    attr.za_integer_length, UMEM_NOFAIL);
+
+		VERIFY(zap_lookup(os, object, attr.za_name,
+		    attr.za_integer_length,
+		    attr.za_num_integers, layout_attrs) == 0);
+
+		for (i = 0; i != attr.za_num_integers; i++)
+			(void) printf(" %d ", (int)layout_attrs[i]);
+		(void) printf("]\n");
+		umem_free(layout_attrs,
+		    attr.za_num_integers * attr.za_integer_length);
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*ARGSUSED*/
+static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	zap_cursor_t zc;
@@ -405,26 +617,89 @@ dump_zpldir(objset_t *os, uint64_t objec
 	zap_cursor_fini(&zc);
 }
 
+int
+get_dtl_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		space_map_t *sm = vd->vdev_dtl_sm;
+
+		if (sm != NULL &&
+		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+			return (1);
+		return (0);
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		refcount += get_dtl_refcount(vd->vdev_child[c]);
+	return (refcount);
+}
+
+int
+get_metaslab_refcount(vdev_t *vd)
+{
+	int refcount = 0;
+
+	if (vd->vdev_top == vd && !vd->vdev_removing) {
+		for (int m = 0; m < vd->vdev_ms_count; m++) {
+			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
+
+			if (sm != NULL &&
+			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
+				refcount++;
+		}
+	}
+	for (int c = 0; c < vd->vdev_children; c++)
+		refcount += get_metaslab_refcount(vd->vdev_child[c]);
+
+	return (refcount);
+}
+
+static int
+verify_spacemap_refcounts(spa_t *spa)
+{
+	uint64_t expected_refcount = 0;
+	uint64_t actual_refcount;
+
+	(void) feature_get_refcount(spa,
+	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
+	    &expected_refcount);
+	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
+	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
+
+	if (expected_refcount != actual_refcount) {
+		(void) printf("space map refcount mismatch: expected %lld != "
+		    "actual %lld\n",
+		    (longlong_t)expected_refcount,
+		    (longlong_t)actual_refcount);
+		return (2);
+	}
+	return (0);
+}
+
 static void
-dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
+dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	uint64_t alloc, offset, entry;
-	uint8_t mapshift = sm->sm_shift;
-	uint64_t mapstart = sm->sm_start;
 	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 			    "INVALID", "INVALID", "INVALID", "INVALID" };
 
-	if (smo->smo_object == 0)
+	if (sm == NULL)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	alloc = 0;
-	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
-		VERIFY(0 == dmu_read(os, smo->smo_object, offset,
+	for (offset = 0; offset < space_map_length(sm);
+	    offset += sizeof (entry)) {
+		uint8_t mapshift = sm->sm_shift;
+
+		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (entry), &entry, DMU_READ_PREFETCH));
 		if (SM_DEBUG_DECODE(entry)) {
+
 			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
@@ -436,10 +711,10 @@ dump_spacemap(objset_t *os, space_map_ob
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + mapstart),
+			    mapshift) + sm->sm_start),
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + mapstart + (SM_RUN_DECODE(entry) <<
-			    mapshift)),
+			    mapshift) + sm->sm_start +
+			    (SM_RUN_DECODE(entry) << mapshift)),
 			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
 			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
 				alloc += SM_RUN_DECODE(entry) << mapshift;
@@ -447,26 +722,28 @@ dump_spacemap(objset_t *os, space_map_ob
 				alloc -= SM_RUN_DECODE(entry) << mapshift;
 		}
 	}
-	if (alloc != smo->smo_alloc) {
+	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
 		    "with space map summary (%llu)\n",
-		    (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc);
+		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
-	char maxbuf[5];
-	space_map_t *sm = &msp->ms_map;
-	avl_tree_t *t = sm->sm_pp_root;
-	int free_pct = sm->sm_space * 100 / sm->sm_size;
+	char maxbuf[32];
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &msp->ms_size_tree;
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
-	nicenum(space_map_maxsize(sm), maxbuf, sizeof(maxbuf));
+	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", avl_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
+	(void) printf("\tIn-memory histogram:\n");
+	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
@@ -474,33 +751,45 @@ dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
-	space_map_t *sm = &msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo;
-	char freebuf[5];
+	space_map_t *sm = msp->ms_sm;
+	char freebuf[32];
 
-	nicenum(sm->sm_size - smo->smo_alloc, freebuf, sizeof(freebuf));
+	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
-	    (u_longlong_t)(sm->sm_start / sm->sm_size),
-	    (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
+	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
+	    (u_longlong_t)space_map_object(sm), freebuf);
 
-	if (dump_opt['m'] > 1 && !dump_opt['L']) {
+	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
-		space_map_load_wait(sm);
-		if (!sm->sm_loaded)
-			VERIFY(space_map_load(sm, zfs_metaslab_ops,
-			    SM_FREE, smo, spa->spa_meta_objset) == 0);
+		metaslab_load_wait(msp);
+		if (!msp->ms_loaded) {
+			VERIFY0(metaslab_load(msp));
+			range_tree_stat_verify(msp->ms_tree);
+		}
 		dump_metaslab_stats(msp);
-		space_map_unload(sm);
+		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
-	if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
-		ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
+	if (dump_opt['m'] > 1 && sm != NULL &&
+	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		/*
+		 * The space map histogram represents free space in chunks
+		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
+		 */
+		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
+		    (u_longlong_t)msp->ms_fragmentation);
+		dump_histogram(sm->sm_phys->smp_histogram,
+		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
+	}
+
+	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
+		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
 
 		mutex_enter(&msp->ms_lock);
-		dump_spacemap(spa->spa_meta_objset, smo, sm);
+		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 		mutex_exit(&msp->ms_lock);
 	}
 }
@@ -518,6 +807,47 @@ print_vdev_metaslab_header(vdev_t *vd)
 }
 
 static void
+dump_metaslab_groups(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_class_t *mc = spa_normal_class(spa);
+	uint64_t fragmentation;
+
+	metaslab_class_histogram_verify(mc);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (mg->mg_class != mc)
+			continue;
+
+		metaslab_group_histogram_verify(mg);
+		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
+		    "fragmentation",
+		    (u_longlong_t)tvd->vdev_id,
+		    (u_longlong_t)tvd->vdev_ms_count);
+		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+			(void) printf("%3s\n", "-");
+		} else {
+			(void) printf("%3llu%%\n",
+			    (u_longlong_t)mg->mg_fragmentation);
+		}
+		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+	}
+
+	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
+	fragmentation = metaslab_class_fragmentation(mc);
+	if (fragmentation == ZFS_FRAG_INVALID)
+		(void) printf("\t%3s\n", "-");
+	else
+		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
+	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
+}
+
+static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
@@ -572,7 +902,7 @@ dump_dde(const ddt_t *ddt, const ddt_ent
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
-		snprintf_blkptr(blkbuf, sizeof(blkbuf), &blk);
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu %s %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
 		    types[p], blkbuf);
@@ -617,13 +947,15 @@ dump_ddt(ddt_t *ddt, enum ddt_type type,
 		return;
 	ASSERT(error == 0);
 
-	count = ddt_object_count(ddt, type, class);
+	error = ddt_object_count(ddt, type, class, &count);
+	ASSERT(error == 0);
+	if (count == 0)
+		return;
+
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
-	ASSERT(count != 0);	/* we should have destroyed it */
-
-	ddt_object_name(ddt, type, class, name, sizeof(name));
+	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
 	    name,
@@ -687,9 +1019,9 @@ dump_all_ddts(spa_t *spa)
 }
 
 static void
-dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
+dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
-	char *prefix = (void *)sm;
+	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
@@ -719,28 +1051,32 @@ dump_dtl(vdev_t *vd, int indent)
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
-		space_map_t *sm = &vd->vdev_dtl[t];
-		if (sm->sm_space == 0)
+		range_tree_t *rt = vd->vdev_dtl[t];
+		if (range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
-		mutex_enter(sm->sm_lock);
-		space_map_walk(sm, dump_dtl_seg, (void *)prefix);
-		mutex_exit(sm->sm_lock);
+		mutex_enter(rt->rt_lock);
+		range_tree_walk(rt, dump_dtl_seg, prefix);
+		mutex_exit(rt->rt_lock);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
-			dump_spacemap(spa->spa_meta_objset,
-			    &vd->vdev_dtl_smo, sm);
+			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
+/* from spa_history.c: spa_history_create_obj() */
+#define	HIS_BUF_LEN_DEF	(128 << 10)
+#define	HIS_BUF_LEN_MAX	(1 << 30)
+
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
-	char buf[SPA_MAXBLOCKSIZE];
+	char *buf = NULL;
+	uint64_t bufsize = HIS_BUF_LEN_DEF;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
@@ -749,8 +1085,11 @@ dump_history(spa_t *spa)
 	char tbuf[30];
 	char internalstr[MAXPATHLEN];
 
+	if ((buf = malloc(bufsize)) == NULL)
+		(void) fprintf(stderr, "Unable to read history: "
+		    "out of memory\n");
 	do {
-		len = sizeof (buf);
+		len = bufsize;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
@@ -760,34 +1099,52 @@ dump_history(spa_t *spa)
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
-
 		off -= resid;
+
+		/*
+		 * If the history block is too big, double the buffer
+		 * size and try again.
+		 */
+		if (resid == len) {
+			free(buf);
+			buf = NULL;
+
+			bufsize <<= 1;
+			if ((bufsize >= HIS_BUF_LEN_MAX) ||
+			    ((buf = malloc(bufsize)) == NULL)) {
+				(void) fprintf(stderr, "Unable to read history: "
+				    "out of memory\n");
+				return;
+			}
+		}
 	} while (len != 0);
+	free(buf);
 
 	(void) printf("\nHistory:\n");
 	for (int i = 0; i < num; i++) {
 		uint64_t time, txg, ievent;
 		char *cmd, *intstr;
+		boolean_t printed = B_FALSE;
 
 		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
 		    &time) != 0)
-			continue;
+			goto next;
 		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
 		    &cmd) != 0) {
 			if (nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
-				continue;
+				goto next;
 			verify(nvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG, &txg) == 0);
 			verify(nvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR, &intstr) == 0);
-			if (ievent >= LOG_END)
-				continue;
+			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
+				goto next;
 
 			(void) snprintf(internalstr,
 			    sizeof (internalstr),
 			    "[internal %s txg:%lld] %s",
-			    hist_event_table[ievent], txg,
+			    zfs_history_event_names[ievent], txg,
 			    intstr);
 			cmd = internalstr;
 		}
@@ -795,6 +1152,14 @@ dump_history(spa_t *spa)
 		(void) localtime_r(&tsec, &t);
 		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		(void) printf("%s %s\n", tbuf, cmd);
+		printed = B_TRUE;
+
+next:
+		if (dump_opt['h'] > 1) {
+			if (!printed)
+				(void) printf("unrecognized record:\n");
+			dump_nvlist(events[i], 2);
+		}
 	}
 }
 
@@ -805,7 +1170,8 @@ dump_dnode(objset_t *os, uint64_t object
 }
 
 static uint64_t
-blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
@@ -822,47 +1188,63 @@ blkid2offset(const dnode_phys_t *dnp, co
 }
 
 static void
-snprintf_blkptr_compact(char *blkbuf, size_t blklen, blkptr_t *bp)
+snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
 {
-	dva_t *dva = bp->blk_dva;
+	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
-	size_t len;
 
-	if (dump_opt['b'] >= 5) {
-		snprintf_blkptr(blkbuf, blklen, bp);
+	if (dump_opt['b'] >= 6) {
+		snprintf_blkptr(blkbuf, buflen, bp);
 		return;
 	}
 
-	blkbuf[0] = '\0';
+	if (BP_IS_EMBEDDED(bp)) {
+		(void) sprintf(blkbuf,
+		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
+		    (int)BPE_GET_ETYPE(bp),
+		    (u_longlong_t)BPE_GET_LSIZE(bp),
+		    (u_longlong_t)BPE_GET_PSIZE(bp),
+		    (u_longlong_t)bp->blk_birth);
+		return;
+	}
 
-	len = 0;
-	for (int i = 0; i < ndvas; i++) {
-		len += snprintf(blkbuf + len, blklen - len, "%llu:%llx:%llx ",
+	blkbuf[0] = '\0';
+	for (int i = 0; i < ndvas; i++)
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
-		if (len > blklen)
-			len = blklen;
-	}
 
-	snprintf(blkbuf + len, blklen - len,
-	    "%llxL/%llxP F=%llu B=%llu/%llu",
-	    (u_longlong_t)BP_GET_LSIZE(bp),
-	    (u_longlong_t)BP_GET_PSIZE(bp),
-	    (u_longlong_t)bp->blk_fill,
-	    (u_longlong_t)bp->blk_birth,
-	    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+	if (BP_IS_HOLE(bp)) {
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf),
+		    "%llxL B=%llu",
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)bp->blk_birth);
+	} else {
+		(void) snprintf(blkbuf + strlen(blkbuf),
+		    buflen - strlen(blkbuf),
+		    "%llxL/%llxP F=%llu B=%llu/%llu",
+		    (u_longlong_t)BP_GET_LSIZE(bp),
+		    (u_longlong_t)BP_GET_PSIZE(bp),
+		    (u_longlong_t)BP_GET_FILL(bp),
+		    (u_longlong_t)bp->blk_birth,
+		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+	}
 }
 
 static void
-print_indirect(blkptr_t *bp, const zbookmark_t *zb,
+print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	int l;
 
-	ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
-	ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+	if (!BP_IS_EMBEDDED(bp)) {
+		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+	}
 
 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
@@ -876,13 +1258,13 @@ print_indirect(blkptr_t *bp, const zbook
 		}
 	}
 
-	snprintf_blkptr_compact(blkbuf, sizeof(blkbuf), bp);
+	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
-    blkptr_t *bp, const zbookmark_t *zb)
+    blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	int err = 0;
 
@@ -891,23 +1273,24 @@ visit_indirect(spa_t *spa, const dnode_p
 
 	print_indirect(bp, zb, dnp);
 
-	if (BP_GET_LEVEL(bp) > 0) {
-		uint32_t flags = ARC_WAIT;
+	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 
-		err = arc_read_nolock(NULL, spa, bp, arc_getbuf_func, &buf,
+		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
+		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
-			zbookmark_t czb;
+			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
@@ -915,11 +1298,11 @@ visit_indirect(spa_t *spa, const dnode_p
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
-			fill += cbp->blk_fill;
+			fill += BP_GET_FILL(cbp);
 		}
 		if (!err)
-			ASSERT3U(fill, ==, bp->blk_fill);
-		(void) arc_buf_remove_ref(buf, &buf);
+			ASSERT3U(fill, ==, BP_GET_FILL(bp));
+		arc_buf_destroy(buf, &buf);
 	}
 
 	return (err);
@@ -931,7 +1314,7 @@ dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	int j;
-	zbookmark_t czb;
+	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
@@ -952,7 +1335,7 @@ dump_dsl_dir(objset_t *os, uint64_t obje
 {
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
-	char nice[6];
+	char nice[32];
 
 	if (dd == NULL)
 		return;
@@ -969,15 +1352,15 @@ dump_dsl_dir(objset_t *os, uint64_t obje
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
-	nicenum(dd->dd_used_bytes, nice, sizeof(nice));
+	zdb_nicenum(dd->dd_used_bytes, nice);
 	(void) printf("\t\tused_bytes = %s\n", nice);
-	nicenum(dd->dd_compressed_bytes, nice, sizeof(nice));
+	zdb_nicenum(dd->dd_compressed_bytes, nice);
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
-	nicenum(dd->dd_uncompressed_bytes, nice, sizeof(nice));
+	zdb_nicenum(dd->dd_uncompressed_bytes, nice);
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
-	nicenum(dd->dd_quota, nice, sizeof(nice));
+	zdb_nicenum(dd->dd_quota, nice);
 	(void) printf("\t\tquota = %s\n", nice);
-	nicenum(dd->dd_reserved, nice, sizeof(nice));
+	zdb_nicenum(dd->dd_reserved, nice);
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
@@ -987,7 +1370,7 @@ dump_dsl_dir(objset_t *os, uint64_t obje
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
-	nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, sizeof(nice)); \
+	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
@@ -1003,7 +1386,7 @@ dump_dsl_dataset(objset_t *os, uint64_t 
 {
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
-	char used[6], compressed[6], uncompressed[6], unique[6];
+	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (ds == NULL)
@@ -1011,11 +1394,11 @@ dump_dsl_dataset(objset_t *os, uint64_t 
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
-	nicenum(ds->ds_used_bytes, used, sizeof(used));
-	nicenum(ds->ds_compressed_bytes, compressed, sizeof(compressed));
-	nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof(uncompressed));
-	nicenum(ds->ds_unique_bytes, unique, sizeof(unique));
-	snprintf_blkptr(blkbuf, sizeof(blkbuf), &ds->ds_bp);
+	zdb_nicenum(ds->ds_referenced_bytes, used);
+	zdb_nicenum(ds->ds_compressed_bytes, compressed);
+	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
+	zdb_nicenum(ds->ds_unique_bytes, unique);
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
@@ -1053,63 +1436,166 @@ dump_dsl_dataset(objset_t *os, uint64_t 
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
+/* ARGSUSED */
+static int
+dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	if (bp->blk_birth != 0) {
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("\t%s\n", blkbuf);
+	}
+	return (0);
+}
+
 static void
-dump_bplist(objset_t *mos, uint64_t object, char *name)
+dump_bptree(objset_t *os, uint64_t obj, char *name)
 {
-	bplist_t bpl = { 0 };
-	blkptr_t blk, *bp = &blk;
-	uint64_t itor = 0;
-	char bytes[6];
-	char comp[6];
-	char uncomp[6];
+	char bytes[32];
+	bptree_phys_t *bt;
+	dmu_buf_t *db;
 
 	if (dump_opt['d'] < 3)
 		return;
 
-	bplist_init(&bpl);
-	VERIFY(0 == bplist_open(&bpl, mos, object));
-	if (bplist_empty(&bpl)) {
-		bplist_close(&bpl);
-		bplist_fini(&bpl);
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	zdb_nicenum(bt->bt_bytes, bytes);
+	(void) printf("\n    %s: %llu datasets, %s\n",
+	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
+	dmu_buf_rele(db, FTAG);
+
+	if (dump_opt['d'] < 5)
 		return;
-	}
 
-	nicenum(bpl.bpl_phys->bpl_bytes, bytes, sizeof(bytes));
-	if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) {
-		nicenum(bpl.bpl_phys->bpl_comp, comp, sizeof(comp));
-		nicenum(bpl.bpl_phys->bpl_uncomp, uncomp, sizeof(uncomp));
-		(void) printf("\n    %s: %llu entries, %s (%s/%s comp)\n",
-		    name, (u_longlong_t)bpl.bpl_phys->bpl_entries,
+	(void) printf("\n");
+
+	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
+}
+
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	char blkbuf[BP_SPRINTF_LEN];
+
+	ASSERT(bp->blk_birth != 0);
+	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
+	(void) printf("\t%s\n", blkbuf);
+	return (0);
+}
+
+static void
+dump_full_bpobj(bpobj_t *bpo, char *name, int indent)
+{
+	char bytes[32];
+	char comp[32];
+	char uncomp[32];
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
+	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
+		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
+		(void) printf("    %*s: object %llu, %llu local blkptrs, "
+		    "%llu subobjs in object %llu, %s (%s/%s comp)\n",
+		    indent * 8, name,
+		    (u_longlong_t)bpo->bpo_object,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
+		    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 		    bytes, comp, uncomp);
+
+		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
+			uint64_t subobj;
+			bpobj_t subbpo;
+			int error;
+			VERIFY0(dmu_read(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs,
+			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
+			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
+			if (error != 0) {
+				(void) printf("ERROR %u while trying to open "
+				    "subobj id %llu\n",
+				    error, (u_longlong_t)subobj);
+				continue;
+			}
+			dump_full_bpobj(&subbpo, "subobj", indent + 1);
+			bpobj_close(&subbpo);
+		}
 	} else {
-		(void) printf("\n    %s: %llu entries, %s\n",
-		    name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes);
+		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
+		    indent * 8, name,
+		    (u_longlong_t)bpo->bpo_object,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+		    bytes);
 	}
 
-	if (dump_opt['d'] < 5) {
-		bplist_close(&bpl);
-		bplist_fini(&bpl);
+	if (dump_opt['d'] < 5)
 		return;
+
+
+	if (indent == 0) {
+		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+		(void) printf("\n");
 	}
+}
 
-	(void) printf("\n");
+static void
+dump_deadlist(dsl_deadlist_t *dl)
+{
+	dsl_deadlist_entry_t *dle;
+	uint64_t unused;
+	char bytes[32];
+	char comp[32];
+	char uncomp[32];
 
-	while (bplist_iterate(&bpl, &itor, bp) == 0) {
-		char blkbuf[BP_SPRINTF_LEN];
+	if (dump_opt['d'] < 3)
+		return;
 
-		ASSERT(bp->blk_birth != 0);
-		snprintf_blkptr_compact(blkbuf, sizeof(blkbuf), bp);
-		(void) printf("\tItem %3llu: %s\n",
-		    (u_longlong_t)itor - 1, blkbuf);
+	if (dl->dl_oldfmt) {
+		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
+		return;
 	}
 
-	bplist_close(&bpl);
-	bplist_fini(&bpl);
+	zdb_nicenum(dl->dl_phys->dl_used, bytes);
+	zdb_nicenum(dl->dl_phys->dl_comp, comp);
+	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
+	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
+	    bytes, comp, uncomp);
+
+	if (dump_opt['d'] < 4)
+		return;
+
+	(void) printf("\n");
+
+	/* force the tree to be loaded */
+	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
+
+	for (dle = avl_first(&dl->dl_tree); dle;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		if (dump_opt['d'] >= 5) {
+			char buf[128];
+			(void) snprintf(buf, sizeof (buf), "mintxg %llu -> "
+			    "obj %llu", (longlong_t)dle->dle_mintxg,
+			    (longlong_t)dle->dle_bpobj.bpo_object);
+			dump_full_bpobj(&dle->dle_bpobj, buf, 0);
+		} else {
+			(void) printf("mintxg %llu -> obj %llu\n",
+			    (longlong_t)dle->dle_mintxg,
+			    (longlong_t)dle->dle_bpobj.bpo_object);
+		}
+	}
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
+static boolean_t sa_loaded;
+sa_attr_type_t *sa_attr_table;
 
 static void
 fuid_table_destroy()
@@ -1124,7 +1610,7 @@ fuid_table_destroy()
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
- * the doman-rid string.
+ * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
@@ -1142,12 +1628,12 @@ print_idstr(uint64_t id, const char *id_
 }
 
 static void
-dump_uidgid(objset_t *os, znode_phys_t *zp)
+dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
-	uid_idx = FUID_INDEX(zp->zp_uid);
-	gid_idx = FUID_INDEX(zp->zp_gid);
+	uid_idx = FUID_INDEX(uid);
+	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
@@ -1162,50 +1648,111 @@ dump_uidgid(objset_t *os, znode_phys_t *
 		fuid_table_loaded = B_TRUE;
 	}
 
-	print_idstr(zp->zp_uid, "uid");
-	print_idstr(zp->zp_gid, "gid");
+	print_idstr(uid, "uid");
+	print_idstr(gid, "gid");
 }
 
 /*ARGSUSED*/
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
-	znode_phys_t *zp = data;
-	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
+	sa_handle_t *hdl;
+	uint64_t xattr, rdev, gen;
+	uint64_t uid, gid, mode, fsize, parent, links;
+	uint64_t pflags;
+	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
+	time_t z_crtime, z_atime, z_mtime, z_ctime;
+	sa_bulk_attr_t bulk[12];
+	int idx = 0;
 	int error;
 
-	ASSERT(size >= sizeof (znode_phys_t));
+	if (!sa_loaded) {
+		uint64_t sa_attrs = 0;
+		uint64_t version;
+
+		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+		    8, 1, &version) == 0);
+		if (version >= ZPL_VERSION_SA) {
+			VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
+			    8, 1, &sa_attrs) == 0);
+		}
+		if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
+		    ZPL_END, &sa_attr_table)) != 0) {
+			(void) printf("sa_setup failed errno %d, can't "
+			    "display znode contents\n", error);
+			return;
+		}
+		sa_loaded = B_TRUE;
+	}
+
+	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
+		(void) printf("Failed to get handle for SA znode\n");
+		return;
+	}
+
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
+	    &links, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
+	    &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
+	    NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
+	    &fsize, 8);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
+	    acctm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
+	    modtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
+	    crtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
+	    chgtm, 16);
+	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
+	    &pflags, 8);
+
+	if (sa_bulk_lookup(hdl, bulk, idx)) {
+		(void) sa_handle_destroy(hdl);
+		return;
+	}
 
 	error = zfs_obj_to_path(os, object, path, sizeof (path));
 	if (error != 0) {
 		(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
 		    (u_longlong_t)object);
 	}
-
 	if (dump_opt['d'] < 3) {
 		(void) printf("\t%s\n", path);
+		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
-	z_crtime = (time_t)zp->zp_crtime[0];
-	z_atime = (time_t)zp->zp_atime[0];
-	z_mtime = (time_t)zp->zp_mtime[0];
-	z_ctime = (time_t)zp->zp_ctime[0];
+	z_crtime = (time_t)crtm[0];
+	z_atime = (time_t)acctm[0];
+	z_mtime = (time_t)modtm[0];
+	z_ctime = (time_t)chgtm[0];
 
 	(void) printf("\tpath	%s\n", path);
-	dump_uidgid(os, zp);
+	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
-	(void) printf("\tgen	%llu\n", (u_longlong_t)zp->zp_gen);
-	(void) printf("\tmode	%llo\n", (u_longlong_t)zp->zp_mode);
-	(void) printf("\tsize	%llu\n", (u_longlong_t)zp->zp_size);
-	(void) printf("\tparent	%llu\n", (u_longlong_t)zp->zp_parent);
-	(void) printf("\tlinks	%llu\n", (u_longlong_t)zp->zp_links);
-	(void) printf("\txattr	%llu\n", (u_longlong_t)zp->zp_xattr);
-	(void) printf("\trdev	0x%016llx\n", (u_longlong_t)zp->zp_rdev);
+	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
+	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
+	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
+	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
+	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
+	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
+	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
+	    sizeof (uint64_t)) == 0)
+		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
+	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
+	    sizeof (uint64_t)) == 0)
+		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
+	sa_handle_destroy(hdl);
 }
 
 /*ARGSUSED*/
@@ -1226,8 +1773,8 @@ static object_viewer_t *object_viewer[DM
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
-	dump_none,		/* bplist			*/
-	dump_none,		/* bplist header		*/
+	dump_none,		/* bpobj			*/
+	dump_bpobj,		/* bpobj header			*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
@@ -1251,7 +1798,7 @@ static object_viewer_t *object_viewer[DM
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
-	dump_uint64,		/* SPA history offsets		*/
+	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
@@ -1265,7 +1812,17 @@ static object_viewer_t *object_viewer[DM
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
-	dump_unknown		/* Unknown type, must be last	*/
+	dump_znode,		/* SA object			*/
+	dump_zap,		/* SA Master Node		*/
+	dump_sa_attrs,		/* SA attribute registration	*/
+	dump_sa_layouts,	/* SA attribute layouts		*/
+	dump_zap,		/* DSL scrub translations	*/
+	dump_none,		/* fake dedup BP		*/
+	dump_zap,		/* deadlist			*/
+	dump_none,		/* deadlist hdr			*/
+	dump_zap,		/* dsl clones			*/
+	dump_bpobj_subobjs,	/* bpobj subobjs		*/
+	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static void
@@ -1276,7 +1833,8 @@ dump_object(objset_t *os, uint64_t objec
 	dnode_t *dn;
 	void *bonus = NULL;
 	size_t bsize = 0;
-	char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7];
+	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
+	char bonus_size[32];
 	char aux[50];
 	int error;
 
@@ -1288,7 +1846,7 @@ dump_object(objset_t *os, uint64_t objec
 	}
 
 	if (object == 0) {
-		dn = os->os_meta_dnode;
+		dn = DMU_META_DNODE(os);
 	} else {
 		error = dmu_bonus_hold(os, object, FTAG, &db);
 		if (error)
@@ -1296,16 +1854,16 @@ dump_object(objset_t *os, uint64_t objec
 			    object, error);
 		bonus = db->db_data;
 		bsize = db->db_size;
-		dn = ((dmu_buf_impl_t *)db)->db_dnode;
+		dn = DB_DNODE((dmu_buf_impl_t *)db);
 	}
 	dmu_object_info_from_dnode(dn, &doi);
 
-	nicenum(doi.doi_metadata_block_size, iblk, sizeof(iblk));
-	nicenum(doi.doi_data_block_size, dblk, sizeof(dblk));
-	nicenum(doi.doi_max_offset, lsize, sizeof(lsize));
-	nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof(asize));
-	nicenum(doi.doi_bonus_size, bonus_size, sizeof(bonus_size));
-	(void) snprintf(fill, sizeof(fill), "%6.2f", 100.0 * doi.doi_fill_count *
+	zdb_nicenum(doi.doi_metadata_block_size, iblk);
+	zdb_nicenum(doi.doi_data_block_size, dblk);
+	zdb_nicenum(doi.doi_max_offset, lsize);
+	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
+	zdb_nicenum(doi.doi_bonus_size, bonus_size);
+	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
 	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
 	    doi.doi_max_offset);
 
@@ -1332,11 +1890,13 @@ dump_object(objset_t *os, uint64_t objec
 	}
 
 	if (verbosity >= 4) {
-		(void) printf("\tdnode flags: %s%s\n",
+		(void) printf("\tdnode flags: %s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
-		    "USERUSED_ACCOUNTED " : "");
+		    "USERUSED_ACCOUNTED " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
+		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
@@ -1364,7 +1924,7 @@ dump_object(objset_t *os, uint64_t objec
 		}
 
 		for (;;) {
-			char segsize[6];
+			char segsize[32];
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
@@ -1372,7 +1932,7 @@ dump_object(objset_t *os, uint64_t objec
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
-			nicenum(end - start, segsize, sizeof(segsize));
+			zdb_nicenum(end - start, segsize);
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
@@ -1395,39 +1955,38 @@ dump_dir(objset_t *os)
 	dmu_objset_stats_t dds;
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
-	char numbuf[8];
+	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
-	char osname[MAXNAMELEN];
+	char osname[ZFS_MAX_DATASET_NAME_LEN];
 	char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	int print_header = 1;
 	int i, error;
-	size_t len;
 
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
-		usedobjs = os->os_rootbp->blk_fill;
-		refdbytes = os->os_spa->spa_dsl_pool->
-		    dp_mos_dir->dd_phys->dd_used_bytes;
+		usedobjs = BP_GET_FILL(os->os_rootbp);
+		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
+		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
-	ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
+	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
-	nicenum(refdbytes, numbuf, sizeof(numbuf));
+	zdb_nicenum(refdbytes, numbuf);
 
 	if (verbosity >= 4) {
-		size_t blklen = sizeof(blkbuf);
-		len = snprintf(blkbuf, blklen, ", rootbp ");
-		if (len > blklen)
-			len = blklen;
-		snprintf_blkptr(blkbuf + len, blklen - len, os->os_rootbp);
+		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
+		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
+		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
@@ -1452,19 +2011,18 @@ dump_dir(objset_t *os)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL)
-		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
-		    dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
+		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
 
 	if (verbosity < 2)
 		return;
 
-	if (os->os_rootbp->blk_birth == 0)
+	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header);
 	object_count = 0;
-	if (os->os_userused_dnode &&
-	    os->os_userused_dnode->dn_type != 0) {
+	if (DMU_USERUSED_DNODE(os) != NULL &&
+	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
 	}
@@ -1499,7 +2057,7 @@ dump_uberblock(uberblock_t *ub, const ch
 	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
 	if (dump_opt['u'] >= 3) {
 		char blkbuf[BP_SPRINTF_LEN];
-		snprintf_blkptr(blkbuf, sizeof(blkbuf), &ub->ub_rootbp);
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf(footer ? footer : "");
@@ -1538,13 +2096,13 @@ dump_cachefile(const char *cachefile)
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
-		(void) printf("cannot open '%s': %s\n", cachefile,
+		(void) fprintf(stderr, "cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
-		(void) printf("failed to stat '%s': %s\n", cachefile,
+		(void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		exit(1);
 	}
@@ -1604,19 +2162,41 @@ dump_label(const char *dev)
 {
 	int fd;
 	vdev_label_t label;
-	char *buf = label.vl_vdev_phys.vp_nvlist;
+	char *path, *buf = label.vl_vdev_phys.vp_nvlist;
 	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
 	struct stat64 statbuf;
 	uint64_t psize, ashift;
+	int len = strlen(dev) + 1;
+
+	if (strncmp(dev, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) {
+		len++;
+		path = malloc(len);
+		(void) snprintf(path, len, "%s%s", ZFS_RDISK_ROOTD,
+		    dev + strlen(ZFS_DISK_ROOTD));
+	} else {
+		path = strdup(dev);
+	}
 
-	if ((fd = open64(dev, O_RDONLY)) < 0) {
-		(void) printf("cannot open '%s': %s\n", dev, strerror(errno));
+	if ((fd = open64(path, O_RDONLY)) < 0) {
+		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
+		free(path);
 		exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
-		(void) printf("failed to stat '%s': %s\n", dev,
+		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
+		free(path);
+		(void) close(fd);
+		exit(1);
+	}
+
+	if (S_ISBLK(statbuf.st_mode)) {
+		(void) printf("cannot use '%s': character device required\n",
+		    path);
+		free(path);
+		(void) close(fd);
+		exit(1);
 	}
 
 	psize = statbuf.st_size;
@@ -1652,8 +2232,13 @@ dump_label(const char *dev)
 		if (dump_opt['u'])
 			dump_label_uberblocks(&label, ashift);
 	}
+
+	free(path);
+	(void) close(fd);
 }
 
+static uint64_t dataset_feature_count[SPA_FEATURES];
+
 /*ARGSUSED*/
 static int
 dump_one_dir(const char *dsname, void *arg)
@@ -1666,20 +2251,34 @@ dump_one_dir(const char *dsname, void *a
 		(void) printf("Could not open %s, error %d\n", dsname, error);
 		return (0);
 	}
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (!dmu_objset_ds(os)->ds_feature_inuse[f])
+			continue;
+		ASSERT(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET);
+		dataset_feature_count[f]++;
+	}
+
 	dump_dir(os);
 	dmu_objset_disown(os, FTAG);
 	fuid_table_destroy();
+	sa_loaded = B_FALSE;
 	return (0);
 }
 
 /*
  * Block statistics.
  */
+#define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
-	uint64_t	zb_asize;
-	uint64_t	zb_lsize;
-	uint64_t	zb_psize;
-	uint64_t	zb_count;
+	uint64_t zb_asize;
+	uint64_t zb_lsize;
+	uint64_t zb_psize;
+	uint64_t zb_count;
+	uint64_t zb_gangs;
+	uint64_t zb_ditto_samevdev;
+	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
@@ -1687,11 +2286,13 @@ typedef struct zdb_blkstats {
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
-#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 2)
+#define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
+#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
+	"other",
 	"Total",
 };
 
@@ -1701,13 +2302,20 @@ typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
+	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
+	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
+	    [BPE_PAYLOAD_SIZE];
+	uint64_t	zcb_start;
+	uint64_t	zcb_lastprint;
+	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
+	spa_t		*zcb_spa;
 } zdb_cb_t;
 
 static void
-zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp,
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	uint64_t refcnt = 0;
@@ -1720,12 +2328,50 @@ zdb_count_block(spa_t *spa, zilog_t *zil
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
+		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
+
+		/*
+		 * The histogram is only big enough to record blocks up to
+		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
+		 * "other", bucket.
+		 */
+		int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
+		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
+		zb->zb_psize_histogram[idx]++;
+
+		zb->zb_gangs += BP_COUNT_GANG(bp);
+
+		switch (BP_GET_NDVAS(bp)) {
+		case 2:
+			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1]))
+				zb->zb_ditto_samevdev++;
+			break;
+		case 3:
+			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2]));
+			if (equal != 0)
+				zb->zb_ditto_samevdev++;
+			break;
+		}
+
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
+		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
+		    [BPE_GET_PSIZE(bp)]++;
+		return;
 	}
 
 	if (dump_opt['L'])
@@ -1735,7 +2381,7 @@ zdb_count_block(spa_t *spa, zilog_t *zil
 		ddt_t *ddt;
 		ddt_entry_t *dde;
 
-		ddt = ddt_select(spa, bp);
+		ddt = ddt_select(zcb->zcb_spa, bp);
 		ddt_enter(ddt);
 		dde = ddt_lookup(ddt, bp, B_FALSE);
 
@@ -1751,115 +2397,149 @@ zdb_count_block(spa_t *spa, zilog_t *zil
 		ddt_exit(ddt);
 	}
 
-	VERIFY3U(zio_wait(zio_claim(NULL, spa,
-	    refcnt ? 0 : spa_first_txg(spa),
+	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
+/* ARGSUSED */
+static void
+zdb_blkptr_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	int ioerr = zio->io_error;
+	zdb_cb_t *zcb = zio->io_private;
+	zbookmark_phys_t *zb = &zio->io_bookmark;
+
+	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		char blkbuf[BP_SPRINTF_LEN];
+
+		zcb->zcb_haderrors = 1;
+		zcb->zcb_errors[ioerr]++;
+
+		if (dump_opt['b'] >= 2)
+			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		else
+			blkbuf[0] = '\0';
+
+		(void) printf("zdb_blkptr_cb: "
+		    "Got error %d reading "
+		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+		    ioerr,
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (u_longlong_t)zb->zb_level,
+		    (u_longlong_t)zb->zb_blkid,
+		    blkbuf);
+	}
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+/* ARGSUSED */
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
-	char blkbuf[BP_SPRINTF_LEN];
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (bp == NULL)
 		return (0);
 
+	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("objset %llu object %llu "
+		    "level %lld offset 0x%llx %s\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)zb->zb_object,
+		    (longlong_t)zb->zb_level,
+		    (u_longlong_t)blkid2offset(dnp, bp, zb),
+		    blkbuf);
+	}
+
+	if (BP_IS_HOLE(bp))
+		return (0);
+
 	type = BP_GET_TYPE(bp);
 
-	zdb_count_block(spa, zilog, zcb, bp, type);
+	zdb_count_block(zcb, zilog, bp,
+	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
-	is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
+	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
-	if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
-		int ioerr;
+	if (!BP_IS_EMBEDDED(bp) &&
+	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
-		void *data = malloc(size);
+		void *data = zio_data_buf_alloc(size);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
-		ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
-		    NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb));
-
-		free(data);
-
-		if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) {
-			zcb->zcb_haderrors = 1;
-			zcb->zcb_errors[ioerr]++;
-
-			if (dump_opt['b'] >= 2)
-				snprintf_blkptr(blkbuf, sizeof(blkbuf), bp);
-			else
-				blkbuf[0] = '\0';
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > max_inflight)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight++;
+		mutex_exit(&spa->spa_scrub_lock);
 
-			(void) printf("zdb_blkptr_cb: "
-			    "Got error %d reading "
-			    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
-			    ioerr,
-			    (u_longlong_t)zb->zb_objset,
-			    (u_longlong_t)zb->zb_object,
-			    (u_longlong_t)zb->zb_level,
-			    (u_longlong_t)zb->zb_blkid,
-			    blkbuf);
-		}
+		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
-	if (dump_opt['b'] >= 4) {
-		snprintf_blkptr(blkbuf, sizeof(blkbuf), bp);
-		(void) printf("objset %llu object %llu "
-		    "level %lld offset 0x%llx %s\n",
-		    (u_longlong_t)zb->zb_objset,
-		    (u_longlong_t)zb->zb_object,
-		    (longlong_t)zb->zb_level,
-		    (u_longlong_t)blkid2offset(dnp, bp, zb),
-		    blkbuf);
+	/* only call gethrtime() every 100 blocks */
+	static int iters;
+	if (++iters > 100)
+		iters = 0;
+	else
+		return (0);
+
+	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
+		uint64_t now = gethrtime();
+		char buf[10];
+		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
+		int kb_per_sec =
+		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
+		int sec_remaining =
+		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
+
+		zfs_nicenum(bytes, buf, sizeof (buf));
+		(void) fprintf(stderr,
+		    "\r%5s completed (%4dMB/s) "
+		    "estimated time remaining: %uhr %02umin %02usec        ",
+		    buf, kb_per_sec / 1024,
+		    sec_remaining / 60 / 60,
+		    sec_remaining / 60 % 60,
+		    sec_remaining % 60);
+
+		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
-zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
-	vdev_t *vd = sm->sm_ppd;
+	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
-/* ARGSUSED */
-static void
-zdb_space_map_load(space_map_t *sm)
-{
-}
-
-static void
-zdb_space_map_unload(space_map_t *sm)
-{
-	space_map_vacate(sm, zdb_leak, sm);
-}
-
-/* ARGSUSED */
-static void
-zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-}
-
-static space_map_ops_t zdb_space_map_ops = {
-	zdb_space_map_load,
-	zdb_space_map_unload,
-	NULL,	/* alloc */
-	zdb_space_map_claim,
-	NULL,	/* free */
-	NULL	/* maxsize */
+static metaslab_ops_t zdb_metaslab_ops = {
+	NULL	/* alloc */
 };
 
 static void
@@ -1884,8 +2564,7 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *
 			ddt_bp_create(ddb.ddb_checksum,
 			    &dde.dde_key, ddp, &blk);
 			if (p == DDT_PHYS_DITTO) {
-				zdb_count_block(spa, NULL, zcb, &blk,
-				    ZDB_OT_DITTO);
+				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
 			} else {
 				zcb->zcb_dedup_asize +=
 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
@@ -1906,21 +2585,63 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
+	zcb->zcb_spa = spa;
+
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
-		for (int c = 0; c < rvd->vdev_children; c++) {
+
+		/*
+		 * We are going to be changing the meaning of the metaslab's
+		 * ms_tree.  Ensure that the allocator doesn't try to
+		 * use the tree.
+		 */
+		spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
+		spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
+
+		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
-			for (int m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_group_t *mg = vd->vdev_mg;
+			for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
+				ASSERT3P(msp->ms_group, ==, mg);
 				mutex_enter(&msp->ms_lock);
-				space_map_unload(&msp->ms_map);
-				VERIFY(space_map_load(&msp->ms_map,
-				    &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
-				    spa->spa_meta_objset) == 0);
-				msp->ms_map.sm_ppd = vd;
+				metaslab_unload(msp);
+
+				/*
+				 * For leak detection, we overload the metaslab
+				 * ms_tree to contain allocated segments
+				 * instead of free segments. As a result,
+				 * we can't use the normal metaslab_load/unload
+				 * interfaces.
+				 */
+				if (msp->ms_sm != NULL) {
+					(void) fprintf(stderr,
+					    "\rloading space map for "
+					    "vdev %llu of %llu, "
+					    "metaslab %llu of %llu ...",
+					    (longlong_t)c,
+					    (longlong_t)rvd->vdev_children,
+					    (longlong_t)m,
+					    (longlong_t)vd->vdev_ms_count);
+
+					/*
+					 * We don't want to spend the CPU
+					 * manipulating the size-ordered
+					 * tree, so clear the range_tree
+					 * ops.
+					 */
+					msp->ms_tree->rt_ops = NULL;
+					VERIFY0(space_map_load(msp->ms_sm,
+					    msp->ms_tree, SM_ALLOC));
+
+					if (!msp->ms_loaded) {
+						msp->ms_loaded = B_TRUE;
+					}
+				}
 				mutex_exit(&msp->ms_lock);
 			}
 		}
+		(void) fprintf(stderr, "\n");
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
@@ -1937,16 +2658,50 @@ zdb_leak_fini(spa_t *spa)
 		vdev_t *rvd = spa->spa_root_vdev;
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
+			metaslab_group_t *mg = vd->vdev_mg;
 			for (int m = 0; m < vd->vdev_ms_count; m++) {
 				metaslab_t *msp = vd->vdev_ms[m];
+				ASSERT3P(mg, ==, msp->ms_group);
 				mutex_enter(&msp->ms_lock);
-				space_map_unload(&msp->ms_map);
+
+				/*
+				 * The ms_tree has been overloaded to
+				 * contain allocated segments. Now that we
+				 * finished traversing all blocks, any
+				 * block that remains in the ms_tree
+				 * represents an allocated block that we
+				 * did not claim during the traversal.
+				 * Claimed blocks would have been removed
+				 * from the ms_tree.
+				 */
+				range_tree_vacate(msp->ms_tree, zdb_leak, vd);
+
+				if (msp->ms_loaded) {
+					msp->ms_loaded = B_FALSE;
+				}
+
 				mutex_exit(&msp->ms_lock);
 			}
 		}
 	}
 }
 
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zdb_cb_t *zcb = arg;
+
+	if (dump_opt['b'] >= 5) {
+		char blkbuf[BP_SPRINTF_LEN];
+		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
+		(void) printf("[%s] %s\n",
+		    "deferred free", blkbuf);
+	}
+	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+	return (0);
+}
+
 static int
 dump_block_stats(spa_t *spa)
 {
@@ -1954,9 +2709,9 @@ dump_block_stats(spa_t *spa)
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
-	int leaks = 0;
+	boolean_t leaks = B_FALSE;
 
-	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
@@ -1976,32 +2731,39 @@ dump_block_stats(spa_t *spa)
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
-	if (spa->spa_deferred_bplist_obj != 0) {
-		bplist_t *bpl = &spa->spa_deferred_bplist;
-		blkptr_t blk;
-		uint64_t itor = 0;
-
-		VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
-		    spa->spa_deferred_bplist_obj));
-
-		while (bplist_iterate(bpl, &itor, &blk) == 0) {
-			if (dump_opt['b'] >= 4) {
-				char blkbuf[BP_SPRINTF_LEN];
-				snprintf_blkptr(blkbuf, sizeof(blkbuf), &blk);
-				(void) printf("[%s] %s\n",
-				    "deferred free", blkbuf);
-			}
-			zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED);
-		}
-
-		bplist_close(bpl);
+	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+	    count_block_cb, &zcb, NULL);
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+		    count_block_cb, &zcb, NULL);
+	}
+	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
+		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
+		    &zcb, NULL));
 	}
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
+	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
+	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
 	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
 
+	/*
+	 * If we've traversed the data blocks then we need to wait for those
+	 * I/Os to complete. We leverage "The Godfather" zio to wait on
+	 * all async I/Os to complete.
+	 */
+	if (dump_opt['c']) {
+		for (int i = 0; i < max_ncpus; i++) {
+			(void) zio_wait(spa->spa_async_zio_root[i]);
+			spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+			    ZIO_FLAG_GODFATHER);
+		}
+	}
+
 	if (zcb.zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
@@ -2037,7 +2799,7 @@ dump_block_stats(spa_t *spa)
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
-		leaks = 1;
+		leaks = B_TRUE;
 	}
 
 	if (tzb->zb_count == 0)
@@ -2046,6 +2808,8 @@ dump_block_stats(spa_t *spa)
 	(void) printf("\n");
 	(void) printf("\tbp count:      %10llu\n",
 	    (u_longlong_t)tzb->zb_count);
+	(void) printf("\tganged count:  %10llu\n",
+	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
@@ -2067,13 +2831,36 @@ dump_block_stats(spa_t *spa)
 	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
+	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
+		if (zcb.zcb_embedded_blocks[i] == 0)
+			continue;
+		(void) printf("\n");
+		(void) printf("\tadditional, non-pointer bps of type %u: "
+		    "%10llu\n",
+		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
+
+		if (dump_opt['b'] >= 3) {
+			(void) printf("\t number of (compressed) bytes:  "
+			    "number of bps\n");
+			dump_histogram(zcb.zcb_embedded_histogram[i],
+			    sizeof (zcb.zcb_embedded_histogram[i]) /
+			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
+		}
+	}
+
+	if (tzb->zb_ditto_samevdev != 0) {
+		(void) printf("\tDittoed blocks on same vdev: %llu\n",
+		    (longlong_t)tzb->zb_ditto_samevdev);
+	}
+
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
-			char csize[6], lsize[6], psize[6], asize[6], avg[6];
+			char csize[32], lsize[32], psize[32], asize[32];
+			char avg[32], gang[32];
 			char *typename;
 
 			if (t < DMU_OT_NUMTYPES)
@@ -2109,11 +2896,12 @@ dump_block_stats(spa_t *spa)
 				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
-				nicenum(zb->zb_count, csize, sizeof(csize));
-				nicenum(zb->zb_lsize, lsize, sizeof(lsize));
-				nicenum(zb->zb_psize, psize, sizeof(psize));
-				nicenum(zb->zb_asize, asize, sizeof(asize));
-				nicenum(zb->zb_asize / zb->zb_count, avg, sizeof(avg));
+				zdb_nicenum(zb->zb_count, csize);
+				zdb_nicenum(zb->zb_lsize, lsize);
+				zdb_nicenum(zb->zb_psize, psize);
+				zdb_nicenum(zb->zb_asize, asize);
+				zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
+				zdb_nicenum(zb->zb_gangs, gang);
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
@@ -2126,6 +2914,19 @@ dump_block_stats(spa_t *spa)
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
+
+				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
+					(void) printf("\t number of ganged "
+					    "blocks: %s\n", gang);
+				}
+
+				if (dump_opt['b'] >= 4) {
+					(void) printf("psize "
+					    "(in 512-byte sectors): "
+					    "number of blocks\n");
+					dump_histogram(zb->zb_psize_histogram,
+					    PSIZE_HISTO_SIZE, 0);
+				}
 			}
 		}
 	}
@@ -2153,25 +2954,25 @@ typedef struct zdb_ddt_entry {
 /* ARGSUSED */
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
-	if (bp == NULL)
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
-		    (u_longlong_t)bp->blk_fill,
+		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
-	    BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
@@ -2226,7 +3027,8 @@ dump_simulated_ddt(spa_t *spa)
 		dds.dds_ref_psize = zdde->zdde_ref_psize;
 		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
 
-		ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0);
+		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
+		    &dds, 0);
 
 		umem_free(zdde, sizeof (*zdde));
 	}
@@ -2269,28 +3071,72 @@ dump_zpool(spa_t *spa)
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
+	if (dump_opt['M'])
+		dump_metaslab_groups(spa);
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
-			dump_bplist(dp->dp_meta_objset,
-			    spa->spa_deferred_bplist_obj, "Deferred frees");
+			dump_full_bpobj(&spa->spa_deferred_bpobj,
+			    "Deferred frees", 0);
+			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+				dump_full_bpobj(
+				    &spa->spa_dsl_pool->dp_free_bpobj,
+				    "Pool snapshot frees", 0);
+			}
+
+			if (spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY)) {
+				dump_bptree(spa->spa_meta_objset,
+				    spa->spa_dsl_pool->dp_bptree_obj,
+				    "Pool dataset frees");
+			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+
+		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+			uint64_t refcount;
+
+			if (!(spa_feature_table[f].fi_flags &
+			    ZFEATURE_FLAG_PER_DATASET)) {
+				ASSERT0(dataset_feature_count[f]);
+				continue;
+			}
+			(void) feature_get_refcount(spa,
+			    &spa_feature_table[f], &refcount);
+			if (dataset_feature_count[f] != refcount) {
+				(void) printf("%s feature refcount mismatch: "
+				    "%lld datasets != %lld refcount\n",
+				    spa_feature_table[f].fi_uname,
+				    (longlong_t)dataset_feature_count[f],
+				    (longlong_t)refcount);
+				rc = 2;
+			} else {
+				(void) printf("Verified %s feature refcount "
+				    "of %llu is correct\n",
+				    spa_feature_table[f].fi_uname,
+				    (longlong_t)refcount);
+			}
+		}
 	}
-	if (dump_opt['b'] || dump_opt['c'])
+	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
+	if (rc == 0)
+		rc = verify_spacemap_refcounts(spa);
+
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
-	if (rc != 0)
+	if (rc != 0) {
+		dump_debug_buffer();
 		exit(rc);
+	}
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
@@ -2312,7 +3158,7 @@ zdb_print_blkptr(blkptr_t *bp, int flags
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
-	snprintf_blkptr(blkbuf, sizeof(blkbuf), bp);
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
@@ -2508,6 +3354,7 @@ zdb_read_block(char *thing, spa_t *spa)
 				free(dup);
 				return;
 			}
+			i += p - &flagstr[i + 1]; /* skip over the number */
 		}
 	}
 
@@ -2666,7 +3513,7 @@ find_zpool(char **target, nvlist_t **con
 	nvlist_t *match = NULL;
 	char *name = NULL;
 	char *sepp = NULL;
-	char sep;
+	char sep = '\0';
 	int count = 0;
 	importargs_t args = { 0 };
 
@@ -2742,13 +3589,25 @@ main(int argc, char **argv)
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int rewind = ZPOOL_NEVER_REWIND;
+	char *spa_config_path_env;
+	boolean_t target_is_spa = B_TRUE;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:")) != -1) {
+	/*
+	 * If there is an environment variable SPA_CONFIG_PATH it overrides
+	 * default spa_config_path setting. If -U flag is specified it will
+	 * override this environment variable settings once again.
+	 */
+	spa_config_path_env = getenv("SPA_CONFIG_PATH");
+	if (spa_config_path_env != NULL)
+		spa_config_path = spa_config_path_env;
+
+	while ((c = getopt(argc, argv,
+	    "bcdhilmMI:suCDRSAFLXx:evp:t:U:PG")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
@@ -2761,8 +3620,10 @@ main(int argc, char **argv)
 		case 'u':
 		case 'C':
 		case 'D':
+		case 'M':
 		case 'R':
 		case 'S':
+		case 'G':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
@@ -2771,10 +3632,17 @@ main(int argc, char **argv)
 		case 'L':
 		case 'X':
 		case 'e':
+		case 'P':
 			dump_opt[c]++;
 			break;
-		case 'v':
-			verbose++;
+		case 'I':
+			max_inflight = strtoull(optarg, NULL, 0);
+			if (max_inflight == 0) {
+				(void) fprintf(stderr, "maximum number "
+				    "of inflight I/Os must be greater "
+				    "than 0\n");
+				usage();
+			}
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
@@ -2802,6 +3670,12 @@ main(int argc, char **argv)
 		case 'U':
 			spa_config_path = optarg;
 			break;
+		case 'v':
+			verbose++;
+			break;
+		case 'x':
+			vn_dumpdir = optarg;
+			break;
 		default:
 			usage();
 			break;
@@ -2813,15 +3687,29 @@ main(int argc, char **argv)
 		usage();
 	}
 
+	/*
+	 * ZDB does not typically re-read blocks; therefore limit the ARC
+	 * to 256 MB, which can be used entirely for metadata.
+	 */
+	zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024;
+
+	/*
+	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
+	 * "zdb -b" uses traversal prefetch which uses async reads.
+	 * For good performance, let several of them be active at once.
+	 */
+	zfs_vdev_async_read_max_active = 10;
+
 	kernel_init(FREAD);
 	g_zfs = libzfs_init();
-	ASSERT(g_zfs != NULL);
+	if (g_zfs == NULL)
+		fatal("Fail to initialize zfs");
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
-		if (dump_all && !strchr("elAFLRSX", c))
+		if (dump_all && !strchr("elAFLRSXP", c))
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
@@ -2875,13 +3763,31 @@ main(int argc, char **argv)
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
-			if ((error = spa_import(name, cfg, NULL)) != 0)
-				error = spa_import_verbatim(name, cfg, NULL);
+			if ((error = spa_import(name, cfg, NULL,
+			    ZFS_IMPORT_MISSING_LOG)) != 0) {
+				error = spa_import(name, cfg, NULL,
+				    ZFS_IMPORT_VERBATIM);
+			}
 		}
 	}
 
+	if (strpbrk(target, "/@") != NULL) {
+		size_t targetlen;
+
+		target_is_spa = B_FALSE;
+		/*
+		 * Remove any trailing slash.  Later code would get confused
+		 * by it, but we want to allow it so that "pool/" can
+		 * indicate that we want to dump the topmost filesystem,
+		 * rather than the whole pool.
+		 */
+		targetlen = strlen(target);
+		if (targetlen != 0 && target[targetlen - 1] == '/')
+			target[targetlen - 1] = '\0';
+	}
+
 	if (error == 0) {
-		if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
+		if (target_is_spa || dump_opt['R']) {
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
@@ -2927,7 +3833,13 @@ main(int argc, char **argv)
 					    argv[i], strerror(errno));
 			}
 		}
-		(os != NULL) ? dump_dir(os) : dump_zpool(spa);
+		if (os != NULL) {
+			dump_dir(os);
+		} else if (zopt_objects > 0 && !dump_opt['m']) {
+			dump_dir(spa->spa_meta_objset);
+		} else {
+			dump_zpool(spa);
+		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
@@ -2945,6 +3857,9 @@ main(int argc, char **argv)
 	(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
 
 	fuid_table_destroy();
+	sa_loaded = B_FALSE;
+
+	dump_debug_buffer();
 
 	libzfs_fini(g_zfs);
 	kernel_fini();
Index: src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c,v
retrieving revision 1.2
diff -u -p -r1.2 zdb_il.c
--- src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c	28 Mar 2014 03:46:56 -0000	1.2
+++ src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c	17 Jul 2014 16:19:55 -0000
@@ -24,6 +24,10 @@
  */
 
 /*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+/*
  * Print intent log header and statistics.
  */
 
@@ -47,7 +51,7 @@ print_log_bp(const blkptr_t *bp, const c
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
-	snprintf_blkptr(blkbuf, sizeof(blkbuf), bp);
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s%s\n", prefix, blkbuf);
 }
 
@@ -118,7 +122,7 @@ zil_prt_rec_write(zilog_t *zilog, int tx
 {
 	char *data, *dlimit;
 	blkptr_t *bp = &lr->lr_blkptr;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	char buf[SPA_MAXBLOCKSIZE];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int error;
@@ -132,6 +136,7 @@ zil_prt_rec_write(zilog_t *zilog, int tx
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", prefix,
+		    !BP_IS_HOLE(bp) &&
 		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, prefix);
@@ -139,8 +144,6 @@ zil_prt_rec_write(zilog_t *zilog, int tx
 		if (BP_IS_HOLE(bp)) {
 			(void) printf("\t\t\tLSIZE 0x%llx\n",
 			    (u_longlong_t)BP_GET_LSIZE(bp));
-		}
-		if (bp->blk_birth == 0) {
 			bzero(buf, sizeof (buf));
 			(void) printf("%s<hole>\n", prefix);
 			return;
@@ -314,7 +317,7 @@ print_log_block(zilog_t *zilog, blkptr_t
 	if (verbose >= 5) {
 		(void) strcpy(blkbuf, ", ");
 		snprintf_blkptr(blkbuf + strlen(blkbuf),
-		    sizeof(blkbuf) - strlen(blkbuf), bp);
+		    sizeof (blkbuf) - strlen(blkbuf), bp);
 	} else {
 		blkbuf[0] = '\0';
 	}
@@ -362,7 +365,7 @@ dump_intent_log(zilog_t *zilog)
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int i;
 
-	if (zh->zh_log.blk_birth == 0 || verbose < 1)
+	if (BP_IS_HOLE(&zh->zh_log) || verbose < 1)
 		return;
 
 	(void) printf("\n    ZIL header: claim_txg %llu, "
Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_iter.c
--- src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c	27 Feb 2010 22:29:21 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c	10 Oct 2016 11:14:24 -0000
@@ -18,9 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #include <libintl.h>
@@ -69,7 +72,7 @@ uu_avl_pool_t *avl_pool;
  * Include snaps if they were requested or if this a zfs list where types
  * were not specified and the "listsnapshots" property is set on this pool.
  */
-static int
+static boolean_t
 zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb)
 {
 	zpool_handle_t *zph;
@@ -89,8 +92,9 @@ static int
 zfs_callback(zfs_handle_t *zhp, void *data)
 {
 	callback_data_t *cb = data;
-	int dontclose = 0;
-	int include_snaps = zfs_include_snapshots(zhp, cb);
+	boolean_t should_close = B_TRUE;
+	boolean_t include_snaps = zfs_include_snapshots(zhp, cb);
+	boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK);
 
 	if ((zfs_get_type(zhp) & cb->cb_types) ||
 	    ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) {
@@ -108,14 +112,15 @@ zfs_callback(zfs_handle_t *zhp, void *da
 					    cb->cb_props_table);
 
 				if (zfs_expand_proplist(zhp, cb->cb_proplist,
-				    (cb->cb_flags & ZFS_ITER_RECVD_PROPS))
+				    (cb->cb_flags & ZFS_ITER_RECVD_PROPS),
+				    (cb->cb_flags & ZFS_ITER_LITERAL_PROPS))
 				    != 0) {
 					free(node);
 					return (-1);
 				}
 			}
 			uu_avl_insert(cb->cb_avl, node, idx);
-			dontclose = 1;
+			should_close = B_FALSE;
 		} else {
 			free(node);
 		}
@@ -130,12 +135,18 @@ zfs_callback(zfs_handle_t *zhp, void *da
 		cb->cb_depth++;
 		if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
 			(void) zfs_iter_filesystems(zhp, zfs_callback, data);
-		if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps)
-			(void) zfs_iter_snapshots(zhp, zfs_callback, data);
+		if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT |
+		    ZFS_TYPE_BOOKMARK)) == 0) && include_snaps)
+			(void) zfs_iter_snapshots(zhp,
+			    (cb->cb_flags & ZFS_ITER_SIMPLE) != 0, zfs_callback,
+			    data);
+		if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT |
+		    ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks)
+			(void) zfs_iter_bookmarks(zhp, zfs_callback, data);
 		cb->cb_depth--;
 	}
 
-	if (!dontclose)
+	if (should_close)
 		zfs_close(zhp);
 
 	return (0);
@@ -185,6 +196,14 @@ zfs_free_sort_columns(zfs_sort_column_t 
 	}
 }
 
+boolean_t
+zfs_sort_only_by_name(const zfs_sort_column_t *sc)
+{
+
+	return (sc != NULL && sc->sc_next == NULL &&
+	    sc->sc_prop == ZFS_PROP_NAME);
+}
+
 /* ARGSUSED */
 static int
 zfs_compare(const void *larg, const void *rarg, void *unused)
@@ -225,7 +244,13 @@ zfs_compare(const void *larg, const void
 			lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
 			rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
 
-			if (lcreate < rcreate)
+			/*
+			 * Both lcreate and rcreate being 0 means we don't have
+			 * properties and we should compare full name.
+			 */
+			if (lcreate == 0 && rcreate == 0)
+				ret = strcmp(lat + 1, rat + 1);
+			else if (lcreate < rcreate)
 				ret = -1;
 			else if (lcreate > rcreate)
 				ret = 1;
@@ -291,7 +316,14 @@ zfs_sort(const void *larg, const void *r
 			if (rvalid)
 				verify(nvlist_lookup_string(rval,
 				    ZPROP_VALUE, &rstr) == 0);
+		} else if (psc->sc_prop == ZFS_PROP_NAME) {
+			lvalid = rvalid = B_TRUE;
+
+			(void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf));
+			(void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf));
 
+			lstr = lbuf;
+			rstr = rbuf;
 		} else if (zfs_prop_is_string(psc->sc_prop)) {
 			lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf,
 			    sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0);
@@ -351,11 +383,8 @@ zfs_for_each(int argc, char **argv, int 
 	avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t),
 	    offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT);
 
-	if (avl_pool == NULL) {
-		(void) fprintf(stderr,
-		    gettext("internal error: out of memory\n"));
-		exit(1);
-	}
+	if (avl_pool == NULL)
+		nomem();
 
 	cb.cb_sortcol = sortcol;
 	cb.cb_flags = flags;
@@ -400,11 +429,8 @@ zfs_for_each(int argc, char **argv, int 
 		    sizeof (cb.cb_props_table));
 	}
 
-	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
-		(void) fprintf(stderr,
-		    gettext("internal error: out of memory\n"));
-		exit(1);
-	}
+	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+		nomem();
 
 	if (argc == 0) {
 		/*
@@ -454,11 +480,8 @@ zfs_for_each(int argc, char **argv, int 
 	/*
 	 * Finally, clean up the AVL tree.
 	 */
-	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) {
-		(void) fprintf(stderr,
-		    gettext("internal error: out of memory"));
-		exit(1);
-	}
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
 
 	while ((node = uu_avl_walk_next(walk)) != NULL) {
 		uu_avl_remove(cb.cb_avl, node);
Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_iter.h
--- src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h	27 Feb 2010 22:29:20 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h	10 Oct 2016 11:14:24 -0000
@@ -18,9 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef	ZFS_ITER_H
@@ -43,11 +46,14 @@ typedef struct zfs_sort_column {
 #define	ZFS_ITER_PROP_LISTSNAPS    (1 << 2)
 #define	ZFS_ITER_DEPTH_LIMIT	   (1 << 3)
 #define	ZFS_ITER_RECVD_PROPS	   (1 << 4)
+#define	ZFS_ITER_SIMPLE		   (1 << 5)
+#define	ZFS_ITER_LITERAL_PROPS	   (1 << 6)
 
 int zfs_for_each(int, char **, int options, zfs_type_t,
     zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
 int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t);
 void zfs_free_sort_columns(zfs_sort_column_t *);
+boolean_t zfs_sort_only_by_name(const zfs_sort_column_t *);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c,v
retrieving revision 1.5
diff -u -p -r1.5 zfs_main.c
--- src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c	10 Apr 2015 22:28:27 -0000	1.5
+++ src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c	22 Apr 2017 16:42:58 -0000
@@ -20,8 +20,16 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2012 Milan Jurik. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland.  All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2016 Nexenta Systems, Inc.
  */
 
 #include <assert.h>
@@ -41,24 +49,43 @@
 #include <zone.h>
 #include <grp.h>
 #include <pwd.h>
-#include <sys/mkdev.h>
+#include <signal.h>
+#include <sys/list.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 #include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include <time.h>
+#include <err.h>
+#ifdef __FreeBSD__
+#include <jail.h>
+#endif
+#ifdef __NetBSD__
+#include <sys/statvfs.h>
+#endif
 
 #include <libzfs.h>
+#include <libzfs_core.h>
+#include <zfs_prop.h>
+#include <zfs_deleg.h>
 #include <libuutil.h>
+#ifdef illumos
+#include <aclutils.h>
+#include <directory.h>
+#include <idmap.h>
+#endif
 
 #include "zfs_iter.h"
 #include "zfs_util.h"
+#include "zfs_comutil.h"
 
 libzfs_handle_t *g_zfs;
 
 static FILE *mnttab_file;
 static char history_str[HIS_MAX_RECORD_LEN];
-const char *pypath = "/usr/lib/zfs/pyzfs.py";
+static boolean_t log_history = B_TRUE;
 
 static int zfs_do_clone(int argc, char **argv);
 static int zfs_do_create(int argc, char **argv);
@@ -79,9 +106,17 @@ static int zfs_do_send(int argc, char **
 static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
 static int zfs_do_userspace(int argc, char **argv);
-static int zfs_do_python(int argc, char **argv);
+static int zfs_do_allow(int argc, char **argv);
+static int zfs_do_unallow(int argc, char **argv);
 static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_holds(int argc, char **argv);
 static int zfs_do_release(int argc, char **argv);
+static int zfs_do_diff(int argc, char **argv);
+#ifdef __FreeBSD__
+static int zfs_do_jail(int argc, char **argv);
+static int zfs_do_unjail(int argc, char **argv);
+#endif
+static int zfs_do_bookmark(int argc, char **argv);
 
 /*
  * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
@@ -108,6 +143,8 @@ typedef enum {
 	HELP_GET,
 	HELP_INHERIT,
 	HELP_UPGRADE,
+	HELP_JAIL,
+	HELP_UNJAIL,
 	HELP_LIST,
 	HELP_MOUNT,
 	HELP_PROMOTE,
@@ -126,7 +163,9 @@ typedef enum {
 	HELP_GROUPSPACE,
 	HELP_HOLD,
 	HELP_HOLDS,
-	HELP_RELEASE
+	HELP_RELEASE,
+	HELP_DIFF,
+	HELP_BOOKMARK,
 } zfs_help_t;
 
 typedef struct zfs_command {
@@ -153,6 +192,7 @@ static zfs_command_t command_table[] = {
 	{ "clone",	zfs_do_clone,		HELP_CLONE		},
 	{ "promote",	zfs_do_promote,		HELP_PROMOTE		},
 	{ "rename",	zfs_do_rename,		HELP_RENAME		},
+	{ "bookmark",	zfs_do_bookmark,	HELP_BOOKMARK		},
 	{ NULL },
 	{ "list",	zfs_do_list,		HELP_LIST		},
 	{ NULL },
@@ -171,13 +211,19 @@ static zfs_command_t command_table[] = {
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
 	{ NULL },
-	{ "allow",	zfs_do_python,		HELP_ALLOW		},
+	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
 	{ NULL },
-	{ "unallow",	zfs_do_python,		HELP_UNALLOW		},
+	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
 	{ NULL },
 	{ "hold",	zfs_do_hold,		HELP_HOLD		},
-	{ "holds",	zfs_do_python,		HELP_HOLDS		},
+	{ "holds",	zfs_do_holds,		HELP_HOLDS		},
 	{ "release",	zfs_do_release,		HELP_RELEASE		},
+	{ "diff",	zfs_do_diff,		HELP_DIFF		},
+#ifdef __FreeBSD__
+	{ NULL },
+	{ "jail",	zfs_do_jail,		HELP_JAIL		},
+	{ "unjail",	zfs_do_unjail,		HELP_UNJAIL		},
+#endif
 };
 
 #define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
@@ -192,16 +238,19 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tclone [-p] [-o property=value] ... "
 		    "<snapshot> <filesystem|volume>\n"));
 	case HELP_CREATE:
-		return (gettext("\tcreate [-p] [-o property=value] ... "
+		return (gettext("\tcreate [-pu] [-o property=value] ... "
 		    "<filesystem>\n"
 		    "\tcreate [-ps] [-b blocksize] [-o property=value] ... "
 		    "-V <size> <volume>\n"));
 	case HELP_DESTROY:
-		return (gettext("\tdestroy [-rRf] <filesystem|volume>\n"
-		    "\tdestroy [-rRd] <snapshot>\n"));
+		return (gettext("\tdestroy [-fnpRrv] <filesystem|volume>\n"
+		    "\tdestroy [-dnpRrv] "
+		    "<filesystem|volume>@<snap>[%<snap>][,...]\n"
+		    "\tdestroy <filesystem|volume>#<bookmark>\n"));
 	case HELP_GET:
 		return (gettext("\tget [-rHp] [-d max] "
-		    "[-o \"all\" | field[,...]] [-s source[,...]]\n"
+		    "[-o \"all\" | field[,...]]\n"
+		    "\t    [-t type[,...]] [-s source[,...]]\n"
 		    "\t    <\"all\" | property[,...]> "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_INHERIT:
@@ -210,10 +259,13 @@ get_usage(zfs_help_t idx)
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade [-v]\n"
 		    "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
+	case HELP_JAIL:
+		return (gettext("\tjail <jailid|jailname> <filesystem>\n"));
+	case HELP_UNJAIL:
+		return (gettext("\tunjail <jailid|jailname> <filesystem>\n"));
 	case HELP_LIST:
-		return (gettext("\tlist [-rH][-d max] "
-		    "[-o property[,...]] [-t type[,...]] [-s property] ...\n"
-		    "\t    [-S property] ... "
+		return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] "
+		    "[-s property]...\n\t    [-S property]... [-t type[,...]] "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
 		return (gettext("\tmount\n"
@@ -221,28 +273,35 @@ get_usage(zfs_help_t idx)
 	case HELP_PROMOTE:
 		return (gettext("\tpromote <clone-filesystem>\n"));
 	case HELP_RECEIVE:
-		return (gettext("\treceive [-vnF] <filesystem|volume|"
-		"snapshot>\n"
-		"\treceive [-vnF] -d <filesystem>\n"));
+		return (gettext("\treceive|recv [-vnsFu] <filesystem|volume|"
+		    "snapshot>\n"
+		    "\treceive|recv [-vnsFu] [-o origin=<snapshot>] [-d | -e] "
+		    "<filesystem>\n"
+		    "\treceive|recv -A <filesystem|volume>\n"));
 	case HELP_RENAME:
-		return (gettext("\trename <filesystem|volume|snapshot> "
+		return (gettext("\trename [-f] <filesystem|volume|snapshot> "
 		    "<filesystem|volume|snapshot>\n"
-		    "\trename -p <filesystem|volume> <filesystem|volume>\n"
-		    "\trename -r <snapshot> <snapshot>"));
+		    "\trename [-f] -p <filesystem|volume> <filesystem|volume>\n"
+		    "\trename -r <snapshot> <snapshot>\n"
+		    "\trename -u [-p] <filesystem> <filesystem>"));
 	case HELP_ROLLBACK:
 		return (gettext("\trollback [-rRf] <snapshot>\n"));
 	case HELP_SEND:
-		return (gettext("\tsend [-RDp] [-[iI] snapshot] <snapshot>\n"));
+		return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
+		    "<snapshot>\n"
+		    "\tsend [-Le] [-i snapshot|bookmark] "
+		    "<filesystem|volume|snapshot>\n"
+		    "\tsend [-nvPe] -t <receive_resume_token>\n"));
 	case HELP_SET:
-		return (gettext("\tset <property=value> "
+		return (gettext("\tset <property=value> ... "
 		    "<filesystem|volume|snapshot> ...\n"));
 	case HELP_SHARE:
 		return (gettext("\tshare <-a | filesystem>\n"));
 	case HELP_SNAPSHOT:
-		return (gettext("\tsnapshot [-r] [-o property=value] ... "
-		    "<filesystem@snapname|volume@snapname>\n"));
+		return (gettext("\tsnapshot|snap [-r] [-o property=value] ... "
+		    "<filesystem|volume>@<snap> ...\n"));
 	case HELP_UNMOUNT:
-		return (gettext("\tunmount [-f] "
+		return (gettext("\tunmount|umount [-f] "
 		    "<-a | filesystem|mountpoint>\n"));
 	case HELP_UNSHARE:
 		return (gettext("\tunshare "
@@ -268,41 +327,66 @@ get_usage(zfs_help_t idx)
 		    "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
 		    "<filesystem|volume>\n"));
 	case HELP_USERSPACE:
-		return (gettext("\tuserspace [-hniHp] [-o field[,...]] "
-		    "[-sS field] ... [-t type[,...]]\n"
-		    "\t    <filesystem|snapshot>\n"));
+		return (gettext("\tuserspace [-Hinp] [-o field[,...]] "
+		    "[-s field] ...\n"
+		    "\t    [-S field] ... [-t type[,...]] "
+		    "<filesystem|snapshot>\n"));
 	case HELP_GROUPSPACE:
-		return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] "
-		    "[-sS field] ... [-t type[,...]]\n"
-		    "\t    <filesystem|snapshot>\n"));
+		return (gettext("\tgroupspace [-Hinp] [-o field[,...]] "
+		    "[-s field] ...\n"
+		    "\t    [-S field] ... [-t type[,...]] "
+		    "<filesystem|snapshot>\n"));
 	case HELP_HOLD:
 		return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
 	case HELP_HOLDS:
-		return (gettext("\tholds [-r] <snapshot> ...\n"));
+		return (gettext("\tholds [-Hp] [-r|-d depth] "
+		    "<filesystem|volume|snapshot> ...\n"));
 	case HELP_RELEASE:
 		return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
+	case HELP_DIFF:
+		return (gettext("\tdiff [-FHt] <snapshot> "
+		    "[snapshot|filesystem]\n"));
+	case HELP_BOOKMARK:
+		return (gettext("\tbookmark <snapshot> <bookmark>\n"));
 	}
 
 	abort();
 	/* NOTREACHED */
 }
 
+void
+nomem(void)
+{
+	(void) fprintf(stderr, gettext("internal error: out of memory\n"));
+	exit(1);
+}
+
 /*
  * Utility function to guarantee malloc() success.
  */
+
 void *
 safe_malloc(size_t size)
 {
 	void *data;
 
-	if ((data = calloc(1, size)) == NULL) {
-		(void) fprintf(stderr, "internal error: out of memory\n");
-		exit(1);
-	}
+	if ((data = calloc(1, size)) == NULL)
+		nomem();
 
 	return (data);
 }
 
+static char *
+safe_strdup(char *str)
+{
+	char *dupstr = strdup(str);
+
+	if (dupstr == NULL)
+		nomem();
+
+	return (dupstr);
+}
+
 /*
  * Callback routine that will print out information for each of
  * the properties.
@@ -391,6 +475,8 @@ usage(boolean_t requested)
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
 		(void) fprintf(fp, "\t%-15s ", "groupquota@...");
 		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "written@<snap>");
+		(void) fprintf(fp, " NO       NO   <size>\n");
 
 		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
 		    "with standard units such as K, M, G, etc.\n"));
@@ -423,15 +509,18 @@ usage(boolean_t requested)
 	exit(requested ? 0 : 2);
 }
 
+/*
+ * Take a property=value argument string and add it to the given nvlist.
+ * Modifies the argument inplace.
+ */
 static int
-parseprop(nvlist_t *props)
+parseprop(nvlist_t *props, char *propname)
 {
-	char *propname = optarg;
 	char *propval, *strval;
 
 	if ((propval = strchr(propname, '=')) == NULL) {
 		(void) fprintf(stderr, gettext("missing "
-		    "'=' for -o option\n"));
+		    "'=' for property=value argument\n"));
 		return (-1);
 	}
 	*propval = '\0';
@@ -441,11 +530,8 @@ parseprop(nvlist_t *props)
 		    "specified multiple times\n"), propname);
 		return (-1);
 	}
-	if (nvlist_add_string(props, propname, propval) != 0) {
-		(void) fprintf(stderr, gettext("internal "
-		    "error: out of memory\n"));
-		return (-1);
-	}
+	if (nvlist_add_string(props, propname, propval) != 0)
+		nomem();
 	return (0);
 }
 
@@ -458,7 +544,7 @@ parse_depth(char *opt, int *flags)
 	depth = (int)strtol(opt, &tmp, 0);
 	if (*tmp) {
 		(void) fprintf(stderr,
-		    gettext("%s is not an integer\n"), optarg);
+		    gettext("%s is not an integer\n"), opt);
 		usage(B_FALSE);
 	}
 	if (depth < 0) {
@@ -470,6 +556,71 @@ parse_depth(char *opt, int *flags)
 	return (depth);
 }
 
+#define	PROGRESS_DELAY 2		/* seconds */
+
+static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
+static time_t pt_begin;
+static char *pt_header = NULL;
+static boolean_t pt_shown;
+
+static void
+start_progress_timer(void)
+{
+	pt_begin = time(NULL) + PROGRESS_DELAY;
+	pt_shown = B_FALSE;
+}
+
+static void
+set_progress_header(char *header)
+{
+	assert(pt_header == NULL);
+	pt_header = safe_strdup(header);
+	if (pt_shown) {
+		(void) printf("%s: ", header);
+		(void) fflush(stdout);
+	}
+}
+
+static void
+update_progress(char *update)
+{
+	if (!pt_shown && time(NULL) > pt_begin) {
+		int len = strlen(update);
+
+		(void) printf("%s: %s%*.*s", pt_header, update, len, len,
+		    pt_reverse);
+		(void) fflush(stdout);
+		pt_shown = B_TRUE;
+	} else if (pt_shown) {
+		int len = strlen(update);
+
+		(void) printf("%s%*.*s", update, len, len, pt_reverse);
+		(void) fflush(stdout);
+	}
+}
+
+static void
+finish_progress(char *done)
+{
+	if (pt_shown) {
+		(void) printf("%s\n", done);
+		(void) fflush(stdout);
+	}
+	free(pt_header);
+	pt_header = NULL;
+}
+
+/*
+ * Check if the dataset is mountable and should be automatically mounted.
+ */
+static boolean_t
+should_auto_mount(zfs_handle_t *zhp)
+{
+	if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp)))
+		return (B_FALSE);
+	return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON);
+}
+
 /*
  * zfs clone [-p] [-o prop=value] ... <snap> <fs | vol>
  *
@@ -486,20 +637,17 @@ zfs_do_clone(int argc, char **argv)
 	zfs_handle_t *zhp = NULL;
 	boolean_t parents = B_FALSE;
 	nvlist_t *props;
-	int ret;
+	int ret = 0;
 	int c;
 
-	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
-		(void) fprintf(stderr, gettext("internal error: "
-		    "out of memory\n"));
-		return (1);
-	}
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
 
 	/* check options */
 	while ((c = getopt(argc, argv, "o:p")) != -1) {
 		switch (c) {
 		case 'o':
-			if (parseprop(props))
+			if (parseprop(props, optarg) != 0)
 				return (1);
 			break;
 		case 'p':
@@ -558,8 +706,22 @@ zfs_do_clone(int argc, char **argv)
 
 		clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET);
 		if (clone != NULL) {
-			if ((ret = zfs_mount(clone, NULL, 0)) == 0)
-				ret = zfs_share(clone);
+			/*
+			 * If the user doesn't want the dataset
+			 * automatically mounted, then skip the mount/share
+			 * step.
+			 */
+			if (should_auto_mount(clone)) {
+				if ((ret = zfs_mount(clone, NULL, 0)) != 0) {
+					(void) fprintf(stderr, gettext("clone "
+					    "successfully created, "
+					    "but not mounted\n"));
+				} else if ((ret = zfs_share(clone)) != 0) {
+					(void) fprintf(stderr, gettext("clone "
+					    "successfully created, "
+					    "but not shared\n"));
+				}
+			}
 			zfs_close(clone);
 		}
 	}
@@ -578,7 +740,7 @@ usage:
 }
 
 /*
- * zfs create [-p] [-o prop=value] ... fs
+ * zfs create [-pu] [-o prop=value] ... fs
  * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size
  *
  * Create a new dataset.  This command can be used to create filesystems
@@ -591,30 +753,29 @@ usage:
  * SPA_VERSION_REFRESERVATION, we set a refreservation instead.
  *
  * The '-p' flag creates all the non-existing ancestors of the target first.
+ *
+ * The '-u' flag prevents mounting of newly created file system.
  */
 static int
 zfs_do_create(int argc, char **argv)
 {
 	zfs_type_t type = ZFS_TYPE_FILESYSTEM;
 	zfs_handle_t *zhp = NULL;
-	uint64_t volsize;
+	uint64_t volsize = 0;
 	int c;
 	boolean_t noreserve = B_FALSE;
 	boolean_t bflag = B_FALSE;
 	boolean_t parents = B_FALSE;
+	boolean_t nomount = B_FALSE;
 	int ret = 1;
 	nvlist_t *props;
 	uint64_t intval;
-	int canmount;
 
-	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
-		(void) fprintf(stderr, gettext("internal error: "
-		    "out of memory\n"));
-		return (1);
-	}
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":V:b:so:p")) != -1) {
+	while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) {
 		switch (c) {
 		case 'V':
 			type = ZFS_TYPE_VOLUME;
@@ -626,12 +787,8 @@ zfs_do_create(int argc, char **argv)
 			}
 
 			if (nvlist_add_uint64(props,
-			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
-			    intval) != 0) {
-				(void) fprintf(stderr, gettext("internal "
-				    "error: out of memory\n"));
-				goto error;
-			}
+			    zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0)
+				nomem();
 			volsize = intval;
 			break;
 		case 'p':
@@ -648,24 +805,23 @@ zfs_do_create(int argc, char **argv)
 
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-			    intval) != 0) {
-				(void) fprintf(stderr, gettext("internal "
-				    "error: out of memory\n"));
-				goto error;
-			}
+			    intval) != 0)
+				nomem();
 			break;
 		case 'o':
-			if (parseprop(props))
+			if (parseprop(props, optarg) != 0)
 				goto error;
 			break;
 		case 's':
 			noreserve = B_TRUE;
 			break;
+		case 'u':
+			nomount = B_TRUE;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing size "
 			    "argument\n"));
 			goto badusage;
-			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -678,6 +834,11 @@ zfs_do_create(int argc, char **argv)
 		    "used when creating a volume\n"));
 		goto badusage;
 	}
+	if (nomount && type != ZFS_TYPE_FILESYSTEM) {
+		(void) fprintf(stderr, gettext("'-u' can only be "
+		    "used when creating a file system\n"));
+		goto badusage;
+	}
 
 	argc -= optind;
 	argv += optind;
@@ -695,12 +856,14 @@ zfs_do_create(int argc, char **argv)
 
 	if (type == ZFS_TYPE_VOLUME && !noreserve) {
 		zpool_handle_t *zpool_handle;
+		nvlist_t *real_props = NULL;
 		uint64_t spa_version;
 		char *p;
 		zfs_prop_t resv_prop;
 		char *strval;
+		char msg[1024];
 
-		if (p = strchr(argv[0], '/'))
+		if ((p = strchr(argv[0], '/')) != NULL)
 			*p = '\0';
 		zpool_handle = zpool_open(g_zfs, argv[0]);
 		if (p != NULL)
@@ -709,21 +872,29 @@ zfs_do_create(int argc, char **argv)
 			goto error;
 		spa_version = zpool_get_prop_int(zpool_handle,
 		    ZPOOL_PROP_VERSION, NULL);
-		zpool_close(zpool_handle);
 		if (spa_version >= SPA_VERSION_REFRESERVATION)
 			resv_prop = ZFS_PROP_REFRESERVATION;
 		else
 			resv_prop = ZFS_PROP_RESERVATION;
-		volsize = zvol_volsize_to_reservation(volsize, props);
+
+		(void) snprintf(msg, sizeof (msg),
+		    gettext("cannot create '%s'"), argv[0]);
+		if (props && (real_props = zfs_valid_proplist(g_zfs, type,
+		    props, 0, NULL, zpool_handle, msg)) == NULL) {
+			zpool_close(zpool_handle);
+			goto error;
+		}
+		zpool_close(zpool_handle);
+
+		volsize = zvol_volsize_to_reservation(volsize, real_props);
+		nvlist_free(real_props);
 
 		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
 		    &strval) != 0) {
 			if (nvlist_add_uint64(props,
 			    zfs_prop_to_name(resv_prop), volsize) != 0) {
-				(void) fprintf(stderr, gettext("internal "
-				    "error: out of memory\n"));
 				nvlist_free(props);
-				return (1);
+				nomem();
 			}
 		}
 	}
@@ -748,20 +919,17 @@ zfs_do_create(int argc, char **argv)
 
 	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
 		goto error;
-	/*
-	 * if the user doesn't want the dataset automatically mounted,
-	 * then skip the mount/share step
-	 */
 
-	canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+	ret = 0;
 
 	/*
 	 * Mount and/or share the new filesystem as appropriate.  We provide a
 	 * verbose error message to let the user know that their filesystem was
 	 * in fact created, even if we failed to mount or share it.
+	 * If the user doesn't want the dataset automatically mounted,
+	 * then skip the mount/share step altogether.
 	 */
-	ret = 0;
-	if (canmount == ZFS_CANMOUNT_ON) {
+	if (!nomount && should_auto_mount(zhp)) {
 		if (zfs_mount(zhp, NULL, 0) != 0) {
 			(void) fprintf(stderr, gettext("filesystem "
 			    "successfully created, but not mounted\n"));
@@ -799,15 +967,25 @@ badusage:
  */
 typedef struct destroy_cbdata {
 	boolean_t	cb_first;
-	int		cb_force;
-	int		cb_recurse;
-	int		cb_error;
-	int		cb_needforce;
-	int		cb_doclones;
-	boolean_t	cb_closezhp;
+	boolean_t	cb_force;
+	boolean_t	cb_recurse;
+	boolean_t	cb_error;
+	boolean_t	cb_doclones;
 	zfs_handle_t	*cb_target;
-	char		*cb_snapname;
 	boolean_t	cb_defer_destroy;
+	boolean_t	cb_verbose;
+	boolean_t	cb_parsable;
+	boolean_t	cb_dryrun;
+	nvlist_t	*cb_nvl;
+	nvlist_t	*cb_batchedsnaps;
+
+	/* first snap in contiguous run */
+	char		*cb_firstsnap;
+	/* previous snap in contiguous run */
+	char		*cb_prevsnap;
+	int64_t		cb_snapused;
+	char		*cb_snapspec;
+	char		*cb_bookmark;
 } destroy_cbdata_t;
 
 /*
@@ -837,7 +1015,7 @@ destroy_check_dependent(zfs_handle_t *zh
 			(void) fprintf(stderr, gettext("use '-r' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
-			cbp->cb_error = 1;
+			cbp->cb_error = B_TRUE;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
@@ -858,7 +1036,8 @@ destroy_check_dependent(zfs_handle_t *zh
 			(void) fprintf(stderr, gettext("use '-R' to destroy "
 			    "the following datasets:\n"));
 			cbp->cb_first = B_FALSE;
-			cbp->cb_error = 1;
+			cbp->cb_error = B_TRUE;
+			cbp->cb_dryrun = B_TRUE;
 		}
 
 		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
@@ -872,7 +1051,20 @@ out:
 static int
 destroy_callback(zfs_handle_t *zhp, void *data)
 {
-	destroy_cbdata_t *cbp = data;
+	destroy_cbdata_t *cb = data;
+	const char *name = zfs_get_name(zhp);
+
+	if (cb->cb_verbose) {
+		if (cb->cb_parsable) {
+			(void) printf("destroy\t%s\n", name);
+		} else if (cb->cb_dryrun) {
+			(void) printf(gettext("would destroy %s\n"),
+			    name);
+		} else {
+			(void) printf(gettext("will destroy %s\n"),
+			    name);
+		}
+	}
 
 	/*
 	 * Ignore pools (which we've already flagged as an error before getting
@@ -883,14 +1075,31 @@ destroy_callback(zfs_handle_t *zhp, void
 		zfs_close(zhp);
 		return (0);
 	}
+	if (cb->cb_dryrun) {
+		zfs_close(zhp);
+		return (0);
+	}
 
 	/*
-	 * Bail out on the first error.
+	 * We batch up all contiguous snapshots (even of different
+	 * filesystems) and destroy them with one ioctl.  We can't
+	 * simply do all snap deletions and then all fs deletions,
+	 * because we must delete a clone before its origin.
 	 */
-	if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
-	    zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) {
-		zfs_close(zhp);
-		return (-1);
+	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) {
+		fnvlist_add_boolean(cb->cb_batchedsnaps, name);
+	} else {
+		int error = zfs_destroy_snaps_nvl(g_zfs,
+		    cb->cb_batchedsnaps, B_FALSE);
+		fnvlist_free(cb->cb_batchedsnaps);
+		cb->cb_batchedsnaps = fnvlist_alloc();
+
+		if (error != 0 ||
+		    zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 ||
+		    zfs_destroy(zhp, cb->cb_defer_destroy) != 0) {
+			zfs_close(zhp);
+			return (-1);
+		}
 	}
 
 	zfs_close(zhp);
@@ -898,66 +1107,183 @@ destroy_callback(zfs_handle_t *zhp, void
 }
 
 static int
-destroy_snap_clones(zfs_handle_t *zhp, void *arg)
+destroy_print_cb(zfs_handle_t *zhp, void *arg)
 {
-	destroy_cbdata_t *cbp = arg;
-	char thissnap[MAXPATHLEN];
-	zfs_handle_t *szhp;
-	boolean_t closezhp = cbp->cb_closezhp;
-	int rv;
+	destroy_cbdata_t *cb = arg;
+	const char *name = zfs_get_name(zhp);
+	int err = 0;
 
-	(void) snprintf(thissnap, sizeof (thissnap),
-	    "%s@%s", zfs_get_name(zhp), cbp->cb_snapname);
+	if (nvlist_exists(cb->cb_nvl, name)) {
+		if (cb->cb_firstsnap == NULL)
+			cb->cb_firstsnap = strdup(name);
+		if (cb->cb_prevsnap != NULL)
+			free(cb->cb_prevsnap);
+		/* this snap continues the current range */
+		cb->cb_prevsnap = strdup(name);
+		if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL)
+			nomem();
+		if (cb->cb_verbose) {
+			if (cb->cb_parsable) {
+				(void) printf("destroy\t%s\n", name);
+			} else if (cb->cb_dryrun) {
+				(void) printf(gettext("would destroy %s\n"),
+				    name);
+			} else {
+				(void) printf(gettext("will destroy %s\n"),
+				    name);
+			}
+		}
+	} else if (cb->cb_firstsnap != NULL) {
+		/* end of this range */
+		uint64_t used = 0;
+		err = lzc_snaprange_space(cb->cb_firstsnap,
+		    cb->cb_prevsnap, &used);
+		cb->cb_snapused += used;
+		free(cb->cb_firstsnap);
+		cb->cb_firstsnap = NULL;
+		free(cb->cb_prevsnap);
+		cb->cb_prevsnap = NULL;
+	}
+	zfs_close(zhp);
+	return (err);
+}
 
-	libzfs_print_on_error(g_zfs, B_FALSE);
-	szhp = zfs_open(g_zfs, thissnap, ZFS_TYPE_SNAPSHOT);
-	libzfs_print_on_error(g_zfs, B_TRUE);
-	if (szhp) {
-		/*
-		 * Destroy any clones of this snapshot
-		 */
-		if (zfs_iter_dependents(szhp, B_FALSE, destroy_callback,
-		    cbp) != 0) {
-			zfs_close(szhp);
-			if (closezhp)
-				zfs_close(zhp);
-			return (-1);
+static int
+destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb)
+{
+	int err = 0;
+	assert(cb->cb_firstsnap == NULL);
+	assert(cb->cb_prevsnap == NULL);
+	err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb);
+	if (cb->cb_firstsnap != NULL) {
+		uint64_t used = 0;
+		if (err == 0) {
+			err = lzc_snaprange_space(cb->cb_firstsnap,
+			    cb->cb_prevsnap, &used);
 		}
-		zfs_close(szhp);
+		cb->cb_snapused += used;
+		free(cb->cb_firstsnap);
+		cb->cb_firstsnap = NULL;
+		free(cb->cb_prevsnap);
+		cb->cb_prevsnap = NULL;
 	}
+	return (err);
+}
 
-	cbp->cb_closezhp = B_TRUE;
-	rv = zfs_iter_filesystems(zhp, destroy_snap_clones, arg);
-	if (closezhp)
-		zfs_close(zhp);
-	return (rv);
+static int
+snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	int err = 0;
+
+	/* Check for clones. */
+	if (!cb->cb_doclones && !cb->cb_defer_destroy) {
+		cb->cb_target = zhp;
+		cb->cb_first = B_TRUE;
+		err = zfs_iter_dependents(zhp, B_TRUE,
+		    destroy_check_dependent, cb);
+	}
+
+	if (err == 0) {
+		if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp)))
+			nomem();
+	}
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+gather_snapshots(zfs_handle_t *zhp, void *arg)
+{
+	destroy_cbdata_t *cb = arg;
+	int err = 0;
+
+	err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb);
+	if (err == ENOENT)
+		err = 0;
+	if (err != 0)
+		goto out;
+
+	if (cb->cb_verbose) {
+		err = destroy_print_snapshots(zhp, cb);
+		if (err != 0)
+			goto out;
+	}
+
+	if (cb->cb_recurse)
+		err = zfs_iter_filesystems(zhp, gather_snapshots, cb);
+
+out:
+	zfs_close(zhp);
+	return (err);
+}
+
+static int
+destroy_clones(destroy_cbdata_t *cb)
+{
+	nvpair_t *pair;
+	for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(cb->cb_nvl, pair)) {
+		zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair),
+		    ZFS_TYPE_SNAPSHOT);
+		if (zhp != NULL) {
+			boolean_t defer = cb->cb_defer_destroy;
+			int err = 0;
+
+			/*
+			 * We can't defer destroy non-snapshots, so set it to
+			 * false while destroying the clones.
+			 */
+			cb->cb_defer_destroy = B_FALSE;
+			err = zfs_iter_dependents(zhp, B_FALSE,
+			    destroy_callback, cb);
+			cb->cb_defer_destroy = defer;
+			zfs_close(zhp);
+			if (err != 0)
+				return (err);
+		}
+	}
+	return (0);
 }
 
 static int
 zfs_do_destroy(int argc, char **argv)
 {
 	destroy_cbdata_t cb = { 0 };
+	int rv = 0;
+	int err = 0;
 	int c;
-	zfs_handle_t *zhp;
-	char *cp;
+	zfs_handle_t *zhp = NULL;
+	char *at, *pound;
 	zfs_type_t type = ZFS_TYPE_DATASET;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "dfrR")) != -1) {
+	while ((c = getopt(argc, argv, "vpndfrR")) != -1) {
 		switch (c) {
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
+		case 'p':
+			cb.cb_verbose = B_TRUE;
+			cb.cb_parsable = B_TRUE;
+			break;
+		case 'n':
+			cb.cb_dryrun = B_TRUE;
+			break;
 		case 'd':
 			cb.cb_defer_destroy = B_TRUE;
 			type = ZFS_TYPE_SNAPSHOT;
 			break;
 		case 'f':
-			cb.cb_force = 1;
+			cb.cb_force = B_TRUE;
 			break;
 		case 'r':
-			cb.cb_recurse = 1;
+			cb.cb_recurse = B_TRUE;
 			break;
 		case 'R':
-			cb.cb_recurse = 1;
-			cb.cb_doclones = 1;
+			cb.cb_recurse = B_TRUE;
+			cb.cb_doclones = B_TRUE;
 			break;
 		case '?':
 		default:
@@ -972,7 +1298,7 @@ zfs_do_destroy(int argc, char **argv)
 
 	/* check number of arguments */
 	if (argc == 0) {
-		(void) fprintf(stderr, gettext("missing path argument\n"));
+		(void) fprintf(stderr, gettext("missing dataset argument\n"));
 		usage(B_FALSE);
 	}
 	if (argc > 1) {
@@ -980,112 +1306,195 @@ zfs_do_destroy(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
-	/*
-	 * If we are doing recursive destroy of a snapshot, then the
-	 * named snapshot may not exist.  Go straight to libzfs.
-	 */
-	if (cb.cb_recurse && (cp = strchr(argv[0], '@'))) {
-		int ret;
+	at = strchr(argv[0], '@');
+	pound = strchr(argv[0], '#');
+	if (at != NULL) {
+
+		/* Build the list of snaps to destroy in cb_nvl. */
+		cb.cb_nvl = fnvlist_alloc();
 
-		*cp = '\0';
-		if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
+		*at = '\0';
+		zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL)
 			return (1);
-		*cp = '@';
-		cp++;
 
-		if (cb.cb_doclones) {
-			boolean_t defer = cb.cb_defer_destroy;
+		cb.cb_snapspec = at + 1;
+		if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 ||
+		    cb.cb_error) {
+			rv = 1;
+			goto out;
+		}
+
+		if (nvlist_empty(cb.cb_nvl)) {
+			(void) fprintf(stderr, gettext("could not find any "
+			    "snapshots to destroy; check snapshot names.\n"));
+			rv = 1;
+			goto out;
+		}
 
-			/*
-			 * Temporarily ignore the defer_destroy setting since
-			 * it's not supported for clones.
-			 */
-			cb.cb_defer_destroy = B_FALSE;
-			cb.cb_snapname = cp;
-			if (destroy_snap_clones(zhp, &cb) != 0) {
-				zfs_close(zhp);
-				return (1);
+		if (cb.cb_verbose) {
+			char buf[16];
+			zfs_nicenum(cb.cb_snapused, buf, sizeof (buf));
+			if (cb.cb_parsable) {
+				(void) printf("reclaim\t%llu\n",
+				    cb.cb_snapused);
+			} else if (cb.cb_dryrun) {
+				(void) printf(gettext("would reclaim %s\n"),
+				    buf);
+			} else {
+				(void) printf(gettext("will reclaim %s\n"),
+				    buf);
 			}
-			cb.cb_defer_destroy = defer;
 		}
 
-		ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy);
-		zfs_close(zhp);
-		if (ret) {
+		if (!cb.cb_dryrun) {
+			if (cb.cb_doclones) {
+				cb.cb_batchedsnaps = fnvlist_alloc();
+				err = destroy_clones(&cb);
+				if (err == 0) {
+					err = zfs_destroy_snaps_nvl(g_zfs,
+					    cb.cb_batchedsnaps, B_FALSE);
+				}
+				if (err != 0) {
+					rv = 1;
+					goto out;
+				}
+			}
+			if (err == 0) {
+				err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl,
+				    cb.cb_defer_destroy);
+			}
+		}
+
+		if (err != 0)
+			rv = 1;
+	} else if (pound != NULL) {
+		int err;
+		nvlist_t *nvl;
+
+		if (cb.cb_dryrun) {
 			(void) fprintf(stderr,
-			    gettext("no snapshots destroyed\n"));
+			    "dryrun is not supported with bookmark\n");
+			return (-1);
 		}
-		return (ret != 0);
-	}
 
-	/* Open the given dataset */
-	if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
-		return (1);
+		if (cb.cb_defer_destroy) {
+			(void) fprintf(stderr,
+			    "defer destroy is not supported with bookmark\n");
+			return (-1);
+		}
 
-	cb.cb_target = zhp;
+		if (cb.cb_recurse) {
+			(void) fprintf(stderr,
+			    "recursive is not supported with bookmark\n");
+			return (-1);
+		}
 
-	/*
-	 * Perform an explicit check for pools before going any further.
-	 */
-	if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
-	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
-		(void) fprintf(stderr, gettext("cannot destroy '%s': "
-		    "operation does not apply to pools\n"),
-		    zfs_get_name(zhp));
-		(void) fprintf(stderr, gettext("use 'zfs destroy -r "
-		    "%s' to destroy all datasets in the pool\n"),
-		    zfs_get_name(zhp));
-		(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
-		    "to destroy the pool itself\n"), zfs_get_name(zhp));
-		zfs_close(zhp);
-		return (1);
-	}
+		if (!zfs_bookmark_exists(argv[0])) {
+			(void) fprintf(stderr, gettext("bookmark '%s' "
+			    "does not exist.\n"), argv[0]);
+			return (1);
+		}
 
-	/*
-	 * Check for any dependents and/or clones.
-	 */
-	cb.cb_first = B_TRUE;
-	if (!cb.cb_doclones && !cb.cb_defer_destroy &&
-	    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
-	    &cb) != 0) {
-		zfs_close(zhp);
-		return (1);
-	}
+		nvl = fnvlist_alloc();
+		fnvlist_add_boolean(nvl, argv[0]);
 
-	if (cb.cb_error || (!cb.cb_defer_destroy &&
-	    (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) {
-		zfs_close(zhp);
-		return (1);
-	}
+		err = lzc_destroy_bookmarks(nvl, NULL);
+		if (err != 0) {
+			(void) zfs_standard_error(g_zfs, err,
+			    "cannot destroy bookmark");
+		}
 
-	/*
-	 * Do the real thing.  The callback will close the handle regardless of
-	 * whether it succeeds or not.
-	 */
+		nvlist_free(cb.cb_nvl);
 
-	if (destroy_callback(zhp, &cb) != 0)
-		return (1);
+		return (err);
+	} else {
+		/* Open the given dataset */
+		if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
+			return (1);
 
-	return (0);
-}
+		cb.cb_target = zhp;
 
-static boolean_t
-is_recvd_column(zprop_get_cbdata_t *cbp)
-{
-	int i;
-	zfs_get_column_t col;
+		/*
+		 * Perform an explicit check for pools before going any further.
+		 */
+		if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL &&
+		    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
+			(void) fprintf(stderr, gettext("cannot destroy '%s': "
+			    "operation does not apply to pools\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use 'zfs destroy -r "
+			    "%s' to destroy all datasets in the pool\n"),
+			    zfs_get_name(zhp));
+			(void) fprintf(stderr, gettext("use 'zpool destroy %s' "
+			    "to destroy the pool itself\n"), zfs_get_name(zhp));
+			rv = 1;
+			goto out;
+		}
 
-	for (i = 0; i < ZFS_GET_NCOLS &&
-	    (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
-		if (col == GET_COL_RECVD)
-			return (B_TRUE);
-	return (B_FALSE);
-}
+		/*
+		 * Check for any dependents and/or clones.
+		 */
+		cb.cb_first = B_TRUE;
+		if (!cb.cb_doclones &&
+		    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
+		    &cb) != 0) {
+			rv = 1;
+			goto out;
+		}
 
-/*
- * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
- *	< all | property[,property]... > < fs | snap | vol > ...
- *
+		if (cb.cb_error) {
+			rv = 1;
+			goto out;
+		}
+
+		cb.cb_batchedsnaps = fnvlist_alloc();
+		if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback,
+		    &cb) != 0) {
+			rv = 1;
+			goto out;
+		}
+
+		/*
+		 * Do the real thing.  The callback will close the
+		 * handle regardless of whether it succeeds or not.
+		 */
+		err = destroy_callback(zhp, &cb);
+		zhp = NULL;
+		if (err == 0) {
+			err = zfs_destroy_snaps_nvl(g_zfs,
+			    cb.cb_batchedsnaps, cb.cb_defer_destroy);
+		}
+		if (err != 0)
+			rv = 1;
+	}
+
+out:
+	fnvlist_free(cb.cb_batchedsnaps);
+	fnvlist_free(cb.cb_nvl);
+	if (zhp != NULL)
+		zfs_close(zhp);
+	return (rv);
+}
+
+static boolean_t
+is_recvd_column(zprop_get_cbdata_t *cbp)
+{
+	int i;
+	zfs_get_column_t col;
+
+	for (i = 0; i < ZFS_GET_NCOLS &&
+	    (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
+		if (col == GET_COL_RECVD)
+			return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
+ *	< all | property[,property]... > < fs | snap | vol > ...
+ *
  *	-r	recurse over any child datasets
  *	-H	scripted mode.  Headers are stripped, and fields are separated
  *		by tabs instead of spaces.
@@ -1110,7 +1519,7 @@ get_callback(zfs_handle_t *zhp, void *da
 	char buf[ZFS_MAXPROPLEN];
 	char rbuf[ZFS_MAXPROPLEN];
 	zprop_source_t sourcetype;
-	char source[ZFS_MAXNAMELEN];
+	char source[ZFS_MAX_DATASET_NAME_LEN];
 	zprop_get_cbdata_t *cbp = data;
 	nvlist_t *user_props = zfs_get_user_props(zhp);
 	zprop_list_t *pl = cbp->cb_proplist;
@@ -1166,6 +1575,17 @@ get_callback(zfs_handle_t *zhp, void *da
 
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    pl->pl_user_prop, buf, sourcetype, source, NULL);
+		} else if (zfs_prop_written(pl->pl_user_prop)) {
+			sourcetype = ZPROP_SRC_LOCAL;
+
+			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+			    buf, sizeof (buf), cbp->cb_literal) != 0) {
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, buf, sourcetype, source, NULL);
 		} else {
 			if (nvlist_lookup_nvlist(user_props,
 			    pl->pl_user_prop, &propval) != 0) {
@@ -1210,9 +1630,10 @@ static int
 zfs_do_get(int argc, char **argv)
 {
 	zprop_get_cbdata_t cb = { 0 };
-	int i, c, flags = 0;
+	int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
+	int types = ZFS_TYPE_DATASET;
 	char *value, *fields;
-	int ret;
+	int ret = 0;
 	int limit = 0;
 	zprop_list_t fake_name = { 0 };
 
@@ -1227,7 +1648,7 @@ zfs_do_get(int argc, char **argv)
 	cb.cb_type = ZFS_TYPE_DATASET;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) {
+	while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) {
 		switch (c) {
 		case 'p':
 			cb.cb_literal = B_TRUE;
@@ -1302,7 +1723,7 @@ zfs_do_get(int argc, char **argv)
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid column name "
-					    "'%s'\n"), value);
+					    "'%s'\n"), suboptarg);
 					usage(B_FALSE);
 				}
 			}
@@ -1339,7 +1760,43 @@ zfs_do_get(int argc, char **argv)
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid source "
-					    "'%s'\n"), value);
+					    "'%s'\n"), suboptarg);
+					usage(B_FALSE);
+				}
+			}
+			break;
+
+		case 't':
+			types = 0;
+			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
+			while (*optarg != '\0') {
+				static char *type_subopts[] = { "filesystem",
+				    "volume", "snapshot", "bookmark",
+				    "all", NULL };
+
+				switch (getsubopt(&optarg, type_subopts,
+				    &value)) {
+				case 0:
+					types |= ZFS_TYPE_FILESYSTEM;
+					break;
+				case 1:
+					types |= ZFS_TYPE_VOLUME;
+					break;
+				case 2:
+					types |= ZFS_TYPE_SNAPSHOT;
+					break;
+				case 3:
+					types |= ZFS_TYPE_BOOKMARK;
+					break;
+				case 4:
+					types = ZFS_TYPE_DATASET |
+					    ZFS_TYPE_BOOKMARK;
+					break;
+
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid type '%s'\n"),
+					    suboptarg);
 					usage(B_FALSE);
 				}
 			}
@@ -1388,7 +1845,7 @@ zfs_do_get(int argc, char **argv)
 	cb.cb_first = B_TRUE;
 
 	/* run for each object */
-	ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, 0,
+	ret = zfs_for_each(argc, argv, flags, types, NULL,
 	    &cb.cb_proplist, limit, get_callback, &cb);
 
 	if (cb.cb_proplist == &fake_name)
@@ -1449,7 +1906,7 @@ zfs_do_inherit(int argc, char **argv)
 	zfs_prop_t prop;
 	inherit_cbdata_t cb = { 0 };
 	char *propname;
-	int ret;
+	int ret = 0;
 	int flags = 0;
 	boolean_t received = B_FALSE;
 
@@ -1500,9 +1957,13 @@ zfs_do_inherit(int argc, char **argv)
 			if (prop == ZFS_PROP_QUOTA ||
 			    prop == ZFS_PROP_RESERVATION ||
 			    prop == ZFS_PROP_REFQUOTA ||
-			    prop == ZFS_PROP_REFRESERVATION)
+			    prop == ZFS_PROP_REFRESERVATION) {
 				(void) fprintf(stderr, gettext("use 'zfs set "
 				    "%s=none' to clear\n"), propname);
+				(void) fprintf(stderr, gettext("use 'zfs "
+				    "inherit -S %s' to revert to received "
+				    "value\n"), propname);
+			}
 			return (1);
 		}
 		if (received && (prop == ZFS_PROP_VOLSIZE ||
@@ -1538,7 +1999,7 @@ typedef struct upgrade_cbdata {
 	uint64_t cb_version;
 	boolean_t cb_newer;
 	boolean_t cb_foundone;
-	char cb_lastfs[ZFS_MAXNAMELEN];
+	char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN];
 } upgrade_cbdata_t;
 
 static int
@@ -1594,31 +2055,25 @@ upgrade_set_callback(zfs_handle_t *zhp, 
 {
 	upgrade_cbdata_t *cb = data;
 	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
-	int i;
-	static struct { int zplver; int spaver; } table[] = {
-		{ZPL_VERSION_FUID, SPA_VERSION_FUID},
-		{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
-		{0, 0}
-	};
+	int needed_spa_version;
+	int spa_version;
 
+	if (zfs_spa_version(zhp, &spa_version) < 0)
+		return (-1);
 
-	for (i = 0; table[i].zplver; i++) {
-		if (cb->cb_version >= table[i].zplver) {
-			int spa_version;
+	needed_spa_version = zfs_spa_version_map(cb->cb_version);
 
-			if (zfs_spa_version(zhp, &spa_version) < 0)
-				return (-1);
+	if (needed_spa_version < 0)
+		return (-1);
 
-			if (spa_version < table[i].spaver) {
-				/* can't upgrade */
-				(void) printf(gettext("%s: can not be "
-				    "upgraded; the pool version needs to first "
-				    "be upgraded\nto version %d\n\n"),
-				    zfs_get_name(zhp), table[i].spaver);
-				cb->cb_numfailed++;
-				return (0);
-			}
-		}
+	if (spa_version < needed_spa_version) {
+		/* can't upgrade */
+		(void) printf(gettext("%s: can not be "
+		    "upgraded; the pool version needs to first "
+		    "be upgraded\nto version %d\n\n"),
+		    zfs_get_name(zhp), needed_spa_version);
+		cb->cb_numfailed++;
+		return (0);
 	}
 
 	/* upgrade */
@@ -1630,9 +2085,11 @@ upgrade_set_callback(zfs_handle_t *zhp, 
 			/*
 			 * If they did "zfs upgrade -a", then we could
 			 * be doing ioctls to different pools.  We need
-			 * to log this history once to each pool.
+			 * to log this history once to each pool, and bypass
+			 * the normal history logging that happens in main().
 			 */
-			verify(zpool_stage_history(g_zfs, history_str) == 0);
+			(void) zpool_log_history(g_zfs, history_str);
+			log_history = B_FALSE;
 		}
 		if (zfs_prop_set(zhp, "version", verstr) == 0)
 			cb->cb_numupgraded++;
@@ -1661,9 +2118,9 @@ zfs_do_upgrade(int argc, char **argv)
 {
 	boolean_t all = B_FALSE;
 	boolean_t showversions = B_FALSE;
-	int ret;
+	int ret = 0;
 	upgrade_cbdata_t cb = { 0 };
-	char c;
+	int c;
 	int flags = ZFS_ITER_ARGS_CAN_BE_PATHS;
 
 	/* check options */
@@ -1716,15 +2173,14 @@ zfs_do_upgrade(int argc, char **argv)
 		    "---------------\n");
 		(void) printf(gettext(" 1   Initial ZFS filesystem version\n"));
 		(void) printf(gettext(" 2   Enhanced directory entries\n"));
-		(void) printf(gettext(" 3   Case insensitive and File system "
-		    "unique identifier (FUID)\n"));
+		(void) printf(gettext(" 3   Case insensitive and filesystem "
+		    "user identifier (FUID)\n"));
 		(void) printf(gettext(" 4   userquota, groupquota "
 		    "properties\n"));
+		(void) printf(gettext(" 5   System attributes\n"));
 		(void) printf(gettext("\nFor more information on a particular "
-		    "version, including supported releases, see:\n\n"));
-		(void) printf("http://www.opensolaris.org/os/community/zfs/"
-		    "version/zpl/N\n\n");
-		(void) printf(gettext("Where 'N' is the version number.\n"));
+		    "version, including supported releases,\n"));
+		(void) printf("see the ZFS Administration Guide.\n\n");
 		ret = 0;
 	} else if (argc || all) {
 		/* Upgrade filesystems */
@@ -1768,183 +2224,820 @@ zfs_do_upgrade(int argc, char **argv)
 }
 
 /*
- * zfs userspace
+ * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ *               [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...]
+ *                [-S field [-S field]...] [-t type[,...]] filesystem | snapshot
+ *
+ *	-H      Scripted mode; elide headers and separate columns by tabs.
+ *	-i	Translate SID to POSIX ID.
+ *	-n	Print numeric ID instead of user/group name.
+ *	-o      Control which fields to display.
+ *	-p	Use exact (parsable) numeric output.
+ *	-s      Specify sort columns, descending order.
+ *	-S      Specify sort columns, ascending order.
+ *	-t      Control which object types to display.
+ *
+ *	Displays space consumed by, and quotas on, each user in the specified
+ *	filesystem or snapshot.
  */
-static int
-userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
-{
-	zfs_userquota_prop_t *typep = arg;
-	zfs_userquota_prop_t p = *typep;
-	char *name = NULL;
-	char *ug, *propname;
-	char namebuf[32];
-	char sizebuf[32];
 
-	if (domain == NULL || domain[0] == '\0') {
-		if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) {
-			struct group *g = getgrgid(rid);
-			if (g)
-				name = g->gr_name;
-		} else {
-			struct passwd *p = getpwuid(rid);
-			if (p)
-				name = p->pw_name;
-		}
-	}
+/* us_field_types, us_field_hdr and us_field_names should be kept in sync */
+enum us_field_types {
+	USFIELD_TYPE,
+	USFIELD_NAME,
+	USFIELD_USED,
+	USFIELD_QUOTA
+};
+static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" };
+static char *us_field_names[] = { "type", "name", "used", "quota" };
+#define	USFIELD_LAST	(sizeof (us_field_names) / sizeof (char *))
+
+#define	USTYPE_PSX_GRP	(1 << 0)
+#define	USTYPE_PSX_USR	(1 << 1)
+#define	USTYPE_SMB_GRP	(1 << 2)
+#define	USTYPE_SMB_USR	(1 << 3)
+#define	USTYPE_ALL	\
+	(USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR)
+
+static int us_type_bits[] = {
+	USTYPE_PSX_GRP,
+	USTYPE_PSX_USR,
+	USTYPE_SMB_GRP,
+	USTYPE_SMB_USR,
+	USTYPE_ALL
+};
+static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup",
+	"smbuser", "all" };
 
-	if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA)
-		ug = "group";
-	else
-		ug = "user";
+typedef struct us_node {
+	nvlist_t	*usn_nvl;
+	uu_avl_node_t	usn_avlnode;
+	uu_list_node_t	usn_listnode;
+} us_node_t;
+
+typedef struct us_cbdata {
+	nvlist_t	**cb_nvlp;
+	uu_avl_pool_t	*cb_avl_pool;
+	uu_avl_t	*cb_avl;
+	boolean_t	cb_numname;
+	boolean_t	cb_nicenum;
+	boolean_t	cb_sid2posix;
+	zfs_userquota_prop_t cb_prop;
+	zfs_sort_column_t *cb_sortcol;
+	size_t		cb_width[USFIELD_LAST];
+} us_cbdata_t;
+
+static boolean_t us_populated = B_FALSE;
+
+typedef struct {
+	zfs_sort_column_t *si_sortcol;
+	boolean_t	si_numname;
+} us_sort_info_t;
 
-	if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED)
-		propname = "used";
-	else
-		propname = "quota";
+static int
+us_field_index(char *field)
+{
+	int i;
 
-	if (name == NULL) {
-		(void) snprintf(namebuf, sizeof (namebuf),
-		    "%llu", (longlong_t)rid);
-		name = namebuf;
+	for (i = 0; i < USFIELD_LAST; i++) {
+		if (strcmp(field, us_field_names[i]) == 0)
+			return (i);
 	}
-	zfs_nicenum(space, sizebuf, sizeof (sizebuf));
 
-	(void) printf("%s %s %s%c%s %s\n", propname, ug, domain,
-	    domain[0] ? '-' : ' ', name, sizebuf);
-
-	return (0);
+	return (-1);
 }
 
 static int
-zfs_do_userspace(int argc, char **argv)
+us_compare(const void *larg, const void *rarg, void *unused)
 {
-	zfs_handle_t *zhp;
-	zfs_userquota_prop_t p;
-	int error;
+	const us_node_t *l = larg;
+	const us_node_t *r = rarg;
+	us_sort_info_t *si = (us_sort_info_t *)unused;
+	zfs_sort_column_t *sortcol = si->si_sortcol;
+	boolean_t numname = si->si_numname;
+	nvlist_t *lnvl = l->usn_nvl;
+	nvlist_t *rnvl = r->usn_nvl;
+	int rc = 0;
+	boolean_t lvb, rvb;
+
+	for (; sortcol != NULL; sortcol = sortcol->sc_next) {
+		char *lvstr = "";
+		char *rvstr = "";
+		uint32_t lv32 = 0;
+		uint32_t rv32 = 0;
+		uint64_t lv64 = 0;
+		uint64_t rv64 = 0;
+		zfs_prop_t prop = sortcol->sc_prop;
+		const char *propname = NULL;
+		boolean_t reverse = sortcol->sc_reverse;
+
+		switch (prop) {
+		case ZFS_PROP_TYPE:
+			propname = "type";
+			(void) nvlist_lookup_uint32(lnvl, propname, &lv32);
+			(void) nvlist_lookup_uint32(rnvl, propname, &rv32);
+			if (rv32 != lv32)
+				rc = (rv32 < lv32) ? 1 : -1;
+			break;
+		case ZFS_PROP_NAME:
+			propname = "name";
+			if (numname) {
+				(void) nvlist_lookup_uint64(lnvl, propname,
+				    &lv64);
+				(void) nvlist_lookup_uint64(rnvl, propname,
+				    &rv64);
+				if (rv64 != lv64)
+					rc = (rv64 < lv64) ? 1 : -1;
+			} else {
+				(void) nvlist_lookup_string(lnvl, propname,
+				    &lvstr);
+				(void) nvlist_lookup_string(rnvl, propname,
+				    &rvstr);
+				rc = strcmp(lvstr, rvstr);
+			}
+			break;
+		case ZFS_PROP_USED:
+		case ZFS_PROP_QUOTA:
+			if (!us_populated)
+				break;
+			if (prop == ZFS_PROP_USED)
+				propname = "used";
+			else
+				propname = "quota";
+			(void) nvlist_lookup_uint64(lnvl, propname, &lv64);
+			(void) nvlist_lookup_uint64(rnvl, propname, &rv64);
+			if (rv64 != lv64)
+				rc = (rv64 < lv64) ? 1 : -1;
+			break;
 
-	/*
-	 * Try the python version.  If the execv fails, we'll continue
-	 * and do a simplistic implementation.
-	 */
-	(void) execv(pypath, argv-1);
+		default:
+			break;
+		}
 
-	(void) printf("internal error: %s not found\n"
-	    "falling back on built-in implementation, "
-	    "some features will not work\n", pypath);
+		if (rc != 0) {
+			if (rc < 0)
+				return (reverse ? 1 : -1);
+			else
+				return (reverse ? -1 : 1);
+		}
+	}
 
-	if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL)
-		return (1);
+	/*
+	 * If entries still seem to be the same, check if they are of the same
+	 * type (smbentity is added only if we are doing SID to POSIX ID
+	 * translation where we can have duplicate type/name combinations).
+	 */
+	if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 &&
+	    nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 &&
+	    lvb != rvb)
+		return (lvb < rvb ? -1 : 1);
 
-	(void) printf("PROP TYPE NAME VALUE\n");
+	return (0);
+}
 
-	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
-		error = zfs_userspace(zhp, p, userspace_cb, &p);
-		if (error)
-			break;
+static inline const char *
+us_type2str(unsigned field_type)
+{
+	switch (field_type) {
+	case USTYPE_PSX_USR:
+		return ("POSIX User");
+	case USTYPE_PSX_GRP:
+		return ("POSIX Group");
+	case USTYPE_SMB_USR:
+		return ("SMB User");
+	case USTYPE_SMB_GRP:
+		return ("SMB Group");
+	default:
+		return ("Undefined");
 	}
-	return (error);
 }
 
-/*
- * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...]
- *      [-s property [-s property]...] [-S property [-S property]...]
- *      <dataset> ...
- *
- *	-r	Recurse over all children
- *	-d	Limit recursion by depth.
- *	-H	Scripted mode; elide headers and separate columns by tabs
- *	-o	Control which fields to display.
- *	-t	Control which object types to display.
- *	-s	Specify sort columns, descending order.
- *	-S	Specify sort columns, ascending order.
- *
- * When given no arguments, lists all filesystems in the system.
- * Otherwise, list the specified datasets, optionally recursing down them if
- * '-r' is specified.
- */
-typedef struct list_cbdata {
-	boolean_t	cb_first;
-	boolean_t	cb_scripted;
-	zprop_list_t	*cb_proplist;
-} list_cbdata_t;
-
-/*
- * Given a list of columns to display, output appropriate headers for each one.
- */
-static void
-print_header(zprop_list_t *pl)
+static int
+userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
 {
-	char headerbuf[ZFS_MAXPROPLEN];
-	const char *header;
-	int i;
-	boolean_t first = B_TRUE;
-	boolean_t right_justify;
+	us_cbdata_t *cb = (us_cbdata_t *)arg;
+	zfs_userquota_prop_t prop = cb->cb_prop;
+	char *name = NULL;
+	char *propname;
+	char sizebuf[32];
+	us_node_t *node;
+	uu_avl_pool_t *avl_pool = cb->cb_avl_pool;
+	uu_avl_t *avl = cb->cb_avl;
+	uu_avl_index_t idx;
+	nvlist_t *props;
+	us_node_t *n;
+	zfs_sort_column_t *sortcol = cb->cb_sortcol;
+	unsigned type = 0;
+	const char *typestr;
+	size_t namelen;
+	size_t typelen;
+	size_t sizelen;
+	int typeidx, nameidx, sizeidx;
+	us_sort_info_t sortinfo = { sortcol, cb->cb_numname };
+	boolean_t smbentity = B_FALSE;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	node = safe_malloc(sizeof (us_node_t));
+	uu_avl_node_init(node, &node->usn_avlnode, avl_pool);
+	node->usn_nvl = props;
+
+	if (domain != NULL && domain[0] != '\0') {
+		/* SMB */
+		char sid[MAXNAMELEN + 32];
+		uid_t id;
+#ifdef illumos
+		int err;
+		int flag = IDMAP_REQ_FLG_USE_CACHE;
+#endif
 
-	for (; pl != NULL; pl = pl->pl_next) {
-		if (!first) {
-			(void) printf("  ");
+		smbentity = B_TRUE;
+
+		(void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid);
+
+		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+			type = USTYPE_SMB_GRP;
+#ifdef illumos
+			err = sid_to_id(sid, B_FALSE, &id);
+#endif
 		} else {
-			first = B_FALSE;
+			type = USTYPE_SMB_USR;
+#ifdef illumos
+			err = sid_to_id(sid, B_TRUE, &id);
+#endif
 		}
 
-		right_justify = B_FALSE;
-		if (pl->pl_prop != ZPROP_INVAL) {
-			header = zfs_prop_column_name(pl->pl_prop);
-			right_justify = zfs_prop_align_right(pl->pl_prop);
+#ifdef illumos
+		if (err == 0) {
+			rid = id;
+			if (!cb->cb_sid2posix) {
+				if (type == USTYPE_SMB_USR) {
+					(void) idmap_getwinnamebyuid(rid, flag,
+					    &name, NULL);
+				} else {
+					(void) idmap_getwinnamebygid(rid, flag,
+					    &name, NULL);
+				}
+				if (name == NULL)
+					name = sid;
+			}
+		}
+#endif
+	}
+
+	if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') {
+		/* POSIX or -i */
+		if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) {
+			type = USTYPE_PSX_GRP;
+			if (!cb->cb_numname) {
+				struct group *g;
+
+				if ((g = getgrgid(rid)) != NULL)
+					name = g->gr_name;
+			}
 		} else {
-			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
-				headerbuf[i] = toupper(pl->pl_user_prop[i]);
-			headerbuf[i] = '\0';
-			header = headerbuf;
+			type = USTYPE_PSX_USR;
+			if (!cb->cb_numname) {
+				struct passwd *p;
+
+				if ((p = getpwuid(rid)) != NULL)
+					name = p->pw_name;
+			}
 		}
+	}
 
-		if (pl->pl_next == NULL && !right_justify)
-			(void) printf("%s", header);
-		else if (right_justify)
-			(void) printf("%*s", pl->pl_width, header);
-		else
-			(void) printf("%-*s", pl->pl_width, header);
+	/*
+	 * Make sure that the type/name combination is unique when doing
+	 * SID to POSIX ID translation (hence changing the type from SMB to
+	 * POSIX).
+	 */
+	if (cb->cb_sid2posix &&
+	    nvlist_add_boolean_value(props, "smbentity", smbentity) != 0)
+		nomem();
+
+	/* Calculate/update width of TYPE field */
+	typestr = us_type2str(type);
+	typelen = strlen(gettext(typestr));
+	typeidx = us_field_index("type");
+	if (typelen > cb->cb_width[typeidx])
+		cb->cb_width[typeidx] = typelen;
+	if (nvlist_add_uint32(props, "type", type) != 0)
+		nomem();
+
+	/* Calculate/update width of NAME field */
+	if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) {
+		if (nvlist_add_uint64(props, "name", rid) != 0)
+			nomem();
+		namelen = snprintf(NULL, 0, "%u", rid);
+	} else {
+		if (nvlist_add_string(props, "name", name) != 0)
+			nomem();
+		namelen = strlen(name);
+	}
+	nameidx = us_field_index("name");
+	if (namelen > cb->cb_width[nameidx])
+		cb->cb_width[nameidx] = namelen;
+
+	/*
+	 * Check if this type/name combination is in the list and update it;
+	 * otherwise add new node to the list.
+	 */
+	if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) {
+		uu_avl_insert(avl, node, idx);
+	} else {
+		nvlist_free(props);
+		free(node);
+		node = n;
+		props = node->usn_nvl;
 	}
 
-	(void) printf("\n");
+	/* Calculate/update width of USED/QUOTA fields */
+	if (cb->cb_nicenum)
+		zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+	else
+		(void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space);
+	sizelen = strlen(sizebuf);
+	if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) {
+		propname = "used";
+		if (!nvlist_exists(props, "quota"))
+			(void) nvlist_add_uint64(props, "quota", 0);
+	} else {
+		propname = "quota";
+		if (!nvlist_exists(props, "used"))
+			(void) nvlist_add_uint64(props, "used", 0);
+	}
+	sizeidx = us_field_index(propname);
+	if (sizelen > cb->cb_width[sizeidx])
+		cb->cb_width[sizeidx] = sizelen;
+
+	if (nvlist_add_uint64(props, propname, space) != 0)
+		nomem();
+
+	return (0);
 }
 
-/*
- * Given a dataset and a list of fields, print out all the properties according
- * to the described layout.
- */
 static void
-print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
+print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types,
+    size_t *width, us_node_t *node)
 {
+	nvlist_t *nvl = node->usn_nvl;
+	char valstr[MAXNAMELEN];
 	boolean_t first = B_TRUE;
-	char property[ZFS_MAXPROPLEN];
-	nvlist_t *userprops = zfs_get_user_props(zhp);
-	nvlist_t *propval;
-	char *propstr;
-	boolean_t right_justify;
-	int width;
+	int cfield = 0;
+	int field;
+	uint32_t ustype;
+
+	/* Check type */
+	(void) nvlist_lookup_uint32(nvl, "type", &ustype);
+	if (!(ustype & types))
+		return;
+
+	while ((field = fields[cfield]) != USFIELD_LAST) {
+		nvpair_t *nvp = NULL;
+		data_type_t type;
+		uint32_t val32;
+		uint64_t val64;
+		char *strval = NULL;
+
+		while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+			if (strcmp(nvpair_name(nvp),
+			    us_field_names[field]) == 0)
+				break;
+		}
+
+		type = nvpair_type(nvp);
+		switch (type) {
+		case DATA_TYPE_UINT32:
+			(void) nvpair_value_uint32(nvp, &val32);
+			break;
+		case DATA_TYPE_UINT64:
+			(void) nvpair_value_uint64(nvp, &val64);
+			break;
+		case DATA_TYPE_STRING:
+			(void) nvpair_value_string(nvp, &strval);
+			break;
+		default:
+			(void) fprintf(stderr, "invalid data type\n");
+		}
+
+		switch (field) {
+		case USFIELD_TYPE:
+			strval = (char *)us_type2str(val32);
+			break;
+		case USFIELD_NAME:
+			if (type == DATA_TYPE_UINT64) {
+				(void) sprintf(valstr, "%llu", val64);
+				strval = valstr;
+			}
+			break;
+		case USFIELD_USED:
+		case USFIELD_QUOTA:
+			if (type == DATA_TYPE_UINT64) {
+				if (parsable) {
+					(void) sprintf(valstr, "%llu", val64);
+				} else {
+					zfs_nicenum(val64, valstr,
+					    sizeof (valstr));
+				}
+				if (field == USFIELD_QUOTA &&
+				    strcmp(valstr, "0") == 0)
+					strval = "none";
+				else
+					strval = valstr;
+			}
+			break;
+		}
 
-	for (; pl != NULL; pl = pl->pl_next) {
 		if (!first) {
 			if (scripted)
 				(void) printf("\t");
 			else
 				(void) printf("  ");
-		} else {
-			first = B_FALSE;
 		}
+		if (scripted)
+			(void) printf("%s", strval);
+		else if (field == USFIELD_TYPE || field == USFIELD_NAME)
+			(void) printf("%-*s", width[field], strval);
+		else
+			(void) printf("%*s", width[field], strval);
 
-		if (pl->pl_prop != ZPROP_INVAL) {
-			if (zfs_prop_get(zhp, pl->pl_prop, property,
-			    sizeof (property), NULL, NULL, 0, B_FALSE) != 0)
+		first = B_FALSE;
+		cfield++;
+	}
+
+	(void) printf("\n");
+}
+
+static void
+print_us(boolean_t scripted, boolean_t parsable, int *fields, int types,
+    size_t *width, boolean_t rmnode, uu_avl_t *avl)
+{
+	us_node_t *node;
+	const char *col;
+	int cfield = 0;
+	int field;
+
+	if (!scripted) {
+		boolean_t first = B_TRUE;
+
+		while ((field = fields[cfield]) != USFIELD_LAST) {
+			col = gettext(us_field_hdr[field]);
+			if (field == USFIELD_TYPE || field == USFIELD_NAME) {
+				(void) printf(first ? "%-*s" : "  %-*s",
+				    width[field], col);
+			} else {
+				(void) printf(first ? "%*s" : "  %*s",
+				    width[field], col);
+			}
+			first = B_FALSE;
+			cfield++;
+		}
+		(void) printf("\n");
+	}
+
+	for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) {
+		print_us_node(scripted, parsable, fields, types, width, node);
+		if (rmnode)
+			nvlist_free(node->usn_nvl);
+	}
+}
+
+static int
+zfs_do_userspace(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	zfs_userquota_prop_t p;
+
+	uu_avl_pool_t *avl_pool;
+	uu_avl_t *avl_tree;
+	uu_avl_walk_t *walk;
+	char *delim;
+	char deffields[] = "type,name,used,quota";
+	char *ofield = NULL;
+	char *tfield = NULL;
+	int cfield = 0;
+	int fields[256];
+	int i;
+	boolean_t scripted = B_FALSE;
+	boolean_t prtnum = B_FALSE;
+	boolean_t parsable = B_FALSE;
+	boolean_t sid2posix = B_FALSE;
+	int ret = 0;
+	int c;
+	zfs_sort_column_t *sortcol = NULL;
+	int types = USTYPE_PSX_USR | USTYPE_SMB_USR;
+	us_cbdata_t cb;
+	us_node_t *node;
+	us_node_t *rmnode;
+	uu_list_pool_t *listpool;
+	uu_list_t *list;
+	uu_avl_index_t idx = 0;
+	uu_list_index_t idx2 = 0;
+
+	if (argc < 2)
+		usage(B_FALSE);
+
+	if (strcmp(argv[0], "groupspace") == 0)
+		/* Toggle default group types */
+		types = USTYPE_PSX_GRP | USTYPE_SMB_GRP;
+
+	while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) {
+		switch (c) {
+		case 'n':
+			prtnum = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case 'p':
+			parsable = B_TRUE;
+			break;
+		case 'o':
+			ofield = optarg;
+			break;
+		case 's':
+		case 'S':
+			if (zfs_add_sort_column(&sortcol, optarg,
+			    c == 's' ? B_FALSE : B_TRUE) != 0) {
+				(void) fprintf(stderr,
+				    gettext("invalid field '%s'\n"), optarg);
+				usage(B_FALSE);
+			}
+			break;
+		case 't':
+			tfield = optarg;
+			break;
+		case 'i':
+			sid2posix = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing dataset name\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	/* Use default output fields if not specified using -o */
+	if (ofield == NULL)
+		ofield = deffields;
+	do {
+		if ((delim = strchr(ofield, ',')) != NULL)
+			*delim = '\0';
+		if ((fields[cfield++] = us_field_index(ofield)) == -1) {
+			(void) fprintf(stderr, gettext("invalid type '%s' "
+			    "for -o option\n"), ofield);
+			return (-1);
+		}
+		if (delim != NULL)
+			ofield = delim + 1;
+	} while (delim != NULL);
+	fields[cfield] = USFIELD_LAST;
+
+	/* Override output types (-t option) */
+	if (tfield != NULL) {
+		types = 0;
+
+		do {
+			boolean_t found = B_FALSE;
+
+			if ((delim = strchr(tfield, ',')) != NULL)
+				*delim = '\0';
+			for (i = 0; i < sizeof (us_type_bits) / sizeof (int);
+			    i++) {
+				if (strcmp(tfield, us_type_names[i]) == 0) {
+					found = B_TRUE;
+					types |= us_type_bits[i];
+					break;
+				}
+			}
+			if (!found) {
+				(void) fprintf(stderr, gettext("invalid type "
+				    "'%s' for -t option\n"), tfield);
+				return (-1);
+			}
+			if (delim != NULL)
+				tfield = delim + 1;
+		} while (delim != NULL);
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
+		return (1);
+
+	if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t),
+	    offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL)
+		nomem();
+
+	/* Always add default sorting columns */
+	(void) zfs_add_sort_column(&sortcol, "type", B_FALSE);
+	(void) zfs_add_sort_column(&sortcol, "name", B_FALSE);
+
+	cb.cb_sortcol = sortcol;
+	cb.cb_numname = prtnum;
+	cb.cb_nicenum = !parsable;
+	cb.cb_avl_pool = avl_pool;
+	cb.cb_avl = avl_tree;
+	cb.cb_sid2posix = sid2posix;
+
+	for (i = 0; i < USFIELD_LAST; i++)
+		cb.cb_width[i] = strlen(gettext(us_field_hdr[i]));
+
+	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
+		if (((p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA) &&
+		    !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) ||
+		    ((p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) &&
+		    !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))))
+			continue;
+		cb.cb_prop = p;
+		if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0)
+			return (ret);
+	}
+
+	/* Sort the list */
+	if ((node = uu_avl_first(avl_tree)) == NULL)
+		return (0);
+
+	us_populated = B_TRUE;
+
+	listpool = uu_list_pool_create("tmplist", sizeof (us_node_t),
+	    offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT);
+	list = uu_list_create(listpool, NULL, UU_DEFAULT);
+	uu_list_node_init(node, &node->usn_listnode, listpool);
+
+	while (node != NULL) {
+		rmnode = node;
+		node = uu_avl_next(avl_tree, node);
+		uu_avl_remove(avl_tree, rmnode);
+		if (uu_list_find(list, rmnode, NULL, &idx2) == NULL)
+			uu_list_insert(list, rmnode, idx2);
+	}
+
+	for (node = uu_list_first(list); node != NULL;
+	    node = uu_list_next(list, node)) {
+		us_sort_info_t sortinfo = { sortcol, cb.cb_numname };
+
+		if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL)
+			uu_avl_insert(avl_tree, node, idx);
+	}
+
+	uu_list_destroy(list);
+	uu_list_pool_destroy(listpool);
+
+	/* Print and free node nvlist memory */
+	print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE,
+	    cb.cb_avl);
+
+	zfs_free_sort_columns(sortcol);
+
+	/* Clean up the AVL tree */
+	if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((node = uu_avl_walk_next(walk)) != NULL) {
+		uu_avl_remove(cb.cb_avl, node);
+		free(node);
+	}
+
+	uu_avl_walk_end(walk);
+	uu_avl_destroy(avl_tree);
+	uu_avl_pool_destroy(avl_pool);
+
+	return (ret);
+}
+
+/*
+ * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] ...
+ *      [-t type[,...]] [filesystem|volume|snapshot] ...
+ *
+ *	-H	Scripted mode; elide headers and separate columns by tabs.
+ *	-p	Display values in parsable (literal) format.
+ *	-r	Recurse over all children.
+ *	-d	Limit recursion by depth.
+ *	-o	Control which fields to display.
+ *	-s	Specify sort columns, descending order.
+ *	-S	Specify sort columns, ascending order.
+ *	-t	Control which object types to display.
+ *
+ * When given no arguments, list all filesystems in the system.
+ * Otherwise, list the specified datasets, optionally recursing down them if
+ * '-r' is specified.
+ */
+typedef struct list_cbdata {
+	boolean_t	cb_first;
+	boolean_t	cb_literal;
+	boolean_t	cb_scripted;
+	zprop_list_t	*cb_proplist;
+} list_cbdata_t;
+
+/*
+ * Given a list of columns to display, output appropriate headers for each one.
+ */
+static void
+print_header(list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	char headerbuf[ZFS_MAXPROPLEN];
+	const char *header;
+	int i;
+	boolean_t first = B_TRUE;
+	boolean_t right_justify;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		if (!first) {
+			(void) printf("  ");
+		} else {
+			first = B_FALSE;
+		}
+
+		right_justify = B_FALSE;
+		if (pl->pl_prop != ZPROP_INVAL) {
+			header = zfs_prop_column_name(pl->pl_prop);
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else {
+			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+				headerbuf[i] = toupper(pl->pl_user_prop[i]);
+			headerbuf[i] = '\0';
+			header = headerbuf;
+		}
+
+		if (pl->pl_next == NULL && !right_justify)
+			(void) printf("%s", header);
+		else if (right_justify)
+			(void) printf("%*s", pl->pl_width, header);
+		else
+			(void) printf("%-*s", pl->pl_width, header);
+	}
+
+	(void) printf("\n");
+}
+
+/*
+ * Given a dataset and a list of fields, print out all the properties according
+ * to the described layout.
+ */
+static void
+print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb)
+{
+	zprop_list_t *pl = cb->cb_proplist;
+	boolean_t first = B_TRUE;
+	char property[ZFS_MAXPROPLEN];
+	nvlist_t *userprops = zfs_get_user_props(zhp);
+	nvlist_t *propval;
+	char *propstr;
+	boolean_t right_justify;
+
+	for (; pl != NULL; pl = pl->pl_next) {
+		if (!first) {
+			if (cb->cb_scripted)
+				(void) printf("\t");
+			else
+				(void) printf("  ");
+		} else {
+			first = B_FALSE;
+		}
+
+		if (pl->pl_prop == ZFS_PROP_NAME) {
+			(void) strlcpy(property, zfs_get_name(zhp),
+			    sizeof (property));
+			propstr = property;
+			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else if (pl->pl_prop != ZPROP_INVAL) {
+			if (zfs_prop_get(zhp, pl->pl_prop, property,
+			    sizeof (property), NULL, NULL, 0,
+			    cb->cb_literal) != 0)
 				propstr = "-";
 			else
 				propstr = property;
-
 			right_justify = zfs_prop_align_right(pl->pl_prop);
 		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
 			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
-			    property, sizeof (property), B_FALSE) != 0)
+			    property, sizeof (property), cb->cb_literal) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = B_TRUE;
+		} else if (zfs_prop_written(pl->pl_user_prop)) {
+			if (zfs_prop_get_written(zhp, pl->pl_user_prop,
+			    property, sizeof (property), cb->cb_literal) != 0)
 				propstr = "-";
 			else
 				propstr = property;
@@ -1959,19 +3052,17 @@ print_dataset(zfs_handle_t *zhp, zprop_l
 			right_justify = B_FALSE;
 		}
 
-		width = pl->pl_width;
-
 		/*
 		 * If this is being called in scripted mode, or if this is the
 		 * last column and it is left-justified, don't include a width
 		 * format specifier.
 		 */
-		if (scripted || (pl->pl_next == NULL && !right_justify))
+		if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
 			(void) printf("%s", propstr);
 		else if (right_justify)
-			(void) printf("%*s", width, propstr);
+			(void) printf("%*s", pl->pl_width, propstr);
 		else
-			(void) printf("%-*s", width, propstr);
+			(void) printf("%-*s", pl->pl_width, propstr);
 	}
 
 	(void) printf("\n");
@@ -1987,11 +3078,11 @@ list_callback(zfs_handle_t *zhp, void *d
 
 	if (cbp->cb_first) {
 		if (!cbp->cb_scripted)
-			print_header(cbp->cb_proplist);
+			print_header(cbp);
 		cbp->cb_first = B_FALSE;
 	}
 
-	print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted);
+	print_dataset(zhp, cbp);
 
 	return (0);
 }
@@ -2000,7 +3091,6 @@ static int
 zfs_do_list(int argc, char **argv)
 {
 	int c;
-	boolean_t scripted = B_FALSE;
 	static char default_fields[] =
 	    "name,used,available,referenced,mountpoint";
 	int types = ZFS_TYPE_DATASET;
@@ -2009,16 +3099,20 @@ zfs_do_list(int argc, char **argv)
 	list_cbdata_t cb = { 0 };
 	char *value;
 	int limit = 0;
-	int ret;
+	int ret = 0;
 	zfs_sort_column_t *sortcol = NULL;
 	int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) {
+	while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) {
 		switch (c) {
 		case 'o':
 			fields = optarg;
 			break;
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			flags |= ZFS_ITER_LITERAL_PROPS;
+			break;
 		case 'd':
 			limit = parse_depth(optarg, &flags);
 			break;
@@ -2026,7 +3120,7 @@ zfs_do_list(int argc, char **argv)
 			flags |= ZFS_ITER_RECURSE;
 			break;
 		case 'H':
-			scripted = B_TRUE;
+			cb.cb_scripted = B_TRUE;
 			break;
 		case 's':
 			if (zfs_add_sort_column(&sortcol, optarg,
@@ -2050,7 +3144,8 @@ zfs_do_list(int argc, char **argv)
 			flags &= ~ZFS_ITER_PROP_LISTSNAPS;
 			while (*optarg != '\0') {
 				static char *type_subopts[] = { "filesystem",
-				    "volume", "snapshot", "all", NULL };
+				    "volume", "snapshot", "snap", "bookmark",
+				    "all", NULL };
 
 				switch (getsubopt(&optarg, type_subopts,
 				    &value)) {
@@ -2061,16 +3156,20 @@ zfs_do_list(int argc, char **argv)
 					types |= ZFS_TYPE_VOLUME;
 					break;
 				case 2:
+				case 3:
 					types |= ZFS_TYPE_SNAPSHOT;
 					break;
-				case 3:
-					types = ZFS_TYPE_DATASET;
+				case 4:
+					types |= ZFS_TYPE_BOOKMARK;
+					break;
+				case 5:
+					types = ZFS_TYPE_DATASET |
+					    ZFS_TYPE_BOOKMARK;
 					break;
-
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid type '%s'\n"),
-					    value);
+					    suboptarg);
 					usage(B_FALSE);
 				}
 			}
@@ -2094,6 +3193,13 @@ zfs_do_list(int argc, char **argv)
 		fields = default_fields;
 
 	/*
+	 * If we are only going to list snapshot names and sort by name,
+	 * then we can use faster version.
+	 */
+	if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol))
+		flags |= ZFS_ITER_SIMPLE;
+
+	/*
 	 * If "-o space" and no types were specified, don't display snapshots.
 	 */
 	if (strcmp(fields, "space") == 0 && types_specified == B_FALSE)
@@ -2108,7 +3214,6 @@ zfs_do_list(int argc, char **argv)
 	    != 0)
 		usage(B_FALSE);
 
-	cb.cb_scripted = scripted;
 	cb.cb_first = B_TRUE;
 
 	ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
@@ -2124,9 +3229,10 @@ zfs_do_list(int argc, char **argv)
 }
 
 /*
- * zfs rename <fs | snap | vol> <fs | snap | vol>
- * zfs rename -p <fs | vol> <fs | vol>
+ * zfs rename [-f] <fs | snap | vol> <fs | snap | vol>
+ * zfs rename [-f] -p <fs | vol> <fs | vol>
  * zfs rename -r <snap> <snap>
+ * zfs rename -u [-p] <fs> <fs>
  *
  * Renames the given dataset to another of the same type.
  *
@@ -2137,19 +3243,27 @@ static int
 zfs_do_rename(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
+	renameflags_t flags = { 0 };
 	int c;
-	int ret;
-	boolean_t recurse = B_FALSE;
+	int ret = 0;
+	int types;
 	boolean_t parents = B_FALSE;
+	char *snapshot = NULL;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "pr")) != -1) {
+	while ((c = getopt(argc, argv, "fpru")) != -1) {
 		switch (c) {
 		case 'p':
 			parents = B_TRUE;
 			break;
 		case 'r':
-			recurse = B_TRUE;
+			flags.recurse = B_TRUE;
+			break;
+		case 'u':
+			flags.nounmount = B_TRUE;
+			break;
+		case 'f':
+			flags.forceunmount = B_TRUE;
 			break;
 		case '?':
 		default:
@@ -2178,20 +3292,45 @@ zfs_do_rename(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
-	if (recurse && parents) {
+	if (flags.recurse && parents) {
 		(void) fprintf(stderr, gettext("-p and -r options are mutually "
 		    "exclusive\n"));
 		usage(B_FALSE);
 	}
 
-	if (recurse && strchr(argv[0], '@') == 0) {
+	if (flags.recurse && strchr(argv[0], '@') == 0) {
 		(void) fprintf(stderr, gettext("source dataset for recursive "
 		    "rename must be a snapshot\n"));
 		usage(B_FALSE);
 	}
 
-	if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM |
-	    ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL)
+	if (flags.nounmount && parents) {
+		(void) fprintf(stderr, gettext("-u and -p options are mutually "
+		    "exclusive\n"));
+		usage(B_FALSE);
+	}
+
+	if (flags.nounmount)
+		types = ZFS_TYPE_FILESYSTEM;
+	else if (parents)
+		types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+	else
+		types = ZFS_TYPE_DATASET;
+
+	if (flags.recurse) {
+		/*
+		 * When we do recursive rename we are fine when the given
+		 * snapshot for the given dataset doesn't exist - it can
+		 * still exists below.
+		 */
+
+		snapshot = strchr(argv[0], '@');
+		assert(snapshot != NULL);
+		*snapshot = '\0';
+		snapshot++;
+	}
+
+	if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
 		return (1);
 
 	/* If we were asked and the name looks good, try to create ancestors. */
@@ -2201,7 +3340,7 @@ zfs_do_rename(int argc, char **argv)
 		return (1);
 	}
 
-	ret = (zfs_rename(zhp, argv[1], recurse) != 0);
+	ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0);
 
 	zfs_close(zhp);
 	return (ret);
@@ -2217,7 +3356,7 @@ static int
 zfs_do_promote(int argc, char **argv)
 {
 	zfs_handle_t *zhp;
-	int ret;
+	int ret = 0;
 
 	/* check options */
 	if (argc > 1 && argv[1][0] == '-') {
@@ -2266,9 +3405,30 @@ typedef struct rollback_cbdata {
 	char		*cb_target;
 	int		cb_error;
 	boolean_t	cb_recurse;
-	boolean_t	cb_dependent;
 } rollback_cbdata_t;
 
+static int
+rollback_check_dependent(zfs_handle_t *zhp, void *data)
+{
+	rollback_cbdata_t *cbp = data;
+
+	if (cbp->cb_first && cbp->cb_recurse) {
+		(void) fprintf(stderr, gettext("cannot rollback to "
+		    "'%s': clones of previous snapshots exist\n"),
+		    cbp->cb_target);
+		(void) fprintf(stderr, gettext("use '-R' to "
+		    "force deletion of the following clones and "
+		    "dependents:\n"));
+		cbp->cb_first = 0;
+		cbp->cb_error = 1;
+	}
+
+	(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+
+	zfs_close(zhp);
+	return (0);
+}
+
 /*
  * Report any snapshots more recent than the one specified.  Used when '-r' is
  * not specified.  We reuse this same callback for the snapshot dependents - if
@@ -2285,52 +3445,30 @@ rollback_check(zfs_handle_t *zhp, void *
 		return (0);
 	}
 
-	if (!cbp->cb_dependent) {
-		if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 &&
-		    zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
-		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
-		    cbp->cb_create) {
-
-			if (cbp->cb_first && !cbp->cb_recurse) {
-				(void) fprintf(stderr, gettext("cannot "
-				    "rollback to '%s': more recent snapshots "
-				    "exist\n"),
-				    cbp->cb_target);
-				(void) fprintf(stderr, gettext("use '-r' to "
-				    "force deletion of the following "
-				    "snapshots:\n"));
-				cbp->cb_first = 0;
-				cbp->cb_error = 1;
-			}
-
-			if (cbp->cb_recurse) {
-				cbp->cb_dependent = B_TRUE;
-				if (zfs_iter_dependents(zhp, B_TRUE,
-				    rollback_check, cbp) != 0) {
-					zfs_close(zhp);
-					return (-1);
-				}
-				cbp->cb_dependent = B_FALSE;
-			} else {
-				(void) fprintf(stderr, "%s\n",
-				    zfs_get_name(zhp));
-			}
-		}
-	} else {
-		if (cbp->cb_first && cbp->cb_recurse) {
-			(void) fprintf(stderr, gettext("cannot rollback to "
-			    "'%s': clones of previous snapshots exist\n"),
+	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
+		if (cbp->cb_first && !cbp->cb_recurse) {
+			(void) fprintf(stderr, gettext("cannot "
+			    "rollback to '%s': more recent snapshots "
+			    "or bookmarks exist\n"),
 			    cbp->cb_target);
-			(void) fprintf(stderr, gettext("use '-R' to "
-			    "force deletion of the following clones and "
-			    "dependents:\n"));
+			(void) fprintf(stderr, gettext("use '-r' to "
+			    "force deletion of the following "
+			    "snapshots and bookmarks:\n"));
 			cbp->cb_first = 0;
 			cbp->cb_error = 1;
 		}
 
-		(void) fprintf(stderr, "%s\n", zfs_get_name(zhp));
+		if (cbp->cb_recurse) {
+			if (zfs_iter_dependents(zhp, B_TRUE,
+			    rollback_check_dependent, cbp) != 0) {
+				zfs_close(zhp);
+				return (-1);
+			}
+		} else {
+			(void) fprintf(stderr, "%s\n",
+			    zfs_get_name(zhp));
+		}
 	}
-
 	zfs_close(zhp);
 	return (0);
 }
@@ -2338,12 +3476,12 @@ rollback_check(zfs_handle_t *zhp, void *
 static int
 zfs_do_rollback(int argc, char **argv)
 {
-	int ret;
+	int ret = 0;
 	int c;
 	boolean_t force = B_FALSE;
 	rollback_cbdata_t cb = { 0 };
 	zfs_handle_t *zhp, *snap;
-	char parentname[ZFS_MAXNAMELEN];
+	char parentname[ZFS_MAX_DATASET_NAME_LEN];
 	char *delim;
 
 	/* check options */
@@ -2400,7 +3538,9 @@ zfs_do_rollback(int argc, char **argv)
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
 	cb.cb_first = B_TRUE;
 	cb.cb_error = 0;
-	if ((ret = zfs_iter_children(zhp, rollback_check, &cb)) != 0)
+	if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb)) != 0)
+		goto out;
+	if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0)
 		goto out;
 
 	if ((ret = cb.cb_error) != 0)
@@ -2422,21 +3562,17 @@ out:
 }
 
 /*
- * zfs set property=value { fs | snap | vol } ...
+ * zfs set property=value ... { fs | snap | vol } ...
  *
- * Sets the given property for all datasets specified on the command line.
+ * Sets the given properties for all datasets specified on the command line.
  */
-typedef struct set_cbdata {
-	char		*cb_propname;
-	char		*cb_value;
-} set_cbdata_t;
 
 static int
 set_callback(zfs_handle_t *zhp, void *data)
 {
-	set_cbdata_t *cbp = data;
+	nvlist_t *props = data;
 
-	if (zfs_prop_set(zhp, cbp->cb_propname, cbp->cb_value) != 0) {
+	if (zfs_prop_set_list(zhp, props) != 0) {
 		switch (libzfs_errno(g_zfs)) {
 		case EZFS_MOUNTFAILED:
 			(void) fprintf(stderr, gettext("property may be set "
@@ -2455,8 +3591,9 @@ set_callback(zfs_handle_t *zhp, void *da
 static int
 zfs_do_set(int argc, char **argv)
 {
-	set_cbdata_t cb;
-	int ret;
+	nvlist_t *props = NULL;
+	int ds_start = -1; /* argv idx of first dataset arg */
+	int ret = 0;
 
 	/* check for options */
 	if (argc > 1 && argv[1][0] == '-') {
@@ -2467,39 +3604,86 @@ zfs_do_set(int argc, char **argv)
 
 	/* check number of arguments */
 	if (argc < 2) {
-		(void) fprintf(stderr, gettext("missing property=value "
-		    "argument\n"));
+		(void) fprintf(stderr, gettext("missing arguments\n"));
 		usage(B_FALSE);
 	}
 	if (argc < 3) {
-		(void) fprintf(stderr, gettext("missing dataset name\n"));
+		if (strchr(argv[1], '=') == NULL) {
+			(void) fprintf(stderr, gettext("missing property=value "
+			    "argument(s)\n"));
+		} else {
+			(void) fprintf(stderr, gettext("missing dataset "
+			    "name(s)\n"));
+		}
 		usage(B_FALSE);
 	}
 
-	/* validate property=value argument */
-	cb.cb_propname = argv[1];
-	if (((cb.cb_value = strchr(cb.cb_propname, '=')) == NULL) ||
-	    (cb.cb_value[1] == '\0')) {
-		(void) fprintf(stderr, gettext("missing value in "
-		    "property=value argument\n"));
+	/* validate argument order:  prop=val args followed by dataset args */
+	for (int i = 1; i < argc; i++) {
+		if (strchr(argv[i], '=') != NULL) {
+			if (ds_start > 0) {
+				/* out-of-order prop=val argument */
+				(void) fprintf(stderr, gettext("invalid "
+				    "argument order\n"), i);
+				usage(B_FALSE);
+			}
+		} else if (ds_start < 0) {
+			ds_start = i;
+		}
+	}
+	if (ds_start < 0) {
+		(void) fprintf(stderr, gettext("missing dataset name(s)\n"));
 		usage(B_FALSE);
 	}
 
-	*cb.cb_value = '\0';
-	cb.cb_value++;
-
-	if (*cb.cb_propname == '\0') {
-		(void) fprintf(stderr,
-		    gettext("missing property in property=value argument\n"));
-		usage(B_FALSE);
+	/* Populate a list of property settings */
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	for (int i = 1; i < ds_start; i++) {
+		if ((ret = parseprop(props, argv[i])) != 0)
+			goto error;
 	}
 
-	ret = zfs_for_each(argc - 2, argv + 2, 0,
-	    ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb);
+	ret = zfs_for_each(argc - ds_start, argv + ds_start, 0,
+	    ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props);
 
+error:
+	nvlist_free(props);
 	return (ret);
 }
 
+typedef struct snap_cbdata {
+	nvlist_t *sd_nvl;
+	boolean_t sd_recursive;
+	const char *sd_snapname;
+} snap_cbdata_t;
+
+static int
+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
+{
+	snap_cbdata_t *sd = arg;
+	char *name;
+	int rv = 0;
+	int error;
+
+	if (sd->sd_recursive &&
+	    zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
+	if (error == -1)
+		nomem();
+	fnvlist_add_boolean(sd->sd_nvl, name);
+	free(name);
+
+	if (sd->sd_recursive)
+		rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
+	zfs_close(zhp);
+	return (rv);
+}
+
 /*
  * zfs snapshot [-r] [-o prop=value] ... <fs@snap>
  *
@@ -2509,26 +3693,27 @@ zfs_do_set(int argc, char **argv)
 static int
 zfs_do_snapshot(int argc, char **argv)
 {
-	boolean_t recursive = B_FALSE;
-	int ret;
-	char c;
+	int ret = 0;
+	int c;
 	nvlist_t *props;
+	snap_cbdata_t sd = { 0 };
+	boolean_t multiple_snaps = B_FALSE;
 
-	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) {
-		(void) fprintf(stderr, gettext("internal error: "
-		    "out of memory\n"));
-		return (1);
-	}
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
 
 	/* check options */
 	while ((c = getopt(argc, argv, "ro:")) != -1) {
 		switch (c) {
 		case 'o':
-			if (parseprop(props))
+			if (parseprop(props, optarg) != 0)
 				return (1);
 			break;
 		case 'r':
-			recursive = B_TRUE;
+			sd.sd_recursive = B_TRUE;
+			multiple_snaps = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -2545,27 +3730,41 @@ zfs_do_snapshot(int argc, char **argv)
 		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
 		goto usage;
 	}
-	if (argc > 1) {
-		(void) fprintf(stderr, gettext("too many arguments\n"));
-		goto usage;
+
+	if (argc > 1)
+		multiple_snaps = B_TRUE;
+	for (; argc > 0; argc--, argv++) {
+		char *atp;
+		zfs_handle_t *zhp;
+
+		atp = strchr(argv[0], '@');
+		if (atp == NULL)
+			goto usage;
+		*atp = '\0';
+		sd.sd_snapname = atp + 1;
+		zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL)
+			goto usage;
+		if (zfs_snapshot_cb(zhp, &sd) != 0)
+			goto usage;
 	}
 
-	ret = zfs_snapshot(g_zfs, argv[0], recursive, props);
+	ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props);
+	nvlist_free(sd.sd_nvl);
 	nvlist_free(props);
-	if (ret && recursive)
+	if (ret != 0 && multiple_snaps)
 		(void) fprintf(stderr, gettext("no snapshots were created\n"));
 	return (ret != 0);
 
 usage:
+	nvlist_free(sd.sd_nvl);
 	nvlist_free(props);
 	usage(B_FALSE);
 	return (-1);
 }
 
 /*
- * zfs send [-vDp] -R [-i|-I <@snap>] <fs@snap>
- * zfs send [-vDp] [-i|-I <@snap>] <fs@snap>
- *
  * Send a backup stream to stdout.
  */
 static int
@@ -2573,13 +3772,16 @@ zfs_do_send(int argc, char **argv)
 {
 	char *fromname = NULL;
 	char *toname = NULL;
+	char *resume_token = NULL;
 	char *cp;
 	zfs_handle_t *zhp;
 	sendflags_t flags = { 0 };
 	int c, err;
+	nvlist_t *dbgnv = NULL;
+	boolean_t extraverbose = B_FALSE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) {
+	while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
@@ -2598,12 +3800,31 @@ zfs_do_send(int argc, char **argv)
 		case 'p':
 			flags.props = B_TRUE;
 			break;
+		case 'P':
+			flags.parsable = B_TRUE;
+			flags.verbose = B_TRUE;
+			break;
 		case 'v':
+			if (flags.verbose)
+				extraverbose = B_TRUE;
 			flags.verbose = B_TRUE;
+			flags.progress = B_TRUE;
 			break;
 		case 'D':
 			flags.dedup = B_TRUE;
 			break;
+		case 'n':
+			flags.dryrun = B_TRUE;
+			break;
+		case 'L':
+			flags.largeblock = B_TRUE;
+			break;
+		case 'e':
+			flags.embed_data = B_TRUE;
+			break;
+		case 't':
+			resume_token = optarg;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@@ -2619,29 +3840,87 @@ zfs_do_send(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	/* check number of arguments */
-	if (argc < 1) {
-		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
-		usage(B_FALSE);
-	}
-	if (argc > 1) {
-		(void) fprintf(stderr, gettext("too many arguments\n"));
-		usage(B_FALSE);
+	if (resume_token != NULL) {
+		if (fromname != NULL || flags.replicate || flags.props ||
+		    flags.dedup) {
+			(void) fprintf(stderr,
+			    gettext("invalid flags combined with -t\n"));
+			usage(B_FALSE);
+		}
+		if (argc != 0) {
+			(void) fprintf(stderr, gettext("no additional "
+			    "arguments are permitted with -t\n"));
+			usage(B_FALSE);
+		}
+	} else {
+		if (argc < 1) {
+			(void) fprintf(stderr,
+			    gettext("missing snapshot argument\n"));
+			usage(B_FALSE);
+		}
+		if (argc > 1) {
+			(void) fprintf(stderr, gettext("too many arguments\n"));
+			usage(B_FALSE);
+		}
 	}
 
-	if (isatty(STDOUT_FILENO)) {
+	if (!flags.dryrun && isatty(STDOUT_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Stream can not be written to a terminal.\n"
 		    "You must redirect standard output.\n"));
 		return (1);
 	}
 
-	cp = strchr(argv[0], '@');
-	if (cp == NULL) {
-		(void) fprintf(stderr,
-		    gettext("argument must be a snapshot\n"));
-		usage(B_FALSE);
+	if (resume_token != NULL) {
+		return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
+		    resume_token));
+	}
+
+	/*
+	 * Special case sending a filesystem, or from a bookmark.
+	 */
+	if (strchr(argv[0], '@') == NULL ||
+	    (fromname && strchr(fromname, '#') != NULL)) {
+		char frombuf[ZFS_MAX_DATASET_NAME_LEN];
+		enum lzc_send_flags lzc_flags = 0;
+
+		if (flags.replicate || flags.doall || flags.props ||
+		    flags.dedup || flags.dryrun || flags.verbose ||
+		    flags.progress) {
+			(void) fprintf(stderr,
+			    gettext("Error: "
+			    "Unsupported flag with filesystem or bookmark.\n"));
+			return (1);
+		}
+
+		zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET);
+		if (zhp == NULL)
+			return (1);
+
+		if (flags.largeblock)
+			lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK;
+		if (flags.embed_data)
+			lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+
+		if (fromname != NULL &&
+		    (fromname[0] == '#' || fromname[0] == '@')) {
+			/*
+			 * Incremental source name begins with # or @.
+			 * Default to same fs as target.
+			 */
+			(void) strncpy(frombuf, argv[0], sizeof (frombuf));
+			cp = strchr(frombuf, '@');
+			if (cp != NULL)
+				*cp = '\0';
+			(void) strlcat(frombuf, fromname, sizeof (frombuf));
+			fromname = frombuf;
+		}
+		err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
+		zfs_close(zhp);
+		return (err != 0);
 	}
+
+	cp = strchr(argv[0], '@');
 	*cp = '\0';
 	toname = cp + 1;
 	zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
@@ -2654,7 +3933,7 @@ zfs_do_send(int argc, char **argv)
 	 * case if they specify the origin.
 	 */
 	if (fromname && (cp = strchr(fromname, '@')) != NULL) {
-		char origin[ZFS_MAXNAMELEN];
+		char origin[ZFS_MAX_DATASET_NAME_LEN];
 		zprop_source_t src;
 
 		(void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN,
@@ -2683,26 +3962,47 @@ zfs_do_send(int argc, char **argv)
 	if (flags.replicate && fromname == NULL)
 		flags.doall = B_TRUE;
 
-	err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0);
+	err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0,
+	    extraverbose ? &dbgnv : NULL);
+
+	if (extraverbose && dbgnv != NULL) {
+		/*
+		 * dump_nvlist prints to stdout, but that's been
+		 * redirected to a file.  Make it print to stderr
+		 * instead.
+		 */
+		(void) dup2(STDERR_FILENO, STDOUT_FILENO);
+		dump_nvlist(dbgnv, 0);
+		nvlist_free(dbgnv);
+	}
 	zfs_close(zhp);
 
 	return (err != 0);
 }
 
 /*
- * zfs receive [-denvF] <fs@snap>
- *
  * Restore a backup stream from stdin.
  */
 static int
 zfs_do_receive(int argc, char **argv)
 {
-	int c, err;
+	int c, err = 0;
 	recvflags_t flags = { 0 };
+	boolean_t abort_resumable = B_FALSE;
+
+	nvlist_t *props;
+	nvpair_t *nvp = NULL;
+
+	if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":denuvF")) != -1) {
+	while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) {
 		switch (c) {
+		case 'o':
+			if (parseprop(props, optarg) != 0)
+				return (1);
+			break;
 		case 'd':
 			flags.isprefix = B_TRUE;
 			break;
@@ -2719,9 +4019,15 @@ zfs_do_receive(int argc, char **argv)
 		case 'v':
 			flags.verbose = B_TRUE;
 			break;
+		case 's':
+			flags.resumable = B_TRUE;
+			break;
 		case 'F':
 			flags.force = B_TRUE;
 			break;
+		case 'A':
+			abort_resumable = B_TRUE;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@@ -2747,6 +4053,51 @@ zfs_do_receive(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
+	while ((nvp = nvlist_next_nvpair(props, nvp))) {
+		if (strcmp(nvpair_name(nvp), "origin") != 0) {
+			(void) fprintf(stderr, gettext("invalid option"));
+			usage(B_FALSE);
+		}
+	}
+
+	if (abort_resumable) {
+		if (flags.isprefix || flags.istail || flags.dryrun ||
+		    flags.resumable || flags.nomount) {
+			(void) fprintf(stderr, gettext("invalid option"));
+			usage(B_FALSE);
+		}
+
+		char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+		(void) snprintf(namebuf, sizeof (namebuf),
+		    "%s/%%recv", argv[0]);
+
+		if (zfs_dataset_exists(g_zfs, namebuf,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) {
+			zfs_handle_t *zhp = zfs_open(g_zfs,
+			    namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+			if (zhp == NULL)
+				return (1);
+			err = zfs_destroy(zhp, B_FALSE);
+		} else {
+			zfs_handle_t *zhp = zfs_open(g_zfs,
+			    argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+			if (zhp == NULL)
+				usage(B_FALSE);
+			if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) ||
+			    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+			    NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
+				(void) fprintf(stderr,
+				    gettext("'%s' does not have any "
+				    "resumable receive state to abort\n"),
+				    argv[0]);
+				return (1);
+			}
+			err = zfs_destroy(zhp, B_FALSE);
+		}
+
+		return (err != 0);
+	}
+
 	if (isatty(STDIN_FILENO)) {
 		(void) fprintf(stderr,
 		    gettext("Error: Backup stream can not be read "
@@ -2754,147 +4105,1685 @@ zfs_do_receive(int argc, char **argv)
 		    "You must redirect standard input.\n"));
 		return (1);
 	}
-
-	err = zfs_receive(g_zfs, argv[0], flags, STDIN_FILENO, NULL);
+	err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
 
 	return (err != 0);
 }
 
-static int
-zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
-{
-	int errors = 0;
-	int i;
-	const char *tag;
-	boolean_t recursive = B_FALSE;
-	boolean_t temphold = B_FALSE;
-	const char *opts = holding ? "rt" : "r";
-	int c;
+/*
+ * allow/unallow stuff
+ */
+/* copied from zfs/sys/dsl_deleg.h */
+#define	ZFS_DELEG_PERM_CREATE		"create"
+#define	ZFS_DELEG_PERM_DESTROY		"destroy"
+#define	ZFS_DELEG_PERM_SNAPSHOT		"snapshot"
+#define	ZFS_DELEG_PERM_ROLLBACK		"rollback"
+#define	ZFS_DELEG_PERM_CLONE		"clone"
+#define	ZFS_DELEG_PERM_PROMOTE		"promote"
+#define	ZFS_DELEG_PERM_RENAME		"rename"
+#define	ZFS_DELEG_PERM_MOUNT		"mount"
+#define	ZFS_DELEG_PERM_SHARE		"share"
+#define	ZFS_DELEG_PERM_SEND		"send"
+#define	ZFS_DELEG_PERM_RECEIVE		"receive"
+#define	ZFS_DELEG_PERM_ALLOW		"allow"
+#define	ZFS_DELEG_PERM_USERPROP		"userprop"
+#define	ZFS_DELEG_PERM_VSCAN		"vscan" /* ??? */
+#define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
+#define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
+#define	ZFS_DELEG_PERM_USERUSED		"userused"
+#define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
+#define	ZFS_DELEG_PERM_HOLD		"hold"
+#define	ZFS_DELEG_PERM_RELEASE		"release"
+#define	ZFS_DELEG_PERM_DIFF		"diff"
+#define	ZFS_DELEG_PERM_BOOKMARK		"bookmark"
+
+#define	ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE
+
+static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = {
+	{ ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW },
+	{ ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
+	{ ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
+	{ ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
+	{ ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF},
+	{ ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+	{ ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
+	{ ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
+	{ ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
+	{ ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
+	{ ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
+	{ ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
+	{ ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND },
+	{ ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
+	{ ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
+	{ ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK },
+
+	{ ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
+	{ ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+	{ ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+	{ ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
+	{ ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
+	{ NULL, ZFS_DELEG_NOTE_NONE }
+};
 
-	/* check options */
-	while ((c = getopt(argc, argv, opts)) != -1) {
-		switch (c) {
-		case 'r':
-			recursive = B_TRUE;
-			break;
-		case 't':
-			temphold = B_TRUE;
-			break;
-		case '?':
-			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
-			    optopt);
-			usage(B_FALSE);
-		}
-	}
+/* permission structure */
+typedef struct deleg_perm {
+	zfs_deleg_who_type_t	dp_who_type;
+	const char		*dp_name;
+	boolean_t		dp_local;
+	boolean_t		dp_descend;
+} deleg_perm_t;
+
+/* */
+typedef struct deleg_perm_node {
+	deleg_perm_t		dpn_perm;
+
+	uu_avl_node_t		dpn_avl_node;
+} deleg_perm_node_t;
+
+typedef struct fs_perm fs_perm_t;
+
+/* permissions set */
+typedef struct who_perm {
+	zfs_deleg_who_type_t	who_type;
+	const char		*who_name;		/* id */
+	char			who_ug_name[256];	/* user/group name */
+	fs_perm_t		*who_fsperm;		/* uplink */
+
+	uu_avl_t		*who_deleg_perm_avl;	/* permissions */
+} who_perm_t;
+
+/* */
+typedef struct who_perm_node {
+	who_perm_t	who_perm;
+	uu_avl_node_t	who_avl_node;
+} who_perm_node_t;
+
+typedef struct fs_perm_set fs_perm_set_t;
+/* fs permissions */
+struct fs_perm {
+	const char		*fsp_name;
 
-	argc -= optind;
-	argv += optind;
+	uu_avl_t		*fsp_sc_avl;	/* sets,create */
+	uu_avl_t		*fsp_uge_avl;	/* user,group,everyone */
 
-	/* check number of arguments */
-	if (argc < 2)
-		usage(B_FALSE);
+	fs_perm_set_t		*fsp_set;	/* uplink */
+};
 
-	tag = argv[0];
-	--argc;
-	++argv;
+/* */
+typedef struct fs_perm_node {
+	fs_perm_t	fspn_fsperm;
+	uu_avl_t	*fspn_avl;
+
+	uu_list_node_t	fspn_list_node;
+} fs_perm_node_t;
+
+/* top level structure */
+struct fs_perm_set {
+	uu_list_pool_t	*fsps_list_pool;
+	uu_list_t	*fsps_list; /* list of fs_perms */
+
+	uu_avl_pool_t	*fsps_named_set_avl_pool;
+	uu_avl_pool_t	*fsps_who_perm_avl_pool;
+	uu_avl_pool_t	*fsps_deleg_perm_avl_pool;
+};
 
-	if (holding && tag[0] == '.') {
-		/* tags starting with '.' are reserved for libzfs */
-		(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
-		usage(B_FALSE);
+static inline const char *
+deleg_perm_type(zfs_deleg_note_t note)
+{
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+	case ZFS_DELEG_NOTE_GROUPUSED:
+	case ZFS_DELEG_NOTE_USERPROP:
+	case ZFS_DELEG_NOTE_USERQUOTA:
+	case ZFS_DELEG_NOTE_USERUSED:
+		/* other */
+		return (gettext("other"));
+	default:
+		return (gettext("subcommand"));
 	}
+}
 
-	for (i = 0; i < argc; ++i) {
-		zfs_handle_t *zhp;
-		char parent[ZFS_MAXNAMELEN];
-		const char *delim;
-		char *path = argv[i];
-
-		delim = strchr(path, '@');
-		if (delim == NULL) {
-			(void) fprintf(stderr,
-			    gettext("'%s' is not a snapshot\n"), path);
-			++errors;
-			continue;
-		}
-		(void) strncpy(parent, path, delim - path);
-		parent[delim - path] = '\0';
-
-		zhp = zfs_open(g_zfs, parent,
-		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
-		if (zhp == NULL) {
-			++errors;
-			continue;
-		}
-		if (holding) {
-			if (zfs_hold(zhp, delim+1, tag, recursive,
-			    temphold, B_FALSE) != 0)
-				++errors;
-		} else {
-			if (zfs_release(zhp, delim+1, tag, recursive) != 0)
-				++errors;
-		}
-		zfs_close(zhp);
+static int
+who_type2weight(zfs_deleg_who_type_t who_type)
+{
+	int res;
+	switch (who_type) {
+		case ZFS_DELEG_NAMED_SET_SETS:
+		case ZFS_DELEG_NAMED_SET:
+			res = 0;
+			break;
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_CREATE:
+			res = 1;
+			break;
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_USER:
+			res = 2;
+			break;
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_GROUP:
+			res = 3;
+			break;
+		case ZFS_DELEG_EVERYONE_SETS:
+		case ZFS_DELEG_EVERYONE:
+			res = 4;
+			break;
+		default:
+			res = -1;
 	}
 
-	return (errors != 0);
+	return (res);
 }
 
-/*
- * zfs hold [-r] [-t] <tag> <snap> ...
- *
- *	-r	Recursively hold
- *	-t	Temporary hold (hidden option)
- *
- * Apply a user-hold with the given tag to the list of snapshots.
- */
+/* ARGSUSED */
 static int
-zfs_do_hold(int argc, char **argv)
+who_perm_compare(const void *larg, const void *rarg, void *unused)
 {
-	return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+	const who_perm_node_t *l = larg;
+	const who_perm_node_t *r = rarg;
+	zfs_deleg_who_type_t ltype = l->who_perm.who_type;
+	zfs_deleg_who_type_t rtype = r->who_perm.who_type;
+	int lweight = who_type2weight(ltype);
+	int rweight = who_type2weight(rtype);
+	int res = lweight - rweight;
+	if (res == 0)
+		res = strncmp(l->who_perm.who_name, r->who_perm.who_name,
+		    ZFS_MAX_DELEG_NAME-1);
+
+	if (res == 0)
+		return (0);
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
 }
 
-/*
- * zfs release [-r] <tag> <snap> ...
- *
- *	-r	Recursively release
- *
- * Release a user-hold with the given tag from the list of snapshots.
- */
+/* ARGSUSED */
 static int
-zfs_do_release(int argc, char **argv)
+deleg_perm_compare(const void *larg, const void *rarg, void *unused)
 {
-	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
-}
+	const deleg_perm_node_t *l = larg;
+	const deleg_perm_node_t *r = rarg;
+	int res =  strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name,
+	    ZFS_MAX_DELEG_NAME-1);
 
-typedef struct get_all_cbdata {
-	zfs_handle_t	**cb_handles;
-	size_t		cb_alloc;
-	size_t		cb_used;
-	uint_t		cb_types;
-	boolean_t	cb_verbose;
-} get_all_cbdata_t;
+	if (res == 0)
+		return (0);
 
-#define	CHECK_SPINNER 30
-#define	SPINNER_TIME 3		/* seconds */
-#define	MOUNT_TIME 5		/* seconds */
+	if (res > 0)
+		return (1);
+	else
+		return (-1);
+}
 
-static int
-get_one_dataset(zfs_handle_t *zhp, void *data)
+static inline void
+fs_perm_set_init(fs_perm_set_t *fspset)
 {
-	static char spin[] = { '-', '\\', '|', '/' };
-	static int spinval = 0;
-	static int spincheck = 0;
+	bzero(fspset, sizeof (fs_perm_set_t));
+
+	if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool",
+	    sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node),
+	    NULL, UU_DEFAULT)) == NULL)
+		nomem();
+	if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create(
+	    "named_set_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create(
+	    "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof(
+	    who_perm_node_t, who_avl_node), who_perm_compare,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create(
+	    "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof(
+	    deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT))
+	    == NULL)
+		nomem();
+}
+
+static inline void fs_perm_fini(fs_perm_t *);
+static inline void who_perm_fini(who_perm_t *);
+
+static inline void
+fs_perm_set_fini(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = uu_list_first(fspset->fsps_list);
+
+	while (node != NULL) {
+		fs_perm_node_t *next_node =
+		    uu_list_next(fspset->fsps_list, node);
+		fs_perm_t *fsperm = &node->fspn_fsperm;
+		fs_perm_fini(fsperm);
+		uu_list_remove(fspset->fsps_list, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool);
+	uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool);
+}
+
+static inline void
+deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type,
+    const char *name)
+{
+	deleg_perm->dp_who_type = type;
+	deleg_perm->dp_name = name;
+}
+
+static inline void
+who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm,
+    zfs_deleg_who_type_t type, const char *name)
+{
+	uu_avl_pool_t	*pool;
+	pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool;
+
+	bzero(who_perm, sizeof (who_perm_t));
+
+	if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL,
+	    UU_DEFAULT)) == NULL)
+		nomem();
+
+	who_perm->who_type = type;
+	who_perm->who_name = name;
+	who_perm->who_fsperm = fsperm;
+}
+
+static inline void
+who_perm_fini(who_perm_t *who_perm)
+{
+	deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl);
+
+	while (node != NULL) {
+		deleg_perm_node_t *next_node =
+		    uu_avl_next(who_perm->who_deleg_perm_avl, node);
+
+		uu_avl_remove(who_perm->who_deleg_perm_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(who_perm->who_deleg_perm_avl);
+}
+
+static inline void
+fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname)
+{
+	uu_avl_pool_t	*nset_pool = fspset->fsps_named_set_avl_pool;
+	uu_avl_pool_t	*who_pool = fspset->fsps_who_perm_avl_pool;
+
+	bzero(fsperm, sizeof (fs_perm_t));
+
+	if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT))
+	    == NULL)
+		nomem();
+
+	fsperm->fsp_set = fspset;
+	fsperm->fsp_name = fsname;
+}
+
+static inline void
+fs_perm_fini(fs_perm_t *fsperm)
+{
+	who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_sc_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	node = uu_avl_first(fsperm->fsp_uge_avl);
+	while (node != NULL) {
+		who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl,
+		    node);
+		who_perm_t *who_perm = &node->who_perm;
+		who_perm_fini(who_perm);
+		uu_avl_remove(fsperm->fsp_uge_avl, node);
+		free(node);
+		node = next_node;
+	}
+
+	uu_avl_destroy(fsperm->fsp_sc_avl);
+	uu_avl_destroy(fsperm->fsp_uge_avl);
+}
+
+static void
+set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node,
+    zfs_deleg_who_type_t who_type, const char *name, char locality)
+{
+	uu_avl_index_t idx = 0;
+
+	deleg_perm_node_t *found_node = NULL;
+	deleg_perm_t	*deleg_perm = &node->dpn_perm;
+
+	deleg_perm_init(deleg_perm, who_type, name);
+
+	if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+	    == NULL)
+		uu_avl_insert(avl, node, idx);
+	else {
+		node = found_node;
+		deleg_perm = &node->dpn_perm;
+	}
+
+
+	switch (locality) {
+	case ZFS_DELEG_LOCAL:
+		deleg_perm->dp_local = B_TRUE;
+		break;
+	case ZFS_DELEG_DESCENDENT:
+		deleg_perm->dp_descend = B_TRUE;
+		break;
+	case ZFS_DELEG_NA:
+		break;
+	default:
+		assert(B_FALSE); /* invalid locality */
+	}
+}
+
+static inline int
+parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set;
+	uu_avl_t *avl = who_perm->who_deleg_perm_avl;
+	zfs_deleg_who_type_t who_type = who_perm->who_type;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *name = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool;
+		deleg_perm_node_t *node =
+		    safe_malloc(sizeof (deleg_perm_node_t));
+
+		assert(type == DATA_TYPE_BOOLEAN);
+
+		uu_avl_node_init(node, &node->dpn_avl_node, avl_pool);
+		set_deleg_perm_node(avl, node, who_type, name, locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	fs_perm_set_t *fspset = fsperm->fsp_set;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *name = nvpair_name(nvp);
+		uu_avl_t *avl = NULL;
+		uu_avl_pool_t *avl_pool = NULL;
+		zfs_deleg_who_type_t perm_type = name[0];
+		char perm_locality = name[1];
+		const char *perm_name = name + 3;
+		boolean_t is_set = B_TRUE;
+		who_perm_t *who_perm = NULL;
+
+		assert('$' == name[2]);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		switch (perm_type) {
+		case ZFS_DELEG_CREATE:
+		case ZFS_DELEG_CREATE_SETS:
+		case ZFS_DELEG_NAMED_SET:
+		case ZFS_DELEG_NAMED_SET_SETS:
+			avl_pool = fspset->fsps_named_set_avl_pool;
+			avl = fsperm->fsp_sc_avl;
+			break;
+		case ZFS_DELEG_USER:
+		case ZFS_DELEG_USER_SETS:
+		case ZFS_DELEG_GROUP:
+		case ZFS_DELEG_GROUP_SETS:
+		case ZFS_DELEG_EVERYONE:
+		case ZFS_DELEG_EVERYONE_SETS:
+			avl_pool = fspset->fsps_who_perm_avl_pool;
+			avl = fsperm->fsp_uge_avl;
+			break;
+
+		default:
+			assert(!"unhandled zfs_deleg_who_type_t");
+		}
+
+		if (is_set) {
+			who_perm_node_t *found_node = NULL;
+			who_perm_node_t *node = safe_malloc(
+			    sizeof (who_perm_node_t));
+			who_perm = &node->who_perm;
+			uu_avl_index_t idx = 0;
+
+			uu_avl_node_init(node, &node->who_avl_node, avl_pool);
+			who_perm_init(who_perm, fsperm, perm_type, perm_name);
+
+			if ((found_node = uu_avl_find(avl, node, NULL, &idx))
+			    == NULL) {
+				if (avl == fsperm->fsp_uge_avl) {
+					uid_t rid = 0;
+					struct passwd *p = NULL;
+					struct group *g = NULL;
+					const char *nice_name = NULL;
+
+					switch (perm_type) {
+					case ZFS_DELEG_USER_SETS:
+					case ZFS_DELEG_USER:
+						rid = atoi(perm_name);
+						p = getpwuid(rid);
+						if (p)
+							nice_name = p->pw_name;
+						break;
+					case ZFS_DELEG_GROUP_SETS:
+					case ZFS_DELEG_GROUP:
+						rid = atoi(perm_name);
+						g = getgrgid(rid);
+						if (g)
+							nice_name = g->gr_name;
+						break;
+
+					default:
+						break;
+					}
+
+					if (nice_name != NULL)
+						(void) strlcpy(
+						    node->who_perm.who_ug_name,
+						    nice_name, 256);
+				}
+
+				uu_avl_insert(avl, node, idx);
+			} else {
+				node = found_node;
+				who_perm = &node->who_perm;
+			}
+		}
+
+		(void) parse_who_perm(who_perm, nvl2, perm_locality);
+	}
+
+	return (0);
+}
+
+static inline int
+parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl)
+{
+	nvpair_t *nvp = NULL;
+	uu_avl_index_t idx = 0;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		nvlist_t *nvl2 = NULL;
+		const char *fsname = nvpair_name(nvp);
+		data_type_t type = nvpair_type(nvp);
+		fs_perm_t *fsperm = NULL;
+		fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t));
+		if (node == NULL)
+			nomem();
+
+		fsperm = &node->fspn_fsperm;
+
+		assert(DATA_TYPE_NVLIST == type);
+
+		uu_list_node_init(node, &node->fspn_list_node,
+		    fspset->fsps_list_pool);
+
+		idx = uu_list_numnodes(fspset->fsps_list);
+		fs_perm_init(fsperm, fspset, fsname);
+
+		if (nvpair_value_nvlist(nvp, &nvl2) != 0)
+			return (-1);
+
+		(void) parse_fs_perm(fsperm, nvl2);
+
+		uu_list_insert(fspset->fsps_list, node, idx);
+	}
+
+	return (0);
+}
+
+static inline const char *
+deleg_perm_comment(zfs_deleg_note_t note)
+{
+	const char *str = "";
+
+	/* subcommands */
+	switch (note) {
+		/* SUBCOMMANDS */
+	case ZFS_DELEG_NOTE_ALLOW:
+		str = gettext("Must also have the permission that is being"
+		    "\n\t\t\t\tallowed");
+		break;
+	case ZFS_DELEG_NOTE_CLONE:
+		str = gettext("Must also have the 'create' ability and 'mount'"
+		    "\n\t\t\t\tability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_CREATE:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DESTROY:
+		str = gettext("Must also have the 'mount' ability");
+		break;
+	case ZFS_DELEG_NOTE_DIFF:
+		str = gettext("Allows lookup of paths within a dataset;"
+		    "\n\t\t\t\tgiven an object number. Ordinary users need this"
+		    "\n\t\t\t\tin order to use zfs diff");
+		break;
+	case ZFS_DELEG_NOTE_HOLD:
+		str = gettext("Allows adding a user hold to a snapshot");
+		break;
+	case ZFS_DELEG_NOTE_MOUNT:
+		str = gettext("Allows mount/umount of ZFS datasets");
+		break;
+	case ZFS_DELEG_NOTE_PROMOTE:
+		str = gettext("Must also have the 'mount'\n\t\t\t\tand"
+		    " 'promote' ability in the origin file system");
+		break;
+	case ZFS_DELEG_NOTE_RECEIVE:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    " ability");
+		break;
+	case ZFS_DELEG_NOTE_RELEASE:
+		str = gettext("Allows releasing a user hold which\n\t\t\t\t"
+		    "might destroy the snapshot");
+		break;
+	case ZFS_DELEG_NOTE_RENAME:
+		str = gettext("Must also have the 'mount' and 'create'"
+		    "\n\t\t\t\tability in the new parent");
+		break;
+	case ZFS_DELEG_NOTE_ROLLBACK:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SEND:
+		str = gettext("");
+		break;
+	case ZFS_DELEG_NOTE_SHARE:
+		str = gettext("Allows sharing file systems over NFS or SMB"
+		    "\n\t\t\t\tprotocols");
+		break;
+	case ZFS_DELEG_NOTE_SNAPSHOT:
+		str = gettext("");
+		break;
+/*
+ *	case ZFS_DELEG_NOTE_VSCAN:
+ *		str = gettext("");
+ *		break;
+ */
+		/* OTHER */
+	case ZFS_DELEG_NOTE_GROUPQUOTA:
+		str = gettext("Allows accessing any groupquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_GROUPUSED:
+		str = gettext("Allows reading any groupused@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERPROP:
+		str = gettext("Allows changing any user property");
+		break;
+	case ZFS_DELEG_NOTE_USERQUOTA:
+		str = gettext("Allows accessing any userquota@... property");
+		break;
+	case ZFS_DELEG_NOTE_USERUSED:
+		str = gettext("Allows reading any userused@... property");
+		break;
+		/* other */
+	default:
+		str = "";
+	}
+
+	return (str);
+}
+
+struct allow_opts {
+	boolean_t local;
+	boolean_t descend;
+	boolean_t user;
+	boolean_t group;
+	boolean_t everyone;
+	boolean_t create;
+	boolean_t set;
+	boolean_t recursive; /* unallow only */
+	boolean_t prt_usage;
+
+	boolean_t prt_perms;
+	char *who;
+	char *perms;
+	const char *dataset;
+};
+
+static inline int
+prop_cmp(const void *a, const void *b)
+{
+	const char *str1 = *(const char **)a;
+	const char *str2 = *(const char **)b;
+	return (strcmp(str1, str2));
+}
+
+static void
+allow_usage(boolean_t un, boolean_t requested, const char *msg)
+{
+	const char *opt_desc[] = {
+		"-h", gettext("show this help message and exit"),
+		"-l", gettext("set permission locally"),
+		"-d", gettext("set permission for descents"),
+		"-u", gettext("set permission for user"),
+		"-g", gettext("set permission for group"),
+		"-e", gettext("set permission for everyone"),
+		"-c", gettext("set create time permission"),
+		"-s", gettext("define permission set"),
+		/* unallow only */
+		"-r", gettext("remove permissions recursively"),
+	};
+	size_t unallow_size = sizeof (opt_desc) / sizeof (char *);
+	size_t allow_size = unallow_size - 2;
+	const char *props[ZFS_NUM_PROPS];
+	int i;
+	size_t count = 0;
+	FILE *fp = requested ? stdout : stderr;
+	zprop_desc_t *pdtbl = zfs_prop_get_table();
+	const char *fmt = gettext("%-16s %-14s\t%s\n");
+
+	(void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW :
+	    HELP_ALLOW));
+	(void) fprintf(fp, gettext("Options:\n"));
+	for (i = 0; i < (un ? unallow_size : allow_size); i++) {
+		const char *opt = opt_desc[i++];
+		const char *optdsc = opt_desc[i];
+		(void) fprintf(fp, gettext("  %-10s  %s\n"), opt, optdsc);
+	}
+
+	(void) fprintf(fp, gettext("\nThe following permissions are "
+	    "supported:\n\n"));
+	(void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"),
+	    gettext("NOTES"));
+	for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) {
+		const char *perm_name = zfs_deleg_perm_tbl[i].z_perm;
+		zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note;
+		const char *perm_type = deleg_perm_type(perm_note);
+		const char *perm_comment = deleg_perm_comment(perm_note);
+		(void) fprintf(fp, fmt, perm_name, perm_type, perm_comment);
+	}
+
+	for (i = 0; i < ZFS_NUM_PROPS; i++) {
+		zprop_desc_t *pd = &pdtbl[i];
+		if (pd->pd_visible != B_TRUE)
+			continue;
+
+		if (pd->pd_attr == PROP_READONLY)
+			continue;
+
+		props[count++] = pd->pd_name;
+	}
+	props[count] = NULL;
+
+	qsort(props, count, sizeof (char *), prop_cmp);
+
+	for (i = 0; i < count; i++)
+		(void) fprintf(fp, fmt, props[i], gettext("property"), "");
+
+	if (msg != NULL)
+		(void) fprintf(fp, gettext("\nzfs: error: %s"), msg);
+
+	exit(requested ? 0 : 2);
+}
+
+static inline const char *
+munge_args(int argc, char **argv, boolean_t un, size_t expected_argc,
+    char **permsp)
+{
+	if (un && argc == expected_argc - 1)
+		*permsp = NULL;
+	else if (argc == expected_argc)
+		*permsp = argv[argc - 2];
+	else
+		allow_usage(un, B_FALSE,
+		    gettext("wrong number of parameters\n"));
+
+	return (argv[argc - 1]);
+}
+
+static void
+parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts)
+{
+	int uge_sum = opts->user + opts->group + opts->everyone;
+	int csuge_sum = opts->create + opts->set + uge_sum;
+	int ldcsuge_sum = csuge_sum + opts->local + opts->descend;
+	int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum;
+
+	if (uge_sum > 1)
+		allow_usage(un, B_FALSE,
+		    gettext("-u, -g, and -e are mutually exclusive\n"));
+
+	if (opts->prt_usage) {
+		if (argc == 0 && all_sum == 0)
+			allow_usage(un, B_TRUE, NULL);
+		else
+			usage(B_FALSE);
+	}
+
+	if (opts->set) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -s\n"));
+
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		if (argv[0][0] != '@')
+			allow_usage(un, B_FALSE,
+			    gettext("invalid set name: missing '@' prefix\n"));
+		opts->who = argv[0];
+	} else if (opts->create) {
+		if (ldcsuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -c\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (opts->everyone) {
+		if (csuge_sum > 1)
+			allow_usage(un, B_FALSE,
+			    gettext("invalid options combined with -e\n"));
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone")
+	    == 0) {
+		opts->everyone = B_TRUE;
+		argc--;
+		argv++;
+		opts->dataset = munge_args(argc, argv, un, 2, &opts->perms);
+	} else if (argc == 1 && !un) {
+		opts->prt_perms = B_TRUE;
+		opts->dataset = argv[argc-1];
+	} else {
+		opts->dataset = munge_args(argc, argv, un, 3, &opts->perms);
+		opts->who = argv[0];
+	}
+
+	if (!opts->local && !opts->descend) {
+		opts->local = B_TRUE;
+		opts->descend = B_TRUE;
+	}
+}
+
+static void
+store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend,
+    const char *who, char *perms, nvlist_t *top_nvl)
+{
+	int i;
+	char ld[2] = { '\0', '\0' };
+	char who_buf[MAXNAMELEN + 32];
+	char base_type = '\0';
+	char set_type = '\0';
+	nvlist_t *base_nvl = NULL;
+	nvlist_t *set_nvl = NULL;
+	nvlist_t *nvl;
+
+	if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+	if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) !=  0)
+		nomem();
+
+	switch (type) {
+	case ZFS_DELEG_NAMED_SET_SETS:
+	case ZFS_DELEG_NAMED_SET:
+		set_type = ZFS_DELEG_NAMED_SET_SETS;
+		base_type = ZFS_DELEG_NAMED_SET;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_CREATE_SETS:
+	case ZFS_DELEG_CREATE:
+		set_type = ZFS_DELEG_CREATE_SETS;
+		base_type = ZFS_DELEG_CREATE;
+		ld[0] = ZFS_DELEG_NA;
+		break;
+	case ZFS_DELEG_USER_SETS:
+	case ZFS_DELEG_USER:
+		set_type = ZFS_DELEG_USER_SETS;
+		base_type = ZFS_DELEG_USER;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_GROUP_SETS:
+	case ZFS_DELEG_GROUP:
+		set_type = ZFS_DELEG_GROUP_SETS;
+		base_type = ZFS_DELEG_GROUP;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+	case ZFS_DELEG_EVERYONE_SETS:
+	case ZFS_DELEG_EVERYONE:
+		set_type = ZFS_DELEG_EVERYONE_SETS;
+		base_type = ZFS_DELEG_EVERYONE;
+		if (local)
+			ld[0] = ZFS_DELEG_LOCAL;
+		if (descend)
+			ld[1] = ZFS_DELEG_DESCENDENT;
+		break;
+
+	default:
+		assert(set_type != '\0' && base_type != '\0');
+	}
+
+	if (perms != NULL) {
+		char *curr = perms;
+		char *end = curr + strlen(perms);
+
+		while (curr < end) {
+			char *delim = strchr(curr, ',');
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			if (curr[0] == '@')
+				nvl = set_nvl;
+			else
+				nvl = base_nvl;
+
+			(void) nvlist_add_boolean(nvl, curr);
+			if (delim != end)
+				*delim = ',';
+			curr = delim + 1;
+		}
+
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (!nvlist_empty(base_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    base_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    base_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    base_nvl);
+			}
+
+
+			if (!nvlist_empty(set_nvl)) {
+				if (who != NULL)
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$%s",
+					    set_type, locality, who);
+				else
+					(void) snprintf(who_buf,
+					    sizeof (who_buf), "%c%c$",
+					    set_type, locality);
+
+				(void) nvlist_add_nvlist(top_nvl, who_buf,
+				    set_nvl);
+			}
+		}
+	} else {
+		for (i = 0; i < 2; i++) {
+			char locality = ld[i];
+			if (locality == 0)
+				continue;
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", base_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", base_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+
+			if (who != NULL)
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$%s", set_type, locality, who);
+			else
+				(void) snprintf(who_buf, sizeof (who_buf),
+				    "%c%c$", set_type, locality);
+			(void) nvlist_add_boolean(top_nvl, who_buf);
+		}
+	}
+}
+
+static int
+construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
+{
+	if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	if (opts->set) {
+		store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local,
+		    opts->descend, opts->who, opts->perms, *nvlp);
+	} else if (opts->create) {
+		store_allow_perm(ZFS_DELEG_CREATE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else if (opts->everyone) {
+		store_allow_perm(ZFS_DELEG_EVERYONE, opts->local,
+		    opts->descend, NULL, opts->perms, *nvlp);
+	} else {
+		char *curr = opts->who;
+		char *end = curr + strlen(curr);
+
+		while (curr < end) {
+			const char *who;
+			zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
+			char *endch;
+			char *delim = strchr(curr, ',');
+			char errbuf[256];
+			char id[64];
+			struct passwd *p = NULL;
+			struct group *g = NULL;
+
+			uid_t rid;
+			if (delim == NULL)
+				delim = end;
+			else
+				*delim = '\0';
+
+			rid = (uid_t)strtol(curr, &endch, 0);
+			if (opts->user) {
+				who_type = ZFS_DELEG_USER;
+				if (*endch != '\0')
+					p = getpwnam(curr);
+				else
+					p = getpwuid(rid);
+
+				if (p != NULL)
+					rid = p->pw_uid;
+				else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user %s"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else if (opts->group) {
+				who_type = ZFS_DELEG_GROUP;
+				if (*endch != '\0')
+					g = getgrnam(curr);
+				else
+					g = getgrgid(rid);
+
+				if (g != NULL)
+					rid = g->gr_gid;
+				else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid group %s"),  curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			} else {
+				if (*endch != '\0') {
+					p = getpwnam(curr);
+				} else {
+					p = getpwuid(rid);
+				}
+
+				if (p == NULL) {
+					if (*endch != '\0') {
+						g = getgrnam(curr);
+					} else {
+						g = getgrgid(rid);
+					}
+				}
+
+				if (p != NULL) {
+					who_type = ZFS_DELEG_USER;
+					rid = p->pw_uid;
+				} else if (g != NULL) {
+					who_type = ZFS_DELEG_GROUP;
+					rid = g->gr_gid;
+				} else {
+					(void) snprintf(errbuf, 256, gettext(
+					    "invalid user/group %s"), curr);
+					allow_usage(un, B_TRUE, errbuf);
+				}
+			}
+
+			(void) sprintf(id, "%u", rid);
+			who = id;
+
+			store_allow_perm(who_type, opts->local,
+			    opts->descend, who, opts->perms, *nvlp);
+			curr = delim + 1;
+		}
+	}
+
+	return (0);
+}
+
+static void
+print_set_creat_perms(uu_avl_t *who_avl)
+{
+	const char *sc_title[] = {
+		gettext("Permission sets:\n"),
+		gettext("Create time permissions:\n"),
+		NULL
+	};
+	const char **title_ptr = sc_title;
+	who_perm_node_t *who_node = NULL;
+	int prev_weight = -1;
+
+	for (who_node = uu_avl_first(who_avl); who_node != NULL;
+	    who_node = uu_avl_next(who_avl, who_node)) {
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		const char *who_name = who_node->who_perm.who_name;
+		int weight = who_type2weight(who_type);
+		boolean_t first = B_TRUE;
+		deleg_perm_node_t *deleg_node;
+
+		if (prev_weight != weight) {
+			(void) printf(*title_ptr++);
+			prev_weight = weight;
+		}
+
+		if (who_name == NULL || strnlen(who_name, 1) == 0)
+			(void) printf("\t");
+		else
+			(void) printf("\t%s ", who_name);
+
+		for (deleg_node = uu_avl_first(avl); deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (first) {
+				(void) printf("%s",
+				    deleg_node->dpn_perm.dp_name);
+				first = B_FALSE;
+			} else
+				(void) printf(",%s",
+				    deleg_node->dpn_perm.dp_name);
+		}
+
+		(void) printf("\n");
+	}
+}
+
+static void
+print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend,
+    const char *title)
+{
+	who_perm_node_t *who_node = NULL;
+	boolean_t prt_title = B_TRUE;
+	uu_avl_walk_t *walk;
+
+	if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL)
+		nomem();
+
+	while ((who_node = uu_avl_walk_next(walk)) != NULL) {
+		const char *who_name = who_node->who_perm.who_name;
+		const char *nice_who_name = who_node->who_perm.who_ug_name;
+		uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl;
+		zfs_deleg_who_type_t who_type = who_node->who_perm.who_type;
+		char delim = ' ';
+		deleg_perm_node_t *deleg_node;
+		boolean_t prt_who = B_TRUE;
+
+		for (deleg_node = uu_avl_first(avl);
+		    deleg_node != NULL;
+		    deleg_node = uu_avl_next(avl, deleg_node)) {
+			if (local != deleg_node->dpn_perm.dp_local ||
+			    descend != deleg_node->dpn_perm.dp_descend)
+				continue;
+
+			if (prt_who) {
+				const char *who = NULL;
+				if (prt_title) {
+					prt_title = B_FALSE;
+					(void) printf(title);
+				}
+
+				switch (who_type) {
+				case ZFS_DELEG_USER_SETS:
+				case ZFS_DELEG_USER:
+					who = gettext("user");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_GROUP_SETS:
+				case ZFS_DELEG_GROUP:
+					who = gettext("group");
+					if (nice_who_name)
+						who_name  = nice_who_name;
+					break;
+				case ZFS_DELEG_EVERYONE_SETS:
+				case ZFS_DELEG_EVERYONE:
+					who = gettext("everyone");
+					who_name = NULL;
+					break;
+
+				default:
+					assert(who != NULL);
+				}
+
+				prt_who = B_FALSE;
+				if (who_name == NULL)
+					(void) printf("\t%s", who);
+				else
+					(void) printf("\t%s %s", who, who_name);
+			}
+
+			(void) printf("%c%s", delim,
+			    deleg_node->dpn_perm.dp_name);
+			delim = ',';
+		}
+
+		if (!prt_who)
+			(void) printf("\n");
+	}
+
+	uu_avl_walk_end(walk);
+}
+
+static void
+print_fs_perms(fs_perm_set_t *fspset)
+{
+	fs_perm_node_t *node = NULL;
+	char buf[MAXNAMELEN + 32];
+	const char *dsname = buf;
+
+	for (node = uu_list_first(fspset->fsps_list); node != NULL;
+	    node = uu_list_next(fspset->fsps_list, node)) {
+		uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl;
+		uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl;
+		int left = 0;
+
+		(void) snprintf(buf, sizeof (buf),
+		    gettext("---- Permissions on %s "),
+		    node->fspn_fsperm.fsp_name);
+		(void) printf(dsname);
+		left = 70 - strlen(buf);
+		while (left-- > 0)
+			(void) printf("-");
+		(void) printf("\n");
+
+		print_set_creat_perms(sc_avl);
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE,
+		    gettext("Local permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE,
+		    gettext("Descendent permissions:\n"));
+		print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE,
+		    gettext("Local+Descendent permissions:\n"));
+	}
+}
+
+static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL };
+
+struct deleg_perms {
+	boolean_t un;
+	nvlist_t *nvl;
+};
+
+static int
+set_deleg_perms(zfs_handle_t *zhp, void *data)
+{
+	struct deleg_perms *perms = (struct deleg_perms *)data;
+	zfs_type_t zfs_type = zfs_get_type(zhp);
+
+	if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME)
+		return (0);
+
+	return (zfs_set_fsacl(zhp, perms->un, perms->nvl));
+}
+
+static int
+zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un)
+{
+	zfs_handle_t *zhp;
+	nvlist_t *perm_nvl = NULL;
+	nvlist_t *update_perm_nvl = NULL;
+	int error = 1;
+	int c;
+	struct allow_opts opts = { 0 };
+
+	const char *optstr = un ? "ldugecsrh" : "ldugecsh";
+
+	/* check opts */
+	while ((c = getopt(argc, argv, optstr)) != -1) {
+		switch (c) {
+		case 'l':
+			opts.local = B_TRUE;
+			break;
+		case 'd':
+			opts.descend = B_TRUE;
+			break;
+		case 'u':
+			opts.user = B_TRUE;
+			break;
+		case 'g':
+			opts.group = B_TRUE;
+			break;
+		case 'e':
+			opts.everyone = B_TRUE;
+			break;
+		case 's':
+			opts.set = B_TRUE;
+			break;
+		case 'c':
+			opts.create = B_TRUE;
+			break;
+		case 'r':
+			opts.recursive = B_TRUE;
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case 'h':
+			opts.prt_usage = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check arguments */
+	parse_allow_args(argc, argv, un, &opts);
+
+	/* try to open the dataset */
+	if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME)) == NULL) {
+		(void) fprintf(stderr, "Failed to open dataset: %s\n",
+		    opts.dataset);
+		return (-1);
+	}
+
+	if (zfs_get_fsacl(zhp, &perm_nvl) != 0)
+		goto cleanup2;
+
+	fs_perm_set_init(&fs_perm_set);
+	if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) {
+		(void) fprintf(stderr, "Failed to parse fsacl permissions\n");
+		goto cleanup1;
+	}
+
+	if (opts.prt_perms)
+		print_fs_perms(&fs_perm_set);
+	else {
+		(void) construct_fsacl_list(un, &opts, &update_perm_nvl);
+		if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0)
+			goto cleanup0;
+
+		if (un && opts.recursive) {
+			struct deleg_perms data = { un, update_perm_nvl };
+			if (zfs_iter_filesystems(zhp, set_deleg_perms,
+			    &data) != 0)
+				goto cleanup0;
+		}
+	}
+
+	error = 0;
+
+cleanup0:
+	nvlist_free(perm_nvl);
+	nvlist_free(update_perm_nvl);
+cleanup1:
+	fs_perm_set_fini(&fs_perm_set);
+cleanup2:
+	zfs_close(zhp);
+
+	return (error);
+}
+
+static int
+zfs_do_allow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE));
+}
+
+static int
+zfs_do_unallow(int argc, char **argv)
+{
+	return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE));
+}
+
+static int
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
+{
+	int errors = 0;
+	int i;
+	const char *tag;
+	boolean_t recursive = B_FALSE;
+	const char *opts = holding ? "rt" : "r";
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'r':
+			recursive = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 2)
+		usage(B_FALSE);
+
+	tag = argv[0];
+	--argc;
+	++argv;
+
+	if (holding && tag[0] == '.') {
+		/* tags starting with '.' are reserved for libzfs */
+		(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
+		usage(B_FALSE);
+	}
+
+	for (i = 0; i < argc; ++i) {
+		zfs_handle_t *zhp;
+		char parent[ZFS_MAX_DATASET_NAME_LEN];
+		const char *delim;
+		char *path = argv[i];
+
+		delim = strchr(path, '@');
+		if (delim == NULL) {
+			(void) fprintf(stderr,
+			    gettext("'%s' is not a snapshot\n"), path);
+			++errors;
+			continue;
+		}
+		(void) strncpy(parent, path, delim - path);
+		parent[delim - path] = '\0';
+
+		zhp = zfs_open(g_zfs, parent,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL) {
+			++errors;
+			continue;
+		}
+		if (holding) {
+			if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0)
+				++errors;
+		} else {
+			if (zfs_release(zhp, delim+1, tag, recursive) != 0)
+				++errors;
+		}
+		zfs_close(zhp);
+	}
+
+	return (errors != 0);
+}
+
+/*
+ * zfs hold [-r] [-t] <tag> <snap> ...
+ *
+ *	-r	Recursively hold
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
+static int
+zfs_do_hold(int argc, char **argv)
+{
+	return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
+}
+
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ *	-r	Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
+static int
+zfs_do_release(int argc, char **argv)
+{
+	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
+}
+
+typedef struct holds_cbdata {
+	boolean_t	cb_recursive;
+	const char	*cb_snapname;
+	nvlist_t	**cb_nvlp;
+	size_t		cb_max_namelen;
+	size_t		cb_max_taglen;
+} holds_cbdata_t;
+
+#define	STRFTIME_FMT_STR "%a %b %e %k:%M %Y"
+#define	DATETIME_BUF_LEN (32)
+/*
+ *
+ */
+static void
+print_holds(boolean_t scripted, boolean_t literal, size_t nwidth,
+    size_t tagwidth, nvlist_t *nvl)
+{
+	int i;
+	nvpair_t *nvp = NULL;
+	char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" };
+	const char *col;
+
+	if (!scripted) {
+		for (i = 0; i < 3; i++) {
+			col = gettext(hdr_cols[i]);
+			if (i < 2)
+				(void) printf("%-*s  ", i ? tagwidth : nwidth,
+				    col);
+			else
+				(void) printf("%s\n", col);
+		}
+	}
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		char *zname = nvpair_name(nvp);
+		nvlist_t *nvl2;
+		nvpair_t *nvp2 = NULL;
+		(void) nvpair_value_nvlist(nvp, &nvl2);
+		while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) {
+			char tsbuf[DATETIME_BUF_LEN];
+			char *tagname = nvpair_name(nvp2);
+			uint64_t val = 0;
+			time_t time;
+			struct tm t;
+			char sep = scripted ? '\t' : ' ';
+			size_t sepnum = scripted ? 1 : 2;
+
+			(void) nvpair_value_uint64(nvp2, &val);
+			if (literal)
+				snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val);
+			else {
+				time = (time_t)val;
+				(void) localtime_r(&time, &t);
+				(void) strftime(tsbuf, DATETIME_BUF_LEN,
+				    gettext(STRFTIME_FMT_STR), &t);
+			}
+
+			(void) printf("%-*s%*c%-*s%*c%s\n", nwidth, zname,
+			    sepnum, sep, tagwidth, tagname, sepnum, sep, tsbuf);
+		}
+	}
+}
+
+/*
+ * Generic callback function to list a dataset or snapshot.
+ */
+static int
+holds_callback(zfs_handle_t *zhp, void *data)
+{
+	holds_cbdata_t *cbp = data;
+	nvlist_t *top_nvl = *cbp->cb_nvlp;
+	nvlist_t *nvl = NULL;
+	nvpair_t *nvp = NULL;
+	const char *zname = zfs_get_name(zhp);
+	size_t znamelen = strlen(zname);
+
+	if (cbp->cb_recursive && cbp->cb_snapname != NULL) {
+		const char *snapname;
+		char *delim  = strchr(zname, '@');
+		if (delim == NULL)
+			return (0);
+
+		snapname = delim + 1;
+		if (strcmp(cbp->cb_snapname, snapname))
+			return (0);
+	}
+
+	if (zfs_get_holds(zhp, &nvl) != 0)
+		return (-1);
+
+	if (znamelen > cbp->cb_max_namelen)
+		cbp->cb_max_namelen  = znamelen;
+
+	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
+		const char *tag = nvpair_name(nvp);
+		size_t taglen = strlen(tag);
+		if (taglen > cbp->cb_max_taglen)
+			cbp->cb_max_taglen  = taglen;
+	}
+
+	return (nvlist_add_nvlist(top_nvl, zname, nvl));
+}
+
+/*
+ * zfs holds [-Hp] [-r | -d max] <dataset|snap> ...
+ *
+ *	-H	Suppress header output
+ *	-p	Output literal values
+ *	-r	Recursively search for holds
+ *	-d max	Limit depth of recursive search
+ */
+static int
+zfs_do_holds(int argc, char **argv)
+{
+	int errors = 0;
+	int c;
+	int i;
+	boolean_t scripted = B_FALSE;
+	boolean_t literal = B_FALSE;
+	boolean_t recursive = B_FALSE;
+	const char *opts = "d:rHp";
+	nvlist_t *nvl;
+
+	int types = ZFS_TYPE_SNAPSHOT;
+	holds_cbdata_t cb = { 0 };
+
+	int limit = 0;
+	int ret = 0;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
+		switch (c) {
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			recursive = B_TRUE;
+			break;
+		case 'r':
+			recursive = B_TRUE;
+			break;
+		case 'H':
+			scripted = B_TRUE;
+			break;
+		case 'p':
+			literal = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	if (recursive) {
+		types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
+		flags |= ZFS_ITER_RECURSE;
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1)
+		usage(B_FALSE);
+
+	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+		nomem();
+
+	for (i = 0; i < argc; ++i) {
+		char *snapshot = argv[i];
+		const char *delim;
+		const char *snapname = NULL;
+
+		delim = strchr(snapshot, '@');
+		if (delim != NULL) {
+			snapname = delim + 1;
+			if (recursive)
+				snapshot[delim - snapshot] = '\0';
+		}
+
+		cb.cb_recursive = recursive;
+		cb.cb_snapname = snapname;
+		cb.cb_nvlp = &nvl;
+
+		/*
+		 *  1. collect holds data, set format options
+		 */
+		ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit,
+		    holds_callback, &cb);
+		if (ret != 0)
+			++errors;
+	}
+
+	/*
+	 *  2. print holds data
+	 */
+	print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen,
+	    nvl);
+
+	if (nvlist_empty(nvl))
+		(void) printf(gettext("no datasets available\n"));
+
+	nvlist_free(nvl);
+
+	return (0 != errors);
+}
+
+#define	CHECK_SPINNER 30
+#define	SPINNER_TIME 3		/* seconds */
+#define	MOUNT_TIME 5		/* seconds */
+
+static int
+get_one_dataset(zfs_handle_t *zhp, void *data)
+{
+	static char *spin[] = { "-", "\\", "|", "/" };
+	static int spinval = 0;
+	static int spincheck = 0;
 	static time_t last_spin_time = (time_t)0;
-	get_all_cbdata_t *cbp = data;
+	get_all_cb_t *cbp = data;
 	zfs_type_t type = zfs_get_type(zhp);
 
 	if (cbp->cb_verbose) {
 		if (--spincheck < 0) {
 			time_t now = time(NULL);
 			if (last_spin_time + SPINNER_TIME < now) {
-				(void) printf("\b%c", spin[spinval++ % 4]);
-				(void) fflush(stdout);
+				update_progress(spin[spinval++ % 4]);
 				last_spin_time = now;
 			}
 			spincheck = CHECK_SPINNER;
@@ -2904,8 +5793,7 @@ get_one_dataset(zfs_handle_t *zhp, void 
 	/*
 	 * Interate over any nested datasets.
 	 */
-	if (type == ZFS_TYPE_FILESYSTEM &&
-	    zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
+	if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
@@ -2913,83 +5801,32 @@ get_one_dataset(zfs_handle_t *zhp, void 
 	/*
 	 * Skip any datasets whose type does not match.
 	 */
-	if ((type & cbp->cb_types) == 0) {
+	if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
-
-	if (cbp->cb_alloc == cbp->cb_used) {
-		zfs_handle_t **handles;
-
-		if (cbp->cb_alloc == 0)
-			cbp->cb_alloc = 64;
-		else
-			cbp->cb_alloc *= 2;
-
-		handles = safe_malloc(cbp->cb_alloc * sizeof (void *));
-
-		if (cbp->cb_handles) {
-			bcopy(cbp->cb_handles, handles,
-			    cbp->cb_used * sizeof (void *));
-			free(cbp->cb_handles);
-		}
-
-		cbp->cb_handles = handles;
-	}
-
-	cbp->cb_handles[cbp->cb_used++] = zhp;
+	libzfs_add_handle(cbp, zhp);
+	assert(cbp->cb_used <= cbp->cb_alloc);
 
 	return (0);
 }
 
 static void
-get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count,
-    boolean_t verbose)
+get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose)
 {
-	get_all_cbdata_t cb = { 0 };
-	cb.cb_types = types;
+	get_all_cb_t cb = { 0 };
 	cb.cb_verbose = verbose;
+	cb.cb_getone = get_one_dataset;
 
-	if (verbose) {
-		(void) printf("%s: *", gettext("Reading ZFS config"));
-		(void) fflush(stdout);
-	}
-
+	if (verbose)
+		set_progress_header(gettext("Reading ZFS config"));
 	(void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
 
 	*dslist = cb.cb_handles;
 	*count = cb.cb_used;
 
-	if (verbose) {
-		(void) printf("\b%s\n", gettext("done."));
-	}
-}
-
-static int
-dataset_cmp(const void *a, const void *b)
-{
-	zfs_handle_t **za = (zfs_handle_t **)a;
-	zfs_handle_t **zb = (zfs_handle_t **)b;
-	char mounta[MAXPATHLEN];
-	char mountb[MAXPATHLEN];
-	boolean_t gota, gotb;
-
-	if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
-		verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
-		    sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
-	if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
-		verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
-		    sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
-
-	if (gota && gotb)
-		return (strcmp(mounta, mountb));
-
-	if (gota)
-		return (-1);
-	if (gotb)
-		return (1);
-
-	return (strcmp(zfs_get_name(a), zfs_get_name(b)));
+	if (verbose)
+		finish_progress(gettext("done."));
 }
 
 /*
@@ -3013,216 +5850,197 @@ share_mount_one(zfs_handle_t *zhp, int o
 	const char *cmdname = op == OP_SHARE ? "share" : "mount";
 	struct mnttab mnt;
 	uint64_t zoned, canmount;
-	zfs_type_t type = zfs_get_type(zhp);
 	boolean_t shared_nfs, shared_smb;
 
-	assert(type & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME));
-
-	if (type == ZFS_TYPE_FILESYSTEM) {
-		/*
-		 * Check to make sure we can mount/share this dataset.  If we
-		 * are in the global zone and the filesystem is exported to a
-		 * local zone, or if we are in a local zone and the
-		 * filesystem is not exported, then it is an error.
-		 */
-		zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
-
-		if (zoned && getzoneid() == GLOBAL_ZONEID) {
-			if (!explicit)
-				return (0);
-
-			(void) fprintf(stderr, gettext("cannot %s '%s': "
-			    "dataset is exported to a local zone\n"), cmdname,
-			    zfs_get_name(zhp));
-			return (1);
-
-		} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
-			if (!explicit)
-				return (0);
-
-			(void) fprintf(stderr, gettext("cannot %s '%s': "
-			    "permission denied\n"), cmdname,
-			    zfs_get_name(zhp));
-			return (1);
-		}
-
-		/*
-		 * Ignore any filesystems which don't apply to us. This
-		 * includes those with a legacy mountpoint, or those with
-		 * legacy share options.
-		 */
-		verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
-		    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
-		verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
-		    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
-		verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
-		    sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
-
-		if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
-		    strcmp(smbshareopts, "off") == 0) {
-			if (!explicit)
-				return (0);
-
-			(void) fprintf(stderr, gettext("cannot share '%s': "
-			    "legacy share\n"), zfs_get_name(zhp));
-			(void) fprintf(stderr, gettext("use share(1M) to "
-			    "share this filesystem, or set "
-			    "sharenfs property on\n"));
-			return (1);
-		}
+	assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM);
 
-		/*
-		 * We cannot share or mount legacy filesystems. If the
-		 * shareopts is non-legacy but the mountpoint is legacy, we
-		 * treat it as a legacy share.
-		 */
-		if (strcmp(mountpoint, "legacy") == 0) {
-			if (!explicit)
-				return (0);
+	/*
+	 * Check to make sure we can mount/share this dataset.  If we
+	 * are in the global zone and the filesystem is exported to a
+	 * local zone, or if we are in a local zone and the
+	 * filesystem is not exported, then it is an error.
+	 */
+	zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
 
-			(void) fprintf(stderr, gettext("cannot %s '%s': "
-			    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
-			(void) fprintf(stderr, gettext("use %s(1M) to "
-			    "%s this filesystem\n"), cmdname, cmdname);
-			return (1);
-		}
+	if (zoned && getzoneid() == GLOBAL_ZONEID) {
+		if (!explicit)
+			return (0);
 
-		if (strcmp(mountpoint, "none") == 0) {
-			if (!explicit)
-				return (0);
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "dataset is exported to a local zone\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
 
-			(void) fprintf(stderr, gettext("cannot %s '%s': no "
-			    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
-			return (1);
-		}
+	} else if (!zoned && getzoneid() != GLOBAL_ZONEID) {
+		if (!explicit)
+			return (0);
 
-		/*
-		 * canmount	explicit	outcome
-		 * on		no		pass through
-		 * on		yes		pass through
-		 * off		no		return 0
-		 * off		yes		display error, return 1
-		 * noauto	no		return 0
-		 * noauto	yes		pass through
-		 */
-		canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
-		if (canmount == ZFS_CANMOUNT_OFF) {
-			if (!explicit)
-				return (0);
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "permission denied\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+	}
 
-			(void) fprintf(stderr, gettext("cannot %s '%s': "
-			    "'canmount' property is set to 'off'\n"), cmdname,
-			    zfs_get_name(zhp));
-			return (1);
-		} else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
+	/*
+	 * Ignore any filesystems which don't apply to us. This
+	 * includes those with a legacy mountpoint, or those with
+	 * legacy share options.
+	 */
+	verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
+	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+	verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts,
+	    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+	verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
+	    sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
+
+	if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
+	    strcmp(smbshareopts, "off") == 0) {
+		if (!explicit)
 			return (0);
-		}
-
-		/*
-		 * At this point, we have verified that the mountpoint and/or
-		 * shareopts are appropriate for auto management. If the
-		 * filesystem is already mounted or shared, return (failing
-		 * for explicit requests); otherwise mount or share the
-		 * filesystem.
-		 */
-		switch (op) {
-		case OP_SHARE:
 
-			shared_nfs = zfs_is_shared_nfs(zhp, NULL);
-			shared_smb = zfs_is_shared_smb(zhp, NULL);
+		(void) fprintf(stderr, gettext("cannot share '%s': "
+		    "legacy share\n"), zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("to "
+		    "share this filesystem set "
+		    "sharenfs property on\n"));
+		return (1);
+	}
 
-			if (shared_nfs && shared_smb ||
-			    (shared_nfs && strcmp(shareopts, "on") == 0 &&
-			    strcmp(smbshareopts, "off") == 0) ||
-			    (shared_smb && strcmp(smbshareopts, "on") == 0 &&
-			    strcmp(shareopts, "off") == 0)) {
-				if (!explicit)
-					return (0);
+	/*
+	 * We cannot share or mount legacy filesystems. If the
+	 * shareopts is non-legacy but the mountpoint is legacy, we
+	 * treat it as a legacy share.
+	 */
+	if (strcmp(mountpoint, "legacy") == 0) {
+		if (!explicit)
+			return (0);
 
-				(void) fprintf(stderr, gettext("cannot share "
-				    "'%s': filesystem already shared\n"),
-				    zfs_get_name(zhp));
-				return (1);
-			}
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "legacy mountpoint\n"), cmdname, zfs_get_name(zhp));
+		(void) fprintf(stderr, gettext("use %s(8) to "
+		    "%s this filesystem\n"), cmdname, cmdname);
+		return (1);
+	}
 
-			if (!zfs_is_mounted(zhp, NULL) &&
-			    zfs_mount(zhp, NULL, 0) != 0)
-				return (1);
+	if (strcmp(mountpoint, "none") == 0) {
+		if (!explicit)
+			return (0);
 
-			if (protocol == NULL) {
-				if (zfs_shareall(zhp) != 0)
-					return (1);
-			} else if (strcmp(protocol, "nfs") == 0) {
-				if (zfs_share_nfs(zhp))
-					return (1);
-			} else if (strcmp(protocol, "smb") == 0) {
-				if (zfs_share_smb(zhp))
-					return (1);
-			} else {
-				(void) fprintf(stderr, gettext("cannot share "
-				    "'%s': invalid share type '%s' "
-				    "specified\n"),
-				    zfs_get_name(zhp), protocol);
-				return (1);
-			}
+		(void) fprintf(stderr, gettext("cannot %s '%s': no "
+		    "mountpoint set\n"), cmdname, zfs_get_name(zhp));
+		return (1);
+	}
 
-			break;
+	/*
+	 * canmount	explicit	outcome
+	 * on		no		pass through
+	 * on		yes		pass through
+	 * off		no		return 0
+	 * off		yes		display error, return 1
+	 * noauto	no		return 0
+	 * noauto	yes		pass through
+	 */
+	canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
+	if (canmount == ZFS_CANMOUNT_OFF) {
+		if (!explicit)
+			return (0);
 
-		case OP_MOUNT:
-			if (options == NULL)
-				mnt.mnt_mntopts = "";
-			else
-				mnt.mnt_mntopts = (char *)options;
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "'canmount' property is set to 'off'\n"), cmdname,
+		    zfs_get_name(zhp));
+		return (1);
+	} else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) {
+		return (0);
+	}
 
-			if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
-			    zfs_is_mounted(zhp, NULL)) {
-				if (!explicit)
-					return (0);
+	/*
+	 * If this filesystem is inconsistent and has a receive resume
+	 * token, we can not mount it.
+	 */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+	    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+		if (!explicit)
+			return (0);
 
-				(void) fprintf(stderr, gettext("cannot mount "
-				    "'%s': filesystem already mounted\n"),
-				    zfs_get_name(zhp));
-				return (1);
-			}
+		(void) fprintf(stderr, gettext("cannot %s '%s': "
+		    "Contains partially-completed state from "
+		    "\"zfs receive -r\", which can be resumed with "
+		    "\"zfs send -t\"\n"),
+		    cmdname, zfs_get_name(zhp));
+		return (1);
+	}
 
-			if (zfs_mount(zhp, options, flags) != 0)
-				return (1);
-			break;
-		}
-	} else {
-		assert(op == OP_SHARE);
+	/*
+	 * At this point, we have verified that the mountpoint and/or
+	 * shareopts are appropriate for auto management. If the
+	 * filesystem is already mounted or shared, return (failing
+	 * for explicit requests); otherwise mount or share the
+	 * filesystem.
+	 */
+	switch (op) {
+	case OP_SHARE:
 
-		/*
-		 * Ignore any volumes that aren't shared.
-		 */
-		verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
-		    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
+		shared_nfs = zfs_is_shared_nfs(zhp, NULL);
+		shared_smb = zfs_is_shared_smb(zhp, NULL);
 
-		if (strcmp(shareopts, "off") == 0) {
+		if ((shared_nfs && shared_smb) ||
+		    (shared_nfs && strcmp(shareopts, "on") == 0 &&
+		    strcmp(smbshareopts, "off") == 0) ||
+		    (shared_smb && strcmp(smbshareopts, "on") == 0 &&
+		    strcmp(shareopts, "off") == 0)) {
 			if (!explicit)
 				return (0);
 
-			(void) fprintf(stderr, gettext("cannot share '%s': "
-			    "'shareiscsi' property not set\n"),
+			(void) fprintf(stderr, gettext("cannot share "
+			    "'%s': filesystem already shared\n"),
 			    zfs_get_name(zhp));
-			(void) fprintf(stderr, gettext("set 'shareiscsi' "
-			    "property or use iscsitadm(1M) to share this "
-			    "volume\n"));
 			return (1);
 		}
 
-		if (zfs_is_shared_iscsi(zhp)) {
+		if (!zfs_is_mounted(zhp, NULL) &&
+		    zfs_mount(zhp, NULL, 0) != 0)
+			return (1);
+
+		if (protocol == NULL) {
+			if (zfs_shareall(zhp) != 0)
+				return (1);
+		} else if (strcmp(protocol, "nfs") == 0) {
+			if (zfs_share_nfs(zhp))
+				return (1);
+		} else if (strcmp(protocol, "smb") == 0) {
+			if (zfs_share_smb(zhp))
+				return (1);
+		} else {
+			(void) fprintf(stderr, gettext("cannot share "
+			    "'%s': invalid share type '%s' "
+			    "specified\n"),
+			    zfs_get_name(zhp), protocol);
+			return (1);
+		}
+
+		break;
+
+	case OP_MOUNT:
+		if (options == NULL)
+			mnt.mnt_mntopts = "";
+		else
+			mnt.mnt_mntopts = (char *)options;
+
+		if (!hasmntopt(&mnt, MNTOPT_REMOUNT) &&
+		    zfs_is_mounted(zhp, NULL)) {
 			if (!explicit)
 				return (0);
 
-			(void) fprintf(stderr, gettext("cannot share "
-			    "'%s': volume already shared\n"),
+			(void) fprintf(stderr, gettext("cannot mount "
+			    "'%s': filesystem already mounted\n"),
 			    zfs_get_name(zhp));
 			return (1);
 		}
 
-		if (zfs_share_iscsi(zhp) != 0)
+		if (zfs_mount(zhp, options, flags) != 0)
 			return (1);
+		break;
 	}
 
 	return (0);
@@ -3234,19 +6052,16 @@ share_mount_one(zfs_handle_t *zhp, int o
 static void
 report_mount_progress(int current, int total)
 {
-	static int len;
-	static char *reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"
-	    "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b";
-	static time_t last_progress_time;
+	static time_t last_progress_time = 0;
 	time_t now = time(NULL);
+	char info[32];
 
 	/* report 1..n instead of 0..n-1 */
 	++current;
 
 	/* display header if we're here for the first time */
 	if (current == 1) {
-		(void) printf(gettext("Mounting ZFS filesystems: "));
-		len = 0;
+		set_progress_header(gettext("Mounting ZFS filesystems"));
 	} else if (current != total && last_progress_time + MOUNT_TIME >= now) {
 		/* too soon to report again */
 		return;
@@ -3254,13 +6069,12 @@ report_mount_progress(int current, int t
 
 	last_progress_time = now;
 
-	/* back up to prepare for overwriting */
-	if (len)
-		(void) printf("%*.*s", len, len, reverse);
-
-	/* We put a newline at the end if this is the last one.  */
-	len = printf("(%d/%d)%s", current, total, current == total ? "\n" : "");
-	(void) fflush(stdout);
+	(void) sprintf(info, "(%d/%d)", current, total);
+
+	if (current == total)
+		finish_progress(info);
+	else
+		update_progress(info);
 }
 
 static void
@@ -3289,7 +6103,7 @@ share_mount(int op, int argc, char **arg
 	boolean_t verbose = B_FALSE;
 	int c, ret = 0;
 	char *options = NULL;
-	int types, flags = 0;
+	int flags = 0;
 
 	/* check options */
 	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a"))
@@ -3316,7 +6130,7 @@ share_mount(int op, int argc, char **arg
 			break;
 
 		case 'O':
-			flags |= MS_OVERLAY;
+			warnx("no overlay mounts support on FreeBSD, ignoring");
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
@@ -3339,24 +6153,16 @@ share_mount(int op, int argc, char **arg
 		size_t i, count = 0;
 		char *protocol = NULL;
 
-		if (op == OP_MOUNT) {
-			types = ZFS_TYPE_FILESYSTEM;
-		} else if (argc > 0) {
-			if (strcmp(argv[0], "nfs") == 0 ||
-			    strcmp(argv[0], "smb") == 0) {
-				types = ZFS_TYPE_FILESYSTEM;
-			} else if (strcmp(argv[0], "iscsi") == 0) {
-				types = ZFS_TYPE_VOLUME;
-			} else {
+		if (op == OP_SHARE && argc > 0) {
+			if (strcmp(argv[0], "nfs") != 0 &&
+			    strcmp(argv[0], "smb") != 0) {
 				(void) fprintf(stderr, gettext("share type "
-				    "must be 'nfs', 'smb' or 'iscsi'\n"));
+				    "must be 'nfs' or 'smb'\n"));
 				usage(B_FALSE);
 			}
 			protocol = argv[0];
 			argc--;
 			argv++;
-		} else {
-			types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME;
 		}
 
 		if (argc != 0) {
@@ -3364,12 +6170,13 @@ share_mount(int op, int argc, char **arg
 			usage(B_FALSE);
 		}
 
-		get_all_datasets(types, &dslist, &count, verbose);
+		start_progress_timer();
+		get_all_datasets(&dslist, &count, verbose);
 
 		if (count == 0)
 			return (0);
 
-		qsort(dslist, count, sizeof (void *), dataset_cmp);
+		qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
 
 		for (i = 0; i < count; i++) {
 			if (verbose)
@@ -3383,8 +6190,7 @@ share_mount(int op, int argc, char **arg
 
 		free(dslist);
 	} else if (argc == 0) {
-		struct statvfs *sfs;
-		int i, n;
+		struct mnttab entry;
 
 		if ((op == OP_SHARE) || (options != NULL)) {
 			(void) fprintf(stderr, gettext("missing filesystem "
@@ -3397,33 +6203,27 @@ share_mount(int op, int argc, char **arg
 		 * display any active ZFS mounts.  We hide any snapshots, since
 		 * they are controlled automatically.
 		 */
-                if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
-			fprintf(stderr, "getmntinfo(): %s\n", strerror(errno));
-                        return (0);
-		}
-		for (i = 0; i < n; i++) {
-			if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0 ||
-			    strchr(sfs[i].f_mntfromname, '@') != NULL)
+		rewind(mnttab_file);
+		while (getmntent(mnttab_file, &entry) == 0) {
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 ||
+			    strchr(entry.mnt_special, '@') != NULL)
 				continue;
 
-			(void) printf("%-30s  %s\n", sfs[i].f_mntfromname,
-			    sfs[i].f_mntonname);
+			(void) printf("%-30s  %s\n", entry.mnt_special,
+			    entry.mnt_mountp);
 		}
 
 	} else {
 		zfs_handle_t *zhp;
 
-		types = ZFS_TYPE_FILESYSTEM;
-		if (op == OP_SHARE)
-			types |= ZFS_TYPE_VOLUME;
-
 		if (argc > 1) {
 			(void) fprintf(stderr,
 			    gettext("too many arguments\n"));
 			usage(B_FALSE);
 		}
 
-		if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) {
+		if ((zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM)) == NULL) {
 			ret = 1;
 		} else {
 			ret = share_mount_one(zhp, op, flags, NULL, B_TRUE,
@@ -3436,7 +6236,7 @@ share_mount(int op, int argc, char **arg
 }
 
 /*
- * zfs mount -a [nfs | iscsi]
+ * zfs mount -a [nfs]
  * zfs mount filesystem
  *
  * Mount all filesystems, or mount the given filesystem.
@@ -3448,7 +6248,7 @@ zfs_do_mount(int argc, char **argv)
 }
 
 /*
- * zfs share -a [nfs | iscsi | smb]
+ * zfs share -a [nfs | smb]
  * zfs share filesystem
  *
  * Share all filesystems, or share the given filesystem.
@@ -3484,9 +6284,9 @@ static int
 unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual)
 {
 	zfs_handle_t *zhp;
-	int ret;
+	int ret = 0;
 	struct stat64 statbuf;
-	struct mnttab entry, search = { 0 };
+	struct extmnttab entry;
 	const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount";
 	ino_t path_inode;
 
@@ -3506,9 +6306,39 @@ unshare_unmount_path(int op, char *path,
 	/*
 	 * Search for the given (major,minor) pair in the mount table.
 	 */
+#ifdef illumos
 	rewind(mnttab_file);
-	search.mnt_mountp = path;
-	if ((ret = getmntany(mnttab_file, &entry, &search)) == 0) {
+	while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) {
+		if (entry.mnt_major == major(statbuf.st_dev) &&
+		    entry.mnt_minor == minor(statbuf.st_dev))
+			break;
+	}
+#endif
+#ifdef __FreeBSD__
+	{
+		struct statfs sfs;
+
+		if (statfs(path, &sfs) != 0) {
+			(void) fprintf(stderr, "%s: %s\n", path,
+			    strerror(errno));
+			ret = -1;
+		}
+		statfs2mnttab(&sfs, &entry);
+	}
+#endif
+#ifdef __NetBSD__
+	{
+		struct statvfs sfs;
+
+		if (statvfs(path, &sfs) != 0) {
+			(void) fprintf(stderr, "%s: %s\n", path,
+			    strerror(errno));
+			ret = -1;
+		}
+		statvfs2mnttab(&sfs, &entry);
+	}
+#endif
+	if (ret != 0) {
 		if (op == OP_SHARE) {
 			(void) fprintf(stderr, gettext("cannot %s '%s': not "
 			    "currently mounted\n"), cmdname, path);
@@ -3556,8 +6386,10 @@ unshare_unmount_path(int op, char *path,
 		    strcmp(smbshare_prop, "off") == 0) {
 			(void) fprintf(stderr, gettext("cannot unshare "
 			    "'%s': legacy share\n"), path);
+#ifdef illumos
 			(void) fprintf(stderr, gettext("use "
 			    "unshare(1M) to unshare this filesystem\n"));
+#endif
 		} else if (!zfs_is_shared(zhp)) {
 			(void) fprintf(stderr, gettext("cannot unshare '%s': "
 			    "not currently shared\n"), path);
@@ -3576,7 +6408,7 @@ unshare_unmount_path(int op, char *path,
 			(void) fprintf(stderr, gettext("cannot unmount "
 			    "'%s': legacy mountpoint\n"),
 			    zfs_get_name(zhp));
-			(void) fprintf(stderr, gettext("use umount(1M) "
+			(void) fprintf(stderr, gettext("use umount(8) "
 			    "to unmount this filesystem\n"));
 		} else {
 			ret = zfs_unmountall(zhp, flags);
@@ -3598,9 +6430,9 @@ unshare_unmount(int op, int argc, char *
 	int do_all = 0;
 	int flags = 0;
 	int ret = 0;
-	int types, c;
+	int c;
 	zfs_handle_t *zhp;
-	char nfsiscsi_mnt_prop[ZFS_MAXPROPLEN];
+	char nfs_mnt_prop[ZFS_MAXPROPLEN];
 	char sharesmb[ZFS_MAXPROPLEN];
 
 	/* check options */
@@ -3637,10 +6469,9 @@ unshare_unmount(int op, int argc, char *
 		 * the special type (dataset name), and walk the result in
 		 * reverse to make sure to get any snapshots first.
 		 */
-		struct statvfs *sfs;
-		int i, n;
+		struct mnttab entry;
 		uu_avl_pool_t *pool;
-		uu_avl_t *tree;
+		uu_avl_t *tree = NULL;
 		unshare_unmount_node_t *node;
 		uu_avl_index_t idx;
 		uu_avl_walk_t *walk;
@@ -3650,66 +6481,61 @@ unshare_unmount(int op, int argc, char *
 			usage(B_FALSE);
 		}
 
-		if ((pool = uu_avl_pool_create("unmount_pool",
+		if (((pool = uu_avl_pool_create("unmount_pool",
 		    sizeof (unshare_unmount_node_t),
 		    offsetof(unshare_unmount_node_t, un_avlnode),
-		    unshare_unmount_compare,
-		    UU_DEFAULT)) == NULL) {
-			(void) fprintf(stderr, gettext("internal error: "
-			    "out of memory\n"));
-			exit(1);
-		}
-
-		if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) {
-			(void) fprintf(stderr, gettext("internal error: "
-			    "out of memory\n"));
-			exit(1);
-		}
+		    unshare_unmount_compare, UU_DEFAULT)) == NULL) ||
+		    ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL))
+			nomem();
 
 		rewind(mnttab_file);
-		if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
-			(void) fprintf(stderr, gettext("internal error: "
-			    "getmntinfo() failed\n"));
-			exit(1);
-		}
-                for (i = 0; i < n; i++) {
+		while (getmntent(mnttab_file, &entry) == 0) {
 
 			/* ignore non-ZFS entries */
-			if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0)
+			if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
 				continue;
 
 			/* ignore snapshots */
-			if (strchr(sfs[i].f_mntfromname, '@') != NULL)
+			if (strchr(entry.mnt_special, '@') != NULL)
 				continue;
 
-			if ((zhp = zfs_open(g_zfs, sfs[i].f_mntfromname,
+			if ((zhp = zfs_open(g_zfs, entry.mnt_special,
 			    ZFS_TYPE_FILESYSTEM)) == NULL) {
 				ret = 1;
 				continue;
 			}
 
+			/*
+			 * Ignore datasets that are excluded/restricted by
+			 * parent pool name.
+			 */
+			if (zpool_skip_pool(zfs_get_pool_name(zhp))) {
+				zfs_close(zhp);
+				continue;
+			}
+
 			switch (op) {
 			case OP_SHARE:
 				verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
-				    nfsiscsi_mnt_prop,
-				    sizeof (nfsiscsi_mnt_prop),
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
 				    NULL, NULL, 0, B_FALSE) == 0);
-				if (strcmp(nfsiscsi_mnt_prop, "off") != 0)
+				if (strcmp(nfs_mnt_prop, "off") != 0)
 					break;
 				verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
-				    nfsiscsi_mnt_prop,
-				    sizeof (nfsiscsi_mnt_prop),
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
 				    NULL, NULL, 0, B_FALSE) == 0);
-				if (strcmp(nfsiscsi_mnt_prop, "off") == 0)
+				if (strcmp(nfs_mnt_prop, "off") == 0)
 					continue;
 				break;
 			case OP_MOUNT:
 				/* Ignore legacy mounts */
 				verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
-				    nfsiscsi_mnt_prop,
-				    sizeof (nfsiscsi_mnt_prop),
+				    nfs_mnt_prop,
+				    sizeof (nfs_mnt_prop),
 				    NULL, NULL, 0, B_FALSE) == 0);
-				if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0)
+				if (strcmp(nfs_mnt_prop, "legacy") == 0)
 					continue;
 				/* Ignore canmount=noauto mounts */
 				if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) ==
@@ -3721,13 +6547,7 @@ unshare_unmount(int op, int argc, char *
 
 			node = safe_malloc(sizeof (unshare_unmount_node_t));
 			node->un_zhp = zhp;
-
-			if ((node->un_mountp = strdup(sfs[i].f_mntonname)) ==
-			    NULL) {
-				(void) fprintf(stderr, gettext("internal error:"
-				    " out of memory\n"));
-				exit(1);
-			}
+			node->un_mountp = safe_strdup(entry.mnt_mountp);
 
 			uu_avl_node_init(node, &node->un_avlnode, pool);
 
@@ -3745,11 +6565,8 @@ unshare_unmount(int op, int argc, char *
 		 * removing it from the AVL tree in the process.
 		 */
 		if ((walk = uu_avl_walk_start(tree,
-		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) {
-			(void) fprintf(stderr,
-			    gettext("internal error: out of memory"));
-			exit(1);
-		}
+		    UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL)
+			nomem();
 
 		while ((node = uu_avl_walk_next(walk)) != NULL) {
 			uu_avl_remove(tree, node);
@@ -3777,29 +6594,6 @@ unshare_unmount(int op, int argc, char *
 		uu_avl_destroy(tree);
 		uu_avl_pool_destroy(pool);
 
-		if (op == OP_SHARE) {
-			/*
-			 * Finally, unshare any volumes shared via iSCSI.
-			 */
-			zfs_handle_t **dslist = NULL;
-			size_t i, count = 0;
-
-			get_all_datasets(ZFS_TYPE_VOLUME, &dslist, &count,
-			    B_FALSE);
-
-			if (count != 0) {
-				qsort(dslist, count, sizeof (void *),
-				    dataset_cmp);
-
-				for (i = 0; i < count; i++) {
-					if (zfs_unshare_iscsi(dslist[i]) != 0)
-						ret = 1;
-					zfs_close(dslist[i]);
-				}
-
-				free(dslist);
-			}
-		}
 	} else {
 		if (argc != 1) {
 			if (argc == 0)
@@ -3821,91 +6615,65 @@ unshare_unmount(int op, int argc, char *
 			return (unshare_unmount_path(op, argv[0],
 			    flags, B_FALSE));
 
-		types = ZFS_TYPE_FILESYSTEM;
-		if (op == OP_SHARE)
-			types |= ZFS_TYPE_VOLUME;
-
-		if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL)
+		if ((zhp = zfs_open(g_zfs, argv[0],
+		    ZFS_TYPE_FILESYSTEM)) == NULL)
 			return (1);
 
-		if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
-			verify(zfs_prop_get(zhp, op == OP_SHARE ?
-			    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
-			    nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop), NULL,
-			    NULL, 0, B_FALSE) == 0);
-
-			switch (op) {
-			case OP_SHARE:
-				verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
-				    nfsiscsi_mnt_prop,
-				    sizeof (nfsiscsi_mnt_prop),
-				    NULL, NULL, 0, B_FALSE) == 0);
-				verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
-				    sharesmb, sizeof (sharesmb), NULL, NULL,
-				    0, B_FALSE) == 0);
-
-				if (strcmp(nfsiscsi_mnt_prop, "off") == 0 &&
-				    strcmp(sharesmb, "off") == 0) {
-					(void) fprintf(stderr, gettext("cannot "
-					    "unshare '%s': legacy share\n"),
-					    zfs_get_name(zhp));
-					(void) fprintf(stderr, gettext("use "
-					    "unshare(1M) to unshare this "
-					    "filesystem\n"));
-					ret = 1;
-				} else if (!zfs_is_shared(zhp)) {
-					(void) fprintf(stderr, gettext("cannot "
-					    "unshare '%s': not currently "
-					    "shared\n"), zfs_get_name(zhp));
-					ret = 1;
-				} else if (zfs_unshareall(zhp) != 0) {
-					ret = 1;
-				}
-				break;
-
-			case OP_MOUNT:
-				if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0) {
-					(void) fprintf(stderr, gettext("cannot "
-					    "unmount '%s': legacy "
-					    "mountpoint\n"), zfs_get_name(zhp));
-					(void) fprintf(stderr, gettext("use "
-					    "umount(1M) to unmount this "
-					    "filesystem\n"));
-					ret = 1;
-				} else if (!zfs_is_mounted(zhp, NULL)) {
-					(void) fprintf(stderr, gettext("cannot "
-					    "unmount '%s': not currently "
-					    "mounted\n"),
-					    zfs_get_name(zhp));
-					ret = 1;
-				} else if (zfs_unmountall(zhp, flags) != 0) {
-					ret = 1;
-				}
-				break;
-			}
-		} else {
-			assert(op == OP_SHARE);
+		verify(zfs_prop_get(zhp, op == OP_SHARE ?
+		    ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT,
+		    nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL,
+		    NULL, 0, B_FALSE) == 0);
 
-			verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI,
-			    nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop),
+		switch (op) {
+		case OP_SHARE:
+			verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS,
+			    nfs_mnt_prop,
+			    sizeof (nfs_mnt_prop),
 			    NULL, NULL, 0, B_FALSE) == 0);
+			verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB,
+			    sharesmb, sizeof (sharesmb), NULL, NULL,
+			    0, B_FALSE) == 0);
 
-			if (strcmp(nfsiscsi_mnt_prop, "off") == 0) {
-				(void) fprintf(stderr, gettext("cannot unshare "
-				    "'%s': 'shareiscsi' property not set\n"),
+			if (strcmp(nfs_mnt_prop, "off") == 0 &&
+			    strcmp(sharesmb, "off") == 0) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unshare '%s': legacy share\n"),
 				    zfs_get_name(zhp));
-				(void) fprintf(stderr, gettext("set "
-				    "'shareiscsi' property or use "
-				    "iscsitadm(1M) to share this volume\n"));
+#ifdef illumos
+				(void) fprintf(stderr, gettext("use "
+				    "unshare(1M) to unshare this "
+				    "filesystem\n"));
+#endif
+				ret = 1;
+			} else if (!zfs_is_shared(zhp)) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unshare '%s': not currently "
+				    "shared\n"), zfs_get_name(zhp));
 				ret = 1;
-			} else if (!zfs_is_shared_iscsi(zhp)) {
+			} else if (zfs_unshareall(zhp) != 0) {
+				ret = 1;
+			}
+			break;
+
+		case OP_MOUNT:
+			if (strcmp(nfs_mnt_prop, "legacy") == 0) {
+				(void) fprintf(stderr, gettext("cannot "
+				    "unmount '%s': legacy "
+				    "mountpoint\n"), zfs_get_name(zhp));
+				(void) fprintf(stderr, gettext("use "
+				    "umount(8) to unmount this "
+				    "filesystem\n"));
+				ret = 1;
+			} else if (!zfs_is_mounted(zhp, NULL)) {
 				(void) fprintf(stderr, gettext("cannot "
-				    "unshare '%s': not currently shared\n"),
+				    "unmount '%s': not currently "
+				    "mounted\n"),
 				    zfs_get_name(zhp));
 				ret = 1;
-			} else if (zfs_unshare_iscsi(zhp) != 0) {
+			} else if (zfs_unmountall(zhp, flags) != 0) {
 				ret = 1;
 			}
+			break;
 		}
 
 		zfs_close(zhp);
@@ -3938,14 +6706,69 @@ zfs_do_unshare(int argc, char **argv)
 	return (unshare_unmount(OP_SHARE, argc, argv));
 }
 
+#ifdef __FreeBSD__
+/*
+ * Attach/detach the given dataset to/from the given jail
+ */
 /* ARGSUSED */
 static int
-zfs_do_python(int argc, char **argv)
+do_jail(int argc, char **argv, int attach)
 {
-	(void) execv(pypath, argv-1);
-	(void) printf("internal error: %s not found\n", pypath);
-	return (-1);
+	zfs_handle_t *zhp;
+	int jailid, ret;
+
+	/* check number of arguments */
+	if (argc < 3) {
+		(void) fprintf(stderr, gettext("missing argument(s)\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 3) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	jailid = jail_getid(argv[1]);
+	if (jailid < 0) {
+		(void) fprintf(stderr, gettext("invalid jail id or name\n"));
+		usage(B_FALSE);
+	}
+
+	zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM);
+	if (zhp == NULL)
+		return (1);
+
+	ret = (zfs_jail(zhp, jailid, attach) != 0);
+
+	zfs_close(zhp);
+	return (ret);
+}
+
+/*
+ * zfs jail jailid filesystem
+ *
+ * Attach the given dataset to the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_jail(int argc, char **argv)
+{
+
+	return (do_jail(argc, argv, 1));
+}
+
+/*
+ * zfs unjail jailid filesystem
+ *
+ * Detach the given dataset from the given jail
+ */
+/* ARGSUSED */
+static int
+zfs_do_unjail(int argc, char **argv)
+{
+
+	return (do_jail(argc, argv, 0));
 }
+#endif /* __FreeBSD__ */
 
 /*
  * Called when invoked as /etc/fs/zfs/mount.  Do the mount if the mountpoint is
@@ -3957,7 +6780,7 @@ manual_mount(int argc, char **argv)
 	zfs_handle_t *zhp;
 	char mountpoint[ZFS_MAXPROPLEN];
 	char mntopts[MNT_LINE_MAX] = { '\0' };
-	int ret;
+	int ret = 0;
 	int c;
 	int flags = 0;
 	char *dataset, *path;
@@ -4018,7 +6841,7 @@ manual_mount(int argc, char **argv)
 	/* check for legacy mountpoint and complain appropriately */
 	ret = 0;
 	if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
-		if (mount(dataset, path, MS_OPTIONSTR | flags, MNTTYPE_ZFS,
+		if (zmount(dataset, path, flags, MNTTYPE_ZFS,
 		    NULL, 0, mntopts, sizeof (mntopts)) != 0) {
 			(void) fprintf(stderr, gettext("mount failed: %s\n"),
 			    strerror(errno));
@@ -4026,12 +6849,12 @@ manual_mount(int argc, char **argv)
 		}
 	} else {
 		(void) fprintf(stderr, gettext("filesystem '%s' cannot be "
-		    "mounted using 'mount -F zfs'\n"), dataset);
+		    "mounted using 'mount -t zfs'\n"), dataset);
 		(void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' "
 		    "instead.\n"), path);
-		(void) fprintf(stderr, gettext("If you must use 'mount -F zfs' "
-		    "or /etc/vfstab, use 'zfs set mountpoint=legacy'.\n"));
-		(void) fprintf(stderr, gettext("See zfs(1M) for more "
+		(void) fprintf(stderr, gettext("If you must use 'mount -t zfs' "
+		    "or /etc/fstab, use 'zfs set mountpoint=legacy'.\n"));
+		(void) fprintf(stderr, gettext("See zfs(8) for more "
 		    "information.\n"));
 		ret = 1;
 	}
@@ -4099,10 +6922,190 @@ find_command_idx(char *command, int *idx
 	return (1);
 }
 
+static int
+zfs_do_diff(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	int flags = 0;
+	char *tosnap = NULL;
+	char *fromsnap = NULL;
+	char *atp, *copy;
+	int err = 0;
+	int c;
+
+	while ((c = getopt(argc, argv, "FHt")) != -1) {
+		switch (c) {
+		case 'F':
+			flags |= ZFS_DIFF_CLASSIFY;
+			break;
+		case 'H':
+			flags |= ZFS_DIFF_PARSEABLE;
+			break;
+		case 't':
+			flags |= ZFS_DIFF_TIMESTAMP;
+			break;
+		default:
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr,
+		gettext("must provide at least one snapshot name\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 2) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	fromsnap = argv[0];
+	tosnap = (argc == 2) ? argv[1] : NULL;
+
+	copy = NULL;
+	if (*fromsnap != '@')
+		copy = strdup(fromsnap);
+	else if (tosnap)
+		copy = strdup(tosnap);
+	if (copy == NULL)
+		usage(B_FALSE);
+
+	if ((atp = strchr(copy, '@')) != NULL)
+		*atp = '\0';
+
+	if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL)
+		return (1);
+
+	free(copy);
+
+	/*
+	 * Ignore SIGPIPE so that the library can give us
+	 * information on any failure
+	 */
+	(void) sigignore(SIGPIPE);
+
+	err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags);
+
+	zfs_close(zhp);
+
+	return (err != 0);
+}
+
+/*
+ * zfs bookmark <fs@snap> <fs#bmark>
+ *
+ * Creates a bookmark with the given name from the given snapshot.
+ */
+static int
+zfs_do_bookmark(int argc, char **argv)
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	zfs_handle_t *zhp;
+	nvlist_t *nvl;
+	int ret = 0;
+	int c;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "")) != -1) {
+		switch (c) {
+		case '?':
+			(void) fprintf(stderr,
+			    gettext("invalid option '%c'\n"), optopt);
+			goto usage;
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* check number of arguments */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing snapshot argument\n"));
+		goto usage;
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("missing bookmark argument\n"));
+		goto usage;
+	}
+
+	if (strchr(argv[1], '#') == NULL) {
+		(void) fprintf(stderr,
+		    gettext("invalid bookmark name '%s' -- "
+		    "must contain a '#'\n"), argv[1]);
+		goto usage;
+	}
+
+	if (argv[0][0] == '@') {
+		/*
+		 * Snapshot name begins with @.
+		 * Default to same fs as bookmark.
+		 */
+		(void) strncpy(snapname, argv[1], sizeof (snapname));
+		*strchr(snapname, '#') = '\0';
+		(void) strlcat(snapname, argv[0], sizeof (snapname));
+	} else {
+		(void) strncpy(snapname, argv[0], sizeof (snapname));
+	}
+	zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT);
+	if (zhp == NULL)
+		goto usage;
+	zfs_close(zhp);
+
+
+	nvl = fnvlist_alloc();
+	fnvlist_add_string(nvl, argv[1], snapname);
+	ret = lzc_bookmark(nvl, NULL);
+	fnvlist_free(nvl);
+
+	if (ret != 0) {
+		const char *err_msg;
+		char errbuf[1024];
+
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot create bookmark '%s'"), argv[1]);
+
+		switch (ret) {
+		case EXDEV:
+			err_msg = "bookmark is in a different pool";
+			break;
+		case EEXIST:
+			err_msg = "bookmark exists";
+			break;
+		case EINVAL:
+			err_msg = "invalid argument";
+			break;
+		case ENOTSUP:
+			err_msg = "bookmark feature not enabled";
+			break;
+		case ENOSPC:
+			err_msg = "out of space";
+			break;
+		default:
+			err_msg = "unknown error";
+			break;
+		}
+		(void) fprintf(stderr, "%s: %s\n", errbuf,
+		    dgettext(TEXT_DOMAIN, err_msg));
+	}
+
+	return (ret != 0);
+
+usage:
+	usage(B_FALSE);
+	return (-1);
+}
+
 int
 main(int argc, char **argv)
 {
-	int ret;
+	int ret = 0;
 	int i;
 	char *progname;
 	char *cmdname;
@@ -4118,8 +7121,7 @@ main(int argc, char **argv)
 		return (1);
 	}
 
-	zpool_set_history_str("zfs", argc, argv, history_str);
-	verify(zpool_stage_history(g_zfs, history_str) == 0);
+	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
 
 	libzfs_print_on_error(g_zfs, B_TRUE);
 
@@ -4162,6 +7164,12 @@ main(int argc, char **argv)
 			cmdname = "receive";
 
 		/*
+		 * The 'snap' command is an alias for 'snapshot'
+		 */
+		if (strcmp(cmdname, "snap") == 0)
+			cmdname = "snapshot";
+
+		/*
 		 * Special case '-?'
 		 */
 		if (strcmp(cmdname, "-?") == 0)
@@ -4188,6 +7196,9 @@ main(int argc, char **argv)
 
 	(void) fclose(mnttab_file);
 
+	if (ret == 0 && log_history)
+		(void) zpool_log_history(g_zfs, history_str);
+
 	libzfs_fini(g_zfs);
 
 	/*
Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 zfs_util.h
--- src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h	7 Aug 2009 18:32:18 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h	12 Jun 2012 05:55:36 -0000
@@ -19,15 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_ZFS_UTIL_H
 #define	_ZFS_UTIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <libzfs.h>
 
 #ifdef	__cplusplus
@@ -35,6 +32,7 @@ extern "C" {
 #endif
 
 void * safe_malloc(size_t size);
+void nomem(void);
 libzfs_handle_t *g_zfs;
 
 #ifdef	__cplusplus
Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 zpool_iter.c
--- src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c	7 Aug 2009 18:32:18 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c	27 Mar 2016 02:48:25 -0000
@@ -22,9 +22,11 @@
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
+#include <solaris.h>
 #include <libintl.h>
 #include <libuutil.h>
 #include <stddef.h>
@@ -131,7 +133,8 @@ pool_list_get(int argc, char **argv, zpr
 		for (i = 0; i < argc; i++) {
 			zpool_handle_t *zhp;
 
-			if (zhp = zpool_open_canfail(g_zfs, argv[i])) {
+			if ((zhp = zpool_open_canfail(g_zfs, argv[i])) !=
+			    NULL) {
 				if (add_pool(zhp, zlp) != 0)
 					*err = B_TRUE;
 			} else {
Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c,v
retrieving revision 1.2
diff -u -p -r1.2 zpool_main.c
--- src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c	2 Jan 2013 10:33:49 -0000	1.2
+++ src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c	5 May 2017 16:30:16 -0000
@@ -20,10 +20,16 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
+ * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
+ * Copyright 2016 Nexenta Systems, Inc.
  */
 
+#include <solaris.h>
 #include <assert.h>
 #include <ctype.h>
 #include <dirent.h>
@@ -41,15 +47,16 @@
 #include <priv.h>
 #include <pwd.h>
 #include <zone.h>
+#include <sys/time.h>
+#include <zfs_prop.h>
 #include <sys/fs/zfs.h>
-
-#include <sys/ioctl.h>
 #include <sys/stat.h>
 
 #include <libzfs.h>
 
 #include "zpool_util.h"
 #include "zfs_comutil.h"
+#include "zfeature_common.h"
 
 #include "statcommon.h"
 
@@ -58,6 +65,7 @@ static int zpool_do_destroy(int, char **
 
 static int zpool_do_add(int, char **);
 static int zpool_do_remove(int, char **);
+static int zpool_do_labelclear(int, char **);
 
 static int zpool_do_list(int, char **);
 static int zpool_do_iostat(int, char **);
@@ -66,6 +74,9 @@ static int zpool_do_status(int, char **)
 static int zpool_do_online(int, char **);
 static int zpool_do_offline(int, char **);
 static int zpool_do_clear(int, char **);
+static int zpool_do_reopen(int, char **);
+
+static int zpool_do_reguid(int, char **);
 
 static int zpool_do_attach(int, char **);
 static int zpool_do_detach(int, char **);
@@ -114,6 +125,7 @@ typedef enum {
 	HELP_HISTORY,
 	HELP_IMPORT,
 	HELP_IOSTAT,
+	HELP_LABELCLEAR,
 	HELP_LIST,
 	HELP_OFFLINE,
 	HELP_ONLINE,
@@ -124,7 +136,9 @@ typedef enum {
 	HELP_UPGRADE,
 	HELP_GET,
 	HELP_SET,
-	HELP_SPLIT
+	HELP_SPLIT,
+	HELP_REGUID,
+	HELP_REOPEN
 } zpool_help_t;
 
 
@@ -150,6 +164,8 @@ static zpool_command_t command_table[] =
 	{ "add",	zpool_do_add,		HELP_ADD		},
 	{ "remove",	zpool_do_remove,	HELP_REMOVE		},
 	{ NULL },
+	{ "labelclear",	zpool_do_labelclear,	HELP_LABELCLEAR		},
+	{ NULL },
 	{ "list",	zpool_do_list,		HELP_LIST		},
 	{ "iostat",	zpool_do_iostat,	HELP_IOSTAT		},
 	{ "status",	zpool_do_status,	HELP_STATUS		},
@@ -157,6 +173,7 @@ static zpool_command_t command_table[] =
 	{ "online",	zpool_do_online,	HELP_ONLINE		},
 	{ "offline",	zpool_do_offline,	HELP_OFFLINE		},
 	{ "clear",	zpool_do_clear,		HELP_CLEAR		},
+	{ "reopen",	zpool_do_reopen,	HELP_REOPEN		},
 	{ NULL },
 	{ "attach",	zpool_do_attach,	HELP_ATTACH		},
 	{ "detach",	zpool_do_detach,	HELP_DETACH		},
@@ -168,6 +185,7 @@ static zpool_command_t command_table[] =
 	{ "import",	zpool_do_import,	HELP_IMPORT		},
 	{ "export",	zpool_do_export,	HELP_EXPORT		},
 	{ "upgrade",	zpool_do_upgrade,	HELP_UPGRADE		},
+	{ "reguid",	zpool_do_reguid,	HELP_REGUID		},
 	{ NULL },
 	{ "history",	zpool_do_history,	HELP_HISTORY		},
 	{ "get",	zpool_do_get,		HELP_GET		},
@@ -176,13 +194,14 @@ static zpool_command_t command_table[] =
 
 #define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
 
-zpool_command_t *current_command;
+static zpool_command_t *current_command;
 static char history_str[HIS_MAX_RECORD_LEN];
-
+static boolean_t log_history = B_TRUE;
 static uint_t timestamp_fmt = NODATE;
 
 static const char *
-get_usage(zpool_help_t idx) {
+get_usage(zpool_help_t idx)
+{
 	switch (idx) {
 	case HELP_ADD:
 		return (gettext("\tadd [-fn] <pool> <vdev> ...\n"));
@@ -192,7 +211,7 @@ get_usage(zpool_help_t idx) {
 	case HELP_CLEAR:
 		return (gettext("\tclear [-nF] <pool> [device]\n"));
 	case HELP_CREATE:
-		return (gettext("\tcreate [-fn] [-o property=value] ... \n"
+		return (gettext("\tcreate [-fnd] [-o property=value] ... \n"
 		    "\t    [-O file-system-property=value] ... \n"
 		    "\t    [-m mountpoint] [-R root] <pool> <vdev> ...\n"));
 	case HELP_DESTROY:
@@ -205,44 +224,52 @@ get_usage(zpool_help_t idx) {
 		return (gettext("\thistory [-il] [<pool>] ...\n"));
 	case HELP_IMPORT:
 		return (gettext("\timport [-d dir] [-D]\n"
-		    "\timport [-d dir | -c cachefile] [-n] -F <pool | id>\n"
+		    "\timport [-d dir | -c cachefile] [-F [-n]] <pool | id>\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
-		    "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n"
+		    "\t    [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
+		    "[-R root] [-F [-n]] -a\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
-		    "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] "
-		    "<pool | id> [newpool]\n"));
+		    "\t    [-d dir | -c cachefile] [-D] [-f] [-m] [-N] "
+		    "[-R root] [-F [-n]]\n"
+		    "\t    <pool | id> [newpool]\n"));
 	case HELP_IOSTAT:
 		return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
 		    "[count]]\n"));
+	case HELP_LABELCLEAR:
+		return (gettext("\tlabelclear [-f] <vdev>\n"));
 	case HELP_LIST:
-		return (gettext("\tlist [-H] [-o property[,...]] "
-		    "[pool] ...\n"));
+		return (gettext("\tlist [-Hpv] [-o property[,...]] "
+		    "[-T d|u] [pool] ... [interval [count]]\n"));
 	case HELP_OFFLINE:
 		return (gettext("\toffline [-t] <pool> <device> ...\n"));
 	case HELP_ONLINE:
-		return (gettext("\tonline <pool> <device> ...\n"));
+		return (gettext("\tonline [-e] <pool> <device> ...\n"));
 	case HELP_REPLACE:
 		return (gettext("\treplace [-f] <pool> <device> "
 		    "[new-device]\n"));
 	case HELP_REMOVE:
 		return (gettext("\tremove <pool> <device> ...\n"));
+	case HELP_REOPEN:
+		return (gettext("\treopen <pool>\n"));
 	case HELP_SCRUB:
 		return (gettext("\tscrub [-s] <pool> ...\n"));
 	case HELP_STATUS:
-		return (gettext("\tstatus [-vx] [pool] ...\n"));
+		return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval "
+		    "[count]]\n"));
 	case HELP_UPGRADE:
-		return (gettext("\tupgrade\n"
-		    "\tupgrade -v\n"
+		return (gettext("\tupgrade [-v]\n"
 		    "\tupgrade [-V version] <-a | pool ...>\n"));
 	case HELP_GET:
-		return (gettext("\tget <\"all\" | property[,...]> "
-		    "<pool> ...\n"));
+		return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] "
+		    "<\"all\" | property[,...]> <pool> ...\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> <pool> \n"));
 	case HELP_SPLIT:
 		return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n"
 		    "\t    [-o property=value] <pool> <newpool> "
 		    "[<device> ...]\n"));
+	case HELP_REGUID:
+		return (gettext("\treguid <pool>\n"));
 	}
 
 	abort();
@@ -316,6 +343,12 @@ usage(boolean_t requested)
 		/* Iterate over all properties */
 		(void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE,
 		    ZFS_TYPE_POOL);
+
+		(void) fprintf(fp, "\t%-15s   ", "feature@...");
+		(void) fprintf(fp, "YES   disabled | enabled | active\n");
+
+		(void) fprintf(fp, gettext("\nThe feature@ properties must be "
+		    "appended with a feature name.\nSee zpool-features(7).\n"));
 	}
 
 	/*
@@ -359,6 +392,18 @@ print_vdev_tree(zpool_handle_t *zhp, con
 	}
 }
 
+static boolean_t
+prop_list_contains_feature(nvlist_t *proplist)
+{
+	nvpair_t *nvp;
+	for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp;
+	    nvp = nvlist_next_nvpair(proplist, nvp)) {
+		if (zpool_prop_feature(nvpair_name(nvp)))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
 /*
  * Add a property pair (name, string-value) into a property nvlist.
  */
@@ -382,12 +427,34 @@ add_prop_list(const char *propname, char
 	proplist = *props;
 
 	if (poolprop) {
-		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) {
+		const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION);
+
+		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL &&
+		    !zpool_prop_feature(propname)) {
 			(void) fprintf(stderr, gettext("property '%s' is "
 			    "not a valid pool property\n"), propname);
 			return (2);
 		}
-		normnm = zpool_prop_to_name(prop);
+
+		/*
+		 * feature@ properties and version should not be specified
+		 * at the same time.
+		 */
+		if ((prop == ZPROP_INVAL && zpool_prop_feature(propname) &&
+		    nvlist_exists(proplist, vname)) ||
+		    (prop == ZPOOL_PROP_VERSION &&
+		    prop_list_contains_feature(proplist))) {
+			(void) fprintf(stderr, gettext("'feature@' and "
+			    "'version' properties cannot be specified "
+			    "together\n"));
+			return (2);
+		}
+
+
+		if (zpool_prop_feature(propname))
+			normnm = propname;
+		else
+			normnm = zpool_prop_to_name(prop);
 	} else {
 		if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
 			normnm = zfs_prop_to_name(fprop);
@@ -520,11 +587,10 @@ zpool_do_add(int argc, char **argv)
 }
 
 /*
- * zpool remove <pool> <vdev> ...
+ * zpool remove  <pool> <vdev> ...
  *
- * Removes the given vdev from the pool.  Currently, this only supports removing
- * spares and cache devices from the pool.  Eventually, we'll want to support
- * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs.
+ * Removes the given vdev from the pool.  Currently, this supports removing
+ * spares, cache, and log devices from the pool.
  */
 int
 zpool_do_remove(int argc, char **argv)
@@ -560,7 +626,154 @@ zpool_do_remove(int argc, char **argv)
 }
 
 /*
- * zpool create [-fn] [-o property=value] ...
+ * zpool labelclear [-f] <vdev>
+ *
+ *	-f	Force clearing the label for the vdevs which are members of
+ *		the exported or foreign pools.
+ *
+ * Verifies that the vdev is not active and zeros out the label information
+ * on the device.
+ */
+int
+zpool_do_labelclear(int argc, char **argv)
+{
+	char vdev[MAXPATHLEN];
+	char *name = NULL;
+	struct stat st;
+	int c, fd, ret = 0;
+	nvlist_t *config;
+	pool_state_t state;
+	boolean_t inuse = B_FALSE;
+	boolean_t force = B_FALSE;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "f")) != -1) {
+		switch (c) {
+		case 'f':
+			force = B_TRUE;
+			break;
+		default:
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	/* get vdev name */
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing vdev name\n"));
+		usage(B_FALSE);
+	}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	/*
+	 * Check if we were given absolute path and use it as is.
+	 * Otherwise if the provided vdev name doesn't point to a file,
+	 * try prepending dsk path and appending s0.
+	 */
+	(void) strlcpy(vdev, argv[0], sizeof (vdev));
+	if (vdev[0] != '/' && stat(vdev, &st) != 0) {
+		char *s;
+
+		(void) snprintf(vdev, sizeof (vdev), "%s/%s",
+#ifdef illumos
+		    ZFS_DISK_ROOT, argv[0]);
+		if ((s = strrchr(argv[0], 's')) == NULL ||
+		    !isdigit(*(s + 1)))
+			(void) strlcat(vdev, "s0", sizeof (vdev));
+#else
+		    "/dev", argv[0]);
+#endif
+		if (stat(vdev, &st) != 0) {
+			(void) fprintf(stderr, gettext(
+			    "failed to find device %s, try specifying absolute "
+			    "path instead\n"), argv[0]);
+			return (1);
+		}
+	}
+
+	if ((fd = open(vdev, O_RDWR)) < 0) {
+		(void) fprintf(stderr, gettext("failed to open %s: %s\n"),
+		    vdev, strerror(errno));
+		return (1);
+	}
+
+	if (zpool_read_label(fd, &config) != 0 || config == NULL) {
+		(void) fprintf(stderr,
+		    gettext("failed to read label from %s\n"), vdev);
+		return (1);
+	}
+	nvlist_free(config);
+
+	ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse);
+	if (ret != 0) {
+		(void) fprintf(stderr,
+		    gettext("failed to check state for %s\n"), vdev);
+		return (1);
+	}
+
+	if (!inuse)
+		goto wipe_label;
+
+	switch (state) {
+	default:
+	case POOL_STATE_ACTIVE:
+	case POOL_STATE_SPARE:
+	case POOL_STATE_L2CACHE:
+		(void) fprintf(stderr, gettext(
+		    "%s is a member (%s) of pool \"%s\"\n"),
+		    vdev, zpool_pool_state_to_name(state), name);
+		ret = 1;
+		goto errout;
+
+	case POOL_STATE_EXPORTED:
+		if (force)
+			break;
+		(void) fprintf(stderr, gettext(
+		    "use '-f' to override the following error:\n"
+		    "%s is a member of exported pool \"%s\"\n"),
+		    vdev, name);
+		ret = 1;
+		goto errout;
+
+	case POOL_STATE_POTENTIALLY_ACTIVE:
+		if (force)
+			break;
+		(void) fprintf(stderr, gettext(
+		    "use '-f' to override the following error:\n"
+		    "%s is a member of potentially active pool \"%s\"\n"),
+		    vdev, name);
+		ret = 1;
+		goto errout;
+
+	case POOL_STATE_DESTROYED:
+		/* inuse should never be set for a destroyed pool */
+		assert(0);
+		break;
+	}
+
+wipe_label:
+	ret = zpool_clear_label(fd);
+	if (ret != 0) {
+		(void) fprintf(stderr,
+		    gettext("failed to clear label for %s\n"), vdev);
+	}
+
+errout:
+	free(name);
+	(void) close(fd);
+
+	return (ret);
+}
+
+/*
+ * zpool create [-fnd] [-o property=value] ...
  *		[-O file-system-property=value] ...
  *		[-R root] [-m mountpoint] <pool> <dev> ...
  *
@@ -569,8 +782,10 @@ zpool_do_remove(int argc, char **argv)
  *		were to be created.
  *      -R	Create a pool under an alternate root
  *      -m	Set default mountpoint for the root dataset.  By default it's
- *      	'/<pool>'
+ *		'/<pool>'
  *	-o	Set property=value.
+ *	-d	Don't automatically enable all supported pool features
+ *		(individual features can be enabled with -o).
  *	-O	Set fsproperty=value in the pool's root file system
  *
  * Creates the named pool according to the given vdev specification.  The
@@ -583,6 +798,7 @@ zpool_do_create(int argc, char **argv)
 {
 	boolean_t force = B_FALSE;
 	boolean_t dryrun = B_FALSE;
+	boolean_t enable_all_pool_feat = B_TRUE;
 	int c;
 	nvlist_t *nvroot = NULL;
 	char *poolname;
@@ -594,7 +810,7 @@ zpool_do_create(int argc, char **argv)
 	char *propval;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":fnR:m:o:O:")) != -1) {
+	while ((c = getopt(argc, argv, ":fndR:m:o:O:")) != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
@@ -602,6 +818,9 @@ zpool_do_create(int argc, char **argv)
 		case 'n':
 			dryrun = B_TRUE;
 			break;
+		case 'd':
+			enable_all_pool_feat = B_FALSE;
+			break;
 		case 'R':
 			altroot = optarg;
 			if (add_prop_list(zpool_prop_to_name(
@@ -616,6 +835,7 @@ zpool_do_create(int argc, char **argv)
 				goto errout;
 			break;
 		case 'm':
+			/* Equivalent to -O mountpoint=optarg */
 			mountpoint = optarg;
 			break;
 		case 'o':
@@ -629,6 +849,23 @@ zpool_do_create(int argc, char **argv)
 
 			if (add_prop_list(optarg, propval, &props, B_TRUE))
 				goto errout;
+
+			/*
+			 * If the user is creating a pool that doesn't support
+			 * feature flags, don't enable any features.
+			 */
+			if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) {
+				char *end;
+				u_longlong_t ver;
+
+				ver = strtoull(propval, &end, 10);
+				if (*end == '\0' &&
+				    ver < SPA_VERSION_FEATURES) {
+					enable_all_pool_feat = B_FALSE;
+				}
+			}
+			if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT)
+				altroot = propval;
 			break;
 		case 'O':
 			if ((propval = strchr(optarg, '=')) == NULL) {
@@ -639,8 +876,18 @@ zpool_do_create(int argc, char **argv)
 			*propval = '\0';
 			propval++;
 
-			if (add_prop_list(optarg, propval, &fsprops, B_FALSE))
+			/*
+			 * Mountpoints are checked and then added later.
+			 * Uniquely among properties, they can be specified
+			 * more than once, to avoid conflict with -m.
+			 */
+			if (0 == strcmp(optarg,
+			    zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
+				mountpoint = propval;
+			} else if (add_prop_list(optarg, propval, &fsprops,
+			    B_FALSE)) {
 				goto errout;
+			}
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
@@ -694,7 +941,6 @@ zpool_do_create(int argc, char **argv)
 		goto errout;
 	}
 
-
 	if (altroot != NULL && altroot[0] != '/') {
 		(void) fprintf(stderr, gettext("invalid alternate root '%s': "
 		    "must be an absolute path\n"), altroot);
@@ -704,10 +950,11 @@ zpool_do_create(int argc, char **argv)
 	/*
 	 * Check the validity of the mountpoint and direct the user to use the
 	 * '-m' mountpoint option if it looks like its in use.
+	 * Ignore the checks if the '-f' option is given.
 	 */
-	if (mountpoint == NULL ||
+	if (!force && (mountpoint == NULL ||
 	    (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 &&
-	    strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) {
+	    strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0))) {
 		char buf[MAXPATHLEN];
 		DIR *dirp;
 
@@ -758,6 +1005,18 @@ zpool_do_create(int argc, char **argv)
 		}
 	}
 
+	/*
+	 * Now that the mountpoint's validity has been checked, ensure that
+	 * the property is set appropriately prior to creating the pool.
+	 */
+	if (mountpoint != NULL) {
+		ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
+		    mountpoint, &fsprops, B_FALSE);
+		if (ret != 0)
+			goto errout;
+	}
+
+	ret = 1;
 	if (dryrun) {
 		/*
 		 * For a dry run invocation, print out a basic message and run
@@ -776,16 +1035,35 @@ zpool_do_create(int argc, char **argv)
 		/*
 		 * Hand off to libzfs.
 		 */
+		if (enable_all_pool_feat) {
+			spa_feature_t i;
+			for (i = 0; i < SPA_FEATURES; i++) {
+				char propname[MAXPATHLEN];
+				zfeature_info_t *feat = &spa_feature_table[i];
+
+				(void) snprintf(propname, sizeof (propname),
+				    "feature@%s", feat->fi_uname);
+
+				/*
+				 * Skip feature if user specified it manually
+				 * on the command line.
+				 */
+				if (nvlist_exists(props, propname))
+					continue;
+
+				ret = add_prop_list(propname,
+				    ZFS_FEATURE_ENABLED, &props, B_TRUE);
+				if (ret != 0)
+					goto errout;
+			}
+		}
+
+		ret = 1;
 		if (zpool_create(g_zfs, poolname,
 		    nvroot, props, fsprops) == 0) {
 			zfs_handle_t *pool = zfs_open(g_zfs, poolname,
 			    ZFS_TYPE_FILESYSTEM);
 			if (pool != NULL) {
-				if (mountpoint != NULL)
-					verify(zfs_prop_set(pool,
-					    zfs_prop_to_name(
-					    ZFS_PROP_MOUNTPOINT),
-					    mountpoint) == 0);
 				if (zfs_mount(pool, NULL, 0) == 0)
 					ret = zfs_shareall(pool);
 				zfs_close(pool);
@@ -869,7 +1147,10 @@ zpool_do_destroy(int argc, char **argv)
 		return (1);
 	}
 
-	ret = (zpool_destroy(zhp) != 0);
+	/* The history must be logged as part of the export */
+	log_history = B_FALSE;
+
+	ret = (zpool_destroy(zhp, history_str) != 0);
 
 	zpool_close(zhp);
 
@@ -933,10 +1214,13 @@ zpool_do_export(int argc, char **argv)
 			continue;
 		}
 
+		/* The history must be logged as part of the export */
+		log_history = B_FALSE;
+
 		if (hardforce) {
-			if (zpool_export_force(zhp) != 0)
+			if (zpool_export_force(zhp, history_str) != 0)
 				ret = 1;
-		} else if (zpool_export(zhp, force) != 0) {
+		} else if (zpool_export(zhp, force, history_str) != 0) {
 			ret = 1;
 		}
 
@@ -1044,21 +1328,23 @@ print_status_config(zpool_handle_t *zhp,
     int namewidth, int depth, boolean_t isspare)
 {
 	nvlist_t **child;
-	uint_t c, children;
+	uint_t c, vsc, children;
+	pool_scan_stat_t *ps = NULL;
 	vdev_stat_t *vs;
-	char rbuf[6], wbuf[6], cbuf[6], repaired[7];
+	char rbuf[6], wbuf[6], cbuf[6];
 	char *vname;
 	uint64_t notpresent;
+	uint64_t ashift;
 	spare_cbdata_t cb;
-	char *state;
-
-	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
-	    (uint64_t **)&vs, &c) == 0);
+	const char *state;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children) != 0)
 		children = 0;
 
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &vsc) == 0);
+
 	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 	if (isspare) {
 		/*
@@ -1082,10 +1368,11 @@ print_status_config(zpool_handle_t *zhp,
 	}
 
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-	    &notpresent) == 0) {
+	    &notpresent) == 0 ||
+	    vs->vs_state <= VDEV_STATE_CANT_OPEN) {
 		char *path;
-		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
-		(void) printf("  was %s", path);
+		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0)
+			(void) printf("  was %s", path);
 	} else if (vs->vs_aux != 0) {
 		(void) printf("  ");
 
@@ -1106,6 +1393,14 @@ print_status_config(zpool_handle_t *zhp,
 			(void) printf(gettext("newer version"));
 			break;
 
+		case VDEV_AUX_UNSUP_FEAT:
+			(void) printf(gettext("unsupported feature(s)"));
+			break;
+
+		case VDEV_AUX_ASHIFT_TOO_BIG:
+			(void) printf(gettext("unsupported minimum blocksize"));
+			break;
+
 		case VDEV_AUX_SPARED:
 			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
 			    &cb.cb_guid) == 0);
@@ -1148,14 +1443,22 @@ print_status_config(zpool_handle_t *zhp,
 			(void) printf(gettext("corrupted data"));
 			break;
 		}
-	} else if (vs->vs_scrub_repaired != 0 && children == 0) {
-		/*
-		 * Report bytes resilvered/repaired on leaf devices.
-		 */
-		zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
-		(void) printf(gettext("  %s %s"), repaired,
-		    (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
-		    "resilvered" : "repaired");
+	} else if (children == 0 && !isspare &&
+	    VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
+	    vs->vs_configured_ashift < vs->vs_physical_ashift) {
+		(void) printf(
+		    gettext("  block size: %dB configured, %dB native"),
+		    1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift);
+	}
+
+	(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
+	    (uint64_t **)&ps, &c);
+
+	if (ps && ps->pss_state == DSS_SCANNING &&
+	    vs->vs_scan_processed != 0 && children == 0) {
+		(void) printf(gettext("  (%s)"),
+		    (ps->pss_func == POOL_SCAN_RESILVER) ?
+		    "resilvering" : "repairing");
 	}
 
 	(void) printf("\n");
@@ -1195,7 +1498,7 @@ print_import_config(const char *name, nv
 	    strcmp(type, VDEV_TYPE_HOLE) == 0)
 		return;
 
-	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0);
 
 	(void) printf("\t%*s%-*s", depth, "", namewidth - depth, name);
@@ -1221,6 +1524,10 @@ print_import_config(const char *name, nv
 			(void) printf(gettext("newer version"));
 			break;
 
+		case VDEV_AUX_UNSUP_FEAT:
+			(void) printf(gettext("unsupported feature(s)"));
+			break;
+
 		case VDEV_AUX_ERR_EXCEEDED:
 			(void) printf(gettext("too many errors"));
 			break;
@@ -1324,6 +1631,7 @@ show_import(nvlist_t *config)
 	const char *health;
 	uint_t vsc;
 	int namewidth;
+	char *comment;
 
 	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &name) == 0);
@@ -1334,15 +1642,15 @@ show_import(nvlist_t *config)
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 
-	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &vsc) == 0);
 	health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 
 	reason = zpool_import_status(config, &msgid);
 
-	(void) printf(gettext("  pool: %s\n"), name);
-	(void) printf(gettext("    id: %llu\n"), (u_longlong_t)guid);
-	(void) printf(gettext(" state: %s"), health);
+	(void) printf(gettext("   pool: %s\n"), name);
+	(void) printf(gettext("     id: %llu\n"), (u_longlong_t)guid);
+	(void) printf(gettext("  state: %s"), health);
 	if (pool_state == POOL_STATE_DESTROYED)
 		(void) printf(gettext(" (DESTROYED)"));
 	(void) printf("\n");
@@ -1351,56 +1659,87 @@ show_import(nvlist_t *config)
 	case ZPOOL_STATUS_MISSING_DEV_R:
 	case ZPOOL_STATUS_MISSING_DEV_NR:
 	case ZPOOL_STATUS_BAD_GUID_SUM:
-		(void) printf(gettext("status: One or more devices are missing "
-		    "from the system.\n"));
+		(void) printf(gettext(" status: One or more devices are "
+		    "missing from the system.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_LABEL_R:
 	case ZPOOL_STATUS_CORRUPT_LABEL_NR:
-		(void) printf(gettext("status: One or more devices contains "
+		(void) printf(gettext(" status: One or more devices contains "
 		    "corrupted data.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_DATA:
-		(void) printf(gettext("status: The pool data is corrupted.\n"));
+		(void) printf(
+		    gettext(" status: The pool data is corrupted.\n"));
 		break;
 
 	case ZPOOL_STATUS_OFFLINE_DEV:
-		(void) printf(gettext("status: One or more devices "
+		(void) printf(gettext(" status: One or more devices "
 		    "are offlined.\n"));
 		break;
 
 	case ZPOOL_STATUS_CORRUPT_POOL:
-		(void) printf(gettext("status: The pool metadata is "
+		(void) printf(gettext(" status: The pool metadata is "
 		    "corrupted.\n"));
 		break;
 
 	case ZPOOL_STATUS_VERSION_OLDER:
-		(void) printf(gettext("status: The pool is formatted using an "
-		    "older on-disk version.\n"));
+		(void) printf(gettext(" status: The pool is formatted using a "
+		    "legacy on-disk version.\n"));
 		break;
 
 	case ZPOOL_STATUS_VERSION_NEWER:
-		(void) printf(gettext("status: The pool is formatted using an "
+		(void) printf(gettext(" status: The pool is formatted using an "
 		    "incompatible version.\n"));
 		break;
 
+	case ZPOOL_STATUS_FEAT_DISABLED:
+		(void) printf(gettext(" status: Some supported features are "
+		    "not enabled on the pool.\n"));
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_READ:
+		(void) printf(gettext("status: The pool uses the following "
+		    "feature(s) not supported on this sytem:\n"));
+		zpool_print_unsup_feat(config);
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+		(void) printf(gettext("status: The pool can only be accessed "
+		    "in read-only mode on this system. It\n\tcannot be "
+		    "accessed in read-write mode because it uses the "
+		    "following\n\tfeature(s) not supported on this system:\n"));
+		zpool_print_unsup_feat(config);
+		break;
+
 	case ZPOOL_STATUS_HOSTID_MISMATCH:
-		(void) printf(gettext("status: The pool was last accessed by "
+		(void) printf(gettext(" status: The pool was last accessed by "
 		    "another system.\n"));
 		break;
 
 	case ZPOOL_STATUS_FAULTED_DEV_R:
 	case ZPOOL_STATUS_FAULTED_DEV_NR:
-		(void) printf(gettext("status: One or more devices are "
+		(void) printf(gettext(" status: One or more devices are "
 		    "faulted.\n"));
 		break;
 
 	case ZPOOL_STATUS_BAD_LOG:
-		(void) printf(gettext("status: An intent log record cannot be "
+		(void) printf(gettext(" status: An intent log record cannot be "
 		    "read.\n"));
 		break;
 
+	case ZPOOL_STATUS_RESILVERING:
+		(void) printf(gettext(" status: One or more devices were being "
+		    "resilvered.\n"));
+		break;
+
+	case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
+		(void) printf(gettext("status: One or more devices were "
+		    "configured to use a non-native block size.\n"
+		    "\tExpect reduced performance.\n"));
+		break;
+
 	default:
 		/*
 		 * No other status can be seen when importing pools.
@@ -1412,44 +1751,64 @@ show_import(nvlist_t *config)
 	 * Print out an action according to the overall state of the pool.
 	 */
 	if (vs->vs_state == VDEV_STATE_HEALTHY) {
-		if (reason == ZPOOL_STATUS_VERSION_OLDER)
-			(void) printf(gettext("action: The pool can be "
+		if (reason == ZPOOL_STATUS_VERSION_OLDER ||
+		    reason == ZPOOL_STATUS_FEAT_DISABLED) {
+			(void) printf(gettext(" action: The pool can be "
 			    "imported using its name or numeric identifier, "
 			    "though\n\tsome features will not be available "
 			    "without an explicit 'zpool upgrade'.\n"));
-		else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH)
-			(void) printf(gettext("action: The pool can be "
+		} else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) {
+			(void) printf(gettext(" action: The pool can be "
 			    "imported using its name or numeric "
 			    "identifier and\n\tthe '-f' flag.\n"));
-		else
-			(void) printf(gettext("action: The pool can be "
+		} else {
+			(void) printf(gettext(" action: The pool can be "
 			    "imported using its name or numeric "
 			    "identifier.\n"));
+		}
 	} else if (vs->vs_state == VDEV_STATE_DEGRADED) {
-		(void) printf(gettext("action: The pool can be imported "
+		(void) printf(gettext(" action: The pool can be imported "
 		    "despite missing or damaged devices.  The\n\tfault "
 		    "tolerance of the pool may be compromised if imported.\n"));
 	} else {
 		switch (reason) {
 		case ZPOOL_STATUS_VERSION_NEWER:
-			(void) printf(gettext("action: The pool cannot be "
+			(void) printf(gettext(" action: The pool cannot be "
 			    "imported.  Access the pool on a system running "
 			    "newer\n\tsoftware, or recreate the pool from "
 			    "backup.\n"));
 			break;
+		case ZPOOL_STATUS_UNSUP_FEAT_READ:
+			(void) printf(gettext("action: The pool cannot be "
+			    "imported. Access the pool on a system that "
+			    "supports\n\tthe required feature(s), or recreate "
+			    "the pool from backup.\n"));
+			break;
+		case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+			(void) printf(gettext("action: The pool cannot be "
+			    "imported in read-write mode. Import the pool "
+			    "with\n"
+			    "\t\"-o readonly=on\", access the pool on a system "
+			    "that supports the\n\trequired feature(s), or "
+			    "recreate the pool from backup.\n"));
+			break;
 		case ZPOOL_STATUS_MISSING_DEV_R:
 		case ZPOOL_STATUS_MISSING_DEV_NR:
 		case ZPOOL_STATUS_BAD_GUID_SUM:
-			(void) printf(gettext("action: The pool cannot be "
+			(void) printf(gettext(" action: The pool cannot be "
 			    "imported. Attach the missing\n\tdevices and try "
 			    "again.\n"));
 			break;
 		default:
-			(void) printf(gettext("action: The pool cannot be "
+			(void) printf(gettext(" action: The pool cannot be "
 			    "imported due to damaged devices or data.\n"));
 		}
 	}
 
+	/* Print the comment attached to the pool. */
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+		(void) printf(gettext("comment: %s\n"), comment);
+
 	/*
 	 * If the state is "closed" or "can't open", and the aux state
 	 * is "corrupt data":
@@ -1467,10 +1826,10 @@ show_import(nvlist_t *config)
 	}
 
 	if (msgid != NULL)
-		(void) printf(gettext("   see: http://www.sun.com/msg/%s\n"),
+		(void) printf(gettext("   see: http://illumos.org/msg/%s\n"),
 		    msgid);
 
-	(void) printf(gettext("config:\n\n"));
+	(void) printf(gettext(" config:\n\n"));
 
 	namewidth = max_width(NULL, nvroot, 0, 0);
 	if (namewidth < 10)
@@ -1494,7 +1853,7 @@ show_import(nvlist_t *config)
  */
 static int
 do_import(nvlist_t *config, const char *newname, const char *mntopts,
-    int force, nvlist_t *props, boolean_t do_verbatim)
+    nvlist_t *props, int flags)
 {
 	zpool_handle_t *zhp;
 	char *name;
@@ -1508,11 +1867,12 @@ do_import(nvlist_t *config, const char *
 	    ZPOOL_CONFIG_POOL_STATE, &state) == 0);
 	verify(nvlist_lookup_uint64(config,
 	    ZPOOL_CONFIG_VERSION, &version) == 0);
-	if (version > SPA_VERSION) {
+	if (!SPA_VERSION_IS_SUPPORTED(version)) {
 		(void) fprintf(stderr, gettext("cannot import '%s': pool "
-		    "is formatted using a newer ZFS version\n"), name);
+		    "is formatted using an unsupported ZFS version\n"), name);
 		return (1);
-	} else if (state != POOL_STATE_EXPORTED && !force) {
+	} else if (state != POOL_STATE_EXPORTED &&
+	    !(flags & ZFS_IMPORT_ANY_HOST)) {
 		uint64_t hostid;
 
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID,
@@ -1546,7 +1906,7 @@ do_import(nvlist_t *config, const char *
 		}
 	}
 
-	if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0)
+	if (zpool_import_props(g_zfs, config, newname, props, flags) != 0)
 		return (1);
 
 	if (newname != NULL)
@@ -1556,6 +1916,7 @@ do_import(nvlist_t *config, const char *
 		return (1);
 
 	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+	    !(flags & ZFS_IMPORT_ONLY) &&
 	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
 		zpool_close(zhp);
 		return (1);
@@ -1597,6 +1958,11 @@ do_import(nvlist_t *config, const char *
  *
  *       -n     See if rewind would work, but don't actually rewind.
  *
+ *       -N     Import the pool but don't mount datasets.
+ *
+ *       -T     Specify a starting txg to use for import. This option is
+ *       	intentionally undocumented option for testing purposes.
+ *
  *       -a	Import all pools found.
  *
  *       -o	Set property=value and/or temporary mount options (without '=').
@@ -1615,7 +1981,6 @@ zpool_do_import(int argc, char **argv)
 	boolean_t do_all = B_FALSE;
 	boolean_t do_destroyed = B_FALSE;
 	char *mntopts = NULL;
-	boolean_t do_force = B_FALSE;
 	nvpair_t *elem;
 	nvlist_t *config;
 	uint64_t searchguid = 0;
@@ -1625,17 +1990,18 @@ zpool_do_import(int argc, char **argv)
 	nvlist_t *policy = NULL;
 	nvlist_t *props = NULL;
 	boolean_t first;
-	boolean_t do_verbatim = B_FALSE;
+	int flags = ZFS_IMPORT_NORMAL;
 	uint32_t rewind_policy = ZPOOL_NO_REWIND;
 	boolean_t dryrun = B_FALSE;
 	boolean_t do_rewind = B_FALSE;
 	boolean_t xtreme_rewind = B_FALSE;
-	uint64_t pool_state;
+	uint64_t pool_state, txg = -1ULL;
 	char *cachefile = NULL;
 	importargs_t idata = { 0 };
+	char *endptr;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) {
+	while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:T:VX")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = B_TRUE;
@@ -1660,14 +2026,20 @@ zpool_do_import(int argc, char **argv)
 			do_destroyed = B_TRUE;
 			break;
 		case 'f':
-			do_force = B_TRUE;
+			flags |= ZFS_IMPORT_ANY_HOST;
 			break;
 		case 'F':
 			do_rewind = B_TRUE;
 			break;
+		case 'm':
+			flags |= ZFS_IMPORT_MISSING_LOG;
+			break;
 		case 'n':
 			dryrun = B_TRUE;
 			break;
+		case 'N':
+			flags |= ZFS_IMPORT_ONLY;
+			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) != NULL) {
 				*propval = '\0';
@@ -1691,8 +2063,18 @@ zpool_do_import(int argc, char **argv)
 			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
 				goto error;
 			break;
+		case 'T':
+			errno = 0;
+			txg = strtoull(optarg, &endptr, 0);
+			if (errno != 0 || *endptr != '\0') {
+				(void) fprintf(stderr,
+				    gettext("invalid txg value\n"));
+				usage(B_FALSE);
+			}
+			rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND;
+			break;
 		case 'V':
-			do_verbatim = B_TRUE;
+			flags |= ZFS_IMPORT_VERBATIM;
 			break;
 		case 'X':
 			xtreme_rewind = B_TRUE;
@@ -1731,12 +2113,13 @@ zpool_do_import(int argc, char **argv)
 
 	/* In the future, we can capture further policy and include it here */
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
 		goto error;
 
 	if (searchdirs == NULL) {
 		searchdirs = safe_malloc(sizeof (char *));
-		searchdirs[0] = "/dev/dsk";
+		searchdirs[0] = "/dev";
 		nsearch = 1;
 	}
 
@@ -1784,8 +2167,10 @@ zpool_do_import(int argc, char **argv)
 
 		errno = 0;
 		searchguid = strtoull(argv[0], &endptr, 10);
-		if (errno != 0 || *endptr != '\0')
+		if (errno != 0 || *endptr != '\0') {
 			searchname = argv[0];
+			searchguid = 0;
+		}
 		found_config = NULL;
 
 		/*
@@ -1864,7 +2249,7 @@ zpool_do_import(int argc, char **argv)
 
 			if (do_all) {
 				err |= do_import(config, NULL, mntopts,
-				    do_force, props, do_verbatim);
+				    props, flags);
 			} else {
 				show_import(config);
 			}
@@ -1913,7 +2298,7 @@ zpool_do_import(int argc, char **argv)
 			err = B_TRUE;
 		} else {
 			err |= do_import(found_config, argc == 1 ? NULL :
-			    argv[1], mntopts, do_force, props, do_verbatim);
+			    argv[1], mntopts, props, flags);
 		}
 	}
 
@@ -1935,10 +2320,10 @@ error:
 }
 
 typedef struct iostat_cbdata {
-	zpool_list_t *cb_list;
-	int cb_verbose;
-	int cb_iteration;
+	boolean_t cb_verbose;
 	int cb_namewidth;
+	int cb_iteration;
+	zpool_list_t *cb_list;
 } iostat_cbdata_t;
 
 static void
@@ -1991,13 +2376,13 @@ print_vdev_stats(zpool_handle_t *zhp, co
 	char *vname;
 
 	if (oldnv != NULL) {
-		verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS,
-		    (uint64_t **)&oldvs, &c) == 0);
+		verify(nvlist_lookup_uint64_array(oldnv,
+		    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0);
 	} else {
 		oldvs = &zerovs;
 	}
 
-	verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS,
+	verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&newvs, &c) == 0);
 
 	if (strlen(name) + depth > cb->cb_namewidth)
@@ -2047,6 +2432,17 @@ print_vdev_stats(zpool_handle_t *zhp, co
 		return;
 
 	for (c = 0; c < children; c++) {
+		uint64_t ishole = B_FALSE, islog = B_FALSE;
+
+		(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE,
+		    &ishole);
+
+		(void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG,
+		    &islog);
+
+		if (ishole || islog)
+			continue;
+
 		vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
 		print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
 		    newchild[c], cb, depth + 2);
@@ -2054,6 +2450,31 @@ print_vdev_stats(zpool_handle_t *zhp, co
 	}
 
 	/*
+	 * Log device section
+	 */
+
+	if (num_logs(newnv) > 0) {
+		(void) printf("%-*s      -      -      -      -      -      "
+		    "-\n", cb->cb_namewidth, "logs");
+
+		for (c = 0; c < children; c++) {
+			uint64_t islog = B_FALSE;
+			(void) nvlist_lookup_uint64(newchild[c],
+			    ZPOOL_CONFIG_IS_LOG, &islog);
+
+			if (islog) {
+				vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+				    B_FALSE);
+				print_vdev_stats(zhp, vname, oldnv ?
+				    oldchild[c] : NULL, newchild[c],
+				    cb, depth + 2);
+				free(vname);
+			}
+		}
+
+	}
+
+	/*
 	 * Include level 2 ARC devices in iostat output
 	 */
 	if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
@@ -2142,7 +2563,8 @@ get_namewidth(zpool_handle_t *zhp, void 
 		if (!cb->cb_verbose)
 			cb->cb_namewidth = strlen(zpool_get_name(zhp));
 		else
-			cb->cb_namewidth = max_width(zhp, nvroot, 0, 0);
+			cb->cb_namewidth = max_width(zhp, nvroot, 0,
+			    cb->cb_namewidth);
 	}
 
 	/*
@@ -2158,55 +2580,14 @@ get_namewidth(zpool_handle_t *zhp, void 
 }
 
 /*
- * zpool iostat [-T d|u] [-v] [pool] ... [interval [count]]
- *
- *	-T	Display a timestamp in date(1) or Unix format
- *	-v	Display statistics for individual vdevs
- *
- * This command can be tricky because we want to be able to deal with pool
- * creation/destruction as well as vdev configuration changes.  The bulk of this
- * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely
- * on pool_list_update() to detect the addition of new pools.  Configuration
- * changes are all handled within libzfs.
+ * Parse the input string, get the 'interval' and 'count' value if there is one.
  */
-int
-zpool_do_iostat(int argc, char **argv)
+static void
+get_interval_count(int *argcp, char **argv, unsigned long *iv,
+    unsigned long *cnt)
 {
-	int c;
-	int ret;
-	int npools;
 	unsigned long interval = 0, count = 0;
-	zpool_list_t *list;
-	boolean_t verbose = B_FALSE;
-	iostat_cbdata_t cb;
-
-	/* check options */
-	while ((c = getopt(argc, argv, "T:v")) != -1) {
-		switch (c) {
-		case 'T':
-			if (optarg) {
-				if (*optarg == 'u')
-					timestamp_fmt = UDATE;
-				else if (*optarg == 'd')
-					timestamp_fmt = DDATE;
-				else
-					usage(B_FALSE);
-			} else {
-				usage(B_FALSE);
-			}
-			break;
-		case 'v':
-			verbose = B_TRUE;
-			break;
-		case '?':
-			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
-			    optopt);
-			usage(B_FALSE);
-		}
-	}
-
-	argc -= optind;
-	argv += optind;
+	int argc = *argcp, errno;
 
 	/*
 	 * Determine if the last argument is an integer or a pool name
@@ -2223,7 +2604,6 @@ zpool_do_iostat(int argc, char **argv)
 				    "cannot be zero\n"));
 				usage(B_FALSE);
 			}
-
 			/*
 			 * Ignore the last parameter
 			 */
@@ -2240,7 +2620,7 @@ zpool_do_iostat(int argc, char **argv)
 
 	/*
 	 * If the last argument is also an integer, then we have both a count
-	 * and an integer.
+	 * and an interval.
 	 */
 	if (argc > 0 && isdigit(argv[argc - 1][0])) {
 		char *end;
@@ -2265,23 +2645,83 @@ zpool_do_iostat(int argc, char **argv)
 		}
 	}
 
-	/*
-	 * Construct the list of all interesting pools.
-	 */
-	ret = 0;
-	if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
-		return (1);
-
-	if (pool_list_count(list) == 0 && argc != 0) {
-		pool_list_free(list);
-		return (1);
-	}
+	*iv = interval;
+	*cnt = count;
+	*argcp = argc;
+}
 
-	if (pool_list_count(list) == 0 && interval == 0) {
-		pool_list_free(list);
-		(void) fprintf(stderr, gettext("no pools available\n"));
-		return (1);
-	}
+static void
+get_timestamp_arg(char c)
+{
+	if (c == 'u')
+		timestamp_fmt = UDATE;
+	else if (c == 'd')
+		timestamp_fmt = DDATE;
+	else
+		usage(B_FALSE);
+}
+
+/*
+ * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]]
+ *
+ *	-v	Display statistics for individual vdevs
+ *	-T	Display a timestamp in date(1) or Unix format
+ *
+ * This command can be tricky because we want to be able to deal with pool
+ * creation/destruction as well as vdev configuration changes.  The bulk of this
+ * processing is handled by the pool_list_* routines in zpool_iter.c.  We rely
+ * on pool_list_update() to detect the addition of new pools.  Configuration
+ * changes are all handled within libzfs.
+ */
+int
+zpool_do_iostat(int argc, char **argv)
+{
+	int c;
+	int ret;
+	int npools;
+	unsigned long interval = 0, count = 0;
+	zpool_list_t *list;
+	boolean_t verbose = B_FALSE;
+	iostat_cbdata_t cb;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "T:v")) != -1) {
+		switch (c) {
+		case 'T':
+			get_timestamp_arg(*optarg);
+			break;
+		case 'v':
+			verbose = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	get_interval_count(&argc, argv, &interval, &count);
+
+	/*
+	 * Construct the list of all interesting pools.
+	 */
+	ret = 0;
+	if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL)
+		return (1);
+
+	if (pool_list_count(list) == 0 && argc != 0) {
+		pool_list_free(list);
+		return (1);
+	}
+
+	if (pool_list_count(list) == 0 && interval == 0) {
+		pool_list_free(list);
+		(void) fprintf(stderr, gettext("no pools available\n"));
+		return (1);
+	}
 
 	/*
 	 * Enter the main iostat loop.
@@ -2353,39 +2793,61 @@ zpool_do_iostat(int argc, char **argv)
 }
 
 typedef struct list_cbdata {
+	boolean_t	cb_verbose;
+	int		cb_namewidth;
 	boolean_t	cb_scripted;
-	boolean_t	cb_first;
 	zprop_list_t	*cb_proplist;
+	boolean_t	cb_literal;
 } list_cbdata_t;
 
 /*
  * Given a list of columns to display, output appropriate headers for each one.
  */
 static void
-print_header(zprop_list_t *pl)
+print_header(list_cbdata_t *cb)
 {
+	zprop_list_t *pl = cb->cb_proplist;
+	char headerbuf[ZPOOL_MAXPROPLEN];
 	const char *header;
 	boolean_t first = B_TRUE;
 	boolean_t right_justify;
+	size_t width = 0;
 
 	for (; pl != NULL; pl = pl->pl_next) {
-		if (pl->pl_prop == ZPROP_INVAL)
-			continue;
+		width = pl->pl_width;
+		if (first && cb->cb_verbose) {
+			/*
+			 * Reset the width to accommodate the verbose listing
+			 * of devices.
+			 */
+			width = cb->cb_namewidth;
+		}
 
 		if (!first)
 			(void) printf("  ");
 		else
 			first = B_FALSE;
 
-		header = zpool_prop_column_name(pl->pl_prop);
-		right_justify = zpool_prop_align_right(pl->pl_prop);
+		right_justify = B_FALSE;
+		if (pl->pl_prop != ZPROP_INVAL) {
+			header = zpool_prop_column_name(pl->pl_prop);
+			right_justify = zpool_prop_align_right(pl->pl_prop);
+		} else {
+			int i;
+
+			for (i = 0; pl->pl_user_prop[i] != '\0'; i++)
+				headerbuf[i] = toupper(pl->pl_user_prop[i]);
+			headerbuf[i] = '\0';
+			header = headerbuf;
+		}
 
 		if (pl->pl_next == NULL && !right_justify)
 			(void) printf("%s", header);
 		else if (right_justify)
-			(void) printf("%*s", pl->pl_width, header);
+			(void) printf("%*s", width, header);
 		else
-			(void) printf("%-*s", pl->pl_width, header);
+			(void) printf("%-*s", width, header);
+
 	}
 
 	(void) printf("\n");
@@ -2396,17 +2858,28 @@ print_header(zprop_list_t *pl)
  * to the described layout.
  */
 static void
-print_pool(zpool_handle_t *zhp, zprop_list_t *pl, int scripted)
+print_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
 {
+	zprop_list_t *pl = cb->cb_proplist;
 	boolean_t first = B_TRUE;
 	char property[ZPOOL_MAXPROPLEN];
 	char *propstr;
 	boolean_t right_justify;
-	int width;
+	size_t width;
 
 	for (; pl != NULL; pl = pl->pl_next) {
+
+		width = pl->pl_width;
+		if (first && cb->cb_verbose) {
+			/*
+			 * Reset the width to accommodate the verbose listing
+			 * of devices.
+			 */
+			width = cb->cb_namewidth;
+		}
+
 		if (!first) {
-			if (scripted)
+			if (cb->cb_scripted)
 				(void) printf("\t");
 			else
 				(void) printf("  ");
@@ -2417,24 +2890,28 @@ print_pool(zpool_handle_t *zhp, zprop_li
 		right_justify = B_FALSE;
 		if (pl->pl_prop != ZPROP_INVAL) {
 			if (zpool_get_prop(zhp, pl->pl_prop, property,
-			    sizeof (property), NULL) != 0)
+			    sizeof (property), NULL, cb->cb_literal) != 0)
 				propstr = "-";
 			else
 				propstr = property;
 
 			right_justify = zpool_prop_align_right(pl->pl_prop);
+		} else if ((zpool_prop_feature(pl->pl_user_prop) ||
+		    zpool_prop_unsupported(pl->pl_user_prop)) &&
+		    zpool_prop_get_feature(zhp, pl->pl_user_prop, property,
+		    sizeof (property)) == 0) {
+			propstr = property;
 		} else {
 			propstr = "-";
 		}
 
-		width = pl->pl_width;
 
 		/*
 		 * If this is being called in scripted mode, or if this is the
 		 * last column and it is left-justified, don't include a width
 		 * format specifier.
 		 */
-		if (scripted || (pl->pl_next == NULL && !right_justify))
+		if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify))
 			(void) printf("%s", propstr);
 		else if (right_justify)
 			(void) printf("%*s", width, propstr);
@@ -2445,6 +2922,155 @@ print_pool(zpool_handle_t *zhp, zprop_li
 	(void) printf("\n");
 }
 
+static void
+print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted,
+    boolean_t valid)
+{
+	char propval[64];
+	boolean_t fixed;
+	size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
+
+	switch (prop) {
+	case ZPOOL_PROP_EXPANDSZ:
+		if (value == 0)
+			(void) strlcpy(propval, "-", sizeof (propval));
+		else
+			zfs_nicenum(value, propval, sizeof (propval));
+		break;
+	case ZPOOL_PROP_FRAGMENTATION:
+		if (value == ZFS_FRAG_INVALID) {
+			(void) strlcpy(propval, "-", sizeof (propval));
+		} else {
+			(void) snprintf(propval, sizeof (propval), "%llu%%",
+			    value);
+		}
+		break;
+	case ZPOOL_PROP_CAPACITY:
+		(void) snprintf(propval, sizeof (propval), "%llu%%", value);
+		break;
+	default:
+		zfs_nicenum(value, propval, sizeof (propval));
+	}
+
+	if (!valid)
+		(void) strlcpy(propval, "-", sizeof (propval));
+
+	if (scripted)
+		(void) printf("\t%s", propval);
+	else
+		(void) printf("  %*s", width, propval);
+}
+
+void
+print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
+    list_cbdata_t *cb, int depth)
+{
+	nvlist_t **child;
+	vdev_stat_t *vs;
+	uint_t c, children;
+	char *vname;
+	boolean_t scripted = cb->cb_scripted;
+	uint64_t islog = B_FALSE;
+	boolean_t haslog = B_FALSE;
+	char *dashes = "%-*s      -      -      -         -      -      -\n";
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	if (name != NULL) {
+		boolean_t toplevel = (vs->vs_space != 0);
+		uint64_t cap;
+
+		if (scripted)
+			(void) printf("\t%s", name);
+		else if (strlen(name) + depth > cb->cb_namewidth)
+			(void) printf("%*s%s", depth, "", name);
+		else
+			(void) printf("%*s%s%*s", depth, "", name,
+			    (int)(cb->cb_namewidth - strlen(name) - depth), "");
+
+		/*
+		 * Print the properties for the individual vdevs. Some
+		 * properties are only applicable to toplevel vdevs. The
+		 * 'toplevel' boolean value is passed to the print_one_column()
+		 * to indicate that the value is valid.
+		 */
+		print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted,
+		    toplevel);
+		print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted,
+		    toplevel);
+		print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
+		    scripted, toplevel);
+		print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted,
+		    B_TRUE);
+		print_one_column(ZPOOL_PROP_FRAGMENTATION,
+		    vs->vs_fragmentation, scripted,
+		    (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel));
+		cap = (vs->vs_space == 0) ? 0 :
+		    (vs->vs_alloc * 100 / vs->vs_space);
+		print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel);
+		(void) printf("\n");
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++) {
+		uint64_t ishole = B_FALSE;
+
+		if (nvlist_lookup_uint64(child[c],
+		    ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole)
+			continue;
+
+		if (nvlist_lookup_uint64(child[c],
+		    ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) {
+			haslog = B_TRUE;
+			continue;
+		}
+
+		vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
+		print_list_stats(zhp, vname, child[c], cb, depth + 2);
+		free(vname);
+	}
+
+	if (haslog == B_TRUE) {
+		/* LINTED E_SEC_PRINTF_VAR_FMT */
+		(void) printf(dashes, cb->cb_namewidth, "log");
+		for (c = 0; c < children; c++) {
+			if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+			    &islog) != 0 || !islog)
+				continue;
+			vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
+			print_list_stats(zhp, vname, child[c], cb, depth + 2);
+			free(vname);
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+	    &child, &children) == 0 && children > 0) {
+		/* LINTED E_SEC_PRINTF_VAR_FMT */
+		(void) printf(dashes, cb->cb_namewidth, "cache");
+		for (c = 0; c < children; c++) {
+			vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
+			print_list_stats(zhp, vname, child[c], cb, depth + 2);
+			free(vname);
+		}
+	}
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child,
+	    &children) == 0 && children > 0) {
+		/* LINTED E_SEC_PRINTF_VAR_FMT */
+		(void) printf(dashes, cb->cb_namewidth, "spare");
+		for (c = 0; c < children; c++) {
+			vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
+			print_list_stats(zhp, vname, child[c], cb, depth + 2);
+			free(vname);
+		}
+	}
+}
+
+
 /*
  * Generic callback function to list a pool.
  */
@@ -2452,25 +3078,32 @@ int
 list_callback(zpool_handle_t *zhp, void *data)
 {
 	list_cbdata_t *cbp = data;
+	nvlist_t *config;
+	nvlist_t *nvroot;
 
-	if (cbp->cb_first) {
-		if (!cbp->cb_scripted)
-			print_header(cbp->cb_proplist);
-		cbp->cb_first = B_FALSE;
-	}
+	config = zpool_get_config(zhp, NULL);
+
+	print_pool(zhp, cbp);
+	if (!cbp->cb_verbose)
+		return (0);
 
-	print_pool(zhp, cbp->cb_proplist, cbp->cb_scripted);
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	print_list_stats(zhp, NULL, nvroot, cbp, 0);
 
 	return (0);
 }
 
 /*
- * zpool list [-H] [-o prop[,prop]*] [pool] ...
+ * zpool list [-Hp] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]]
  *
  *	-H	Scripted mode.  Don't display headers, and separate properties
  *		by a single tab.
  *	-o	List of properties to display.  Defaults to
- *		"name,size,allocated,free,capacity,health,altroot"
+ *		"name,size,allocated,free,expandsize,fragmentation,capacity,"
+ *		"dedupratio,health,altroot"
+ * 	-p	Diplay values in parsable (exact) format.
+ *	-T	Display a timestamp in date(1) or Unix format
  *
  * List all pools in the system, whether or not they're healthy.  Output space
  * statistics for each one, as well as health status summary.
@@ -2482,11 +3115,15 @@ zpool_do_list(int argc, char **argv)
 	int ret;
 	list_cbdata_t cb = { 0 };
 	static char default_props[] =
-	    "name,size,allocated,free,capacity,dedupratio,health,altroot";
+	    "name,size,allocated,free,expandsize,fragmentation,capacity,"
+	    "dedupratio,health,altroot";
 	char *props = default_props;
+	unsigned long interval = 0, count = 0;
+	zpool_list_t *list;
+	boolean_t first = B_TRUE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":Ho:")) != -1) {
+	while ((c = getopt(argc, argv, ":Ho:pT:v")) != -1) {
 		switch (c) {
 		case 'H':
 			cb.cb_scripted = B_TRUE;
@@ -2494,6 +3131,15 @@ zpool_do_list(int argc, char **argv)
 		case 'o':
 			props = optarg;
 			break;
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
+		case 'T':
+			get_timestamp_arg(*optarg);
+			break;
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@@ -2509,49 +3155,49 @@ zpool_do_list(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
+	get_interval_count(&argc, argv, &interval, &count);
+
 	if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0)
 		usage(B_FALSE);
 
-	cb.cb_first = B_TRUE;
+	for (;;) {
+		if ((list = pool_list_get(argc, argv, &cb.cb_proplist,
+		    &ret)) == NULL)
+			return (1);
 
-	ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
-	    list_callback, &cb);
+		if (pool_list_count(list) == 0)
+			break;
 
-	zprop_free_list(cb.cb_proplist);
+		cb.cb_namewidth = 0;
+		(void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
 
-	if (argc == 0 && cb.cb_first && !cb.cb_scripted) {
-		(void) printf(gettext("no pools available\n"));
-		return (0);
-	}
+		if (timestamp_fmt != NODATE)
+			print_timestamp(timestamp_fmt);
 
-	return (ret);
-}
+		if (!cb.cb_scripted && (first || cb.cb_verbose)) {
+			print_header(&cb);
+			first = B_FALSE;
+		}
+		ret = pool_list_iter(list, B_TRUE, list_callback, &cb);
 
-static nvlist_t *
-zpool_get_vdev_by_name(nvlist_t *nv, char *name)
-{
-	nvlist_t **child;
-	uint_t c, children;
-	nvlist_t *match;
-	char *path;
+		if (interval == 0)
+			break;
 
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) != 0) {
-		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
-		if (strncmp(name, "/dev/dsk/", 9) == 0)
-			name += 9;
-		if (strncmp(path, "/dev/dsk/", 9) == 0)
-			path += 9;
-		if (strcmp(name, path) == 0)
-			return (nv);
-		return (NULL);
-	}
-
-	for (c = 0; c < children; c++)
-		if ((match = zpool_get_vdev_by_name(child[c], name)) != NULL)
-			return (match);
+		if (count != 0 && --count == 0)
+			break;
+
+		pool_list_free(list);
+		(void) sleep(interval);
+	}
+
+	if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) {
+		(void) printf(gettext("no pools available\n"));
+		ret = 0;
+	}
 
-	return (NULL);
+	pool_list_free(list);
+	zprop_free_list(cb.cb_proplist);
+	return (ret);
 }
 
 static int
@@ -2769,8 +3415,7 @@ zpool_do_split(int argc, char **argv)
 			if (add_prop_list(
 			    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
 			    &props, B_TRUE) != 0) {
-				if (props)
-					nvlist_free(props);
+				nvlist_free(props);
 				usage(B_FALSE);
 			}
 			break;
@@ -2783,8 +3428,7 @@ zpool_do_split(int argc, char **argv)
 				propval++;
 				if (add_prop_list(optarg, propval,
 				    &props, B_TRUE) != 0) {
-					if (props)
-						nvlist_free(props);
+					nvlist_free(props);
 					usage(B_FALSE);
 				}
 			} else {
@@ -2857,7 +3501,7 @@ zpool_do_split(int argc, char **argv)
 	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
 	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
 		ret = 1;
-		(void) fprintf(stderr, gettext("Split was succssful, but "
+		(void) fprintf(stderr, gettext("Split was successful, but "
 		    "the datasets could not all be mounted\n"));
 		(void) fprintf(stderr, gettext("Try doing '%s' with a "
 		    "different altroot\n"), "zpool import");
@@ -3086,51 +3730,20 @@ zpool_do_clear(int argc, char **argv)
 	return (ret);
 }
 
-typedef struct scrub_cbdata {
-	int	cb_type;
-	int	cb_argc;
-	char	**cb_argv;
-} scrub_cbdata_t;
-
-int
-scrub_callback(zpool_handle_t *zhp, void *data)
-{
-	scrub_cbdata_t *cb = data;
-	int err;
-
-	/*
-	 * Ignore faulted pools.
-	 */
-	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
-		(void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
-		    "currently unavailable\n"), zpool_get_name(zhp));
-		return (1);
-	}
-
-	err = zpool_scrub(zhp, cb->cb_type);
-
-	return (err != 0);
-}
-
 /*
- * zpool scrub [-s] <pool> ...
- *
- *	-s	Stop.  Stops any in-progress scrub.
+ * zpool reguid <pool>
  */
 int
-zpool_do_scrub(int argc, char **argv)
+zpool_do_reguid(int argc, char **argv)
 {
 	int c;
-	scrub_cbdata_t cb;
-
-	cb.cb_type = POOL_SCRUB_EVERYTHING;
+	char *poolname;
+	zpool_handle_t *zhp;
+	int ret = 0;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "s")) != -1) {
+	while ((c = getopt(argc, argv, "")) != -1) {
 		switch (c) {
-		case 's':
-			cb.cb_type = POOL_SCRUB_NONE;
-			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -3138,24 +3751,146 @@ zpool_do_scrub(int argc, char **argv)
 		}
 	}
 
-	cb.cb_argc = argc;
-	cb.cb_argv = argv;
 	argc -= optind;
 	argv += optind;
 
+	/* get pool name and check number of arguments */
 	if (argc < 1) {
-		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 
-	return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
-}
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
 
-typedef struct status_cbdata {
-	int		cb_count;
-	boolean_t	cb_allpools;
-	boolean_t	cb_verbose;
-	boolean_t	cb_explain;
+	poolname = argv[0];
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+		return (1);
+
+	ret = zpool_reguid(zhp);
+
+	zpool_close(zhp);
+	return (ret);
+}
+
+
+/*
+ * zpool reopen <pool>
+ *
+ * Reopen the pool so that the kernel can update the sizes of all vdevs.
+ */
+int
+zpool_do_reopen(int argc, char **argv)
+{
+	int c;
+	int ret = 0;
+	zpool_handle_t *zhp;
+	char *pool;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "")) != -1) {
+		switch (c) {
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc--;
+	argv++;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name\n"));
+		usage(B_FALSE);
+	}
+
+	if (argc > 1) {
+		(void) fprintf(stderr, gettext("too many arguments\n"));
+		usage(B_FALSE);
+	}
+
+	pool = argv[0];
+	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
+		return (1);
+
+	ret = zpool_reopen(zhp);
+	zpool_close(zhp);
+	return (ret);
+}
+
+typedef struct scrub_cbdata {
+	int	cb_type;
+	int	cb_argc;
+	char	**cb_argv;
+} scrub_cbdata_t;
+
+int
+scrub_callback(zpool_handle_t *zhp, void *data)
+{
+	scrub_cbdata_t *cb = data;
+	int err;
+
+	/*
+	 * Ignore faulted pools.
+	 */
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		(void) fprintf(stderr, gettext("cannot scrub '%s': pool is "
+		    "currently unavailable\n"), zpool_get_name(zhp));
+		return (1);
+	}
+
+	err = zpool_scan(zhp, cb->cb_type);
+
+	return (err != 0);
+}
+
+/*
+ * zpool scrub [-s] <pool> ...
+ *
+ *	-s	Stop.  Stops any in-progress scrub.
+ */
+int
+zpool_do_scrub(int argc, char **argv)
+{
+	int c;
+	scrub_cbdata_t cb;
+
+	cb.cb_type = POOL_SCAN_SCRUB;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "s")) != -1) {
+		switch (c) {
+		case 's':
+			cb.cb_type = POOL_SCAN_NONE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	cb.cb_argc = argc;
+	cb.cb_argv = argv;
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing pool name argument\n"));
+		usage(B_FALSE);
+	}
+
+	return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb));
+}
+
+typedef struct status_cbdata {
+	int		cb_count;
+	boolean_t	cb_allpools;
+	boolean_t	cb_verbose;
+	boolean_t	cb_explain;
 	boolean_t	cb_first;
 	boolean_t	cb_dedup_stats;
 } status_cbdata_t;
@@ -3164,62 +3899,112 @@ typedef struct status_cbdata {
  * Print out detailed scrub status.
  */
 void
-print_scrub_status(nvlist_t *nvroot)
+print_scan_status(pool_scan_stat_t *ps)
 {
-	vdev_stat_t *vs;
-	uint_t vsc;
-	time_t start, end, now;
+	time_t start, end;
+	uint64_t elapsed, mins_left, hours_left;
+	uint64_t pass_exam, examined, total;
+	uint_t rate;
 	double fraction_done;
-	uint64_t examined, total, minutes_left, minutes_taken;
-	char *scrub_type;
+	char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
 
-	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
-	    (uint64_t **)&vs, &vsc) == 0);
+	(void) printf(gettext("  scan: "));
 
-	/*
-	 * If there's never been a scrub, there's not much to say.
-	 */
-	if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) {
+	/* If there's never been a scan, there's not much to say. */
+	if (ps == NULL || ps->pss_func == POOL_SCAN_NONE ||
+	    ps->pss_func >= POOL_SCAN_FUNCS) {
 		(void) printf(gettext("none requested\n"));
 		return;
 	}
 
-	scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
-	    "resilver" : "scrub";
+	start = ps->pss_start_time;
+	end = ps->pss_end_time;
+	zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
 
-	start = vs->vs_scrub_start;
-	end = vs->vs_scrub_end;
-	now = time(NULL);
-	examined = vs->vs_scrub_examined;
-	total = vs->vs_alloc;
-
-	if (end != 0) {
-		minutes_taken = (uint64_t)((end - start) / 60);
-
-		(void) printf(gettext("%s %s after %lluh%um with %llu errors "
-		    "on %s"),
-		    scrub_type, vs->vs_scrub_complete ? "completed" : "stopped",
+	assert(ps->pss_func == POOL_SCAN_SCRUB ||
+	    ps->pss_func == POOL_SCAN_RESILVER);
+	/*
+	 * Scan is finished or canceled.
+	 */
+	if (ps->pss_state == DSS_FINISHED) {
+		uint64_t minutes_taken = (end - start) / 60;
+		char *fmt = NULL;
+
+		if (ps->pss_func == POOL_SCAN_SCRUB) {
+			fmt = gettext("scrub repaired %s in %lluh%um with "
+			    "%llu errors on %s");
+		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+			fmt = gettext("resilvered %s in %lluh%um with "
+			    "%llu errors on %s");
+		}
+		/* LINTED */
+		(void) printf(fmt, processed_buf,
 		    (u_longlong_t)(minutes_taken / 60),
 		    (uint_t)(minutes_taken % 60),
-		    (u_longlong_t)vs->vs_scrub_errors, ctime(&end));
+		    (u_longlong_t)ps->pss_errors,
+		    ctime((time_t *)&end));
+		return;
+	} else if (ps->pss_state == DSS_CANCELED) {
+		if (ps->pss_func == POOL_SCAN_SCRUB) {
+			(void) printf(gettext("scrub canceled on %s"),
+			    ctime(&end));
+		} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+			(void) printf(gettext("resilver canceled on %s"),
+			    ctime(&end));
+		}
 		return;
 	}
 
-	if (examined == 0)
-		examined = 1;
-	if (examined > total)
-		total = examined;
+	assert(ps->pss_state == DSS_SCANNING);
+
+	/*
+	 * Scan is in progress.
+	 */
+	if (ps->pss_func == POOL_SCAN_SCRUB) {
+		(void) printf(gettext("scrub in progress since %s"),
+		    ctime(&start));
+	} else if (ps->pss_func == POOL_SCAN_RESILVER) {
+		(void) printf(gettext("resilver in progress since %s"),
+		    ctime(&start));
+	}
 
+	examined = ps->pss_examined ? ps->pss_examined : 1;
+	total = ps->pss_to_examine;
 	fraction_done = (double)examined / total;
-	minutes_left = (uint64_t)((now - start) *
-	    (1 - fraction_done) / fraction_done / 60);
-	minutes_taken = (uint64_t)((now - start) / 60);
-
-	(void) printf(gettext("%s in progress for %lluh%um, %.2f%% done, "
-	    "%lluh%um to go\n"),
-	    scrub_type, (u_longlong_t)(minutes_taken / 60),
-	    (uint_t)(minutes_taken % 60), 100 * fraction_done,
-	    (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
+
+	/* elapsed time for this pass */
+	elapsed = time(NULL) - ps->pss_pass_start;
+	elapsed = elapsed ? elapsed : 1;
+	pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
+	rate = pass_exam / elapsed;
+	rate = rate ? rate : 1;
+	mins_left = ((total - examined) / rate) / 60;
+	hours_left = mins_left / 60;
+
+	zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
+	zfs_nicenum(total, total_buf, sizeof (total_buf));
+	zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
+
+	/*
+	 * do not print estimated time if hours_left is more than 30 days
+	 */
+	(void) printf(gettext("        %s scanned out of %s at %s/s"),
+	    examined_buf, total_buf, rate_buf);
+	if (hours_left < (30 * 24)) {
+		(void) printf(gettext(", %lluh%um to go\n"),
+		    (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
+	} else {
+		(void) printf(gettext(
+		    ", (scan is slow, no estimated time)\n"));
+	}
+
+	if (ps->pss_func == POOL_SCAN_RESILVER) {
+		(void) printf(gettext("        %s resilvered, %.2f%% done\n"),
+		    processed_buf, 100 * fraction_done);
+	} else if (ps->pss_func == POOL_SCAN_SCRUB) {
+		(void) printf(gettext("        %s repaired, %.2f%% done\n"),
+		    processed_buf, 100 * fraction_done);
+	}
 }
 
 static void
@@ -3307,14 +4092,20 @@ print_dedup_stats(nvlist_t *config)
 
 	/*
 	 * If the pool was faulted then we may not have been able to
-	 * obtain the config. Otherwise, if have anything in the dedup
+	 * obtain the config. Otherwise, if we have anything in the dedup
 	 * table continue processing the stats.
 	 */
 	if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
-	    (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0)
+	    (uint64_t **)&ddo, &c) != 0)
 		return;
 
 	(void) printf("\n");
+	(void) printf(gettext(" dedup: "));
+	if (ddo->ddo_count == 0) {
+		(void) printf(gettext("no DDT entries\n"));
+		return;
+	}
+
 	(void) printf("DDT entries %llu, size %llu on disk, %llu in core\n",
 	    (u_longlong_t)ddo->ddo_count,
 	    (u_longlong_t)ddo->ddo_dspace,
@@ -3333,7 +4124,7 @@ print_dedup_stats(nvlist_t *config)
  *        pool: tank
  *	status: DEGRADED
  *	reason: One or more devices ...
- *         see: http://www.sun.com/msg/ZFS-xxxx-01
+ *         see: http://illumos.org/msg/ZFS-xxxx-01
  *	config:
  *		mirror		DEGRADED
  *                c1t0d0	OK
@@ -3362,7 +4153,11 @@ status_callback(zpool_handle_t *zhp, voi
 	 * If we were given 'zpool status -x', only report those pools with
 	 * problems.
 	 */
-	if (reason == ZPOOL_STATUS_OK && cbp->cb_explain) {
+	if (cbp->cb_explain &&
+	    (reason == ZPOOL_STATUS_OK ||
+	    reason == ZPOOL_STATUS_VERSION_OLDER ||
+	    reason == ZPOOL_STATUS_NON_NATIVE_ASHIFT ||
+	    reason == ZPOOL_STATUS_FEAT_DISABLED)) {
 		if (!cbp->cb_allpools) {
 			(void) printf(gettext("pool '%s' is healthy\n"),
 			    zpool_get_name(zhp));
@@ -3379,7 +4174,7 @@ status_callback(zpool_handle_t *zhp, voi
 
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
-	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &c) == 0);
 	health = zpool_state_to_name(vs->vs_state, vs->vs_aux);
 
@@ -3452,7 +4247,6 @@ status_callback(zpool_handle_t *zhp, voi
 		    "replace'.\n"));
 		break;
 
-
 	case ZPOOL_STATUS_RESILVERING:
 		(void) printf(gettext("status: One or more devices is "
 		    "currently being resilvered.  The pool will\n\tcontinue "
@@ -3478,12 +4272,13 @@ status_callback(zpool_handle_t *zhp, voi
 		break;
 
 	case ZPOOL_STATUS_VERSION_OLDER:
-		(void) printf(gettext("status: The pool is formatted using an "
-		    "older on-disk format.  The pool can\n\tstill be used, but "
-		    "some features are unavailable.\n"));
+		(void) printf(gettext("status: The pool is formatted using a "
+		    "legacy on-disk format.  The pool can\n\tstill be used, "
+		    "but some features are unavailable.\n"));
 		(void) printf(gettext("action: Upgrade the pool using 'zpool "
 		    "upgrade'.  Once this is done, the\n\tpool will no longer "
-		    "be accessible on older software versions.\n"));
+		    "be accessible on software that does not support feature\n"
+		    "\tflags.\n"));
 		break;
 
 	case ZPOOL_STATUS_VERSION_NEWER:
@@ -3495,6 +4290,41 @@ status_callback(zpool_handle_t *zhp, voi
 		    "backup.\n"));
 		break;
 
+	case ZPOOL_STATUS_FEAT_DISABLED:
+		(void) printf(gettext("status: Some supported features are not "
+		    "enabled on the pool. The pool can\n\tstill be used, but "
+		    "some features are unavailable.\n"));
+		(void) printf(gettext("action: Enable all features using "
+		    "'zpool upgrade'. Once this is done,\n\tthe pool may no "
+		    "longer be accessible by software that does not support\n\t"
+		    "the features. See zpool-features(7) for details.\n"));
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_READ:
+		(void) printf(gettext("status: The pool cannot be accessed on "
+		    "this system because it uses the\n\tfollowing feature(s) "
+		    "not supported on this system:\n"));
+		zpool_print_unsup_feat(config);
+		(void) printf("\n");
+		(void) printf(gettext("action: Access the pool from a system "
+		    "that supports the required feature(s),\n\tor restore the "
+		    "pool from backup.\n"));
+		break;
+
+	case ZPOOL_STATUS_UNSUP_FEAT_WRITE:
+		(void) printf(gettext("status: The pool can only be accessed "
+		    "in read-only mode on this system. It\n\tcannot be "
+		    "accessed in read-write mode because it uses the "
+		    "following\n\tfeature(s) not supported on this system:\n"));
+		zpool_print_unsup_feat(config);
+		(void) printf("\n");
+		(void) printf(gettext("action: The pool cannot be accessed in "
+		    "read-write mode. Import the pool with\n"
+		    "\t\"-o readonly=on\", access the pool from a system that "
+		    "supports the\n\trequired feature(s), or restore the "
+		    "pool from backup.\n"));
+		break;
+
 	case ZPOOL_STATUS_FAULTED_DEV_R:
 		(void) printf(gettext("status: One or more devices are "
 		    "faulted in response to persistent errors.\n\tSufficient "
@@ -3534,6 +4364,15 @@ status_callback(zpool_handle_t *zhp, voi
 		    "'zpool clear'.\n"));
 		break;
 
+	case ZPOOL_STATUS_NON_NATIVE_ASHIFT:
+		(void) printf(gettext("status: One or more devices are "
+		    "configured to use a non-native block size.\n"
+		    "\tExpect reduced performance.\n"));
+		(void) printf(gettext("action: Replace affected devices with "
+		    "devices that support the\n\tconfigured block size, or "
+		    "migrate data to a properly configured\n\tpool.\n"));
+		break;
+
 	default:
 		/*
 		 * The remaining errors can't actually be generated, yet.
@@ -3542,7 +4381,7 @@ status_callback(zpool_handle_t *zhp, voi
 	}
 
 	if (msgid != NULL)
-		(void) printf(gettext("   see: http://www.sun.com/msg/%s\n"),
+		(void) printf(gettext("   see: http://illumos.org/msg/%s\n"),
 		    msgid);
 
 	if (config != NULL) {
@@ -3550,10 +4389,11 @@ status_callback(zpool_handle_t *zhp, voi
 		uint64_t nerr;
 		nvlist_t **spares, **l2cache;
 		uint_t nspares, nl2cache;
+		pool_scan_stat_t *ps = NULL;
 
-
-		(void) printf(gettext(" scrub: "));
-		print_scrub_status(nvroot);
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
+		print_scan_status(ps);
 
 		namewidth = max_width(zhp, nvroot, 0, 0);
 		if (namewidth < 10)
@@ -3621,11 +4461,12 @@ status_callback(zpool_handle_t *zhp, voi
 }
 
 /*
- * zpool status [-vx] [pool] ...
+ * zpool status [-vx] [-T d|u] [pool] ... [interval [count]]
  *
  *	-v	Display complete error logs
  *	-x	Display only pools with potential problems
  *	-D	Display dedup status (undocumented)
+ *	-T	Display a timestamp in date(1) or Unix format
  *
  * Describes the health status of all pools or some subset.
  */
@@ -3634,10 +4475,11 @@ zpool_do_status(int argc, char **argv)
 {
 	int c;
 	int ret;
+	unsigned long interval = 0, count = 0;
 	status_cbdata_t cb = { 0 };
 
 	/* check options */
-	while ((c = getopt(argc, argv, "vxD")) != -1) {
+	while ((c = getopt(argc, argv, "vxDT:")) != -1) {
 		switch (c) {
 		case 'v':
 			cb.cb_verbose = B_TRUE;
@@ -3648,6 +4490,9 @@ zpool_do_status(int argc, char **argv)
 		case 'D':
 			cb.cb_dedup_stats = B_TRUE;
 			break;
+		case 'T':
+			get_timestamp_arg(*optarg);
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -3658,72 +4503,280 @@ zpool_do_status(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	cb.cb_first = B_TRUE;
+	get_interval_count(&argc, argv, &interval, &count);
 
 	if (argc == 0)
 		cb.cb_allpools = B_TRUE;
 
-	ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb);
+	cb.cb_first = B_TRUE;
 
-	if (argc == 0 && cb.cb_count == 0)
-		(void) printf(gettext("no pools available\n"));
-	else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
-		(void) printf(gettext("all pools are healthy\n"));
+	for (;;) {
+		if (timestamp_fmt != NODATE)
+			print_timestamp(timestamp_fmt);
 
-	return (ret);
+		ret = for_each_pool(argc, argv, B_TRUE, NULL,
+		    status_callback, &cb);
+
+		if (argc == 0 && cb.cb_count == 0)
+			(void) printf(gettext("no pools available\n"));
+		else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
+			(void) printf(gettext("all pools are healthy\n"));
+
+		if (ret != 0)
+			return (ret);
+
+		if (interval == 0)
+			break;
+
+		if (count != 0 && --count == 0)
+			break;
+
+		(void) sleep(interval);
+	}
+
+	return (0);
 }
 
 typedef struct upgrade_cbdata {
-	int	cb_all;
-	int	cb_first;
-	int	cb_newer;
-	int	cb_argc;
-	uint64_t cb_version;
-	char	**cb_argv;
+	boolean_t	cb_first;
+	boolean_t	cb_unavail;
+	char		cb_poolname[ZFS_MAX_DATASET_NAME_LEN];
+	int		cb_argc;
+	uint64_t	cb_version;
+	char		**cb_argv;
 } upgrade_cbdata_t;
 
+#ifdef __FreeBSD__
+static int
+is_root_pool(zpool_handle_t *zhp)
+{
+	static struct statfs sfs;
+	static char *poolname = NULL;
+	static boolean_t stated = B_FALSE;
+	char *slash;
+
+	if (!stated) {
+		stated = B_TRUE;
+		if (statfs("/", &sfs) == -1) {
+			(void) fprintf(stderr,
+			    "Unable to stat root file system: %s.\n",
+			    strerror(errno));
+			return (0);
+		}
+		if (strcmp(sfs.f_fstypename, "zfs") != 0)
+			return (0);
+		poolname = sfs.f_mntfromname;
+		if ((slash = strchr(poolname, '/')) != NULL)
+			*slash = '\0';
+	}
+	return (poolname != NULL && strcmp(poolname, zpool_get_name(zhp)) == 0);
+}
+
+static void
+root_pool_upgrade_check(zpool_handle_t *zhp, char *poolname, int size)
+{
+
+	if (poolname[0] == '\0' && is_root_pool(zhp))
+		(void) strlcpy(poolname, zpool_get_name(zhp), size);
+}
+#endif	/* FreeBSD */
+
+static int
+upgrade_version(zpool_handle_t *zhp, uint64_t version)
+{
+	int ret;
+	nvlist_t *config;
+	uint64_t oldversion;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &oldversion) == 0);
+
+	assert(SPA_VERSION_IS_SUPPORTED(oldversion));
+	assert(oldversion < version);
+
+	ret = zpool_upgrade(zhp, version);
+	if (ret != 0)
+		return (ret);
+
+	if (version >= SPA_VERSION_FEATURES) {
+		(void) printf(gettext("Successfully upgraded "
+		    "'%s' from version %llu to feature flags.\n"),
+		    zpool_get_name(zhp), oldversion);
+	} else {
+		(void) printf(gettext("Successfully upgraded "
+		    "'%s' from version %llu to version %llu.\n"),
+		    zpool_get_name(zhp), oldversion, version);
+	}
+
+	return (0);
+}
+
+static int
+upgrade_enable_all(zpool_handle_t *zhp, int *countp)
+{
+	int i, ret, count;
+	boolean_t firstff = B_TRUE;
+	nvlist_t *enabled = zpool_get_features(zhp);
+
+	count = 0;
+	for (i = 0; i < SPA_FEATURES; i++) {
+		const char *fname = spa_feature_table[i].fi_uname;
+		const char *fguid = spa_feature_table[i].fi_guid;
+		if (!nvlist_exists(enabled, fguid)) {
+			char *propname;
+			verify(-1 != asprintf(&propname, "feature@%s", fname));
+			ret = zpool_set_prop(zhp, propname,
+			    ZFS_FEATURE_ENABLED);
+			if (ret != 0) {
+				free(propname);
+				return (ret);
+			}
+			count++;
+
+			if (firstff) {
+				(void) printf(gettext("Enabled the "
+				    "following features on '%s':\n"),
+				    zpool_get_name(zhp));
+				firstff = B_FALSE;
+			}
+			(void) printf(gettext("  %s\n"), fname);
+			free(propname);
+		}
+	}
+
+	if (countp != NULL)
+		*countp = count;
+	return (0);
+}
+
 static int
 upgrade_cb(zpool_handle_t *zhp, void *arg)
 {
 	upgrade_cbdata_t *cbp = arg;
 	nvlist_t *config;
 	uint64_t version;
-	int ret = 0;
+	boolean_t printnl = B_FALSE;
+	int ret;
+
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		(void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
+		    "currently unavailable.\n\n"), zpool_get_name(zhp));
+		cbp->cb_unavail = B_TRUE;
+		/* Allow iteration to continue. */
+		return (0);
+	}
 
 	config = zpool_get_config(zhp, NULL);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &version) == 0);
 
-	if (!cbp->cb_newer && version < SPA_VERSION) {
-		if (!cbp->cb_all) {
-			if (cbp->cb_first) {
-				(void) printf(gettext("The following pools are "
-				    "out of date, and can be upgraded.  After "
-				    "being\nupgraded, these pools will no "
-				    "longer be accessible by older software "
-				    "versions.\n\n"));
-				(void) printf(gettext("VER  POOL\n"));
-				(void) printf(gettext("---  ------------\n"));
-				cbp->cb_first = B_FALSE;
-			}
+	assert(SPA_VERSION_IS_SUPPORTED(version));
 
-			(void) printf("%2llu   %s\n", (u_longlong_t)version,
-			    zpool_get_name(zhp));
-		} else {
+	if (version < cbp->cb_version) {
+		cbp->cb_first = B_FALSE;
+		ret = upgrade_version(zhp, cbp->cb_version);
+		if (ret != 0)
+			return (ret);
+#ifdef __FreeBSD__
+		root_pool_upgrade_check(zhp, cbp->cb_poolname,
+		    sizeof(cbp->cb_poolname));
+#endif	/* __FreeBSD__ */
+		printnl = B_TRUE;
+
+#ifdef illumos
+		/*
+		 * If they did "zpool upgrade -a", then we could
+		 * be doing ioctls to different pools.  We need
+		 * to log this history once to each pool, and bypass
+		 * the normal history logging that happens in main().
+		 */
+		(void) zpool_log_history(g_zfs, history_str);
+		log_history = B_FALSE;
+#endif
+	}
+
+	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+		int count;
+		ret = upgrade_enable_all(zhp, &count);
+		if (ret != 0)
+			return (ret);
+
+		if (count > 0) {
+			cbp->cb_first = B_FALSE;
+			printnl = B_TRUE;
+#ifdef __FreeBSD__
+			root_pool_upgrade_check(zhp, cbp->cb_poolname,
+			    sizeof(cbp->cb_poolname));
+#endif	/* __FreeBSD__ */
+			/*
+			 * If they did "zpool upgrade -a", then we could
+			 * be doing ioctls to different pools.  We need
+			 * to log this history once to each pool, and bypass
+			 * the normal history logging that happens in main().
+			 */
+			(void) zpool_log_history(g_zfs, history_str);
+			log_history = B_FALSE;
+		}
+	}
+
+	if (printnl) {
+		(void) printf(gettext("\n"));
+	}
+
+	return (0);
+}
+
+static int
+upgrade_list_unavail(zpool_handle_t *zhp, void *arg)
+{
+	upgrade_cbdata_t *cbp = arg;
+
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		if (cbp->cb_first) {
+			(void) fprintf(stderr, gettext("The following pools "
+			    "are unavailable and cannot be upgraded as this "
+			    "time.\n\n"));
+			(void) fprintf(stderr, gettext("POOL\n"));
+			(void) fprintf(stderr, gettext("------------\n"));
 			cbp->cb_first = B_FALSE;
-			ret = zpool_upgrade(zhp, cbp->cb_version);
-			if (!ret) {
-				(void) printf(gettext("Successfully upgraded "
-				    "'%s'\n\n"), zpool_get_name(zhp));
-			}
 		}
-	} else if (cbp->cb_newer && version > SPA_VERSION) {
-		assert(!cbp->cb_all);
+		(void) printf(gettext("%s\n"), zpool_get_name(zhp));
+		cbp->cb_unavail = B_TRUE;
+	}
+	return (0);
+}
+
+static int
+upgrade_list_older_cb(zpool_handle_t *zhp, void *arg)
+{
+	upgrade_cbdata_t *cbp = arg;
+	nvlist_t *config;
+	uint64_t version;
+
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		/*
+		 * This will have been reported by upgrade_list_unavail so
+		 * just allow iteration to continue.
+		 */
+		cbp->cb_unavail = B_TRUE;
+		return (0);
+	}
 
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &version) == 0);
+
+	assert(SPA_VERSION_IS_SUPPORTED(version));
+
+	if (version < SPA_VERSION_FEATURES) {
 		if (cbp->cb_first) {
 			(void) printf(gettext("The following pools are "
-			    "formatted using a newer software version and\n"
-			    "cannot be accessed on the current system.\n\n"));
+			    "formatted with legacy version numbers and can\n"
+			    "be upgraded to use feature flags.  After "
+			    "being upgraded, these pools\nwill no "
+			    "longer be accessible by software that does not "
+			    "support feature\nflags.\n\n"));
 			(void) printf(gettext("VER  POOL\n"));
 			(void) printf(gettext("---  ------------\n"));
 			cbp->cb_first = B_FALSE;
@@ -3733,48 +4786,142 @@ upgrade_cb(zpool_handle_t *zhp, void *ar
 		    zpool_get_name(zhp));
 	}
 
-	zpool_close(zhp);
-	return (ret);
+	return (0);
+}
+
+static int
+upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg)
+{
+	upgrade_cbdata_t *cbp = arg;
+	nvlist_t *config;
+	uint64_t version;
+
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		/*
+		 * This will have been reported by upgrade_list_unavail so
+		 * just allow iteration to continue.
+		 */
+		cbp->cb_unavail = B_TRUE;
+		return (0);
+	}
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &version) == 0);
+
+	if (version >= SPA_VERSION_FEATURES) {
+		int i;
+		boolean_t poolfirst = B_TRUE;
+		nvlist_t *enabled = zpool_get_features(zhp);
+
+		for (i = 0; i < SPA_FEATURES; i++) {
+			const char *fguid = spa_feature_table[i].fi_guid;
+			const char *fname = spa_feature_table[i].fi_uname;
+			if (!nvlist_exists(enabled, fguid)) {
+				if (cbp->cb_first) {
+					(void) printf(gettext("\nSome "
+					    "supported features are not "
+					    "enabled on the following pools. "
+					    "Once a\nfeature is enabled the "
+					    "pool may become incompatible with "
+					    "software\nthat does not support "
+					    "the feature. See "
+					    "zpool-features(7) for "
+					    "details.\n\n"));
+					(void) printf(gettext("POOL  "
+					    "FEATURE\n"));
+					(void) printf(gettext("------"
+					    "---------\n"));
+					cbp->cb_first = B_FALSE;
+				}
+
+				if (poolfirst) {
+					(void) printf(gettext("%s\n"),
+					    zpool_get_name(zhp));
+					poolfirst = B_FALSE;
+				}
+
+				(void) printf(gettext("      %s\n"), fname);
+			}
+		}
+	}
+
+	return (0);
 }
 
 /* ARGSUSED */
 static int
 upgrade_one(zpool_handle_t *zhp, void *data)
 {
+	boolean_t printnl = B_FALSE;
 	upgrade_cbdata_t *cbp = data;
 	uint64_t cur_version;
 	int ret;
 
+	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
+		(void) fprintf(stderr, gettext("cannot upgrade '%s': pool is "
+		    "is currently unavailable.\n\n"), zpool_get_name(zhp));
+		cbp->cb_unavail = B_TRUE;
+		return (1);
+	}
+
 	if (strcmp("log", zpool_get_name(zhp)) == 0) {
 		(void) printf(gettext("'log' is now a reserved word\n"
 		    "Pool 'log' must be renamed using export and import"
-		    " to upgrade.\n"));
+		    " to upgrade.\n\n"));
 		return (1);
 	}
 
 	cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
 	if (cur_version > cbp->cb_version) {
 		(void) printf(gettext("Pool '%s' is already formatted "
-		    "using more current version '%llu'.\n"),
+		    "using more current version '%llu'.\n\n"),
 		    zpool_get_name(zhp), cur_version);
 		return (0);
 	}
-	if (cur_version == cbp->cb_version) {
+
+	if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) {
 		(void) printf(gettext("Pool '%s' is already formatted "
-		    "using the current version.\n"), zpool_get_name(zhp));
+		    "using version %llu.\n\n"), zpool_get_name(zhp),
+		    cbp->cb_version);
 		return (0);
 	}
 
-	ret = zpool_upgrade(zhp, cbp->cb_version);
+	if (cur_version != cbp->cb_version) {
+		printnl = B_TRUE;
+		ret = upgrade_version(zhp, cbp->cb_version);
+		if (ret != 0)
+			return (ret);
+#ifdef __FreeBSD__
+		root_pool_upgrade_check(zhp, cbp->cb_poolname,
+		    sizeof(cbp->cb_poolname));
+#endif	/* __FreeBSD__ */
+	}
+
+	if (cbp->cb_version >= SPA_VERSION_FEATURES) {
+		int count = 0;
+		ret = upgrade_enable_all(zhp, &count);
+		if (ret != 0)
+			return (ret);
+
+		if (count != 0) {
+			printnl = B_TRUE;
+#ifdef __FreeBSD__
+			root_pool_upgrade_check(zhp, cbp->cb_poolname,
+			    sizeof(cbp->cb_poolname));
+#endif	/* __FreeBSD __*/
+		} else if (cur_version == SPA_VERSION) {
+			(void) printf(gettext("Pool '%s' already has all "
+			    "supported features enabled.\n\n"),
+			    zpool_get_name(zhp));
+		}
+	}
 
-	if (!ret) {
-		(void) printf(gettext("Successfully upgraded '%s' "
-		    "from version %llu to version %llu\n\n"),
-		    zpool_get_name(zhp), (u_longlong_t)cur_version,
-		    (u_longlong_t)cbp->cb_version);
+	if (printnl) {
+		(void) printf(gettext("\n"));
 	}
 
-	return (ret != 0);
+	return (0);
 }
 
 /*
@@ -3793,6 +4940,7 @@ zpool_do_upgrade(int argc, char **argv)
 	upgrade_cbdata_t cb = { 0 };
 	int ret = 0;
 	boolean_t showversions = B_FALSE;
+	boolean_t upgradeall = B_FALSE;
 	char *end;
 
 
@@ -3800,15 +4948,15 @@ zpool_do_upgrade(int argc, char **argv)
 	while ((c = getopt(argc, argv, ":avV:")) != -1) {
 		switch (c) {
 		case 'a':
-			cb.cb_all = B_TRUE;
+			upgradeall = B_TRUE;
 			break;
 		case 'v':
 			showversions = B_TRUE;
 			break;
 		case 'V':
 			cb.cb_version = strtoll(optarg, &end, 10);
-			if (*end != '\0' || cb.cb_version > SPA_VERSION ||
-			    cb.cb_version < SPA_VERSION_1) {
+			if (*end != '\0' ||
+			    !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) {
 				(void) fprintf(stderr,
 				    gettext("invalid version '%s'\n"), optarg);
 				usage(B_FALSE);
@@ -3833,19 +4981,19 @@ zpool_do_upgrade(int argc, char **argv)
 
 	if (cb.cb_version == 0) {
 		cb.cb_version = SPA_VERSION;
-	} else if (!cb.cb_all && argc == 0) {
+	} else if (!upgradeall && argc == 0) {
 		(void) fprintf(stderr, gettext("-V option is "
 		    "incompatible with other arguments\n"));
 		usage(B_FALSE);
 	}
 
 	if (showversions) {
-		if (cb.cb_all || argc != 0) {
+		if (upgradeall || argc != 0) {
 			(void) fprintf(stderr, gettext("-v option is "
 			    "incompatible with other arguments\n"));
 			usage(B_FALSE);
 		}
-	} else if (cb.cb_all) {
+	} else if (upgradeall) {
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("-a option should not "
 			    "be used along with a pool name\n"));
@@ -3853,11 +5001,28 @@ zpool_do_upgrade(int argc, char **argv)
 		}
 	}
 
-	(void) printf(gettext("This system is currently running "
-	    "ZFS pool version %llu.\n\n"), SPA_VERSION);
-	cb.cb_first = B_TRUE;
+	(void) printf(gettext("This system supports ZFS pool feature "
+	    "flags.\n\n"));
 	if (showversions) {
-		(void) printf(gettext("The following versions are "
+		int i;
+
+		(void) printf(gettext("The following features are "
+		    "supported:\n\n"));
+		(void) printf(gettext("FEAT DESCRIPTION\n"));
+		(void) printf("----------------------------------------------"
+		    "---------------\n");
+		for (i = 0; i < SPA_FEATURES; i++) {
+			zfeature_info_t *fi = &spa_feature_table[i];
+			const char *ro =
+			    (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+			    " (read-only compatible)" : "";
+
+			(void) printf("%-37s%s\n", fi->fi_uname, ro);
+			(void) printf("     %s\n", fi->fi_desc);
+		}
+		(void) printf("\n");
+
+		(void) printf(gettext("The following legacy versions are also "
 		    "supported:\n\n"));
 		(void) printf(gettext("VER  DESCRIPTION\n"));
 		(void) printf("---  -----------------------------------------"
@@ -3890,50 +5055,89 @@ zpool_do_upgrade(int argc, char **argv)
 		(void) printf(gettext(" 21  Deduplication\n"));
 		(void) printf(gettext(" 22  Received properties\n"));
 		(void) printf(gettext(" 23  Slim ZIL\n"));
+		(void) printf(gettext(" 24  System attributes\n"));
+		(void) printf(gettext(" 25  Improved scrub stats\n"));
+		(void) printf(gettext(" 26  Improved snapshot deletion "
+		    "performance\n"));
+		(void) printf(gettext(" 27  Improved snapshot creation "
+		    "performance\n"));
+		(void) printf(gettext(" 28  Multiple vdev replacements\n"));
 		(void) printf(gettext("\nFor more information on a particular "
-		    "version, including supported releases, see:\n\n"));
-		(void) printf("http://www.opensolaris.org/os/community/zfs/"
-		    "version/N\n\n");
-		(void) printf(gettext("Where 'N' is the version number.\n"));
-	} else if (argc == 0) {
-		int notfound;
-
+		    "version, including supported releases,\n"));
+		(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
+	} else if (argc == 0 && upgradeall) {
+		cb.cb_first = B_TRUE;
 		ret = zpool_iter(g_zfs, upgrade_cb, &cb);
-		notfound = cb.cb_first;
-
-		if (!cb.cb_all && ret == 0) {
-			if (!cb.cb_first)
-				(void) printf("\n");
-			cb.cb_first = B_TRUE;
-			cb.cb_newer = B_TRUE;
-			ret = zpool_iter(g_zfs, upgrade_cb, &cb);
-			if (!cb.cb_first) {
-				notfound = B_FALSE;
-				(void) printf("\n");
+		if (ret == 0 && cb.cb_first) {
+			if (cb.cb_version == SPA_VERSION) {
+				(void) printf(gettext("All %spools are already "
+				    "formatted using feature flags.\n\n"),
+				    cb.cb_unavail ? gettext("available ") : "");
+				(void) printf(gettext("Every %sfeature flags "
+				    "pool already has all supported features "
+				    "enabled.\n"),
+				    cb.cb_unavail ? gettext("available ") : "");
+			} else {
+				(void) printf(gettext("All pools are already "
+				    "formatted with version %llu or higher.\n"),
+				    cb.cb_version);
 			}
 		}
+	} else if (argc == 0) {
+		cb.cb_first = B_TRUE;
+		ret = zpool_iter(g_zfs, upgrade_list_unavail, &cb);
+		assert(ret == 0);
+
+		if (!cb.cb_first) {
+			(void) fprintf(stderr, "\n");
+		}
+
+		cb.cb_first = B_TRUE;
+		ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb);
+		assert(ret == 0);
+
+		if (cb.cb_first) {
+			(void) printf(gettext("All %spools are formatted using "
+			    "feature flags.\n\n"), cb.cb_unavail ?
+			    gettext("available ") : "");
+		} else {
+			(void) printf(gettext("\nUse 'zpool upgrade -v' "
+			    "for a list of available legacy versions.\n"));
+		}
 
-		if (ret == 0) {
-			if (notfound)
-				(void) printf(gettext("All pools are formatted "
-				    "using this version.\n"));
-			else if (!cb.cb_all)
-				(void) printf(gettext("Use 'zpool upgrade -v' "
-				    "for a list of available versions and "
-				    "their associated\nfeatures.\n"));
+		cb.cb_first = B_TRUE;
+		ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb);
+		assert(ret == 0);
+
+		if (cb.cb_first) {
+			(void) printf(gettext("Every %sfeature flags pool has "
+			    "all supported features enabled.\n"),
+			    cb.cb_unavail ? gettext("available ") : "");
+		} else {
+			(void) printf(gettext("\n"));
 		}
 	} else {
-		ret = for_each_pool(argc, argv, B_FALSE, NULL,
+		ret = for_each_pool(argc, argv, B_TRUE, NULL,
 		    upgrade_one, &cb);
 	}
 
+	if (cb.cb_poolname[0] != '\0') {
+		(void) printf(
+		    "If you boot from pool '%s', don't forget to update boot code.\n"
+		    "Assuming you use GPT partitioning and da0 is your boot disk\n"
+		    "the following command will do it:\n"
+		    "\n"
+		    "\tgpart bootcode -b /boot/pmbr -p /boot/gptzfsboot -i 1 da0\n\n",
+		    cb.cb_poolname);
+	}
+
 	return (ret);
 }
 
 typedef struct hist_cbdata {
 	boolean_t first;
-	int longfmt;
-	int internal;
+	boolean_t longfmt;
+	boolean_t internal;
 } hist_cbdata_t;
 
 /*
@@ -3945,21 +5149,8 @@ get_history_one(zpool_handle_t *zhp, voi
 	nvlist_t *nvhis;
 	nvlist_t **records;
 	uint_t numrecords;
-	char *cmdstr;
-	char *pathstr;
-	uint64_t dst_time;
-	time_t tsec;
-	struct tm t;
-	char tbuf[30];
 	int ret, i;
-	uint64_t who;
-	struct passwd *pwd;
-	char *hostname;
-	char *zonename;
-	char internalstr[MAXPATHLEN];
 	hist_cbdata_t *cb = (hist_cbdata_t *)data;
-	uint64_t txg;
-	uint64_t ievent;
 
 	cb->first = B_FALSE;
 
@@ -3971,64 +5162,94 @@ get_history_one(zpool_handle_t *zhp, voi
 	verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD,
 	    &records, &numrecords) == 0);
 	for (i = 0; i < numrecords; i++) {
-		if (nvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME,
-		    &dst_time) != 0)
-			continue;
+		nvlist_t *rec = records[i];
+		char tbuf[30] = "";
 
-		/* is it an internal event or a standard event? */
-		if (nvlist_lookup_string(records[i], ZPOOL_HIST_CMD,
-		    &cmdstr) != 0) {
-			if (cb->internal == 0)
+		if (nvlist_exists(rec, ZPOOL_HIST_TIME)) {
+			time_t tsec;
+			struct tm t;
+
+			tsec = fnvlist_lookup_uint64(records[i],
+			    ZPOOL_HIST_TIME);
+			(void) localtime_r(&tsec, &t);
+			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+		}
+
+		if (nvlist_exists(rec, ZPOOL_HIST_CMD)) {
+			(void) printf("%s %s", tbuf,
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_CMD));
+		} else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) {
+			int ievent =
+			    fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT);
+			if (!cb->internal)
 				continue;
-
-			if (nvlist_lookup_uint64(records[i],
-			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) {
+				(void) printf("%s unrecognized record:\n",
+				    tbuf);
+				dump_nvlist(rec, 4);
+				continue;
+			}
+			(void) printf("%s [internal %s txg:%lld] %s", tbuf,
+			    zfs_history_event_names[ievent],
+			    fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR));
+		} else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) {
+			if (!cb->internal)
+				continue;
+			(void) printf("%s [txg:%lld] %s", tbuf,
+			    fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME));
+			if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) {
+				(void) printf(" %s (%llu)",
+				    fnvlist_lookup_string(rec,
+				    ZPOOL_HIST_DSNAME),
+				    fnvlist_lookup_uint64(rec,
+				    ZPOOL_HIST_DSID));
+			}
+			(void) printf(" %s", fnvlist_lookup_string(rec,
+			    ZPOOL_HIST_INT_STR));
+		} else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) {
+			if (!cb->internal)
 				continue;
-			verify(nvlist_lookup_uint64(records[i],
-			    ZPOOL_HIST_TXG, &txg) == 0);
-			verify(nvlist_lookup_string(records[i],
-			    ZPOOL_HIST_INT_STR, &pathstr) == 0);
-			if (ievent >= LOG_END)
+			(void) printf("%s ioctl %s\n", tbuf,
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL));
+			if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) {
+				(void) printf("    input:\n");
+				dump_nvlist(fnvlist_lookup_nvlist(rec,
+				    ZPOOL_HIST_INPUT_NVL), 8);
+			}
+			if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) {
+				(void) printf("    output:\n");
+				dump_nvlist(fnvlist_lookup_nvlist(rec,
+				    ZPOOL_HIST_OUTPUT_NVL), 8);
+			}
+		} else {
+			if (!cb->internal)
 				continue;
-			(void) snprintf(internalstr,
-			    sizeof (internalstr),
-			    "[internal %s txg:%lld] %s",
-			    hist_event_table[ievent], txg,
-			    pathstr);
-			cmdstr = internalstr;
-		}
-		tsec = dst_time;
-		(void) localtime_r(&tsec, &t);
-		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
-		(void) printf("%s %s", tbuf, cmdstr);
+			(void) printf("%s unrecognized record:\n", tbuf);
+			dump_nvlist(rec, 4);
+		}
 
 		if (!cb->longfmt) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" [");
-		if (nvlist_lookup_uint64(records[i],
-		    ZPOOL_HIST_WHO, &who) == 0) {
-			pwd = getpwuid((uid_t)who);
-			if (pwd)
-				(void) printf("user %s on",
-				    pwd->pw_name);
-			else
-				(void) printf("user %d on",
-				    (int)who);
-		} else {
-			(void) printf(gettext("no info]\n"));
-			continue;
+		if (nvlist_exists(rec, ZPOOL_HIST_WHO)) {
+			uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO);
+			struct passwd *pwd = getpwuid(who);
+			(void) printf("user %d ", (int)who);
+			if (pwd != NULL)
+				(void) printf("(%s) ", pwd->pw_name);
+		}
+		if (nvlist_exists(rec, ZPOOL_HIST_HOST)) {
+			(void) printf("on %s",
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_HOST));
+		}
+		if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) {
+			(void) printf(":%s",
+			    fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE));
 		}
-		if (nvlist_lookup_string(records[i],
-		    ZPOOL_HIST_HOST, &hostname) == 0) {
-			(void) printf(" %s", hostname);
-		}
-		if (nvlist_lookup_string(records[i],
-		    ZPOOL_HIST_ZONE, &zonename) == 0) {
-			(void) printf(":%s", zonename);
-		}
-
 		(void) printf("]");
 		(void) printf("\n");
 	}
@@ -4043,8 +5264,6 @@ get_history_one(zpool_handle_t *zhp, voi
  *
  * Displays the history of commands that modified pools.
  */
-
-
 int
 zpool_do_history(int argc, char **argv)
 {
@@ -4057,10 +5276,10 @@ zpool_do_history(int argc, char **argv)
 	while ((c = getopt(argc, argv, "li")) != -1) {
 		switch (c) {
 		case 'l':
-			cbdata.longfmt = 1;
+			cbdata.longfmt = B_TRUE;
 			break;
 		case 'i':
-			cbdata.internal = 1;
+			cbdata.internal = B_TRUE;
 			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -4100,28 +5319,56 @@ get_callback(zpool_handle_t *zhp, void *
 		    pl == cbp->cb_proplist)
 			continue;
 
-		if (zpool_get_prop(zhp, pl->pl_prop,
-		    value, sizeof (value), &srctype) != 0)
-			continue;
+		if (pl->pl_prop == ZPROP_INVAL &&
+		    (zpool_prop_feature(pl->pl_user_prop) ||
+		    zpool_prop_unsupported(pl->pl_user_prop))) {
+			srctype = ZPROP_SRC_LOCAL;
+
+			if (zpool_prop_get_feature(zhp, pl->pl_user_prop,
+			    value, sizeof (value)) == 0) {
+				zprop_print_one_property(zpool_get_name(zhp),
+				    cbp, pl->pl_user_prop, value, srctype,
+				    NULL, NULL);
+			}
+		} else {
+			if (zpool_get_prop(zhp, pl->pl_prop, value,
+			    sizeof (value), &srctype, cbp->cb_literal) != 0)
+				continue;
 
-		zprop_print_one_property(zpool_get_name(zhp), cbp,
-		    zpool_prop_to_name(pl->pl_prop), value, srctype, NULL,
-		    NULL);
+			zprop_print_one_property(zpool_get_name(zhp), cbp,
+			    zpool_prop_to_name(pl->pl_prop), value, srctype,
+			    NULL, NULL);
+		}
 	}
 	return (0);
 }
 
+/*
+ * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> <pool> ...
+ *
+ *	-H	Scripted mode.  Don't display headers, and separate properties
+ *		by a single tab.
+ *	-o	List of columns to display.  Defaults to
+ *		"name,property,value,source".
+ * 	-p	Diplay values in parsable (exact) format.
+ *
+ * Get properties of pools in the system. Output space statistics
+ * for each one as well as other attributes.
+ */
 int
 zpool_do_get(int argc, char **argv)
 {
 	zprop_get_cbdata_t cb = { 0 };
 	zprop_list_t fake_name = { 0 };
 	int ret;
-
-	if (argc < 3)
-		usage(B_FALSE);
+	int c, i;
+	char *value;
 
 	cb.cb_first = B_TRUE;
+
+	/*
+	 * Set up default columns and sources.
+	 */
 	cb.cb_sources = ZPROP_SRC_ALL;
 	cb.cb_columns[0] = GET_COL_NAME;
 	cb.cb_columns[1] = GET_COL_PROPERTY;
@@ -4129,10 +5376,89 @@ zpool_do_get(int argc, char **argv)
 	cb.cb_columns[3] = GET_COL_SOURCE;
 	cb.cb_type = ZFS_TYPE_POOL;
 
-	if (zprop_get_list(g_zfs, argv[1],  &cb.cb_proplist,
+	/* check options */
+	while ((c = getopt(argc, argv, ":Hpo:")) != -1) {
+		switch (c) {
+		case 'p':
+			cb.cb_literal = B_TRUE;
+			break;
+		case 'H':
+			cb.cb_scripted = B_TRUE;
+			break;
+		case 'o':
+			bzero(&cb.cb_columns, sizeof (cb.cb_columns));
+			i = 0;
+			while (*optarg != '\0') {
+				static char *col_subopts[] =
+				{ "name", "property", "value", "source",
+				"all", NULL };
+
+				if (i == ZFS_GET_NCOLS) {
+					(void) fprintf(stderr, gettext("too "
+					"many fields given to -o "
+					"option\n"));
+					usage(B_FALSE);
+				}
+
+				switch (getsubopt(&optarg, col_subopts,
+				    &value)) {
+				case 0:
+					cb.cb_columns[i++] = GET_COL_NAME;
+					break;
+				case 1:
+					cb.cb_columns[i++] = GET_COL_PROPERTY;
+					break;
+				case 2:
+					cb.cb_columns[i++] = GET_COL_VALUE;
+					break;
+				case 3:
+					cb.cb_columns[i++] = GET_COL_SOURCE;
+					break;
+				case 4:
+					if (i > 0) {
+						(void) fprintf(stderr,
+						    gettext("\"all\" conflicts "
+						    "with specific fields "
+						    "given to -o option\n"));
+						usage(B_FALSE);
+					}
+					cb.cb_columns[0] = GET_COL_NAME;
+					cb.cb_columns[1] = GET_COL_PROPERTY;
+					cb.cb_columns[2] = GET_COL_VALUE;
+					cb.cb_columns[3] = GET_COL_SOURCE;
+					i = ZFS_GET_NCOLS;
+					break;
+				default:
+					(void) fprintf(stderr,
+					    gettext("invalid column name "
+					    "'%s'\n"), suboptarg);
+					usage(B_FALSE);
+				}
+			}
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("missing property "
+		    "argument\n"));
+		usage(B_FALSE);
+	}
+
+	if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist,
 	    ZFS_TYPE_POOL) != 0)
 		usage(B_FALSE);
 
+	argc--;
+	argv++;
+
 	if (cb.cb_proplist != NULL) {
 		fake_name.pl_prop = ZPOOL_PROP_NAME;
 		fake_name.pl_width = strlen(gettext("NAME"));
@@ -4140,7 +5466,7 @@ zpool_do_get(int argc, char **argv)
 		cb.cb_proplist = &fake_name;
 	}
 
-	ret = for_each_pool(argc - 2, argv + 2, B_TRUE, &cb.cb_proplist,
+	ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist,
 	    get_callback, &cb);
 
 	if (cb.cb_proplist == &fake_name)
@@ -4236,7 +5562,7 @@ find_command_idx(char *command, int *idx
 int
 main(int argc, char **argv)
 {
-	int ret;
+	int ret = 0;
 	int i;
 	char *cmdname;
 
@@ -4269,8 +5595,7 @@ main(int argc, char **argv)
 	if (strcmp(cmdname, "-?") == 0)
 		usage(B_TRUE);
 
-	zpool_set_history_str("zpool", argc, argv, history_str);
-	verify(zpool_stage_history(g_zfs, history_str) == 0);
+	zfs_save_arguments(argc, argv, history_str, sizeof (history_str));
 
 	/*
 	 * Run the appropriate command.
@@ -4287,16 +5612,18 @@ main(int argc, char **argv)
 		 * 'freeze' is a vile debugging abomination, so we treat
 		 * it as such.
 		 */
-		char buf[16384];
-		int fd = open(ZFS_DEV, O_RDWR);
-		(void) strcpy((void *)buf, argv[2]);
-		return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf));
+		zfs_cmd_t zc = { 0 };
+		(void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name));
+		return (!!zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc));
 	} else {
 		(void) fprintf(stderr, gettext("unrecognized "
 		    "command '%s'\n"), cmdname);
 		usage(B_FALSE);
 	}
 
+	if (ret == 0 && log_history)
+		(void) zpool_log_history(g_zfs, history_str);
+
 	libzfs_fini(g_zfs);
 
 	/*
Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zpool_util.h
--- src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h	27 Feb 2010 22:29:23 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h	12 Jun 2012 05:55:36 -0000
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	ZPOOL_UTIL_H
@@ -45,7 +44,7 @@ uint_t num_logs(nvlist_t *nv);
  */
 
 nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
-    boolean_t isreplace, boolean_t dryrun, int argc, char **argv);
+    boolean_t replacing, boolean_t dryrun, int argc, char **argv);
 nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
     nvlist_t *props, splitflags_t flags, int argc, char **argv);
 
Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c,v
retrieving revision 1.3
diff -u -p -r1.3 zpool_vdev.c
--- src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c	27 Feb 2010 23:43:52 -0000	1.3
+++ src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c	5 May 2017 16:51:20 -0000
@@ -20,8 +20,9 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
  */
 
 /*
@@ -70,15 +71,19 @@
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
-#include <sys/efi_partition.h>
+#include <paths.h>
 #include <sys/stat.h>
-#include <sys/vtoc.h>
+#include <sys/disk.h>
 #include <sys/mntent.h>
+#ifdef __FreeBSD__
+#include <libgeom.h>
+#endif
+#ifdef __NetBSD__
+#include <sys/disklabel.h>
+#endif
 
 #include "zpool_util.h"
 
-#define	DISK_ROOT	"/dev/dsk"
-#define	RDISK_ROOT	"/dev/rdsk"
 #define	BACKUP_SLICE	"s2"
 
 /*
@@ -111,6 +116,7 @@ vdev_error(const char *fmt, ...)
 	va_end(ap);
 }
 
+#ifdef illumos
 static void
 libdiskmgt_error(int error)
 {
@@ -126,6 +132,155 @@ libdiskmgt_error(int error)
 }
 
 /*
+ * Validate a device, passing the bulk of the work off to libdiskmgt.
+ */
+static int
+check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
+{
+	char *msg;
+	int error = 0;
+	dm_who_type_t who;
+
+	if (force)
+		who = DM_WHO_ZPOOL_FORCE;
+	else if (isspare)
+		who = DM_WHO_ZPOOL_SPARE;
+	else
+		who = DM_WHO_ZPOOL;
+
+	if (dm_inuse((char *)path, &msg, who, &error) || error) {
+		if (error != 0) {
+			libdiskmgt_error(error);
+			return (0);
+		} else {
+			vdev_error("%s", msg);
+			free(msg);
+			return (-1);
+		}
+	}
+
+	/*
+	 * If we're given a whole disk, ignore overlapping slices since we're
+	 * about to label it anyway.
+	 */
+	error = 0;
+	if (!wholedisk && !force &&
+	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
+		if (error == 0) {
+			/* dm_isoverlapping returned -1 */
+			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
+			free(msg);
+			return (-1);
+		} else if (error != ENODEV) {
+			/* libdiskmgt's devcache only handles physical drives */
+			libdiskmgt_error(error);
+			return (0);
+		}
+	}
+
+	return (0);
+}
+
+
+/*
+ * Validate a whole disk.  Iterate over all slices on the disk and make sure
+ * that none is in use by calling check_slice().
+ */
+static int
+check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
+{
+	dm_descriptor_t *drive, *media, *slice;
+	int err = 0;
+	int i;
+	int ret;
+
+	/*
+	 * Get the drive associated with this disk.  This should never fail,
+	 * because we already have an alias handle open for the device.
+	 */
+	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
+	    &err)) == NULL || *drive == NULL) {
+		if (err)
+			libdiskmgt_error(err);
+		return (0);
+	}
+
+	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
+	    &err)) == NULL) {
+		dm_free_descriptors(drive);
+		if (err)
+			libdiskmgt_error(err);
+		return (0);
+	}
+
+	dm_free_descriptors(drive);
+
+	/*
+	 * It is possible that the user has specified a removable media drive,
+	 * and the media is not present.
+	 */
+	if (*media == NULL) {
+		dm_free_descriptors(media);
+		vdev_error(gettext("'%s' has no media in drive\n"), name);
+		return (-1);
+	}
+
+	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
+	    &err)) == NULL) {
+		dm_free_descriptors(media);
+		if (err)
+			libdiskmgt_error(err);
+		return (0);
+	}
+
+	dm_free_descriptors(media);
+
+	ret = 0;
+
+	/*
+	 * Iterate over all slices and report any errors.  We don't care about
+	 * overlapping slices because we are using the whole disk.
+	 */
+	for (i = 0; slice[i] != NULL; i++) {
+		char *name = dm_get_name(slice[i], &err);
+
+		if (check_slice(name, force, B_TRUE, isspare) != 0)
+			ret = -1;
+
+		dm_free_name(name);
+	}
+
+	dm_free_descriptors(slice);
+	return (ret);
+}
+
+/*
+ * Validate a device.
+ */
+static int
+check_device(const char *path, boolean_t force, boolean_t isspare)
+{
+	dm_descriptor_t desc;
+	int err;
+	char *dev;
+
+	/*
+	 * For whole disks, libdiskmgt does not include the leading dev path.
+	 */
+	dev = strrchr(path, '/');
+	assert(dev != NULL);
+	dev++;
+	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
+		err = check_disk(path, desc, force, isspare);
+		dm_free_descriptor(desc);
+		return (err);
+	}
+
+	return (check_slice(path, force, B_FALSE, isspare));
+}
+#endif	/* illumos */
+
+/*
  * Check that a file is valid.  All we can do in this case is check that it's
  * not in use by another pool, and not in use by swap.
  */
@@ -139,7 +294,7 @@ check_file(const char *file, boolean_t f
 	pool_state_t state;
 	boolean_t inuse;
 
-#ifndef __NetBSD__
+#ifdef illumos
 	if (dm_inuse_swap(file, &err)) {
 		if (err)
 			libdiskmgt_error(err);
@@ -202,6 +357,18 @@ check_file(const char *file, boolean_t f
 	return (ret);
 }
 
+static int
+check_device(const char *name, boolean_t force, boolean_t isspare)
+{
+	char path[MAXPATHLEN];
+
+	if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0)
+		snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name);
+	else
+		strlcpy(path, name, sizeof(path));
+
+	return (check_file(path, force, isspare));
+}
 
 /*
  * By "whole disk" we mean an entire physical disk (something we can
@@ -214,12 +381,13 @@ check_file(const char *file, boolean_t f
 static boolean_t
 is_whole_disk(const char *arg)
 {
+#ifdef illumos
 	struct dk_gpt *label;
 	int	fd;
 	char	path[MAXPATHLEN];
 
 	(void) snprintf(path, sizeof (path), "%s%s%s",
-	    RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
+	    ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
 		return (B_FALSE);
 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
@@ -229,6 +397,28 @@ is_whole_disk(const char *arg)
 	efi_free(label);
 	(void) close(fd);
 	return (B_TRUE);
+#endif
+#ifdef __FreeBSD__
+	int fd;
+
+	fd = g_open(arg, 0);
+	if (fd >= 0) {
+		g_close(fd);
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+#endif
+#ifdef __NetBSD__
+	struct disklabel dl;
+	int fd, rv;
+
+	if ((fd = open(arg, O_RDWR | O_NONBLOCK)) < 0)
+		return (B_FALSE);
+
+	rv = ioctl(fd, DIOCGDINFO, &dl);
+	close(fd);
+	return (rv == 0); 
+#endif
 }
 
 /*
@@ -275,8 +465,10 @@ make_leaf_vdev(const char *arg, uint64_t
 		 * /dev/dsk/.  As part of this check, see if we've been given a
 		 * an entire disk (minus the slice number).
 		 */
-		(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT,
-		    arg);
+		if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
+			strlcpy(path, arg, sizeof (path));
+		else
+			snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg);
 		wholedisk = is_whole_disk(path);
 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
 			/*
@@ -289,7 +481,7 @@ make_leaf_vdev(const char *arg, uint64_t
 			if (errno == ENOENT) {
 				(void) fprintf(stderr,
 				    gettext("cannot open '%s': no such "
-				    "device in %s\n"), arg, DISK_ROOT);
+				    "GEOM provider\n"), arg);
 				(void) fprintf(stderr,
 				    gettext("must be a full path or "
 				    "shorthand device name\n"));
@@ -303,6 +495,14 @@ make_leaf_vdev(const char *arg, uint64_t
 		}
 	}
 
+#ifdef __FreeBSD__
+	if (S_ISCHR(statbuf.st_mode)) {
+		statbuf.st_mode &= ~S_IFCHR;
+		statbuf.st_mode |= S_IFBLK;
+		wholedisk = B_FALSE;
+	}
+#endif
+
 	/*
 	 * Determine whether this is a device or a file.
 	 */
@@ -312,7 +512,7 @@ make_leaf_vdev(const char *arg, uint64_t
 		type = VDEV_TYPE_FILE;
 	} else {
 		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
-		    "block device or regular file\n"), path);
+		    "GEOM provider or regular file\n"), path);
 		return (NULL);
 	}
 
@@ -329,6 +529,7 @@ make_leaf_vdev(const char *arg, uint64_t
 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
 		    (uint64_t)wholedisk) == 0);
 
+#ifdef have_devid
 	/*
 	 * For a whole disk, defer getting its devid until after labeling it.
 	 */
@@ -363,6 +564,7 @@ make_leaf_vdev(const char *arg, uint64_t
 
 		(void) close(fd);
 	}
+#endif
 
 	return (vdev);
 }
@@ -402,7 +604,9 @@ get_replication(nvlist_t *nvroot, boolea
 	uint_t c, children;
 	nvlist_t *nv;
 	char *type;
-	replication_level_t lastrep, rep, *ret;
+	replication_level_t lastrep = {0};
+	replication_level_t rep;
+	replication_level_t *ret;
 	boolean_t dontreport;
 
 	ret = safe_malloc(sizeof (replication_level_t));
@@ -410,7 +614,6 @@ get_replication(nvlist_t *nvroot, boolea
 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &top, &toplevels) == 0);
 
-	lastrep.zprl_type = NULL;
 	for (t = 0; t < toplevels; t++) {
 		uint64_t is_log = B_FALSE;
 
@@ -467,6 +670,7 @@ get_replication(nvlist_t *nvroot, boolea
 			dontreport = 0;
 			vdev_size = -1ULL;
 			for (c = 0; c < children; c++) {
+				boolean_t is_replacing, is_spare;
 				nvlist_t *cnv = child[c];
 				char *path;
 				struct stat64 statbuf;
@@ -483,16 +687,19 @@ get_replication(nvlist_t *nvroot, boolea
 				 * If this is a replacing or spare vdev, then
 				 * get the real first child of the vdev.
 				 */
-				if (strcmp(childtype,
-				    VDEV_TYPE_REPLACING) == 0 ||
-				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
+				is_replacing = strcmp(childtype,
+				    VDEV_TYPE_REPLACING) == 0;
+				is_spare = strcmp(childtype,
+				    VDEV_TYPE_SPARE) == 0;
+				if (is_replacing || is_spare) {
 					nvlist_t **rchild;
 					uint_t rchildren;
 
 					verify(nvlist_lookup_nvlist_array(cnv,
 					    ZPOOL_CONFIG_CHILDREN, &rchild,
 					    &rchildren) == 0);
-					assert(rchildren == 2);
+					assert((is_replacing && rchildren == 2)
+					    || (is_spare && rchildren >= 2));
 					cnv = rchild[0];
 
 					verify(nvlist_lookup_string(cnv,
@@ -725,6 +932,7 @@ check_replication(nvlist_t *config, nvli
 	return (ret);
 }
 
+#ifdef illumos
 /*
  * Go through and find any whole disks in the vdev specification, labelling them
  * as appropriate.  When constructing the vdev spec, we were unable to open this
@@ -828,6 +1036,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t
 
 	return (0);
 }
+#endif	/* illumos */
 
 /*
  * Determine if the given path is a hot spare within the given configuration.
@@ -857,8 +1066,8 @@ is_spare(nvlist_t *config, const char *p
 		return (B_FALSE);
 	}
 	free(name);
-
 	(void) close(fd);
+
 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
 	nvlist_free(label);
 
@@ -881,16 +1090,17 @@ is_spare(nvlist_t *config, const char *p
  * Go through and find any devices that are in use.  We rely on libdiskmgt for
  * the majority of this task.
  */
-static int
-check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
-    int isspare)
+static boolean_t
+is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
+    boolean_t replacing, boolean_t isspare)
 {
 	nvlist_t **child;
 	uint_t c, children;
 	char *type, *path;
-	int ret;
+	int ret = 0;
 	char buf[MAXPATHLEN];
 	uint64_t wholedisk;
+	boolean_t anyinuse = B_FALSE;
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 
@@ -904,44 +1114,48 @@ check_in_use(nvlist_t *config, nvlist_t 
 		 * hot spare within the same pool.  If so, we allow it
 		 * regardless of what libdiskmgt or zpool_in_use() says.
 		 */
-		if (isreplacing) {
+		if (replacing) {
+#ifdef illumos
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 			    &wholedisk) == 0 && wholedisk)
 				(void) snprintf(buf, sizeof (buf), "%ss0",
 				    path);
 			else
+#endif
 				(void) strlcpy(buf, path, sizeof (buf));
+
 			if (is_spare(config, buf))
-				return (0);
+				return (B_FALSE);
 		}
 
-		if (strcmp(type, VDEV_TYPE_DISK) == 0 ||
-		    strcmp(type, VDEV_TYPE_FILE) == 0)
+		if (strcmp(type, VDEV_TYPE_DISK) == 0)
+			ret = check_device(path, force, isspare);
+		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
 			ret = check_file(path, force, isspare);
 
-		return (ret);
+		return (ret != 0);
 	}
 
 	for (c = 0; c < children; c++)
-		if ((ret = check_in_use(config, child[c], force,
-		    isreplacing, B_FALSE)) != 0)
-			return (ret);
+		if (is_device_in_use(config, child[c], force, replacing,
+		    B_FALSE))
+			anyinuse = B_TRUE;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
-			if ((ret = check_in_use(config, child[c], force,
-			    isreplacing, B_TRUE)) != 0)
-				return (ret);
+			if (is_device_in_use(config, child[c], force, replacing,
+			    B_TRUE))
+				anyinuse = B_TRUE;
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0)
 		for (c = 0; c < children; c++)
-			if ((ret = check_in_use(config, child[c], force,
-			    isreplacing, B_FALSE)) != 0)
-				return (ret);
+			if (is_device_in_use(config, child[c], force, replacing,
+			    B_FALSE))
+				anyinuse = B_TRUE;
 
-	return (0);
+	return (anyinuse);
 }
 
 static const char *
@@ -1225,10 +1439,12 @@ split_mirror_vdev(zpool_handle_t *zhp, c
 			return (NULL);
 		}
 
+#ifdef illumos
 		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
 			nvlist_free(newroot);
 			return (NULL);
 		}
+#endif
 
 		/* avoid any tricks in the spec */
 		verify(nvlist_lookup_nvlist_array(newroot,
@@ -1250,8 +1466,7 @@ split_mirror_vdev(zpool_handle_t *zhp, c
 	}
 
 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
-		if (newroot != NULL)
-			nvlist_free(newroot);
+		nvlist_free(newroot);
 		return (NULL);
 	}
 
@@ -1270,7 +1485,7 @@ split_mirror_vdev(zpool_handle_t *zhp, c
  */
 nvlist_t *
 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
-    boolean_t isreplacing, boolean_t dryrun, int argc, char **argv)
+    boolean_t replacing, boolean_t dryrun, int argc, char **argv)
 {
 	nvlist_t *newroot;
 	nvlist_t *poolconfig = NULL;
@@ -1293,8 +1508,7 @@ make_root_vdev(zpool_handle_t *zhp, int 
 	 * uses (such as a dedicated dump device) that even '-f' cannot
 	 * override.
 	 */
-	if (check_in_use(poolconfig, newroot, force, isreplacing,
-	    B_FALSE) != 0) {
+	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
 		nvlist_free(newroot);
 		return (NULL);
 	}
@@ -1309,6 +1523,7 @@ make_root_vdev(zpool_handle_t *zhp, int 
 		return (NULL);
 	}
 
+#ifdef illumos
 	/*
 	 * Run through the vdev specification and label any whole disks found.
 	 */
@@ -1316,6 +1531,7 @@ make_root_vdev(zpool_handle_t *zhp, int 
 		nvlist_free(newroot);
 		return (NULL);
 	}
+#endif
 
 	return (newroot);
 }
Index: src/external/cddl/osnet/dist/cmd/ztest/ztest.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/ztest/ztest.c,v
retrieving revision 1.7
diff -u -p -r1.7 ztest.c
--- src/external/cddl/osnet/dist/cmd/ztest/ztest.c	28 Mar 2014 02:58:36 -0000	1.7
+++ src/external/cddl/osnet/dist/cmd/ztest/ztest.c	27 Mar 2017 06:26:21 -0000
@@ -19,8 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>.  All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /*
@@ -50,7 +54,9 @@
  *     At random times, the child self-immolates with a SIGKILL.
  *     This is the software equivalent of pulling the power cord.
  *     The parent then runs the test again, using the existing
- *     storage pool, as many times as desired.
+ *     storage pool, as many times as desired. If backwards compatibility
+ *     testing is enabled ztest will sometimes run the "older" version
+ *     of ztest after a SIGKILL.
  *
  * (6) To verify that we don't have future leaks or temporal incursions,
  *     many of the functional tests record the transaction group number
@@ -67,9 +73,15 @@
  * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
  * to increase the pool capacity, fanout, and overall stress level.
  *
- * The -N(okill) option will suppress kills, so each child runs to completion.
- * This can be useful when you're trying to distinguish temporal incursions
- * from plain old race conditions.
+ * Use the -k option to set the desired frequency of kills.
+ *
+ * When ztest invokes itself it passes all relevant information through a
+ * temporary file which is mmap-ed in the child process. This allows shared
+ * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
+ * stored at offset 0 of this file and contains information on the size and
+ * number of shared structures in the file. The information stored in this file
+ * must remain backwards compatible with older versions of ztest so that
+ * ztest can invoke them during backwards compatibility testing (-B).
  */
 
 #include <sys/zfs_context.h>
@@ -94,7 +106,12 @@
 #include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
 #include <sys/refcount.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_userhold.h>
 #include <stdio.h>
 #include <stdio_ext.h>
 #include <stdlib.h>
@@ -104,30 +121,89 @@
 #include <dlfcn.h>
 #include <ctype.h>
 #include <math.h>
+#include <errno.h>
 #include <sys/fs/zfs.h>
 #include <libnvpair.h>
 
-static char cmdname[] = "ztest";
-static char *zopt_pool = cmdname;
+static int ztest_fd_data = -1;
+static int ztest_fd_rand = -1;
+
+typedef struct ztest_shared_hdr {
+	uint64_t	zh_hdr_size;
+	uint64_t	zh_opts_size;
+	uint64_t	zh_size;
+	uint64_t	zh_stats_size;
+	uint64_t	zh_stats_count;
+	uint64_t	zh_ds_size;
+	uint64_t	zh_ds_count;
+} ztest_shared_hdr_t;
+
+static ztest_shared_hdr_t *ztest_shared_hdr;
+
+typedef struct ztest_shared_opts {
+	char zo_pool[ZFS_MAX_DATASET_NAME_LEN];
+	char zo_dir[ZFS_MAX_DATASET_NAME_LEN];
+	char zo_alt_ztest[MAXNAMELEN];
+	char zo_alt_libpath[MAXNAMELEN];
+	uint64_t zo_vdevs;
+	uint64_t zo_vdevtime;
+	size_t zo_vdev_size;
+	int zo_ashift;
+	int zo_mirrors;
+	int zo_raidz;
+	int zo_raidz_parity;
+	int zo_datasets;
+	int zo_threads;
+	uint64_t zo_passtime;
+	uint64_t zo_killrate;
+	int zo_verbose;
+	int zo_init;
+	uint64_t zo_time;
+	uint64_t zo_maxloops;
+	uint64_t zo_metaslab_gang_bang;
+} ztest_shared_opts_t;
+
+static const ztest_shared_opts_t ztest_opts_defaults = {
+	.zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
+	.zo_dir = { '/', 't', 'm', 'p', '\0' },
+	.zo_alt_ztest = { '\0' },
+	.zo_alt_libpath = { '\0' },
+	.zo_vdevs = 5,
+	.zo_ashift = SPA_MINBLOCKSHIFT,
+	.zo_mirrors = 2,
+	.zo_raidz = 4,
+	.zo_raidz_parity = 1,
+	.zo_vdev_size = SPA_MINDEVSIZE * 4,	/* 256m default size */
+	.zo_datasets = 7,
+	.zo_threads = 23,
+	.zo_passtime = 60,		/* 60 seconds */
+	.zo_killrate = 70,		/* 70% kill rate */
+	.zo_verbose = 0,
+	.zo_init = 1,
+	.zo_time = 300,			/* 5 minutes */
+	.zo_maxloops = 50,		/* max loops during spa_freeze() */
+	.zo_metaslab_gang_bang = 32 << 10
+};
+
+extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
+extern uint64_t zfs_deadman_synctime_ms;
+extern int metaslab_preload_limit;
+extern boolean_t zfs_compressed_arc_enabled;
+
+static ztest_shared_opts_t *ztest_shared_opts;
+static ztest_shared_opts_t ztest_opts;
+
+typedef struct ztest_shared_ds {
+	uint64_t	zd_seq;
+} ztest_shared_ds_t;
 
-static uint64_t zopt_vdevs = 5;
-static uint64_t zopt_vdevtime;
-static int zopt_ashift = SPA_MINBLOCKSHIFT;
-static int zopt_mirrors = 2;
-static int zopt_raidz = 4;
-static int zopt_raidz_parity = 1;
-static size_t zopt_vdev_size = SPA_MINDEVSIZE;
-static int zopt_datasets = 7;
-static int zopt_threads = 23;
-static uint64_t zopt_passtime = 60;	/* 60 seconds */
-static uint64_t zopt_killrate = 70;	/* 70% kill rate */
-static int zopt_verbose = 0;
-static int zopt_init = 1;
-static char *zopt_dir = "/tmp";
-static uint64_t zopt_time = 300;	/* 5 minutes */
+static ztest_shared_ds_t *ztest_shared_ds;
+#define	ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
 
 #define	BT_MAGIC	0x123456789abcdefULL
-#define	MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1)
+#define	MAXFAULTS() \
+	(MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
 
 enum ztest_io_type {
 	ZTEST_IO_WRITE_TAG,
@@ -135,6 +211,7 @@ enum ztest_io_type {
 	ZTEST_IO_WRITE_ZEROES,
 	ZTEST_IO_TRUNCATE,
 	ZTEST_IO_SETATTR,
+	ZTEST_IO_REWRITE,
 	ZTEST_IO_TYPES
 };
 
@@ -192,18 +269,19 @@ typedef struct ztest_od {
 	uint64_t	od_crblocksize;
 	uint64_t	od_gen;
 	uint64_t	od_crgen;
-	char		od_name[MAXNAMELEN];
+	char		od_name[ZFS_MAX_DATASET_NAME_LEN];
 } ztest_od_t;
 
 /*
  * Per-dataset state.
  */
 typedef struct ztest_ds {
+	ztest_shared_ds_t *zd_shared;
 	objset_t	*zd_os;
+	rwlock_t	zd_zilog_lock;
 	zilog_t		*zd_zilog;
-	uint64_t	zd_seq;
 	ztest_od_t	*zd_od;		/* debugging aid */
-	char		zd_name[MAXNAMELEN];
+	char		zd_name[ZFS_MAX_DATASET_NAME_LEN];
 	mutex_t		zd_dirobj_lock;
 	rll_t		zd_object_lock[ZTEST_OBJECT_LOCKS];
 	rll_t		zd_range_lock[ZTEST_RANGE_LOCKS];
@@ -218,11 +296,17 @@ typedef struct ztest_info {
 	ztest_func_t	*zi_func;	/* test function */
 	uint64_t	zi_iters;	/* iterations per execution */
 	uint64_t	*zi_interval;	/* execute every <interval> seconds */
-	uint64_t	zi_call_count;	/* per-pass count */
-	uint64_t	zi_call_time;	/* per-pass time */
-	uint64_t	zi_call_next;	/* next time to call this function */
 } ztest_info_t;
 
+typedef struct ztest_shared_callstate {
+	uint64_t	zc_count;	/* per-pass count */
+	uint64_t	zc_time;	/* per-pass time */
+	uint64_t	zc_next;	/* next time to call this function */
+} ztest_shared_callstate_t;
+
+static ztest_shared_callstate_t *ztest_shared_callstate;
+#define	ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
+
 /*
  * Note: these aren't static because we want dladdr() to work.
  */
@@ -233,6 +317,7 @@ ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
 ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_zil_remount;
 ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
 ztest_func_t ztest_dmu_prealloc;
@@ -252,6 +337,8 @@ ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_vdev_aux_add_remove;
 ztest_func_t ztest_split_pool;
+ztest_func_t ztest_reguid;
+ztest_func_t ztest_spa_upgrade;
 
 uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
 uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
@@ -268,6 +355,7 @@ ztest_info_t ztest_info[] = {
 	{ ztest_zap_parallel,			100,	&zopt_always	},
 	{ ztest_split_pool,			1,	&zopt_always	},
 	{ ztest_zil_commit,			1,	&zopt_incessant	},
+	{ ztest_zil_remount,			1,	&zopt_sometimes	},
 	{ ztest_dmu_read_write_zcopy,		1,	&zopt_often	},
 	{ ztest_dmu_objset_create_destroy,	1,	&zopt_often	},
 	{ ztest_dsl_prop_get_set,		1,	&zopt_often	},
@@ -281,13 +369,17 @@ ztest_info_t ztest_info[] = {
 	{ ztest_fault_inject,			1,	&zopt_sometimes	},
 	{ ztest_ddt_repair,			1,	&zopt_sometimes	},
 	{ ztest_dmu_snapshot_hold,		1,	&zopt_sometimes	},
+	{ ztest_reguid,				1,	&zopt_rarely	},
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
 	{ ztest_scrub,				1,	&zopt_rarely	},
+	{ ztest_spa_upgrade,			1,	&zopt_rarely	},
 	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
-	{ ztest_vdev_attach_detach,		1,	&zopt_rarely	},
+	{ ztest_vdev_attach_detach,		1,	&zopt_sometimes	},
 	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
-	{ ztest_vdev_add_remove,		1,	&zopt_vdevtime	},
-	{ ztest_vdev_aux_add_remove,		1,	&zopt_vdevtime	},
+	{ ztest_vdev_add_remove,		1,
+	    &ztest_opts.zo_vdevtime				},
+	{ ztest_vdev_aux_add_remove,		1,
+	    &ztest_opts.zo_vdevtime				},
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
@@ -305,8 +397,7 @@ typedef struct ztest_cb_list {
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
-	char		*zs_pool;
-	spa_t		*zs_spa;
+	boolean_t	zs_do_init;
 	hrtime_t	zs_proc_start;
 	hrtime_t	zs_proc_stop;
 	hrtime_t	zs_thread_start;
@@ -317,12 +408,11 @@ typedef struct ztest_shared {
 	uint64_t	zs_vdev_aux;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
-	mutex_t		zs_vdev_lock;
-	rwlock_t	zs_name_lock;
-	ztest_info_t	zs_info[ZTEST_FUNCS];
 	uint64_t	zs_splits;
 	uint64_t	zs_mirrors;
-	ztest_ds_t	zs_zd[];
+	uint64_t	zs_metaslab_sz;
+	uint64_t	zs_metaslab_df_alloc_threshold;
+	uint64_t	zs_guid;
 } ztest_shared_t;
 
 #define	ID_PARALLEL	-1ULL
@@ -330,20 +420,26 @@ typedef struct ztest_shared {
 static char ztest_dev_template[] = "%s/%s.%llua";
 static char ztest_aux_template[] = "%s/%s.%s.%llu";
 ztest_shared_t *ztest_shared;
-uint64_t *ztest_seq;
 
-static int ztest_random_fd;
-static int ztest_dump_core = 1;
+static spa_t *ztest_spa = NULL;
+static ztest_ds_t *ztest_ds;
 
+static mutex_t ztest_vdev_lock;
+
+/*
+ * The ztest_name_lock protects the pool and dataset namespace used by
+ * the individual tests. To modify the namespace, consumers must grab
+ * this lock as writer. Grabbing the lock as reader will ensure that the
+ * namespace does not change while the lock is held.
+ */
+static rwlock_t ztest_name_lock;
+
+static boolean_t ztest_dump_core = B_TRUE;
 static boolean_t ztest_exiting;
 
 /* Global commit callback list */
 static ztest_cb_list_t zcl;
 
-extern uint64_t metaslab_gang_bang;
-extern uint64_t metaslab_df_alloc_threshold;
-static uint64_t metaslab_sz;
-
 enum ztest_object {
 	ZTEST_META_DNODE = 0,
 	ZTEST_DIROBJ,
@@ -378,21 +474,17 @@ fatal(int do_perror, char *message, ...)
 	va_list args;
 	int save_errno = errno;
 	char buf[FATAL_MSG_SZ];
-	size_t len, blklen = sizeof(buf);
 
 	(void) fflush(stdout);
 
 	va_start(args, message);
-	len = snprintf(buf, blklen, "ztest: ");
-	if (len > blklen)
-		len = blklen;
+	(void) sprintf(buf, "ztest: ");
 	/* LINTED */
-	len += vsnprintf(buf + len, blklen - len, message, args);
+	(void) vsprintf(buf + strlen(buf), message, args);
 	va_end(args);
-	if (len > blklen)
-		len = blklen;
 	if (do_perror) {
-		snprintf(buf + len, blklen - len, ": %s", strerror(save_errno));
+		(void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
+		    ": %s", strerror(save_errno));
 	}
 	(void) fprintf(stderr, "%s\n", buf);
 	fatal_msg = buf;			/* to ease debugging */
@@ -460,63 +552,71 @@ nicenumtoull(const char *buf)
 static void
 usage(boolean_t requested)
 {
+	const ztest_shared_opts_t *zo = &ztest_opts_defaults;
+
 	char nice_vdev_size[10];
 	char nice_gang_bang[10];
 	FILE *fp = requested ? stdout : stderr;
 
-	nicenum(zopt_vdev_size, nice_vdev_size, sizeof(nice_vdev_size));
-	nicenum(metaslab_gang_bang, nice_gang_bang, sizeof(nice_gang_bang));
+	nicenum(zo->zo_vdev_size, nice_vdev_size);
+	nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang);
 
 	(void) fprintf(fp, "Usage: %s\n"
 	    "\t[-v vdevs (default: %llu)]\n"
 	    "\t[-s size_of_each_vdev (default: %s)]\n"
-	    "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
+	    "\t[-a alignment_shift (default: %d)] use 0 for random\n"
 	    "\t[-m mirror_copies (default: %d)]\n"
 	    "\t[-r raidz_disks (default: %d)]\n"
 	    "\t[-R raidz_parity (default: %d)]\n"
 	    "\t[-d datasets (default: %d)]\n"
 	    "\t[-t threads (default: %d)]\n"
 	    "\t[-g gang_block_threshold (default: %s)]\n"
-	    "\t[-i initialize pool i times (default: %d)]\n"
-	    "\t[-k kill percentage (default: %llu%%)]\n"
+	    "\t[-i init_count (default: %d)] initialize pool i times\n"
+	    "\t[-k kill_percentage (default: %llu%%)]\n"
 	    "\t[-p pool_name (default: %s)]\n"
-	    "\t[-f file directory for vdev files (default: %s)]\n"
-	    "\t[-V(erbose)] (use multiple times for ever more blather)\n"
-	    "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
-	    "\t[-T time] total run time (default: %llu sec)\n"
-	    "\t[-P passtime] time per pass (default: %llu sec)\n"
+	    "\t[-f dir (default: %s)] file directory for vdev files\n"
+	    "\t[-V] verbose (use multiple times for ever more blather)\n"
+	    "\t[-E] use existing pool instead of creating new one\n"
+	    "\t[-T time (default: %llu sec)] total run time\n"
+	    "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
+	    "\t[-P passtime (default: %llu sec)] time per pass\n"
+	    "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
 	    "\t[-h] (print help)\n"
 	    "",
-	    cmdname,
-	    (u_longlong_t)zopt_vdevs,			/* -v */
+	    zo->zo_pool,
+	    (u_longlong_t)zo->zo_vdevs,			/* -v */
 	    nice_vdev_size,				/* -s */
-	    zopt_ashift,				/* -a */
-	    zopt_mirrors,				/* -m */
-	    zopt_raidz,					/* -r */
-	    zopt_raidz_parity,				/* -R */
-	    zopt_datasets,				/* -d */
-	    zopt_threads,				/* -t */
+	    zo->zo_ashift,				/* -a */
+	    zo->zo_mirrors,				/* -m */
+	    zo->zo_raidz,				/* -r */
+	    zo->zo_raidz_parity,			/* -R */
+	    zo->zo_datasets,				/* -d */
+	    zo->zo_threads,				/* -t */
 	    nice_gang_bang,				/* -g */
-	    zopt_init,					/* -i */
-	    (u_longlong_t)zopt_killrate,		/* -k */
-	    zopt_pool,					/* -p */
-	    zopt_dir,					/* -f */
-	    (u_longlong_t)zopt_time,			/* -T */
-	    (u_longlong_t)zopt_passtime);		/* -P */
+	    zo->zo_init,				/* -i */
+	    (u_longlong_t)zo->zo_killrate,		/* -k */
+	    zo->zo_pool,				/* -p */
+	    zo->zo_dir,					/* -f */
+	    (u_longlong_t)zo->zo_time,			/* -T */
+	    (u_longlong_t)zo->zo_maxloops,		/* -F */
+	    (u_longlong_t)zo->zo_passtime);
 	exit(requested ? 0 : 1);
 }
 
 static void
 process_options(int argc, char **argv)
 {
+	char *path;
+	ztest_shared_opts_t *zo = &ztest_opts;
+
 	int opt;
 	uint64_t value;
+	char altdir[MAXNAMELEN] = { 0 };
 
-	/* By default, test gang blocks for blocks 32K and greater */
-	metaslab_gang_bang = 32 << 10;
+	bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
 
 	while ((opt = getopt(argc, argv,
-	    "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:h")) != EOF) {
+	    "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:")) != EOF) {
 		value = 0;
 		switch (opt) {
 		case 'v':
@@ -532,59 +632,76 @@ process_options(int argc, char **argv)
 		case 'k':
 		case 'T':
 		case 'P':
+		case 'F':
 			value = nicenumtoull(optarg);
 		}
 		switch (opt) {
 		case 'v':
-			zopt_vdevs = value;
+			zo->zo_vdevs = value;
 			break;
 		case 's':
-			zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
+			zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
 			break;
 		case 'a':
-			zopt_ashift = value;
+			zo->zo_ashift = value;
 			break;
 		case 'm':
-			zopt_mirrors = value;
+			zo->zo_mirrors = value;
 			break;
 		case 'r':
-			zopt_raidz = MAX(1, value);
+			zo->zo_raidz = MAX(1, value);
 			break;
 		case 'R':
-			zopt_raidz_parity = MIN(MAX(value, 1), 3);
+			zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
 			break;
 		case 'd':
-			zopt_datasets = MAX(1, value);
+			zo->zo_datasets = MAX(1, value);
 			break;
 		case 't':
-			zopt_threads = MAX(1, value);
+			zo->zo_threads = MAX(1, value);
 			break;
 		case 'g':
-			metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
+			zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
+			    value);
 			break;
 		case 'i':
-			zopt_init = value;
+			zo->zo_init = value;
 			break;
 		case 'k':
-			zopt_killrate = value;
+			zo->zo_killrate = value;
 			break;
 		case 'p':
-			zopt_pool = strdup(optarg);
+			(void) strlcpy(zo->zo_pool, optarg,
+			    sizeof (zo->zo_pool));
 			break;
 		case 'f':
-			zopt_dir = strdup(optarg);
+			path = realpath(optarg, NULL);
+			if (path == NULL) {
+				(void) fprintf(stderr, "error: %s: %s\n",
+				    optarg, strerror(errno));
+				usage(B_FALSE);
+			} else {
+				(void) strlcpy(zo->zo_dir, path,
+				    sizeof (zo->zo_dir));
+			}
 			break;
 		case 'V':
-			zopt_verbose++;
+			zo->zo_verbose++;
 			break;
 		case 'E':
-			zopt_init = 0;
+			zo->zo_init = 0;
 			break;
 		case 'T':
-			zopt_time = value;
+			zo->zo_time = value;
 			break;
 		case 'P':
-			zopt_passtime = MAX(1, value);
+			zo->zo_passtime = MAX(1, value);
+			break;
+		case 'F':
+			zo->zo_maxloops = MAX(1, value);
+			break;
+		case 'B':
+			(void) strlcpy(altdir, optarg, sizeof (altdir));
 			break;
 		case 'h':
 			usage(B_TRUE);
@@ -596,17 +713,75 @@ process_options(int argc, char **argv)
 		}
 	}
 
-	zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
+	zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
 
-	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs :
+	zo->zo_vdevtime =
+	    (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
 	    UINT64_MAX >> 2);
+
+	if (strlen(altdir) > 0) {
+		char *cmd;
+		char *realaltdir;
+		char *bin;
+		char *ztest;
+		char *isa;
+		int isalen;
+
+		cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+
+		VERIFY(NULL != realpath(getexecname(), cmd));
+		if (0 != access(altdir, F_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate ztest path: %s",
+			    altdir);
+		}
+		VERIFY(NULL != realpath(altdir, realaltdir));
+
+		/*
+		 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
+		 * We want to extract <isa> to determine if we should use
+		 * 32 or 64 bit binaries.
+		 */
+		bin = strstr(cmd, "/usr/bin/");
+		ztest = strstr(bin, "/ztest");
+		isa = bin + 9;
+		isalen = ztest - isa;
+		(void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
+		    "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
+		(void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
+		    "%s/usr/lib/%.*s", realaltdir, isalen, isa);
+
+		if (0 != access(zo->zo_alt_ztest, X_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate ztest: %s",
+			    zo->zo_alt_ztest);
+		} else if (0 != access(zo->zo_alt_libpath, X_OK)) {
+			ztest_dump_core = B_FALSE;
+			fatal(B_TRUE, "invalid alternate lib directory %s",
+			    zo->zo_alt_libpath);
+		}
+
+		umem_free(cmd, MAXPATHLEN);
+		umem_free(realaltdir, MAXPATHLEN);
+	}
 }
 
 static void
 ztest_kill(ztest_shared_t *zs)
 {
-	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa));
-	zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa));
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
+
+	/*
+	 * Before we kill off ztest, make sure that the config is updated.
+	 * See comment above spa_config_sync().
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
+	mutex_exit(&spa_namespace_lock);
+
+	zfs_dbgmsg_print(FTAG);
 	(void) kill(getpid(), SIGKILL);
 }
 
@@ -615,10 +790,12 @@ ztest_random(uint64_t range)
 {
 	uint64_t r;
 
+	ASSERT3S(ztest_fd_rand, >=, 0);
+
 	if (range == 0)
 		return (0);
 
-	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+	if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
 		fatal(1, "short read from /dev/urandom");
 
 	return (r % range);
@@ -634,13 +811,13 @@ ztest_record_enospc(const char *s)
 static uint64_t
 ztest_get_ashift(void)
 {
-	if (zopt_ashift == 0)
-		return (SPA_MINBLOCKSHIFT + ztest_random(3));
-	return (zopt_ashift);
+	if (ztest_opts.zo_ashift == 0)
+		return (SPA_MINBLOCKSHIFT + ztest_random(5));
+	return (ztest_opts.zo_ashift);
 }
 
 static nvlist_t *
-make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
+make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
 {
 	char pathbuf[MAXPATHLEN];
 	uint64_t vdev;
@@ -654,12 +831,15 @@ make_vdev_file(char *path, char *aux, si
 
 		if (aux != NULL) {
 			vdev = ztest_shared->zs_vdev_aux;
-			(void) snprintf(path, sizeof(pathbuf), ztest_aux_template,
-			    zopt_dir, zopt_pool, aux, vdev);
+			(void) snprintf(path, sizeof (pathbuf),
+			    ztest_aux_template, ztest_opts.zo_dir,
+			    pool == NULL ? ztest_opts.zo_pool : pool,
+			    aux, vdev);
 		} else {
 			vdev = ztest_shared->zs_vdev_next_leaf++;
-			(void) snprintf(path, sizeof(pathbuf), ztest_dev_template,
-			    zopt_dir, zopt_pool, vdev);
+			(void) snprintf(path, sizeof (pathbuf),
+			    ztest_dev_template, ztest_opts.zo_dir,
+			    pool == NULL ? ztest_opts.zo_pool : pool, vdev);
 		}
 	}
 
@@ -681,23 +861,24 @@ make_vdev_file(char *path, char *aux, si
 }
 
 static nvlist_t *
-make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r)
+make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r)
 {
 	nvlist_t *raidz, **child;
 	int c;
 
 	if (r < 2)
-		return (make_vdev_file(path, aux, size, ashift));
+		return (make_vdev_file(path, aux, pool, size, ashift));
 	child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < r; c++)
-		child[c] = make_vdev_file(path, aux, size, ashift);
+		child[c] = make_vdev_file(path, aux, pool, size, ashift);
 
 	VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_RAIDZ) == 0);
 	VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
-	    zopt_raidz_parity) == 0);
+	    ztest_opts.zo_raidz_parity) == 0);
 	VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
 	    child, r) == 0);
 
@@ -710,19 +891,19 @@ make_vdev_raidz(char *path, char *aux, s
 }
 
 static nvlist_t *
-make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift,
-	int r, int m)
+make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
+    uint64_t ashift, int r, int m)
 {
 	nvlist_t *mirror, **child;
 	int c;
 
 	if (m < 1)
-		return (make_vdev_raidz(path, aux, size, ashift, r));
+		return (make_vdev_raidz(path, aux, pool, size, ashift, r));
 
 	child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < m; c++)
-		child[c] = make_vdev_raidz(path, aux, size, ashift, r);
+		child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
 
 	VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
 	VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
@@ -739,8 +920,8 @@ make_vdev_mirror(char *path, char *aux, 
 }
 
 static nvlist_t *
-make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
-	int log, int r, int m, int t)
+make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
+    int log, int r, int m, int t)
 {
 	nvlist_t *root, **child;
 	int c;
@@ -750,7 +931,8 @@ make_vdev_root(char *path, char *aux, si
 	child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
 
 	for (c = 0; c < t; c++) {
-		child[c] = make_vdev_mirror(path, aux, size, ashift, r, m);
+		child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
+		    r, m);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    log) == 0);
 	}
@@ -768,11 +950,40 @@ make_vdev_root(char *path, char *aux, si
 	return (root);
 }
 
+/*
+ * Find a random spa version. Returns back a random spa version in the
+ * range [initial_version, SPA_VERSION_FEATURES].
+ */
+static uint64_t
+ztest_random_spa_version(uint64_t initial_version)
+{
+	uint64_t version = initial_version;
+
+	if (version <= SPA_VERSION_BEFORE_FEATURES) {
+		version = version +
+		    ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
+	}
+
+	if (version > SPA_VERSION_BEFORE_FEATURES)
+		version = SPA_VERSION_FEATURES;
+
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+	return (version);
+}
+
 static int
 ztest_random_blocksize(void)
 {
-	return (1 << (SPA_MINBLOCKSHIFT +
-	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
+	uint64_t block_shift;
+	/*
+	 * Choose a block size >= the ashift.
+	 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks.
+	 */
+	int maxbs = SPA_OLD_MAXBLOCKSHIFT;
+	if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE)
+		maxbs = 20;
+	block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1);
+	return (1 << (SPA_MINBLOCKSHIFT + block_shift));
 }
 
 static int
@@ -822,20 +1033,18 @@ ztest_dsl_prop_set_uint64(char *osname, 
 	uint64_t curval;
 	int error;
 
-	error = dsl_prop_set(osname, propname,
-	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL),
-	    sizeof (value), 1, &value);
+	error = dsl_prop_set_int(osname, propname,
+	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
 
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
-	ASSERT3U(error, ==, 0);
+	ASSERT0(error);
 
-	VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
-	    1, &curval, setpoint), ==, 0);
+	VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
 
-	if (zopt_verbose >= 6) {
+	if (ztest_opts.zo_verbose >= 6) {
 		VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
 		(void) printf("%s %s = %s at '%s'\n",
 		    osname, propname, valname, setpoint);
@@ -845,9 +1054,9 @@ ztest_dsl_prop_set_uint64(char *osname, 
 }
 
 static int
-ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
+ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
 {
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	nvlist_t *props = NULL;
 	int error;
 
@@ -862,7 +1071,7 @@ ztest_spa_prop_set_uint64(ztest_shared_t
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
-	ASSERT3U(error, ==, 0);
+	ASSERT0(error);
 
 	return (error);
 }
@@ -969,13 +1178,17 @@ ztest_range_unlock(rl_t *rl)
 }
 
 static void
-ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
 {
 	zd->zd_os = os;
 	zd->zd_zilog = dmu_objset_zil(os);
-	zd->zd_seq = 0;
+	zd->zd_shared = szd;
 	dmu_objset_name(os, zd->zd_name);
 
+	if (zd->zd_shared != NULL)
+		zd->zd_shared->zd_seq = 0;
+
+	VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
 	VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
 
 	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
@@ -1065,13 +1278,13 @@ static void
 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
 {
-	ASSERT(bt->bt_magic == BT_MAGIC);
-	ASSERT(bt->bt_objset == dmu_objset_id(os));
-	ASSERT(bt->bt_object == object);
-	ASSERT(bt->bt_offset == offset);
-	ASSERT(bt->bt_gen <= gen);
-	ASSERT(bt->bt_txg <= txg);
-	ASSERT(bt->bt_crtxg == crtxg);
+	ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
+	ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+	ASSERT3U(bt->bt_object, ==, object);
+	ASSERT3U(bt->bt_offset, ==, offset);
+	ASSERT3U(bt->bt_gen, <=, gen);
+	ASSERT3U(bt->bt_txg, <=, txg);
+	ASSERT3U(bt->bt_crtxg, ==, crtxg);
 }
 
 static ztest_block_tag_t *
@@ -1098,7 +1311,7 @@ ztest_bt_bonus(dmu_buf_t *db)
 #define	lrz_bonustype	lr_rdev
 #define	lrz_bonuslen	lr_crtime[1]
 
-static uint64_t
+static void
 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
@@ -1106,40 +1319,41 @@ ztest_log_create(ztest_ds_t *zd, dmu_tx_
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
-		return (0);
+		return;
 
 	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) + namesize - sizeof (lr_t));
 
-	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
-ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+static void
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
 {
 	char *name = (void *)(lr + 1);		/* name follows lr */
 	size_t namesize = strlen(name) + 1;
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
-		return (0);
+		return;
 
 	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) + namesize - sizeof (lr_t));
 
-	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+	itx->itx_oid = object;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
+static void
 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
 {
 	itx_t *itx;
 	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
 
 	if (zil_replaying(zd->zd_zilog, tx))
-		return (0);
+		return;
 
 	if (lr->lr_length > ZIL_MAX_LOG_DATA)
 		write_state = WR_INDIRECT;
@@ -1157,42 +1371,43 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t
 	itx->itx_private = zd;
 	itx->itx_wr_state = write_state;
 	itx->itx_sync = (ztest_random(8) == 0);
-	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
 
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
-	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
+static void
 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
 {
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
-		return (0);
+		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
-	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+	itx->itx_sync = B_FALSE;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
-static uint64_t
+static void
 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
 {
 	itx_t *itx;
 
 	if (zil_replaying(zd->zd_zilog, tx))
-		return (0);
+		return;
 
 	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));
 
-	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+	itx->itx_sync = B_FALSE;
+	zil_itx_assign(zd->zd_zilog, itx, tx);
 }
 
 /*
@@ -1324,7 +1539,7 @@ ztest_replay_remove(ztest_ds_t *zd, lr_r
 
 	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
 
-	(void) ztest_log_remove(zd, tx, lr);
+	(void) ztest_log_remove(zd, tx, lr, object);
 
 	dmu_tx_commit(tx);
 
@@ -1552,7 +1767,7 @@ ztest_replay_setattr(ztest_ds_t *zd, lr_
 
 	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
 	ASSERT3U(lr->lr_size, <=, db->db_size);
-	VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+	VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
 	bbt = ztest_bt_bonus(db);
 
 	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
@@ -1671,9 +1886,16 @@ ztest_get_data(void *arg, lr_write_t *lr
 		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
 		    RL_READER);
 
-		error = dmu_buf_hold(os, object, offset, zgd, &db);
+		error = dmu_buf_hold(os, object, offset, zgd, &db,
+		    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
@@ -1819,6 +2041,9 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t 
 			continue;
 		}
 
+		/*
+		 * No object was found.
+		 */
 		if (od->od_object == 0)
 			continue;
 
@@ -1934,6 +2159,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t 
 static void
 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 {
+	int err;
 	ztest_block_tag_t wbt;
 	dmu_object_info_t doi;
 	enum ztest_io_type io_type;
@@ -1951,6 +2177,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object
 	if (ztest_random(2) == 0)
 		io_type = ZTEST_IO_WRITE_TAG;
 
+	(void) rw_rdlock(&zd->zd_zilog_lock);
+
 	switch (io_type) {
 
 	case ZTEST_IO_WRITE_TAG:
@@ -1984,8 +2212,29 @@ ztest_io(ztest_ds_t *zd, uint64_t object
 	case ZTEST_IO_SETATTR:
 		(void) ztest_setattr(zd, object);
 		break;
+
+	case ZTEST_IO_REWRITE:
+		(void) rw_rdlock(&ztest_name_lock);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_COMPRESSION,
+		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		(void) rw_unlock(&ztest_name_lock);
+
+		VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
+		    DMU_READ_NO_PREFETCH));
+
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
 	}
 
+	(void) rw_unlock(&zd->zd_zilog_lock);
+
 	umem_free(data, blocksize);
 }
 
@@ -2040,7 +2289,9 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_
 {
 	zilog_t *zilog = zd->zd_zilog;
 
-	zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+	(void) rw_rdlock(&zd->zd_zilog_lock);
+
+	zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
 
 	/*
 	 * Remember the committed values in zd, which is in parent/child
@@ -2048,9 +2299,42 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_
 	 * will verify that the log really does contain this record.
 	 */
 	mutex_enter(&zilog->zl_lock);
-	ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
-	zd->zd_seq = zilog->zl_commit_lr_seq;
+	ASSERT(zd->zd_shared != NULL);
+	ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
+	zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
 	mutex_exit(&zilog->zl_lock);
+
+	(void) rw_unlock(&zd->zd_zilog_lock);
+}
+
+/*
+ * This function is designed to simulate the operations that occur during a
+ * mount/unmount operation.  We hold the dataset across these operations in an
+ * attempt to expose any implicit assumptions about ZIL management.
+ */
+/* ARGSUSED */
+void
+ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+
+	/*
+	 * We grab the zd_dirobj_lock to ensure that no other thread is
+	 * updating the zil (i.e. adding in-memory log records) and the
+	 * zd_zilog_lock to block any I/O.
+	 */
+	VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
+	(void) rw_wrlock(&zd->zd_zilog_lock);
+
+	/* zfsvfs_teardown() */
+	zil_close(zd->zd_zilog);
+
+	/* zfsvfs_setup() */
+	VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
+	zil_replay(os, zd, ztest_replay_vector);
+
+	(void) rw_unlock(&zd->zd_zilog_lock);
+	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
 }
 
 /*
@@ -2061,39 +2345,111 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_
 void
 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
+	ztest_shared_opts_t *zo = &ztest_opts;
 	spa_t *spa;
 	nvlist_t *nvroot;
 
 	/*
 	 * Attempt to create using a bad file.
 	 */
-	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
 	VERIFY3U(ENOENT, ==,
-	    spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+	    spa_create("ztest_bad_file", nvroot, NULL, NULL));
 	nvlist_free(nvroot);
 
 	/*
 	 * Attempt to create using a bad mirror.
 	 */
-	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
 	VERIFY3U(ENOENT, ==,
-	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
 	nvlist_free(nvroot);
 
 	/*
 	 * Attempt to create an existing pool.  It shouldn't matter
 	 * what's in the nvroot; we should fail with EEXIST.
 	 */
-	(void) rw_rdlock(&zs->zs_name_lock);
-	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-	VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
+	(void) rw_rdlock(&ztest_name_lock);
+	nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
+	VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
 	nvlist_free(nvroot);
-	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
-	VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
+	VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
+	VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
+	spa_close(spa, FTAG);
+
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+/* ARGSUSED */
+void
+ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa;
+	uint64_t initial_version = SPA_VERSION_INITIAL;
+	uint64_t version, newversion;
+	nvlist_t *nvroot, *props;
+	char *name;
+
+	VERIFY0(mutex_lock(&ztest_vdev_lock));
+	name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
+
+	/*
+	 * Clean up from previous runs.
+	 */
+	(void) spa_destroy(name);
+
+	nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
+	    0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
+
+	/*
+	 * If we're configuring a RAIDZ device then make sure that the
+	 * the initial version is capable of supporting that feature.
+	 */
+	switch (ztest_opts.zo_raidz_parity) {
+	case 0:
+	case 1:
+		initial_version = SPA_VERSION_INITIAL;
+		break;
+	case 2:
+		initial_version = SPA_VERSION_RAIDZ2;
+		break;
+	case 3:
+		initial_version = SPA_VERSION_RAIDZ3;
+		break;
+	}
+
+	/*
+	 * Create a pool with a spa version that can be upgraded. Pick
+	 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
+	 */
+	do {
+		version = ztest_random_spa_version(initial_version);
+	} while (version > SPA_VERSION_BEFORE_FEATURES);
+
+	props = fnvlist_alloc();
+	fnvlist_add_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
+	VERIFY0(spa_create(name, nvroot, props, NULL));
+	fnvlist_free(nvroot);
+	fnvlist_free(props);
+
+	VERIFY0(spa_open(name, &spa, FTAG));
+	VERIFY3U(spa_version(spa), ==, version);
+	newversion = ztest_random_spa_version(version + 1);
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("upgrading spa version from %llu to %llu\n",
+		    (u_longlong_t)version, (u_longlong_t)newversion);
+	}
+
+	spa_upgrade(spa, newversion);
+	VERIFY3U(spa_version(spa), >, version);
+	VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION)));
 	spa_close(spa, FTAG);
 
-	(void) rw_unlock(&zs->zs_name_lock);
+	strfree(name);
+	VERIFY0(mutex_unlock(&ztest_vdev_lock));
 }
 
 static vdev_t *
@@ -2140,14 +2496,14 @@ void
 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	uint64_t leaves;
 	uint64_t guid;
 	nvlist_t *nvroot;
 	int error;
 
-	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
-	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
+	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
+	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -2172,9 +2528,9 @@ ztest_vdev_add_remove(ztest_ds_t *zd, ui
 		 * dmu_objset_destroy() to fail with EBUSY thus
 		 * leaving the dataset in an inconsistent state.
 		 */
-		VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
+		VERIFY(rw_wrlock(&ztest_name_lock) == 0);
 		error = spa_vdev_remove(spa, guid, B_FALSE);
-		VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
+		VERIFY(rw_unlock(&ztest_name_lock) == 0);
 
 		if (error && error != EEXIST)
 			fatal(0, "spa_vdev_remove() = %d", error);
@@ -2184,8 +2540,10 @@ ztest_vdev_add_remove(ztest_ds_t *zd, ui
 		/*
 		 * Make 1/4 of the devices be log devices.
 		 */
-		nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
-		    ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
+		nvroot = make_vdev_root(NULL, NULL, NULL,
+		    ztest_opts.zo_vdev_size, 0,
+		    ztest_random(4) == 0, ztest_opts.zo_raidz,
+		    zs->zs_mirrors, 1);
 
 		error = spa_vdev_add(spa, nvroot);
 		nvlist_free(nvroot);
@@ -2196,7 +2554,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, ui
 			fatal(0, "spa_vdev_add() = %d", error);
 	}
 
-	VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
+	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
@@ -2207,7 +2565,7 @@ void
 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	spa_aux_vdev_t *sav;
 	char *aux;
@@ -2222,7 +2580,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd
 		aux = ZPOOL_CONFIG_L2CACHE;
 	}
 
-	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -2239,8 +2597,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd
 		for (;;) {
 			char path[MAXPATHLEN];
 			int c;
-			(void) snprintf(path, sizeof(path), ztest_aux_template, zopt_dir,
-			    zopt_pool, aux, zs->zs_vdev_aux);
+			(void) snprintf(path, sizeof (path), ztest_aux_template,
+			    ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
+			    zs->zs_vdev_aux);
 			for (c = 0; c < sav->sav_count; c++)
 				if (strcmp(sav->sav_vdevs[c]->vdev_path,
 				    path) == 0)
@@ -2258,8 +2617,8 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd
 		/*
 		 * Add a new device.
 		 */
-		nvlist_t *nvroot = make_vdev_root(NULL, aux,
-		    (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
+		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
+		    (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
 		error = spa_vdev_add(spa, nvroot);
 		if (error != 0)
 			fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
@@ -2278,7 +2637,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd
 			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
 	}
 
-	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
@@ -2289,17 +2648,17 @@ void
 ztest_split_pool(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *tree, **child, *config, *split, **schild;
 	uint_t c, children, schildren = 0, lastlogid = 0;
 	int error = 0;
 
-	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 
 	/* ensure we have a useable config; mirrors of raidz aren't supported */
-	if (zs->zs_mirrors < 3 || zopt_raidz > 1) {
-		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+	if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
+		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
@@ -2309,8 +2668,11 @@ ztest_split_pool(ztest_ds_t *zd, uint64_
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
 	/* generate a config from the existing config */
+	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
 	    &tree) == 0);
+	mutex_exit(&spa->spa_props_lock);
+
 	VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) == 0);
 
@@ -2355,9 +2717,9 @@ ztest_split_pool(ztest_ds_t *zd, uint64_
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
-	(void) rw_wrlock(&zs->zs_name_lock);
+	(void) rw_wrlock(&ztest_name_lock);
 	error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 
 	nvlist_free(config);
 
@@ -2370,7 +2732,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_
 		++zs->zs_splits;
 		--zs->zs_mirrors;
 	}
-	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 
 }
 
@@ -2382,7 +2744,7 @@ void
 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
@@ -2391,7 +2753,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd,
 	uint64_t leaf, top;
 	uint64_t ashift = ztest_get_ashift();
 	uint64_t oldguid, pguid;
-	size_t oldsize, newsize;
+	uint64_t oldsize, newsize;
 	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
 	int oldvd_has_siblings = B_FALSE;
@@ -2399,8 +2761,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd,
 	int oldvd_is_log;
 	int error, expected_error;
 
-	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
-	leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
+	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
+	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -2426,12 +2788,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd,
 	if (zs->zs_mirrors >= 1) {
 		ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
 		ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
-		oldvd = oldvd->vdev_child[leaf / zopt_raidz];
+		oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
 	}
-	if (zopt_raidz > 1) {
+	if (ztest_opts.zo_raidz > 1) {
 		ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
-		ASSERT(oldvd->vdev_children == zopt_raidz);
-		oldvd = oldvd->vdev_child[leaf % zopt_raidz];
+		ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
+		oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
 	}
 
 	/*
@@ -2460,7 +2822,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd,
 		if (error != 0 && error != ENODEV && error != EBUSY &&
 		    error != ENOTSUP)
 			fatal(0, "detach (%s) returned %d", oldpath, error);
-		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
@@ -2474,7 +2836,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd,
 		(void) strcpy(newpath, newvd->vdev_path);
 	} else {
 		(void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
-		    zopt_dir, zopt_pool, top * leaves + leaf);
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + leaf);
 		if (ztest_random(2) == 0)
 			newpath[strlen(newpath) - 1] = 'b';
 		newvd = vdev_lookup_by_path(rvd, newpath);
@@ -2523,7 +2886,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd,
 	/*
 	 * Build the nvlist describing newpath.
 	 */
-	root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0,
+	root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
 	    ashift, 0, 0, 0, 1);
 
 	error = spa_vdev_attach(spa, oldguid, root, replacing);
@@ -2549,11 +2912,11 @@ ztest_vdev_attach_detach(ztest_ds_t *zd,
 	if (error != expected_error && expected_error != EBUSY) {
 		fatal(0, "attach (%s %llu, %s %llu, %d) "
 		    "returned %d, expected %d",
-		    oldpath, (longlong_t)oldsize, newpath,
-		    (longlong_t)newsize, replacing, error, expected_error);
+		    oldpath, oldsize, newpath,
+		    newsize, replacing, error, expected_error);
 	}
 
-	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
@@ -2576,7 +2939,7 @@ grow_vdev(vdev_t *vd, void *arg)
 	fsize = lseek(fd, 0, SEEK_END);
 	(void) ftruncate(fd, *newsize);
 
-	if (zopt_verbose >= 6) {
+	if (ztest_opts.zo_verbose >= 6) {
 		(void) printf("%s grew from %lu to %lu bytes\n",
 		    vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
 	}
@@ -2612,7 +2975,7 @@ online_vdev(vdev_t *vd, void *arg)
 	 * vdev_open fails is by checking the returned newstate.
 	 */
 	if (error || newstate != VDEV_STATE_HEALTHY) {
-		if (zopt_verbose >= 5) {
+		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Unable to expand vdev, state %llu, "
 			    "error %d\n", (u_longlong_t)newstate, error);
 		}
@@ -2627,7 +2990,7 @@ online_vdev(vdev_t *vd, void *arg)
 	 * trying to online it.
 	 */
 	if (generation != spa->spa_config_generation) {
-		if (zopt_verbose >= 5) {
+		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("vdev configuration has changed, "
 			    "guid %llu, state %llu, expected gen %llu, "
 			    "got gen %llu\n",
@@ -2673,8 +3036,7 @@ vdev_walk_tree(vdev_t *vd, vdev_t *(*fun
 void
 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	vdev_t *vd, *tvd;
 	metaslab_class_t *mc;
 	metaslab_group_t *mg;
@@ -2682,7 +3044,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui
 	uint64_t top;
 	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
 
-	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 
 	top = ztest_random_vdev_top(spa, B_TRUE);
@@ -2708,16 +3070,16 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui
 	 * original size, and it has a valid psize.
 	 */
 	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
-	    psize == 0 || psize >= 4 * zopt_vdev_size) {
+	    psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
 		spa_config_exit(spa, SCL_STATE, spa);
-		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 	ASSERT(psize > 0);
 	newsize = psize + psize / 8;
 	ASSERT3U(newsize, >, psize);
 
-	if (zopt_verbose >= 6) {
+	if (ztest_opts.zo_verbose >= 6) {
 		(void) printf("Expanding LUN %s from %lu to %lu\n",
 		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
 	}
@@ -2730,12 +3092,12 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui
 	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
 	    vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
 	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
-		if (zopt_verbose >= 5) {
+		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Could not expand LUN because "
 			    "the vdev configuration changed.\n");
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
-		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
@@ -2764,12 +3126,12 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui
 	new_class_space = metaslab_class_get_space(mc);
 
 	if (tvd->vdev_mg != mg || mg->mg_class != mc) {
-		if (zopt_verbose >= 5) {
+		if (ztest_opts.zo_verbose >= 5) {
 			(void) printf("Could not verify LUN expansion due to "
 			    "intervening vdev offline or remove.\n");
 		}
 		spa_config_exit(spa, SCL_STATE, spa);
-		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		return;
 	}
 
@@ -2787,17 +3149,17 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui
 		fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
 		    old_class_space, new_class_space);
 
-	if (zopt_verbose >= 5) {
+	if (ztest_opts.zo_verbose >= 5) {
 		char oldnumbuf[6], newnumbuf[6];
 
-		nicenum(old_class_space, oldnumbuf, sizeof(oldnumbuf));
-		nicenum(new_class_space, newnumbuf, sizeof(newnumbuf));
+		nicenum(old_class_space, oldnumbuf);
+		nicenum(new_class_space, newnumbuf);
 		(void) printf("%s grew from %s to %s\n",
 		    spa->spa_name, oldnumbuf, newnumbuf);
 	}
 
 	spa_config_exit(spa, SCL_STATE, spa);
-	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 }
 
 /*
@@ -2814,6 +3176,22 @@ ztest_objset_create_cb(objset_t *os, voi
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
+static int
+ztest_dataset_create(char *dsname)
+{
+	uint64_t zilset = ztest_random(100);
+	int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
+	    ztest_objset_create_cb, NULL);
+
+	if (err || zilset < 80)
+		return (err);
+
+	if (ztest_opts.zo_verbose >= 6)
+		(void) printf("Setting dataset %s to sync always\n", dsname);
+	return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
+	    ZFS_SYNC_ALWAYS, B_FALSE));
+}
+
 /* ARGSUSED */
 static int
 ztest_objset_destroy_cb(const char *name, void *arg)
@@ -2825,53 +3203,57 @@ ztest_objset_destroy_cb(const char *name
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
-	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
 	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
-		ASSERT3U(error, ==, 0);
+		ASSERT0(error);
 		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
 		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
 	}
-	dmu_objset_rele(os, FTAG);
+	dmu_objset_disown(os, FTAG);
 
 	/*
 	 * Destroy the dataset.
 	 */
-	VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
+	if (strchr(name, '@') != NULL) {
+		VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
+	} else {
+		VERIFY0(dsl_destroy_head(name));
+	}
 	return (0);
 }
 
 static boolean_t
 ztest_snapshot_create(char *osname, uint64_t id)
 {
-	char snapname[MAXNAMELEN];
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	int error;
 
-	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
-	    (u_longlong_t)id);
+	(void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
 
-	error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
-	    NULL, B_FALSE);
+	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error == ENOSPC) {
 		ztest_record_enospc(FTAG);
 		return (B_FALSE);
 	}
-	if (error != 0 && error != EEXIST)
-		fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+	if (error != 0 && error != EEXIST) {
+		fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
+		    snapname, error);
+	}
 	return (B_TRUE);
 }
 
 static boolean_t
 ztest_snapshot_destroy(char *osname, uint64_t id)
 {
-	char snapname[MAXNAMELEN];
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	int error;
 
-	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+	(void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
 	    (u_longlong_t)id);
 
-	error = dmu_objset_destroy(snapname, B_FALSE);
+	error = dsl_destroy_snapshot(snapname, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
 	return (B_TRUE);
@@ -2881,18 +3263,17 @@ ztest_snapshot_destroy(char *osname, uin
 void
 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
 	ztest_ds_t zdtmp;
 	int iters;
 	int error;
 	objset_t *os, *os2;
-	char name[MAXNAMELEN];
+	char name[ZFS_MAX_DATASET_NAME_LEN];
 	zilog_t *zilog;
 
-	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) rw_rdlock(&ztest_name_lock);
 
-	(void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
-	    zs->zs_pool, (u_longlong_t)id);
+	(void) snprintf(name, sizeof (name), "%s/temp_%llu",
+	    ztest_opts.zo_pool, (u_longlong_t)id);
 
 	/*
 	 * If this dataset exists from a previous run, process its replay log
@@ -2901,7 +3282,7 @@ ztest_dmu_objset_create_destroy(ztest_ds
 	 */
 	if (ztest_random(2) == 0 &&
 	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
-		ztest_zd_init(&zdtmp, os);
+		ztest_zd_init(&zdtmp, NULL, os);
 		zil_replay(os, &zdtmp, ztest_replay_vector);
 		ztest_zd_fini(&zdtmp);
 		dmu_objset_disown(os, FTAG);
@@ -2918,26 +3299,25 @@ ztest_dmu_objset_create_destroy(ztest_ds
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
-	VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
+	VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
+	    FTAG, &os));
 
 	/*
 	 * Verify that we can create a new dataset.
 	 */
-	error = dmu_objset_create(name, DMU_OST_OTHER, 0,
-	    ztest_objset_create_cb, NULL);
+	error = ztest_dataset_create(name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
-			(void) rw_unlock(&zs->zs_name_lock);
+			(void) rw_unlock(&ztest_name_lock);
 			return;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
-	VERIFY3U(0, ==,
-	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
 
-	ztest_zd_init(&zdtmp, os);
+	ztest_zd_init(&zdtmp, NULL, os);
 
 	/*
 	 * Open the intent log for it.
@@ -2977,7 +3357,7 @@ ztest_dmu_objset_create_destroy(ztest_ds
 	dmu_objset_disown(os, FTAG);
 	ztest_zd_fini(&zdtmp);
 
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
@@ -2986,12 +3366,10 @@ ztest_dmu_objset_create_destroy(ztest_ds
 void
 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
-
-	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) rw_rdlock(&ztest_name_lock);
 	(void) ztest_snapshot_destroy(zd->zd_name, id);
 	(void) ztest_snapshot_create(zd->zd_name, id);
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
@@ -3000,34 +3378,39 @@ ztest_dmu_snapshot_create_destroy(ztest_
 void
 ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
 {
-	char snap1name[MAXNAMELEN];
-	char clone1name[MAXNAMELEN];
-	char snap2name[MAXNAMELEN];
-	char clone2name[MAXNAMELEN];
-	char snap3name[MAXNAMELEN];
+	char snap1name[ZFS_MAX_DATASET_NAME_LEN];
+	char clone1name[ZFS_MAX_DATASET_NAME_LEN];
+	char snap2name[ZFS_MAX_DATASET_NAME_LEN];
+	char clone2name[ZFS_MAX_DATASET_NAME_LEN];
+	char snap3name[ZFS_MAX_DATASET_NAME_LEN];
 	int error;
 
-	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
-	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
-	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
-	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
-	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
+	(void) snprintf(snap1name, sizeof (snap1name),
+	    "%s@s1_%llu", osname, id);
+	(void) snprintf(clone1name, sizeof (clone1name),
+	    "%s/c1_%llu", osname, id);
+	(void) snprintf(snap2name, sizeof (snap2name),
+	    "%s@s2_%llu", clone1name, id);
+	(void) snprintf(clone2name, sizeof (clone2name),
+	    "%s/c2_%llu", osname, id);
+	(void) snprintf(snap3name, sizeof (snap3name),
+	    "%s@s3_%llu", clone1name, id);
 
-	error = dmu_objset_destroy(clone2name, B_FALSE);
+	error = dsl_destroy_head(clone2name);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
-	error = dmu_objset_destroy(snap3name, B_FALSE);
+		fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
+	error = dsl_destroy_snapshot(snap3name, B_FALSE);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
-	error = dmu_objset_destroy(snap2name, B_FALSE);
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
+	error = dsl_destroy_snapshot(snap2name, B_FALSE);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
-	error = dmu_objset_destroy(clone1name, B_FALSE);
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
+	error = dsl_destroy_head(clone1name);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
-	error = dmu_objset_destroy(snap1name, B_FALSE);
+		fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
+	error = dsl_destroy_snapshot(snap1name, B_FALSE);
 	if (error && error != ENOENT)
-		fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
+		fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
 }
 
 /*
@@ -3036,29 +3419,31 @@ ztest_dsl_dataset_cleanup(char *osname, 
 void
 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
-	objset_t *clone;
-	dsl_dataset_t *ds;
-	char snap1name[MAXNAMELEN];
-	char clone1name[MAXNAMELEN];
-	char snap2name[MAXNAMELEN];
-	char clone2name[MAXNAMELEN];
-	char snap3name[MAXNAMELEN];
+	objset_t *os;
+	char snap1name[ZFS_MAX_DATASET_NAME_LEN];
+	char clone1name[ZFS_MAX_DATASET_NAME_LEN];
+	char snap2name[ZFS_MAX_DATASET_NAME_LEN];
+	char clone2name[ZFS_MAX_DATASET_NAME_LEN];
+	char snap3name[ZFS_MAX_DATASET_NAME_LEN];
 	char *osname = zd->zd_name;
 	int error;
 
-	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) rw_rdlock(&ztest_name_lock);
 
 	ztest_dsl_dataset_cleanup(osname, id);
 
-	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
-	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
-	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
-	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
-	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
+	(void) snprintf(snap1name, sizeof (snap1name),
+	    "%s@s1_%llu", osname, id);
+	(void) snprintf(clone1name, sizeof (clone1name),
+	    "%s/c1_%llu", osname, id);
+	(void) snprintf(snap2name, sizeof (snap2name),
+	    "%s@s2_%llu", clone1name, id);
+	(void) snprintf(clone2name, sizeof (clone2name),
+	    "%s/c2_%llu", osname, id);
+	(void) snprintf(snap3name, sizeof (snap3name),
+	    "%s@s3_%llu", clone1name, id);
 
-	error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
-	    NULL, B_FALSE);
+	error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
@@ -3067,12 +3452,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_
 		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
 	}
 
-	error = dmu_objset_hold(snap1name, FTAG, &clone);
-	if (error)
-		fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
-
-	error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
-	dmu_objset_rele(clone, FTAG);
+	error = dmu_objset_clone(clone1name, snap1name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
@@ -3081,8 +3461,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_
 		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
 	}
 
-	error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
-	    NULL, B_FALSE);
+	error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
@@ -3091,8 +3470,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
 	}
 
-	error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
-	    NULL, B_FALSE);
+	error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
 	if (error && error != EEXIST) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
@@ -3101,12 +3479,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_
 		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 	}
 
-	error = dmu_objset_hold(snap3name, FTAG, &clone);
-	if (error)
-		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
-
-	error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
-	dmu_objset_rele(clone, FTAG);
+	error = dmu_objset_clone(clone2name, snap3name);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc(FTAG);
@@ -3115,19 +3488,24 @@ ztest_dsl_dataset_promote_busy(ztest_ds_
 		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
 	}
 
-	error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds);
+	error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
 	if (error)
-		fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
+		fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
 	error = dsl_dataset_promote(clone2name, NULL);
+	if (error == ENOSPC) {
+		dmu_objset_disown(os, FTAG);
+		ztest_record_enospc(FTAG);
+		goto out;
+	}
 	if (error != EBUSY)
 		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
 		    error);
-	dsl_dataset_disown(ds, FTAG);
+	dmu_objset_disown(os, FTAG);
 
 out:
 	ztest_dsl_dataset_cleanup(osname, id);
 
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
@@ -3219,7 +3597,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
-	dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
+	dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize,
+	    ZIO_PRIORITY_SYNC_READ);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
@@ -3247,10 +3626,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin
 	 */
 	error = dmu_read(os, packobj, packoff, packsize, packbuf,
 	    DMU_READ_PREFETCH);
-	ASSERT3U(error, ==, 0);
+	ASSERT0(error);
 	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
 	    DMU_READ_PREFETCH);
-	ASSERT3U(error, ==, 0);
+	ASSERT0(error);
 
 	/*
 	 * Get a tx for the mods to both packobj and bigobj.
@@ -3264,6 +3643,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin
 	else
 		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
+	/* This accounts for setting the checksum/compression. */
+	dmu_tx_hold_bonus(tx, bigobj);
+
 	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
 	if (txg == 0) {
 		umem_free(packbuf, packsize);
@@ -3271,11 +3653,19 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin
 		return;
 	}
 
-	dmu_object_set_checksum(os, bigobj,
-	    (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+	enum zio_checksum cksum;
+	do {
+		cksum = (enum zio_checksum)
+		    ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
+	} while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+	dmu_object_set_checksum(os, bigobj, cksum, tx);
 
-	dmu_object_set_compress(os, bigobj,
-	    (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
+	enum zio_compress comp;
+	do {
+		comp = (enum zio_compress)
+		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
+	} while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
+	dmu_object_set_compress(os, bigobj, comp, tx);
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
@@ -3326,7 +3716,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin
 	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 
 	if (freeit) {
-		if (zopt_verbose >= 7) {
+		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("freeing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
@@ -3335,7 +3725,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin
 		}
 		VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
 	} else {
-		if (zopt_verbose >= 7) {
+		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
@@ -3560,10 +3950,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z
 		if (i != 0 || ztest_random(2) != 0) {
 			error = dmu_read(os, packobj, packoff,
 			    packsize, packbuf, DMU_READ_PREFETCH);
-			ASSERT3U(error, ==, 0);
+			ASSERT0(error);
 			error = dmu_read(os, bigobj, bigoff, bigsize,
 			    bigbuf, DMU_READ_PREFETCH);
-			ASSERT3U(error, ==, 0);
+			ASSERT0(error);
 		}
 		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
 		    n, chunksize, txg);
@@ -3573,7 +3963,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z
 		 * Now write them out.
 		 */
 		dmu_write(os, packobj, packoff, packsize, packbuf, tx);
-		if (zopt_verbose >= 7) {
+		if (ztest_opts.zo_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
@@ -3597,7 +3987,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z
 
 			if (i == 1) {
 				VERIFY(dmu_buf_hold(os, bigobj, off,
-				    FTAG, &dbt) == 0);
+				    FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
 			}
 			if (i != 5) {
 				dmu_assign_arcbuf(bonus_db, off,
@@ -3764,8 +4154,8 @@ ztest_zap(ztest_ds_t *zd, uint64_t id)
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
-	(void) snprintf(propname, sizeof(propname), "prop_%llu", (u_longlong_t)prop);
-	(void) snprintf(txgname, sizeof(txgname), "txg_%llu", (u_longlong_t)prop);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 	bzero(value, sizeof (value));
 	last_txg = 0;
 
@@ -3826,15 +4216,15 @@ ztest_zap(ztest_ds_t *zd, uint64_t id)
 	 * Remove a random pair of entries.
 	 */
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
-	(void) snprintf(propname, sizeof(propname), "prop_%llu", (u_longlong_t)prop);
-	(void) snprintf(txgname, sizeof(txgname), "txg_%llu", (u_longlong_t)prop);
+	(void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
+	(void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
 
 	error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
 
 	if (error == ENOENT)
 		return;
 
-	ASSERT3U(error, ==, 0);
+	ASSERT0(error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
@@ -3869,7 +4259,7 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id)
 	 * 2050 entries we should see ptrtbl growth and leaf-block split.
 	 */
 	for (int i = 0; i < 2050; i++) {
-		char name[MAXNAMELEN];
+		char name[ZFS_MAX_DATASET_NAME_LEN];
 		uint64_t value = i;
 		dmu_tx_t *tx;
 		int error;
@@ -3934,7 +4324,7 @@ ztest_zap_parallel(ztest_ds_t *zd, uint6
 	}
 
 	count = -1ULL;
-	VERIFY(zap_count(os, object, &count) == 0);
+	VERIFY0(zap_count(os, object, &count));
 	ASSERT(count != -1ULL);
 
 	/*
@@ -4030,7 +4420,7 @@ ztest_commit_callback(void *arg, int err
 	data->zcd_called = B_TRUE;
 
 	if (error == ECANCELED) {
-		ASSERT3U(data->zcd_txg, ==, 0);
+		ASSERT0(data->zcd_txg);
 		ASSERT(!data->zcd_added);
 
 		/*
@@ -4168,7 +4558,7 @@ ztest_dmu_commit_callbacks(ztest_ds_t *z
 	 */
 	tmp_cb = list_head(&zcl.zcl_callbacks);
 	if (tmp_cb != NULL &&
-	    tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+	    (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
 		fatal(0, "Commit callback threshold exceeded, oldest txg: %"
 		    PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
 	}
@@ -4214,37 +4604,51 @@ ztest_dsl_prop_get_set(ztest_ds_t *zd, u
 		ZFS_PROP_COPIES,
 		ZFS_PROP_DEDUP
 	};
-	ztest_shared_t *zs = ztest_shared;
 
-	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) rw_rdlock(&ztest_name_lock);
 
 	for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
 		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
 		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
 
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 }
 
 /* ARGSUSED */
 void
 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
 	nvlist_t *props = NULL;
 
-	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) rw_rdlock(&ztest_name_lock);
 
-	(void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+	(void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
 	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
 
-	VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+	VERIFY0(spa_prop_get(ztest_spa, &props));
 
-	if (zopt_verbose >= 6)
+	if (ztest_opts.zo_verbose >= 6)
 		dump_nvlist(props, 4);
 
 	nvlist_free(props);
 
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
+}
+
+static int
+user_release_one(const char *snapname, const char *holdname)
+{
+	nvlist_t *snaps, *holds;
+	int error;
+
+	snaps = fnvlist_alloc();
+	holds = fnvlist_alloc();
+	fnvlist_add_boolean(holds, holdname);
+	fnvlist_add_nvlist(snaps, snapname, holds);
+	fnvlist_free(holds);
+	error = dsl_dataset_user_release(snaps, NULL);
+	fnvlist_free(snaps);
+	return (error);
 }
 
 /*
@@ -4260,29 +4664,37 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, 
 	char fullname[100];
 	char clonename[100];
 	char tag[100];
-	char osname[MAXNAMELEN];
+	char osname[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *holds;
 
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
+	(void) rw_rdlock(&ztest_name_lock);
 
 	dmu_objset_name(os, osname);
 
-	(void) snprintf(snapname, 100, "sh1_%llu", id);
-	(void) snprintf(fullname, 100, "%s@%s", osname, snapname);
-	(void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id);
-	(void) snprintf(tag, 100, "%tag_%llu", id);
+	(void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
+	(void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
+	(void) snprintf(clonename, sizeof (clonename),
+	    "%s/ch1_%llu", osname, id);
+	(void) snprintf(tag, sizeof (tag), "tag_%llu", id);
 
 	/*
 	 * Clean up from any previous run.
 	 */
-	(void) dmu_objset_destroy(clonename, B_FALSE);
-	(void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
-	(void) dmu_objset_destroy(fullname, B_FALSE);
+	error = dsl_destroy_head(clonename);
+	if (error != ENOENT)
+		ASSERT0(error);
+	error = user_release_one(fullname, tag);
+	if (error != ESRCH && error != ENOENT)
+		ASSERT0(error);
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
+	if (error != ENOENT)
+		ASSERT0(error);
 
 	/*
 	 * Create snapshot, clone it, mark snap for deferred destroy,
 	 * destroy clone, verify snap was also destroyed.
 	 */
-	error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_snapshot");
@@ -4291,12 +4703,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, 
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
-	error = dmu_objset_hold(fullname, FTAG, &origin);
-	if (error)
-		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
-
-	error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0);
-	dmu_objset_rele(origin, FTAG);
+	error = dmu_objset_clone(clonename, fullname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_clone");
@@ -4305,15 +4712,15 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, 
 		fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
 	}
 
-	error = dmu_objset_destroy(fullname, B_TRUE);
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
-		fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
-	error = dmu_objset_destroy(clonename, B_FALSE);
+	error = dsl_destroy_head(clonename);
 	if (error)
-		fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error);
+		fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
 
 	error = dmu_objset_hold(fullname, FTAG, &origin);
 	if (error != ENOENT)
@@ -4324,7 +4731,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, 
 	 * destroy a held snapshot, mark for deferred destroy,
 	 * release hold, verify snapshot was destroyed.
 	 */
-	error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+	error = dmu_objset_snapshot_one(osname, snapname);
 	if (error) {
 		if (error == ENOSPC) {
 			ztest_record_enospc("dmu_objset_snapshot");
@@ -4333,30 +4740,39 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, 
 		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
 	}
 
-	error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, B_TRUE);
-	if (error)
-		fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
+	holds = fnvlist_alloc();
+	fnvlist_add_string(holds, fullname, tag);
+	error = dsl_dataset_user_hold(holds, 0, NULL);
+	fnvlist_free(holds);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc("dsl_dataset_user_hold");
+		goto out;
+	} else if (error) {
+		fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
+		    fullname, tag, error);
+	}
 
-	error = dmu_objset_destroy(fullname, B_FALSE);
+	error = dsl_destroy_snapshot(fullname, B_FALSE);
 	if (error != EBUSY) {
-		fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
+		fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
 		    fullname, error);
 	}
 
-	error = dmu_objset_destroy(fullname, B_TRUE);
+	error = dsl_destroy_snapshot(fullname, B_TRUE);
 	if (error) {
-		fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+		fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
 		    fullname, error);
 	}
 
-	error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+	error = user_release_one(fullname, tag);
 	if (error)
-		fatal(0, "dsl_dataset_user_release(%s)", fullname, tag);
+		fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
 
-	VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT);
+	VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
 
 out:
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
@@ -4367,16 +4783,16 @@ void
 ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	int fd;
 	uint64_t offset;
 	uint64_t leaves;
-	uint64_t bad = 0x1990c0ffeedecade;
+	uint64_t bad = 0x1990c0ffeedecadeULL;
 	uint64_t top, leaf;
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
-	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
+	int bshift = SPA_MAXBLOCKSHIFT + 2;
 	int iters = 1000;
 	int maxfaults;
 	int mirror_save;
@@ -4384,15 +4800,23 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 	uint64_t guid0 = 0;
 	boolean_t islog = B_FALSE;
 
-	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 	maxfaults = MAXFAULTS();
-	leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
+	leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
 	mirror_save = zs->zs_mirrors;
-	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+	VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 
 	ASSERT(leaves >= 1);
 
 	/*
+	 * Grab the name lock as reader. There are some operations
+	 * which don't like to have their vdevs changed while
+	 * they are in progress (i.e. spa_change_guid). Those
+	 * operations will have grabbed the name lock as writer.
+	 */
+	(void) rw_rdlock(&ztest_name_lock);
+
+	/*
 	 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
@@ -4411,15 +4835,24 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 		 * and we'll write random garbage to the randomly chosen leaf.
 		 */
 		(void) snprintf(path0, sizeof (path0), ztest_dev_template,
-		    zopt_dir, zopt_pool, top * leaves + zs->zs_splits);
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + zs->zs_splits);
 		(void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
-		    zopt_dir, zopt_pool, top * leaves + leaf);
+		    ztest_opts.zo_dir, ztest_opts.zo_pool,
+		    top * leaves + leaf);
 
 		vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
 		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
 			islog = B_TRUE;
 
-		if (vd0 != NULL && maxfaults != 1) {
+		/*
+		 * If the top-level vdev needs to be resilvered
+		 * then we only allow faults on the device that is
+		 * resilvering.
+		 */
+		if (vd0 != NULL && maxfaults != 1 &&
+		    (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
+		    vd0->vdev_resilver_txg != 0)) {
 			/*
 			 * Make vd0 explicitly claim to be unreadable,
 			 * or unwriteable, or reach behind its back
@@ -4450,6 +4883,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 
 		if (sav->sav_count == 0) {
 			spa_config_exit(spa, SCL_STATE, FTAG);
+			(void) rw_unlock(&ztest_name_lock);
 			return;
 		}
 		vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
@@ -4463,6 +4897,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 	}
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
+	(void) rw_unlock(&ztest_name_lock);
 
 	/*
 	 * If we can tolerate two or more faults, or we're dealing
@@ -4482,14 +4917,25 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 			 * leaving the dataset in an inconsistent state.
 			 */
 			if (islog)
-				(void) rw_wrlock(&ztest_shared->zs_name_lock);
+				(void) rw_wrlock(&ztest_name_lock);
 
 			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
 
 			if (islog)
-				(void) rw_unlock(&ztest_shared->zs_name_lock);
+				(void) rw_unlock(&ztest_name_lock);
 		} else {
+			/*
+			 * Ideally we would like to be able to randomly
+			 * call vdev_[on|off]line without holding locks
+			 * to force unpredictable failures but the side
+			 * effects of vdev_[on|off]line prevent us from
+			 * doing so. We grab the ztest_vdev_lock here to
+			 * prevent a race between injection testing and
+			 * aux_vdev removal.
+			 */
+			VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 			(void) vdev_online(spa, guid0, 0, NULL);
+			VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 		}
 	}
 
@@ -4507,16 +4953,63 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 	fsize = lseek(fd, 0, SEEK_END);
 
 	while (--iters != 0) {
+		/*
+		 * The offset must be chosen carefully to ensure that
+		 * we do not inject a given logical block with errors
+		 * on two different leaf devices, because ZFS can not
+		 * tolerate that (if maxfaults==1).
+		 *
+		 * We divide each leaf into chunks of size
+		 * (# leaves * SPA_MAXBLOCKSIZE * 4).  Within each chunk
+		 * there is a series of ranges to which we can inject errors.
+		 * Each range can accept errors on only a single leaf vdev.
+		 * The error injection ranges are separated by ranges
+		 * which we will not inject errors on any device (DMZs).
+		 * Each DMZ must be large enough such that a single block
+		 * can not straddle it, so that a single block can not be
+		 * a target in two different injection ranges (on different
+		 * leaf vdevs).
+		 *
+		 * For example, with 3 leaves, each chunk looks like:
+		 *    0 to  32M: injection range for leaf 0
+		 *  32M to  64M: DMZ - no injection allowed
+		 *  64M to  96M: injection range for leaf 1
+		 *  96M to 128M: DMZ - no injection allowed
+		 * 128M to 160M: injection range for leaf 2
+		 * 160M to 192M: DMZ - no injection allowed
+		 */
 		offset = ztest_random(fsize / (leaves << bshift)) *
 		    (leaves << bshift) + (leaf << bshift) +
 		    (ztest_random(1ULL << (bshift - 1)) & -8ULL);
 
-		if (offset >= fsize)
+		/*
+		 * Only allow damage to the labels at one end of the vdev.
+		 *
+		 * If all labels are damaged, the device will be totally
+		 * inaccessible, which will result in loss of data,
+		 * because we also damage (parts of) the other side of
+		 * the mirror/raidz.
+		 *
+		 * Additionally, we will always have both an even and an
+		 * odd label, so that we can handle crashes in the
+		 * middle of vdev_config_sync().
+		 */
+		if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
 			continue;
 
-		VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+		/*
+		 * The two end labels are stored at the "end" of the disk, but
+		 * the end of the disk (vdev_psize) is aligned to
+		 * sizeof (vdev_label_t).
+		 */
+		uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
+		if ((leaf & 1) == 1 &&
+		    offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
+			continue;
+
+		VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
 		if (mirror_save != zs->zs_mirrors) {
-			VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+			VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 			(void) close(fd);
 			return;
 		}
@@ -4525,9 +5018,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint6
 			fatal(1, "can't inject bad word at 0x%llx in %s",
 			    offset, pathrand);
 
-		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 
-		if (zopt_verbose >= 7)
+		if (ztest_opts.zo_verbose >= 7)
 			(void) printf("injected bad word into %s,"
 			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
 	}
@@ -4542,7 +5035,7 @@ void
 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 {
 	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 	objset_t *os = zd->zd_os;
 	ztest_od_t od[1];
 	uint64_t object, blocksize, txg, pattern, psize;
@@ -4565,19 +5058,24 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	 * Take the name lock as writer to prevent anyone else from changing
 	 * the pool and dataset properies we need to maintain during this test.
 	 */
-	(void) rw_wrlock(&zs->zs_name_lock);
+	(void) rw_wrlock(&ztest_name_lock);
 
 	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
 	    B_FALSE) != 0 ||
 	    ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
 	    B_FALSE) != 0) {
-		(void) rw_unlock(&zs->zs_name_lock);
+		(void) rw_unlock(&ztest_name_lock);
 		return;
 	}
 
+	dmu_objset_stats_t dds;
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+
 	object = od[0].od_object;
 	blocksize = od[0].od_blocksize;
-	pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+	pattern = zs->zs_guid ^ dds.dds_guid;
 
 	ASSERT(object != 0);
 
@@ -4585,7 +5083,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
 	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
 	if (txg == 0) {
-		(void) rw_unlock(&zs->zs_name_lock);
+		(void) rw_unlock(&ztest_name_lock);
 		return;
 	}
 
@@ -4594,7 +5092,12 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	 */
 	for (int i = 0; i < copies; i++) {
 		uint64_t offset = i * blocksize;
-		VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db) == 0);
+		int error = dmu_buf_hold(os, object, offset, FTAG, &db,
+		    DMU_READ_NO_PREFETCH);
+		if (error != 0) {
+			fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
+			    os, (long long)object, (long long) offset, error);
+		}
 		ASSERT(db->db_offset == offset);
 		ASSERT(db->db_size == blocksize);
 		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
@@ -4610,7 +5113,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	/*
 	 * Find out what block we got.
 	 */
-	VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db) == 0);
+	VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
+	    DMU_READ_NO_PREFETCH));
 	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
 	dmu_buf_rele(db, FTAG);
 
@@ -4627,7 +5131,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 
 	zio_buf_free(buf, psize);
 
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
@@ -4637,12 +5141,41 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 void
 ztest_scrub(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
-	spa_t *spa = zs->zs_spa;
+	spa_t *spa = ztest_spa;
 
-	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+	(void) spa_scan(spa, POOL_SCAN_SCRUB);
 	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
-	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+	(void) spa_scan(spa, POOL_SCAN_SCRUB);
+}
+
+/*
+ * Change the guid for the pool.
+ */
+/* ARGSUSED */
+void
+ztest_reguid(ztest_ds_t *zd, uint64_t id)
+{
+	spa_t *spa = ztest_spa;
+	uint64_t orig, load;
+	int error;
+
+	orig = spa_guid(spa);
+	load = spa_load_guid(spa);
+
+	(void) rw_wrlock(&ztest_name_lock);
+	error = spa_change_guid(spa);
+	(void) rw_unlock(&ztest_name_lock);
+
+	if (error != 0)
+		return;
+
+	if (ztest_opts.zo_verbose >= 4) {
+		(void) printf("Changed guid old %llu -> %llu\n",
+		    (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
+	}
+
+	VERIFY3U(orig, !=, spa_guid(spa));
+	VERIFY3U(load, ==, spa_load_guid(spa));
 }
 
 /*
@@ -4652,13 +5185,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id)
 void
 ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
 	char *oldname, *newname;
 	spa_t *spa;
 
-	(void) rw_wrlock(&zs->zs_name_lock);
+	(void) rw_wrlock(&ztest_name_lock);
 
-	oldname = zs->zs_pool;
+	oldname = ztest_opts.zo_pool;
 	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
 	(void) strcpy(newname, oldname);
 	(void) strcat(newname, "_tmp");
@@ -4678,7 +5210,7 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_
 	 */
 	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 
-	ASSERT(spa == zs->zs_spa);
+	ASSERT(spa == ztest_spa);
 	spa_close(spa, FTAG);
 
 	/*
@@ -4691,12 +5223,12 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_
 	 */
 	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
-	ASSERT(spa == zs->zs_spa);
+	ASSERT(spa == ztest_spa);
 	spa_close(spa, FTAG);
 
 	umem_free(newname, strlen(newname) + 1);
 
-	(void) rw_unlock(&zs->zs_name_lock);
+	(void) rw_unlock(&ztest_name_lock);
 }
 
 /*
@@ -4714,7 +5246,7 @@ ztest_run_zdb(char *pool)
 	int isalen;
 	FILE *fp;
 
-	(void) realpath(getexecname(), zdb);
+	strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb));
 
 	/* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
 	bin = strstr(zdb, "/usr/bin/");
@@ -4723,22 +5255,24 @@ ztest_run_zdb(char *pool)
 	isalen = ztest - isa;
 	isa = strdup(isa);
 	/* LINTED */
-	(void) snprintf(bin, sizeof(zdb) - (bin - zdb),
-	    "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s",
+	(void) sprintf(bin,
+	    "/usr/sbin%.*s/zdb -bcc%s%s -d -U %s %s",
 	    isalen,
 	    isa,
-	    zopt_verbose >= 3 ? "s" : "",
-	    zopt_verbose >= 4 ? "v" : "",
+	    ztest_opts.zo_verbose >= 3 ? "s" : "",
+	    ztest_opts.zo_verbose >= 4 ? "v" : "",
+	    spa_config_path,
 	    pool);
 	free(isa);
 
-	if (zopt_verbose >= 5)
+	if (ztest_opts.zo_verbose >= 5)
 		(void) printf("Executing %s\n", strstr(zdb, "zdb "));
 
 	fp = popen(zdb, "r");
+	assert(fp != NULL);
 
 	while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
-		if (zopt_verbose >= 3)
+		if (ztest_opts.zo_verbose >= 3)
 			(void) printf("%s", zbuf);
 
 	status = pclose(fp);
@@ -4758,12 +5292,12 @@ ztest_walk_pool_directory(char *header)
 {
 	spa_t *spa = NULL;
 
-	if (zopt_verbose >= 6)
+	if (ztest_opts.zo_verbose >= 6)
 		(void) printf("%s\n", header);
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL)
-		if (zopt_verbose >= 6)
+		if (ztest_opts.zo_verbose >= 6)
 			(void) printf("\t%s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 }
@@ -4774,8 +5308,9 @@ ztest_spa_import_export(char *oldname, c
 	nvlist_t *config, *newconfig;
 	uint64_t pool_guid;
 	spa_t *spa;
+	int error;
 
-	if (zopt_verbose >= 4) {
+	if (ztest_opts.zo_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
 		    oldname, newname);
 	}
@@ -4794,7 +5329,7 @@ ztest_spa_import_export(char *oldname, c
 	 * Kick off a scrub to tickle scrub/export races.
 	 */
 	if (ztest_random(2) == 0)
-		(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+		(void) spa_scan(spa, POOL_SCAN_SCRUB);
 
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
@@ -4818,19 +5353,24 @@ ztest_spa_import_export(char *oldname, c
 	/*
 	 * Import it under the new name.
 	 */
-	VERIFY3U(0, ==, spa_import(newname, config, NULL));
+	error = spa_import(newname, config, NULL, 0);
+	if (error != 0) {
+		dump_nvlist(config, 0);
+		fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
+		    oldname, newname, error);
+	}
 
 	ztest_walk_pool_directory("pools after import");
 
 	/*
 	 * Try to import it again -- should fail with EEXIST.
 	 */
-	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL));
+	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
 
 	/*
 	 * Try to import it under a different name -- should fail with EEXIST.
 	 */
-	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL));
+	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
 
 	/*
 	 * Verify that the pool is no longer visible under the old name.
@@ -4850,7 +5390,7 @@ ztest_spa_import_export(char *oldname, c
 static void
 ztest_resume(spa_t *spa)
 {
-	if (spa_suspended(spa) && zopt_verbose >= 6)
+	if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
 		(void) printf("resuming from suspended state\n");
 	spa_vdev_state_enter(spa, SCL_NONE);
 	vdev_clear(spa, NULL);
@@ -4867,6 +5407,12 @@ ztest_resume_thread(void *arg)
 		if (spa_suspended(spa))
 			ztest_resume(spa);
 		(void) poll(NULL, 0, 100);
+
+		/*
+		 * Periodically change the zfs_compressed_arc_enabled setting.
+		 */
+		if (ztest_random(10) == 0)
+			zfs_compressed_arc_enabled = ztest_random(2);
 	}
 	return (NULL);
 }
@@ -4875,23 +5421,40 @@ static void *
 ztest_deadman_thread(void *arg)
 {
 	ztest_shared_t *zs = arg;
-	int grace = 300;
-	hrtime_t delta;
+	spa_t *spa = ztest_spa;
+	hrtime_t delta, total = 0;
 
-	delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
-
-	(void) poll(NULL, 0, (int)(1000 * delta));
+	for (;;) {
+		delta = zs->zs_thread_stop - zs->zs_thread_start +
+		    MSEC2NSEC(zfs_deadman_synctime_ms);
 
-	fatal(0, "failed to complete within %d seconds of deadline", grace);
+		(void) poll(NULL, 0, (int)NSEC2MSEC(delta));
 
-	return (NULL);
+		/*
+		 * If the pool is suspended then fail immediately. Otherwise,
+		 * check to see if the pool is making any progress. If
+		 * vdev_deadman() discovers that there hasn't been any recent
+		 * I/Os then it will end up aborting the tests.
+		 */
+		if (spa_suspended(spa) || spa->spa_root_vdev == NULL) {
+			fatal(0, "aborting test after %llu seconds because "
+			    "pool has transitioned to a suspended state.",
+			    zfs_deadman_synctime_ms / 1000);
+			return (NULL);
+		}
+		vdev_deadman(spa->spa_root_vdev);
+
+		total += zfs_deadman_synctime_ms/1000;
+		(void) printf("ztest has been running for %lld seconds\n",
+		    total);
+	}
 }
 
 static void
-ztest_execute(ztest_info_t *zi, uint64_t id)
+ztest_execute(int test, ztest_info_t *zi, uint64_t id)
 {
-	ztest_shared_t *zs = ztest_shared;
-	ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+	ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
+	ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
 	hrtime_t functime = gethrtime();
 
 	for (int i = 0; i < zi->zi_iters; i++)
@@ -4899,10 +5462,10 @@ ztest_execute(ztest_info_t *zi, uint64_t
 
 	functime = gethrtime() - functime;
 
-	atomic_add_64(&zi->zi_call_count, 1);
-	atomic_add_64(&zi->zi_call_time, functime);
+	atomic_add_64(&zc->zc_count, 1);
+	atomic_add_64(&zc->zc_time, functime);
 
-	if (zopt_verbose >= 4) {
+	if (ztest_opts.zo_verbose >= 4) {
 		Dl_info dli;
 		(void) dladdr((void *)zi->zi_func, &dli);
 		(void) printf("%6.2f sec in %s\n",
@@ -4913,11 +5476,13 @@ ztest_execute(ztest_info_t *zi, uint64_t
 static void *
 ztest_thread(void *arg)
 {
+	int rand;
 	uint64_t id = (uintptr_t)arg;
 	ztest_shared_t *zs = ztest_shared;
 	uint64_t call_next;
 	hrtime_t now;
 	ztest_info_t *zi;
+	ztest_shared_callstate_t *zc;
 
 	while ((now = gethrtime()) < zs->zs_thread_stop) {
 		/*
@@ -4935,13 +5500,16 @@ ztest_thread(void *arg)
 		/*
 		 * Pick a random function to execute.
 		 */
-		zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
-		call_next = zi->zi_call_next;
+		rand = ztest_random(ZTEST_FUNCS);
+		zi = &ztest_info[rand];
+		zc = ZTEST_GET_SHARED_CALLSTATE(rand);
+		call_next = zc->zc_next;
 
 		if (now >= call_next &&
-		    atomic_cas_64(&zi->zi_call_next, call_next, call_next +
-		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
-			ztest_execute(zi, id);
+		    atomic_cas_64(&zc->zc_next, call_next, call_next +
+		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
+			ztest_execute(rand, zi, id);
+		}
 	}
 
 	return (NULL);
@@ -4950,17 +5518,17 @@ ztest_thread(void *arg)
 static void
 ztest_dataset_name(char *dsname, char *pool, int d)
 {
-	(void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+	(void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d);
 }
 
 static void
-ztest_dataset_destroy(ztest_shared_t *zs, int d)
+ztest_dataset_destroy(int d)
 {
-	char name[MAXNAMELEN];
+	char name[ZFS_MAX_DATASET_NAME_LEN];
 
-	ztest_dataset_name(name, zs->zs_pool, d);
+	ztest_dataset_name(name, ztest_opts.zo_pool, d);
 
-	if (zopt_verbose >= 3)
+	if (ztest_opts.zo_verbose >= 3)
 		(void) printf("Destroying %s to free up space\n", name);
 
 	/*
@@ -4968,8 +5536,10 @@ ztest_dataset_destroy(ztest_shared_t *zs
 	 * ztest thread t operates on dataset (t % zopt_datasets),
 	 * so there may be more than one thing to clean up.
 	 */
-	for (int t = d; t < zopt_threads; t += zopt_datasets)
+	for (int t = d; t < ztest_opts.zo_threads;
+	    t += ztest_opts.zo_datasets) {
 		ztest_dsl_dataset_cleanup(name, t);
+	}
 
 	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
 	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
@@ -4997,32 +5567,31 @@ ztest_dataset_dirobj_verify(ztest_ds_t *
 }
 
 static int
-ztest_dataset_open(ztest_shared_t *zs, int d)
+ztest_dataset_open(int d)
 {
-	ztest_ds_t *zd = &zs->zs_zd[d];
-	uint64_t committed_seq = zd->zd_seq;
+	ztest_ds_t *zd = &ztest_ds[d];
+	uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
 	objset_t *os;
 	zilog_t *zilog;
-	char name[MAXNAMELEN];
+	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int error;
 
-	ztest_dataset_name(name, zs->zs_pool, d);
+	ztest_dataset_name(name, ztest_opts.zo_pool, d);
 
-	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) rw_rdlock(&ztest_name_lock);
 
-	error = dmu_objset_create(name, DMU_OST_OTHER, 0,
-	    ztest_objset_create_cb, NULL);
+	error = ztest_dataset_create(name);
 	if (error == ENOSPC) {
-		(void) rw_unlock(&zs->zs_name_lock);
+		(void) rw_unlock(&ztest_name_lock);
 		ztest_record_enospc(FTAG);
 		return (error);
 	}
 	ASSERT(error == 0 || error == EEXIST);
 
-	VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
-	(void) rw_unlock(&zs->zs_name_lock);
+	VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
+	(void) rw_unlock(&ztest_name_lock);
 
-	ztest_zd_init(zd, os);
+	ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
 
 	zilog = zd->zd_zilog;
 
@@ -5037,7 +5606,7 @@ ztest_dataset_open(ztest_shared_t *zs, i
 
 	ztest_dataset_dirobj_verify(zd);
 
-	if (zopt_verbose >= 6)
+	if (ztest_opts.zo_verbose >= 6)
 		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
 		    zd->zd_name,
 		    (u_longlong_t)zilog->zl_parse_blk_count,
@@ -5055,12 +5624,12 @@ ztest_dataset_open(ztest_shared_t *zs, i
 }
 
 static void
-ztest_dataset_close(ztest_shared_t *zs, int d)
+ztest_dataset_close(int d)
 {
-	ztest_ds_t *zd = &zs->zs_zd[d];
+	ztest_ds_t *zd = &ztest_ds[d];
 
 	zil_close(zd->zd_zilog);
-	dmu_objset_rele(zd->zd_os, zd);
+	dmu_objset_disown(zd->zd_os, zd);
 
 	ztest_zd_fini(zd);
 }
@@ -5073,6 +5642,7 @@ ztest_run(ztest_shared_t *zs)
 {
 	thread_t *tid;
 	spa_t *spa;
+	objset_t *os;
 	thread_t resume_tid;
 	int error;
 
@@ -5081,15 +5651,18 @@ ztest_run(ztest_shared_t *zs)
 	/*
 	 * Initialize parent/child shared state.
 	 */
-	VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
-	VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
 
 	zs->zs_thread_start = gethrtime();
-	zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC;
+	zs->zs_thread_stop =
+	    zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
 	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
 	zs->zs_thread_kill = zs->zs_thread_stop;
-	if (ztest_random(100) < zopt_killrate)
-		zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC);
+	if (ztest_random(100) < ztest_opts.zo_killrate) {
+		zs->zs_thread_kill -=
+		    ztest_random(ztest_opts.zo_passtime * NANOSEC);
+	}
 
 	(void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
 
@@ -5100,8 +5673,19 @@ ztest_run(ztest_shared_t *zs)
 	 * Open our pool.
 	 */
 	kernel_init(FREAD | FWRITE);
-	VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
-	zs->zs_spa = spa;
+	VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	spa->spa_debug = B_TRUE;
+	metaslab_preload_limit = ztest_random(20) + 1;
+	ztest_spa = spa;
+
+	dmu_objset_stats_t dds;
+	VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
+	    DMU_OST_ANY, B_TRUE, FTAG, &os));
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	dmu_objset_fast_stat(os, &dds);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+	zs->zs_guid = dds.dds_guid;
+	dmu_objset_disown(os, FTAG);
 
 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
@@ -5146,21 +5730,23 @@ ztest_run(ztest_shared_t *zs)
 	 * If we got any ENOSPC errors on the previous run, destroy something.
 	 */
 	if (zs->zs_enospc_count != 0) {
-		int d = ztest_random(zopt_datasets);
-		ztest_dataset_destroy(zs, d);
+		int d = ztest_random(ztest_opts.zo_datasets);
+		ztest_dataset_destroy(d);
 	}
 	zs->zs_enospc_count = 0;
 
-	tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL);
+	tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t),
+	    UMEM_NOFAIL);
 
-	if (zopt_verbose >= 4)
+	if (ztest_opts.zo_verbose >= 4)
 		(void) printf("starting main threads...\n");
 
 	/*
 	 * Kick off all the tests that run in parallel.
 	 */
-	for (int t = 0; t < zopt_threads; t++) {
-		if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+	for (int t = 0; t < ztest_opts.zo_threads; t++) {
+		if (t < ztest_opts.zo_datasets &&
+		    ztest_dataset_open(t) != 0)
 			return;
 		VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
 		    THR_BOUND, &tid[t]) == 0);
@@ -5170,18 +5756,19 @@ ztest_run(ztest_shared_t *zs)
 	 * Wait for all of the tests to complete.  We go in reverse order
 	 * so we don't close datasets while threads are still using them.
 	 */
-	for (int t = zopt_threads - 1; t >= 0; t--) {
+	for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
 		VERIFY(thr_join(tid[t], NULL, NULL) == 0);
-		if (t < zopt_datasets)
-			ztest_dataset_close(zs, t);
+		if (t < ztest_opts.zo_datasets)
+			ztest_dataset_close(t);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+	zfs_dbgmsg_print(FTAG);
 
-	umem_free(tid, zopt_threads * sizeof (thread_t));
+	umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
 
 	/* Kill the resume thread */
 	ztest_exiting = B_TRUE;
@@ -5192,8 +5779,10 @@ ztest_run(ztest_shared_t *zs)
 	 * Right before closing the pool, kick off a bunch of async I/O;
 	 * spa_close() should wait for it to complete.
 	 */
-	for (uint64_t object = 1; object < 50; object++)
-		dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+	for (uint64_t object = 1; object < 50; object++) {
+		dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
 
 	spa_close(spa, FTAG);
 
@@ -5202,7 +5791,7 @@ ztest_run(ztest_shared_t *zs)
 	 */
 	mutex_enter(&spa_namespace_lock);
 	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
-		if (zopt_verbose > 3)
+		if (ztest_opts.zo_verbose > 3)
 			(void) printf("spa_next: found %s\n", spa_name(spa));
 	mutex_exit(&spa_namespace_lock);
 
@@ -5211,27 +5800,38 @@ ztest_run(ztest_shared_t *zs)
 	 * different name.
 	 */
 	if (ztest_random(2) == 0) {
-		char name[MAXNAMELEN];
-		(void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
-		ztest_spa_import_export(zs->zs_pool, name);
-		ztest_spa_import_export(name, zs->zs_pool);
+		char name[ZFS_MAX_DATASET_NAME_LEN];
+		(void) snprintf(name, sizeof (name), "%s_import",
+		    ztest_opts.zo_pool);
+		ztest_spa_import_export(ztest_opts.zo_pool, name);
+		ztest_spa_import_export(name, ztest_opts.zo_pool);
 	}
 
 	kernel_fini();
+
+	list_destroy(&zcl.zcl_callbacks);
+
+	(void) _mutex_destroy(&zcl.zcl_callbacks_lock);
+
+	(void) rwlock_destroy(&ztest_name_lock);
+	(void) _mutex_destroy(&ztest_vdev_lock);
 }
 
 static void
-ztest_freeze(ztest_shared_t *zs)
+ztest_freeze(void)
 {
-	ztest_ds_t *zd = &zs->zs_zd[0];
+	ztest_ds_t *zd = &ztest_ds[0];
 	spa_t *spa;
+	int numloops = 0;
 
-	if (zopt_verbose >= 3)
+	if (ztest_opts.zo_verbose >= 3)
 		(void) printf("testing spa_freeze()...\n");
 
 	kernel_init(FREAD | FWRITE);
-	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
-	VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	VERIFY3U(0, ==, ztest_dataset_open(0));
+	spa->spa_debug = B_TRUE;
+	ztest_spa = spa;
 
 	/*
 	 * Force the first log block to be transactionally allocated.
@@ -5240,7 +5840,7 @@ ztest_freeze(ztest_shared_t *zs)
 	 */
 	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
 		ztest_dmu_object_alloc_free(zd, 0);
-		zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+		zil_commit(zd->zd_zilog, 0);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
@@ -5252,28 +5852,43 @@ ztest_freeze(ztest_shared_t *zs)
 	spa_freeze(spa);
 
 	/*
+	 * Because it is hard to predict how much space a write will actually
+	 * require beforehand, we leave ourselves some fudge space to write over
+	 * capacity.
+	 */
+	uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
+
+	/*
 	 * Run tests that generate log records but don't alter the pool config
 	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
 	 * We do a txg_wait_synced() after each iteration to force the txg
 	 * to increase well beyond the last synced value in the uberblock.
 	 * The ZIL should be OK with that.
+	 *
+	 * Run a random number of times less than zo_maxloops and ensure we do
+	 * not run out of space on the pool.
 	 */
-	while (ztest_random(20) != 0) {
-		ztest_dmu_write_parallel(zd, 0);
-		ztest_dmu_object_alloc_free(zd, 0);
+	while (ztest_random(10) != 0 &&
+	    numloops++ < ztest_opts.zo_maxloops &&
+	    metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
+		ztest_od_t od;
+		ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+		VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
+		ztest_io(zd, od.od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 	}
 
 	/*
 	 * Commit all of the changes we just generated.
 	 */
-	zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+	zil_commit(zd->zd_zilog, 0);
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	/*
 	 * Close our dataset and close the pool.
 	 */
-	ztest_dataset_close(zs, 0);
+	ztest_dataset_close(0);
 	spa_close(spa, FTAG);
 	kernel_fini();
 
@@ -5281,22 +5896,22 @@ ztest_freeze(ztest_shared_t *zs)
 	 * Open and close the pool and dataset to induce log replay.
 	 */
 	kernel_init(FREAD | FWRITE);
-	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
-	VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
-	ztest_dataset_close(zs, 0);
-	spa_close(spa, FTAG);
-	kernel_fini();
-
-	list_destroy(&zcl.zcl_callbacks);
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
+	VERIFY3U(0, ==, ztest_dataset_open(0));
+	ztest_dataset_close(0);
 
-	(void) _mutex_destroy(&zcl.zcl_callbacks_lock);
+	spa->spa_debug = B_TRUE;
+	ztest_spa = spa;
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	ztest_reguid(NULL, 0);
 
-	(void) rwlock_destroy(&zs->zs_name_lock);
-	(void) _mutex_destroy(&zs->zs_vdev_lock);
+	spa_close(spa, FTAG);
+	kernel_fini();
 }
 
 void
-print_time(hrtime_t t, char *timebuf, size_t timelen)
+print_time(hrtime_t t, char *timebuf)
 {
 	hrtime_t s = t / NANOSEC;
 	hrtime_t m = s / 60;
@@ -5310,14 +5925,14 @@ print_time(hrtime_t t, char *timebuf, si
 	timebuf[0] = '\0';
 
 	if (d)
-		(void) snprintf(timebuf, timelen,
+		(void) sprintf(timebuf,
 		    "%llud%02lluh%02llum%02llus", d, h, m, s);
 	else if (h)
-		(void) snprintf(timebuf, timelen, "%lluh%02llum%02llus", h, m, s);
+		(void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
 	else if (m)
-		(void) snprintf(timebuf, timelen, "%llum%02llus", m, s);
+		(void) sprintf(timebuf, "%llum%02llus", m, s);
 	else
-		(void) snprintf(timebuf, timelen, "%llus", s);
+		(void) sprintf(timebuf, "%llus", s);
 }
 
 static nvlist_t *
@@ -5325,15 +5940,11 @@ make_random_props()
 {
 	nvlist_t *props;
 
-	if (ztest_random(2) == 0)
-		return (NULL);
-
 	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+	if (ztest_random(2) == 0)
+		return (props);
 	VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
 
-	(void) printf("props:\n");
-	dump_nvlist(props, 4);
-
 	return (props);
 }
 
@@ -5347,35 +5958,225 @@ ztest_init(ztest_shared_t *zs)
 	spa_t *spa;
 	nvlist_t *nvroot, *props;
 
-	VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
-	VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Create the storage pool.
 	 */
-	(void) spa_destroy(zs->zs_pool);
+	(void) spa_destroy(ztest_opts.zo_pool);
 	ztest_shared->zs_vdev_next_leaf = 0;
 	zs->zs_splits = 0;
-	zs->zs_mirrors = zopt_mirrors;
-	nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
-	    0, zopt_raidz, zs->zs_mirrors, 1);
+	zs->zs_mirrors = ztest_opts.zo_mirrors;
+	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
+	    0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
 	props = make_random_props();
-	VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL));
+	for (int i = 0; i < SPA_FEATURES; i++) {
+		char buf[1024];
+		(void) snprintf(buf, sizeof (buf), "feature@%s",
+		    spa_feature_table[i].fi_uname);
+		VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
+	}
+	VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
 	nvlist_free(nvroot);
+	nvlist_free(props);
+
+	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
+	zs->zs_metaslab_sz =
+	    1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
 
-	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
-	metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
 	spa_close(spa, FTAG);
 
 	kernel_fini();
 
-	ztest_run_zdb(zs->zs_pool);
+	ztest_run_zdb(ztest_opts.zo_pool);
+
+	ztest_freeze();
+
+	ztest_run_zdb(ztest_opts.zo_pool);
+
+	(void) rwlock_destroy(&ztest_name_lock);
+	(void) _mutex_destroy(&ztest_vdev_lock);
+}
+
+static void
+setup_data_fd(void)
+{
+	static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
+
+	ztest_fd_data = mkstemp(ztest_name_data);
+	ASSERT3S(ztest_fd_data, >=, 0);
+	(void) unlink(ztest_name_data);
+}
+
+
+static int
+shared_data_size(ztest_shared_hdr_t *hdr)
+{
+	int size;
+
+	size = hdr->zh_hdr_size;
+	size += hdr->zh_opts_size;
+	size += hdr->zh_size;
+	size += hdr->zh_stats_size * hdr->zh_stats_count;
+	size += hdr->zh_ds_size * hdr->zh_ds_count;
+
+	return (size);
+}
+
+static void
+setup_hdr(void)
+{
+	int size;
+	ztest_shared_hdr_t *hdr;
+
+	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+	ASSERT(hdr != MAP_FAILED);
+
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
+
+	hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
+	hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
+	hdr->zh_size = sizeof (ztest_shared_t);
+	hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
+	hdr->zh_stats_count = ZTEST_FUNCS;
+	hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
+	hdr->zh_ds_count = ztest_opts.zo_datasets;
+
+	size = shared_data_size(hdr);
+	VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
+
+	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+}
+
+static void
+setup_data(void)
+{
+	int size, offset;
+	ztest_shared_hdr_t *hdr;
+	uint8_t *buf;
+
+	hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
+	    PROT_READ, MAP_SHARED, ztest_fd_data, 0);
+	ASSERT(hdr != MAP_FAILED);
+
+	size = shared_data_size(hdr);
+
+	(void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
+	hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
+	    PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
+	ASSERT(hdr != MAP_FAILED);
+	buf = (uint8_t *)hdr;
+
+	offset = hdr->zh_hdr_size;
+	ztest_shared_opts = (void *)&buf[offset];
+	offset += hdr->zh_opts_size;
+	ztest_shared = (void *)&buf[offset];
+	offset += hdr->zh_size;
+	ztest_shared_callstate = (void *)&buf[offset];
+	offset += hdr->zh_stats_size * hdr->zh_stats_count;
+	ztest_shared_ds = (void *)&buf[offset];
+}
+
+static boolean_t
+exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
+{
+	pid_t pid;
+	int status;
+	char *cmdbuf = NULL;
+
+	pid = fork();
+
+	if (cmd == NULL) {
+		cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
+		(void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
+		cmd = cmdbuf;
+	}
+
+	if (pid == -1)
+		fatal(1, "fork failed");
+
+	if (pid == 0) {	/* child */
+		char *emptyargv[2] = { cmd, NULL };
+		char fd_data_str[12];
+
+		struct rlimit rl = { 1024, 1024 };
+		(void) setrlimit(RLIMIT_NOFILE, &rl);
+
+		(void) close(ztest_fd_rand);
+		VERIFY3U(11, >=,
+		    snprintf(fd_data_str, 12, "%d", ztest_fd_data));
+		VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
+
+		(void) enable_extended_FILE_stdio(-1, -1);
+		if (libpath != NULL)
+			VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
+#ifdef illumos
+		(void) execv(cmd, emptyargv);
+#else
+		(void) execvp(cmd, emptyargv);
+#endif
+		ztest_dump_core = B_FALSE;
+		fatal(B_TRUE, "exec failed: %s", cmd);
+	}
+
+	if (cmdbuf != NULL) {
+		umem_free(cmdbuf, MAXPATHLEN);
+		cmd = NULL;
+	}
+
+	while (waitpid(pid, &status, 0) != pid)
+		continue;
+	if (statusp != NULL)
+		*statusp = status;
+
+	if (WIFEXITED(status)) {
+		if (WEXITSTATUS(status) != 0) {
+			(void) fprintf(stderr, "child exited with code %d\n",
+			    WEXITSTATUS(status));
+			exit(2);
+		}
+		return (B_FALSE);
+	} else if (WIFSIGNALED(status)) {
+		if (!ignorekill || WTERMSIG(status) != SIGKILL) {
+			(void) fprintf(stderr, "child died with signal %d\n",
+			    WTERMSIG(status));
+			exit(3);
+		}
+		return (B_TRUE);
+	} else {
+		(void) fprintf(stderr, "something strange happened to child\n");
+		exit(4);
+		/* NOTREACHED */
+	}
+}
+
+static void
+ztest_run_init(void)
+{
+	ztest_shared_t *zs = ztest_shared;
+
+	ASSERT(ztest_opts.zo_init != 0);
 
-	ztest_freeze(zs);
+	/*
+	 * Blow away any existing copy of zpool.cache
+	 */
+	(void) remove(spa_config_path);
 
-	ztest_run_zdb(zs->zs_pool);
+	/*
+	 * Create and initialize our storage pool.
+	 */
+	for (int i = 1; i <= ztest_opts.zo_init; i++) {
+		bzero(zs, sizeof (ztest_shared_t));
+		if (ztest_opts.zo_verbose >= 3 &&
+		    ztest_opts.zo_init != 1) {
+			(void) printf("ztest_init(), pass %d\n", i);
+		}
+		ztest_init(zs);
+	}
 }
 
 int
@@ -5383,63 +6184,98 @@ main(int argc, char **argv)
 {
 	int kills = 0;
 	int iters = 0;
+	int older = 0;
+	int newer = 0;
 	ztest_shared_t *zs;
-	size_t shared_size;
 	ztest_info_t *zi;
+	ztest_shared_callstate_t *zc;
 	char timebuf[100];
 	char numbuf[6];
 	spa_t *spa;
+	char *cmd;
+	boolean_t hasalt;
+	char *fd_data_str = getenv("ZTEST_FD_DATA");
 
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
-	/* Override location of zpool.cache */
-	spa_config_path = "/tmp/zpool.cache";
+	dprintf_setup(&argc, argv);
+	zfs_deadman_synctime_ms = 300000;
 
-	ztest_random_fd = open("/dev/urandom", O_RDONLY);
+	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
+	ASSERT3S(ztest_fd_rand, >=, 0);
 
-	process_options(argc, argv);
+	if (!fd_data_str) {
+		process_options(argc, argv);
 
-	/*
-	 * Blow away any existing copy of zpool.cache
-	 */
-	if (zopt_init != 0)
-		(void) remove("/tmp/zpool.cache");
+		setup_data_fd();
+		setup_hdr();
+		setup_data();
+		bcopy(&ztest_opts, ztest_shared_opts,
+		    sizeof (*ztest_shared_opts));
+	} else {
+		ztest_fd_data = atoi(fd_data_str);
+		setup_data();
+		bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
+	}
+	ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
 
-	shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t);
+	/* Override location of zpool.cache */
+	VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
+	    ztest_opts.zo_dir), !=, -1);
 
-	zs = ztest_shared = (void *)mmap(0,
-	    P2ROUNDUP(shared_size, getpagesize()),
-	    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
+	ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
+	    UMEM_NOFAIL);
+	zs = ztest_shared;
+
+	if (fd_data_str) {
+		metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
+		metaslab_df_alloc_threshold =
+		    zs->zs_metaslab_df_alloc_threshold;
 
-	if (zopt_verbose >= 1) {
-		(void) printf("%llu vdevs, %d datasets, %d threads,"
-		    " %llu seconds...\n",
-		    (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads,
-		    (u_longlong_t)zopt_time);
+		if (zs->zs_do_init)
+			ztest_run_init();
+		else
+			ztest_run(zs);
+		exit(0);
 	}
 
-	/*
-	 * Create and initialize our storage pool.
-	 */
-	for (int i = 1; i <= zopt_init; i++) {
-		bzero(zs, sizeof (ztest_shared_t));
-		if (zopt_verbose >= 3 && zopt_init != 1)
-			(void) printf("ztest_init(), pass %d\n", i);
-		zs->zs_pool = zopt_pool;
-		ztest_init(zs);
+	hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
+
+	if (ztest_opts.zo_verbose >= 1) {
+		(void) printf("%llu vdevs, %d datasets, %d threads,"
+		    " %llu seconds...\n",
+		    (u_longlong_t)ztest_opts.zo_vdevs,
+		    ztest_opts.zo_datasets,
+		    ztest_opts.zo_threads,
+		    (u_longlong_t)ztest_opts.zo_time);
+	}
+
+	cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
+	(void) strlcpy(cmd, getexecname(), MAXNAMELEN);
+
+	zs->zs_do_init = B_TRUE;
+	if (strlen(ztest_opts.zo_alt_ztest) != 0) {
+		if (ztest_opts.zo_verbose >= 1) {
+			(void) printf("Executing older ztest for "
+			    "initialization: %s\n", ztest_opts.zo_alt_ztest);
+		}
+		VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
+		    ztest_opts.zo_alt_libpath, B_FALSE, NULL));
+	} else {
+		VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
 	}
+	zs->zs_do_init = B_FALSE;
 
-	zs->zs_pool = zopt_pool;
 	zs->zs_proc_start = gethrtime();
-	zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
+	zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
 
 	for (int f = 0; f < ZTEST_FUNCS; f++) {
-		zi = &zs->zs_info[f];
-		*zi = ztest_info[f];
+		zi = &ztest_info[f];
+		zc = ZTEST_GET_SHARED_CALLSTATE(f);
 		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
-			zi->zi_call_next = UINT64_MAX;
+			zc->zc_next = UINT64_MAX;
 		else
-			zi->zi_call_next = zs->zs_proc_start +
+			zc->zc_next = zs->zs_proc_start +
 			    ztest_random(2 * zi->zi_interval[0] + 1);
 	}
 
@@ -5450,65 +6286,48 @@ main(int argc, char **argv)
 	 */
 	while (gethrtime() < zs->zs_proc_stop) {
 		int status;
-		pid_t pid;
+		boolean_t killed;
 
 		/*
 		 * Initialize the workload counters for each function.
 		 */
 		for (int f = 0; f < ZTEST_FUNCS; f++) {
-			zi = &zs->zs_info[f];
-			zi->zi_call_count = 0;
-			zi->zi_call_time = 0;
+			zc = ZTEST_GET_SHARED_CALLSTATE(f);
+			zc->zc_count = 0;
+			zc->zc_time = 0;
 		}
 
 		/* Set the allocation switch size */
-		metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
-
-		pid = fork();
-
-		if (pid == -1)
-			fatal(1, "fork failed");
+		zs->zs_metaslab_df_alloc_threshold =
+		    ztest_random(zs->zs_metaslab_sz / 4) + 1;
 
-		if (pid == 0) {	/* child */
-			struct rlimit rl = { 1024, 1024 };
-			(void) setrlimit(RLIMIT_NOFILE, &rl);
-			(void) enable_extended_FILE_stdio(-1, -1);
-			ztest_run(zs);
-			exit(0);
-		}
-
-		while (waitpid(pid, &status, 0) != pid)
-			continue;
-
-		if (WIFEXITED(status)) {
-			if (WEXITSTATUS(status) != 0) {
-				(void) fprintf(stderr,
-				    "child exited with code %d\n",
-				    WEXITSTATUS(status));
-				exit(2);
-			}
-		} else if (WIFSIGNALED(status)) {
-			if (WTERMSIG(status) != SIGKILL) {
-				(void) fprintf(stderr,
-				    "child died with signal %d\n",
-				    WTERMSIG(status));
-				exit(3);
+		if (!hasalt || ztest_random(2) == 0) {
+			if (hasalt && ztest_opts.zo_verbose >= 1) {
+				(void) printf("Executing newer ztest: %s\n",
+				    cmd);
 			}
-			kills++;
+			newer++;
+			killed = exec_child(cmd, NULL, B_TRUE, &status);
 		} else {
-			(void) fprintf(stderr, "something strange happened "
-			    "to child\n");
-			exit(4);
+			if (hasalt && ztest_opts.zo_verbose >= 1) {
+				(void) printf("Executing older ztest: %s\n",
+				    ztest_opts.zo_alt_ztest);
+			}
+			older++;
+			killed = exec_child(ztest_opts.zo_alt_ztest,
+			    ztest_opts.zo_alt_libpath, B_TRUE, &status);
 		}
 
+		if (killed)
+			kills++;
 		iters++;
 
-		if (zopt_verbose >= 1) {
+		if (ztest_opts.zo_verbose >= 1) {
 			hrtime_t now = gethrtime();
 
 			now = MIN(now, zs->zs_proc_stop);
-			print_time(zs->zs_proc_stop - now, timebuf, sizeof(timebuf));
-			nicenum(zs->zs_space, numbuf, sizeof(numbuf));
+			print_time(zs->zs_proc_stop - now, timebuf);
+			nicenum(zs->zs_space, numbuf);
 
 			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
 			    "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
@@ -5518,10 +6337,10 @@ main(int argc, char **argv)
 			    100.0 * zs->zs_alloc / zs->zs_space,
 			    numbuf,
 			    100.0 * (now - zs->zs_proc_start) /
-			    (zopt_time * NANOSEC), timebuf);
+			    (ztest_opts.zo_time * NANOSEC), timebuf);
 		}
 
-		if (zopt_verbose >= 2) {
+		if (ztest_opts.zo_verbose >= 2) {
 			(void) printf("\nWorkload summary:\n\n");
 			(void) printf("%7s %9s   %s\n",
 			    "Calls", "Time", "Function");
@@ -5530,11 +6349,12 @@ main(int argc, char **argv)
 			for (int f = 0; f < ZTEST_FUNCS; f++) {
 				Dl_info dli;
 
-				zi = &zs->zs_info[f];
-				print_time(zi->zi_call_time, timebuf, sizeof(timebuf));
+				zi = &ztest_info[f];
+				zc = ZTEST_GET_SHARED_CALLSTATE(f);
+				print_time(zc->zc_time, timebuf);
 				(void) dladdr((void *)zi->zi_func, &dli);
 				(void) printf("%7llu %9s   %s\n",
-				    (u_longlong_t)zi->zi_call_count, timebuf,
+				    (u_longlong_t)zc->zc_count, timebuf,
 				    dli.dli_sname);
 			}
 			(void) printf("\n");
@@ -5546,25 +6366,33 @@ main(int argc, char **argv)
 		 * instead of 'ztest'.  Do a blind rename in case this happened.
 		 */
 		kernel_init(FREAD);
-		if (spa_open(zopt_pool, &spa, FTAG) == 0) {
+		if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) {
 			spa_close(spa, FTAG);
 		} else {
-			char tmpname[MAXNAMELEN];
+			char tmpname[ZFS_MAX_DATASET_NAME_LEN];
 			kernel_fini();
 			kernel_init(FREAD | FWRITE);
 			(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
-			    zopt_pool);
-			(void) spa_rename(tmpname, zopt_pool);
+			    ztest_opts.zo_pool);
+			(void) spa_rename(tmpname, ztest_opts.zo_pool);
 		}
 		kernel_fini();
 
-		ztest_run_zdb(zopt_pool);
+		ztest_run_zdb(ztest_opts.zo_pool);
 	}
 
-	if (zopt_verbose >= 1) {
+	if (ztest_opts.zo_verbose >= 1) {
+		if (hasalt) {
+			(void) printf("%d runs of older ztest: %s\n", older,
+			    ztest_opts.zo_alt_ztest);
+			(void) printf("%d runs of newer ztest: %s\n", newer,
+			    cmd);
+		}
 		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
 		    kills, iters - kills, (100.0 * kills) / MAX(1, iters));
 	}
 
+	umem_free(cmd, MAXNAMELEN);
+
 	return (0);
 }
Index: src/external/cddl/osnet/dist/common/acl/acl_common.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/acl/acl_common.c,v
retrieving revision 1.5
diff -u -p -r1.5 acl_common.c
--- src/external/cddl/osnet/dist/common/acl/acl_common.c	20 Nov 2011 02:54:25 -0000	1.5
+++ src/external/cddl/osnet/dist/common/acl/acl_common.c	14 Jul 2016 01:26:26 -0000
@@ -19,21 +19,20 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
-#include <sys/debug.h>
 #include <sys/stat.h>
 #include <sys/avl.h>
+#include <sys/misc.h>
 #if defined(_KERNEL)
+#include <sys/kmem.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <acl/acl_common.h>
-#include <sys/kmem.h>
+#include <sys/debug.h>
 #else
 #include <errno.h>
 #include <stdlib.h>
@@ -151,166 +150,6 @@ typedef struct ace_list {
 	int seen; /* bitmask of all aclent_t a_type values seen */
 } ace_list_t;
 
-ace_t trivial_acl[] = {
-	{(uid_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE},
-	{(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES|
-	    ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE},
-	{(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
-	    ACE_ACCESS_DENIED_ACE_TYPE},
-	{(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP,
-	    ACE_ACCESS_ALLOWED_ACE_TYPE},
-	{(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES|
-	    ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE},
-	{(uid_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
-	    ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE}
-};
-
-
-void
-adjust_ace_pair_common(void *pair, size_t access_off,
-    size_t pairsize, mode_t mode)
-{
-	char *datap = (char *)pair;
-	uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off);
-	uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize +
-	    access_off);
-	if (mode & S_IROTH)
-		*amask1 |= ACE_READ_DATA;
-	else
-		*amask0 |= ACE_READ_DATA;
-	if (mode & S_IWOTH)
-		*amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
-	else
-		*amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA;
-	if (mode & S_IXOTH)
-		*amask1 |= ACE_EXECUTE;
-	else
-		*amask0 |= ACE_EXECUTE;
-}
-
-void
-adjust_ace_pair(ace_t *pair, mode_t mode)
-{
-	adjust_ace_pair_common(pair, offsetof(ace_t, a_access_mask),
-	    sizeof (ace_t), mode);
-}
-
-static void
-ace_allow_deny_helper(uint16_t type, boolean_t *allow, boolean_t *deny)
-{
-	if (type == ACE_ACCESS_ALLOWED_ACE_TYPE)
-		*allow = B_TRUE;
-	else if (type == ACE_ACCESS_DENIED_ACE_TYPE)
-		*deny = B_TRUE;
-}
-
-/*
- * ace_trivial:
- * determine whether an ace_t acl is trivial
- *
- * Trivialness implies that the acl is composed of only
- * owner, group, everyone entries.  ACL can't
- * have read_acl denied, and write_owner/write_acl/write_attributes
- * can only be owner@ entry.
- */
-int
-ace_trivial_common(void *acep, int aclcnt,
-    uint64_t (*walk)(void *, uint64_t, int aclcnt,
-    uint16_t *, uint16_t *, uint32_t *))
-{
-	boolean_t owner_allow = B_FALSE;
-	boolean_t group_allow = B_FALSE;
-	boolean_t everyone_allow = B_FALSE;
-	boolean_t owner_deny = B_FALSE;
-	boolean_t group_deny = B_FALSE;
-	boolean_t everyone_deny = B_FALSE;
-	uint16_t flags;
-	uint32_t mask;
-	uint16_t type;
-	uint64_t cookie = 0;
-
-	while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
-		switch (flags & ACE_TYPE_FLAGS) {
-		case ACE_OWNER:
-			if (group_allow || group_deny || everyone_allow ||
-			    everyone_deny)
-				return (1);
-			ace_allow_deny_helper(type, &owner_allow, &owner_deny);
-			break;
-		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
-			if (everyone_allow || everyone_deny &&
-			    (!owner_allow && !owner_deny))
-				return (1);
-			ace_allow_deny_helper(type, &group_allow, &group_deny);
-			break;
-
-		case ACE_EVERYONE:
-			if (!owner_allow && !owner_deny &&
-			    !group_allow && !group_deny)
-				return (1);
-			ace_allow_deny_helper(type,
-			    &everyone_allow, &everyone_deny);
-			break;
-		default:
-			return (1);
-
-		}
-
-		if (flags & (ACE_FILE_INHERIT_ACE|
-		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
-		    ACE_INHERIT_ONLY_ACE))
-			return (1);
-
-		/*
-		 * Special check for some special bits
-		 *
-		 * Don't allow anybody to deny reading basic
-		 * attributes or a files ACL.
-		 */
-		if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
-		    (type == ACE_ACCESS_DENIED_ACE_TYPE))
-			return (1);
-
-		/*
-		 * Allow on owner@ to allow
-		 * write_acl/write_owner/write_attributes
-		 */
-		if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
-		    (!(flags & ACE_OWNER) && (mask &
-		    (ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES))))
-			return (1);
-
-	}
-
-	if (!owner_allow || !owner_deny || !group_allow || !group_deny ||
-	    !everyone_allow || !everyone_deny)
-		return (1);
-
-	return (0);
-}
-
-uint64_t
-ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
-    uint16_t *type, uint32_t *mask)
-{
-	ace_t *acep = datap;
-
-	if (cookie >= aclcnt)
-		return (0);
-
-	*flags = acep[cookie].a_flags;
-	*type = acep[cookie].a_type;
-	*mask = acep[cookie++].a_access_mask;
-
-	return (cookie);
-}
-
-int
-ace_trivial(ace_t *acep, int aclcnt)
-{
-	return (ace_trivial_common(acep, aclcnt, ace_walk));
-}
-
 /*
  * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
  * v = Ptr to array/vector of objs
@@ -469,7 +308,6 @@ acl_free(acl_t *aclp)
 
 	cacl_free(aclp, sizeof (acl_t));
 }
-#endif
 
 static uint32_t
 access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
@@ -539,7 +377,7 @@ access_mask_set(int haswriteperm, int ha
  * by nfsace, assuming aclent_t -> nfsace semantics.
  */
 static uint32_t
-mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow)
+mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
 {
 	uint32_t access = 0;
 	int haswriteperm = 0;
@@ -582,7 +420,7 @@ mode_to_ace_access(mode_t mode, int isdi
 			access |= ACE_DELETE_CHILD;
 	}
 	/* exec */
-	if (mode & 01) {
+	if (mode & S_IXOTH) {
 		access |= ACE_EXECUTE;
 	}
 
@@ -833,7 +671,7 @@ out:
 }
 
 static int
-convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir,
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
     ace_t **retacep, int *retacecnt)
 {
 	ace_t *acep;
@@ -859,7 +697,7 @@ convert_aent_to_ace(aclent_t *aclentp, i
 		dfaclcnt = aclcnt - i;
 	}
 
-	if (dfaclcnt && isdir == 0) {
+	if (dfaclcnt && !isdir) {
 		return (EINVAL);
 	}
 
@@ -897,7 +735,7 @@ convert_aent_to_ace(aclent_t *aclentp, i
 }
 
 static int
-ace_mask_to_mode(uint32_t  mask, o_mode_t *modep, int isdir)
+ace_mask_to_mode(uint32_t  mask, o_mode_t *modep, boolean_t isdir)
 {
 	int error = 0;
 	o_mode_t mode = 0;
@@ -1194,7 +1032,7 @@ out:
 }
 
 static int
-ace_allow_to_mode(uint32_t mask, o_mode_t *modep, int isdir)
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
 {
 	/* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
 	if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
@@ -1207,7 +1045,7 @@ ace_allow_to_mode(uint32_t mask, o_mode_
 
 static int
 acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
-    uid_t owner, gid_t group, int isdir)
+    uid_t owner, gid_t group, boolean_t isdir)
 {
 	int error;
 	uint32_t  flips = ACE_POSIX_SUPPORTED_BITS;
@@ -1247,7 +1085,7 @@ out:
 
 static int
 ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
-    uid_t owner, gid_t group, int isdir)
+    uid_t owner, gid_t group, boolean_t isdir)
 {
 	int error = 0;
 	aclent_t *aent, *result = NULL;
@@ -1427,7 +1265,7 @@ acevals_compare(const void *va, const vo
 static int
 ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
     aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
-    int isdir)
+    boolean_t isdir)
 {
 	int error = 0;
 	ace_t *acep;
@@ -1622,7 +1460,7 @@ out:
 }
 
 static int
-convert_ace_to_aent(ace_t *acebufp, int acecnt, int isdir,
+convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
     uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
 {
 	int error = 0;
@@ -1664,7 +1502,7 @@ convert_ace_to_aent(ace_t *acebufp, int 
 
 
 int
-acl_translate(acl_t *aclp, int target_flavor, int isdir, uid_t owner,
+acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
     gid_t group)
 {
 	int aclcnt;
@@ -1726,3 +1564,202 @@ out:
 	return (error);
 #endif
 }
+#endif /* !_KERNEL */
+
+#define	SET_ACE(acl, index, who, mask, type, flags) { \
+	acl[0][index].a_who = (uint32_t)who; \
+	acl[0][index].a_type = type; \
+	acl[0][index].a_flags = flags; \
+	acl[0][index++].a_access_mask = mask; \
+}
+
+void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+	uint32_t read_mask = ACE_READ_DATA;
+	uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+	uint32_t execute_mask = ACE_EXECUTE;
+
+	(void) isdir;	/* will need this later */
+
+	masks->deny1 = 0;
+	if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+		masks->deny1 |= read_mask;
+	if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+		masks->deny1 |= write_mask;
+	if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+		masks->deny1 |= execute_mask;
+
+	masks->deny2 = 0;
+	if (!(mode & S_IRGRP) && (mode & S_IROTH))
+		masks->deny2 |= read_mask;
+	if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+		masks->deny2 |= write_mask;
+	if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+		masks->deny2 |= execute_mask;
+
+	masks->allow0 = 0;
+	if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+		masks->allow0 |= read_mask;
+	if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+		masks->allow0 |= write_mask;
+	if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+		masks->allow0 |= execute_mask;
+
+	masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+	    ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+	    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+	if (mode & S_IRUSR)
+		masks->owner |= read_mask;
+	if (mode & S_IWUSR)
+		masks->owner |= write_mask;
+	if (mode & S_IXUSR)
+		masks->owner |= execute_mask;
+
+	masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IRGRP)
+		masks->group |= read_mask;
+	if (mode & S_IWGRP)
+		masks->group |= write_mask;
+	if (mode & S_IXGRP)
+		masks->group |= execute_mask;
+
+	masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IROTH)
+		masks->everyone |= read_mask;
+	if (mode & S_IWOTH)
+		masks->everyone |= write_mask;
+	if (mode & S_IXOTH)
+		masks->everyone |= execute_mask;
+}
+
+int
+acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count)
+{
+	int		index = 0;
+	int		error;
+	trivial_acl_t	masks;
+
+	*count = 3;
+	acl_trivial_access_masks(mode, isdir, &masks);
+
+	if (masks.allow0)
+		(*count)++;
+	if (masks.deny1)
+		(*count)++;
+	if (masks.deny2)
+		(*count)++;
+
+	if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
+		return (error);
+
+	if (masks.allow0) {
+		SET_ACE(acl, index, -1, masks.allow0,
+		    ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
+	}
+	if (masks.deny1) {
+		SET_ACE(acl, index, -1, masks.deny1,
+		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
+	}
+	if (masks.deny2) {
+		SET_ACE(acl, index, -1, masks.deny2,
+		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP);
+	}
+
+	SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_OWNER);
+	SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_IDENTIFIER_GROUP|ACE_GROUP);
+	SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_EVERYONE);
+
+	return (0);
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries.  ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial_common(void *acep, int aclcnt,
+    uint64_t (*walk)(void *, uint64_t, int aclcnt,
+    uint16_t *, uint16_t *, uint32_t *))
+{
+	uint16_t flags;
+	uint32_t mask;
+	uint16_t type;
+	uint64_t cookie = 0;
+
+	while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) {
+		switch (flags & ACE_TYPE_FLAGS) {
+		case ACE_OWNER:
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+		case ACE_EVERYONE:
+			break;
+		default:
+			return (1);
+
+		}
+
+		if (flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE))
+			return (1);
+
+		/*
+		 * Special check for some special bits
+		 *
+		 * Don't allow anybody to deny reading basic
+		 * attributes or a files ACL.
+		 */
+		if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+		    (type == ACE_ACCESS_DENIED_ACE_TYPE))
+			return (1);
+
+		/*
+		 * Delete permissions are never set by default
+		 */
+		if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+			return (1);
+		/*
+		 * only allow owner@ to have
+		 * write_acl/write_owner/write_attributes/write_xattr/
+		 */
+		if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    (!(flags & ACE_OWNER) && (mask &
+		    (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+		    ACE_WRITE_NAMED_ATTRS))))
+			return (1);
+
+	}
+	return (0);
+}
+
+uint64_t
+ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags,
+    uint16_t *type, uint32_t *mask)
+{
+	ace_t *acep = datap;
+
+	if (cookie >= aclcnt)
+		return (0);
+
+	*flags = acep[cookie].a_flags;
+	*type = acep[cookie].a_type;
+	*mask = acep[cookie++].a_access_mask;
+
+	return (cookie);
+}
+
+int
+ace_trivial(ace_t *acep, int aclcnt)
+{
+	return (ace_trivial_common(acep, aclcnt, ace_walk));
+}
Index: src/external/cddl/osnet/dist/common/acl/acl_common.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/acl/acl_common.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 acl_common.h
--- src/external/cddl/osnet/dist/common/acl/acl_common.h	7 Aug 2009 18:32:26 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/common/acl/acl_common.h	12 Jun 2012 05:57:26 -0000
@@ -19,16 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef	_ACL_COMMON_H
 #define	_ACL_COMMON_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
 #include <sys/types.h>
 #include <sys/acl.h>
 #include <sys/stat.h>
@@ -37,7 +34,14 @@
 extern "C" {
 #endif
 
-extern ace_t trivial_acl[6];
+typedef struct trivial_acl {
+	uint32_t	allow0;		/* allow mask for bits only in owner */
+	uint32_t	deny1;		/* deny mask for bits not in owner */
+	uint32_t	deny2;		/* deny mask for bits not in group */
+	uint32_t	owner;		/* allow mask matching mode */
+	uint32_t	group;		/* allow mask matching mode */
+	uint32_t	everyone;	/* allow mask matching mode */
+} trivial_acl_t;
 
 extern int acltrivial(const char *);
 extern void adjust_ace_pair(ace_t *pair, mode_t mode);
@@ -46,13 +50,17 @@ extern int ace_trivial(ace_t *acep, int 
 extern int ace_trivial_common(void *, int,
     uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *,
     uint32_t *mask));
+#if !defined(_KERNEL)
 extern acl_t *acl_alloc(acl_type_t);
 extern void acl_free(acl_t *aclp);
-extern int acl_translate(acl_t *aclp, int target_flavor,
-    int isdir, uid_t owner, gid_t group);
+extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir,
+    uid_t owner, gid_t group);
+#endif	/* !_KERNEL */
 void ksort(caddr_t v, int n, int s, int (*f)());
 int cmp2acls(void *a, void *b);
-
+int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count);
+void acl_trivial_access_masks(mode_t mode, boolean_t isdir,
+    trivial_acl_t *masks);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/common/avl/avl.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/avl/avl.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 avl.c
--- src/external/cddl/osnet/dist/common/avl/avl.c	27 Feb 2010 22:29:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/avl/avl.c	20 Sep 2015 01:09:06 -0000
@@ -24,6 +24,11 @@
  */
 
 /*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+/*
  * AVL - generic AVL tree implementation for kernel use
  *
  * A complete description of AVL trees can be found in many CS textbooks.
@@ -37,7 +42,7 @@
  * insertion and deletion relatively efficiently. Searching the tree is
  * still a fast operation, roughly O(log(N)).
  *
- * The key to insertion and deletion is a set of tree maniuplations called
+ * The key to insertion and deletion is a set of tree manipulations called
  * rotations, which bring unbalanced subtrees back into the semi-balanced state.
  *
  * This implementation of AVL trees has the following peculiarities:
@@ -45,7 +50,7 @@
  *	- The AVL specific data structures are physically embedded as fields
  *	  in the "using" data structures.  To maintain generality the code
  *	  must constantly translate between "avl_node_t *" and containing
- *	  data structure "void *"s by adding/subracting the avl_offset.
+ *	  data structure "void *"s by adding/subtracting the avl_offset.
  *
  *	- Since the AVL data is always embedded in other structures, there is
  *	  no locking or memory allocation in the AVL routines. This must be
@@ -85,16 +90,22 @@
  *	  is a modified "avl_node_t *".  The bottom bit (normally 0 for a
  *	  pointer) is set to indicate if that the new node has a value greater
  *	  than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
+#include <sys/stdint.h>
 #include <sys/debug.h>
 #include <sys/avl.h>
-#include <sys/cmn_err.h>
 
 /*
- * Small arrays to translate between balance (or diff) values and child indeces.
+ * Small arrays to translate between balance (or diff) values and child indices.
  *
  * Code that deals with binary tree data structures will randomly use
  * left and right children when examining a tree.  C "if()" statements
@@ -114,7 +125,8 @@ static const int  avl_balance2child[]	= 
  *
  * - If there is a left child, go to it, then to it's rightmost descendant.
  *
- * - otherwise we return thru parent nodes until we've come from a right child.
+ * - otherwise we return through parent nodes until we've come from a right
+ *   child.
  *
  * Return Value:
  * NULL - if at the end of the nodes
@@ -624,14 +636,17 @@ avl_add(avl_tree_t *tree, void *new_node
 	/*
 	 * This is unfortunate.  We want to call panic() here, even for
 	 * non-DEBUG kernels.  In userland, however, we can't depend on anything
-	 * in libc or else the rtld build process gets confused.  So, all we can
-	 * do in userland is resort to a normal ASSERT().
+	 * in libc or else the rtld build process gets confused.
+	 * Thankfully, rtld provides us with its own assfail() so we can use
+	 * that here.  We use assfail() directly to get a nice error message
+	 * in the core - much like what panic() does for crashdumps.
 	 */
 	if (avl_find(tree, new_node, &where) != NULL)
 #ifdef _KERNEL
 		panic("avl_find() succeeded inside avl_add()");
 #else
-		ASSERT(0);
+		(void) assfail("avl_find() succeeded inside avl_add()",
+		    __FILE__, __LINE__);
 #endif
 	avl_insert(tree, new_node, where);
 }
@@ -863,6 +878,24 @@ avl_update(avl_tree_t *t, void *obj)
 	return (B_FALSE);
 }
 
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+	avl_node_t *temp_node;
+	ulong_t temp_numnodes;
+
+	ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+	ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+	ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+	temp_node = tree1->avl_root;
+	temp_numnodes = tree1->avl_numnodes;
+	tree1->avl_root = tree2->avl_root;
+	tree1->avl_numnodes = tree2->avl_numnodes;
+	tree2->avl_root = temp_node;
+	tree2->avl_numnodes = temp_numnodes;
+}
+
 /*
  * initialize a new AVL tree
  */
@@ -919,7 +952,7 @@ avl_is_empty(avl_tree_t *tree)
 
 /*
  * Post-order tree walk used to visit all tree nodes and destroy the tree
- * in post order. This is used for destroying a tree w/o paying any cost
+ * in post order. This is used for destroying a tree without paying any cost
  * for rebalancing it.
  *
  * example:
Index: src/external/cddl/osnet/dist/common/ctf/ctf_types.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/ctf/ctf_types.c,v
retrieving revision 1.4
diff -u -p -r1.4 ctf_types.c
--- src/external/cddl/osnet/dist/common/ctf/ctf_types.c	27 Dec 2015 21:39:01 -0000	1.4
+++ src/external/cddl/osnet/dist/common/ctf/ctf_types.c	11 Apr 2017 09:42:30 -0000
@@ -647,11 +647,8 @@ ctf_type_compat(ctf_file_t *lfp, ctf_id_
 	}
 }
 
-/*
- * Return the type and offset for a given member of a STRUCT or UNION.
- */
-int
-ctf_member_info(ctf_file_t *fp, ctf_id_t type, const char *name,
+static int
+_ctf_member_info(ctf_file_t *fp, ctf_id_t type, const char *name, ulong_t off,
     ctf_membinfo_t *mip)
 {
 	ctf_file_t *ofp = fp;
@@ -676,9 +673,13 @@ ctf_member_info(ctf_file_t *fp, ctf_id_t
 		    ((uintptr_t)tp + increment);
 
 		for (n = LCTF_INFO_VLEN(fp, tp->ctt_info); n != 0; n--, mp++) {
+			if (mp->ctm_name == 0 &&
+			    _ctf_member_info(fp, mp->ctm_type, name,
+			    mp->ctm_offset + off, mip) == 0)
+				return (0);
 			if (strcmp(ctf_strptr(fp, mp->ctm_name), name) == 0) {
 				mip->ctm_type = mp->ctm_type;
-				mip->ctm_offset = mp->ctm_offset;
+				mip->ctm_offset = mp->ctm_offset + off;
 				return (0);
 			}
 		}
@@ -687,9 +688,14 @@ ctf_member_info(ctf_file_t *fp, ctf_id_t
 		    ((uintptr_t)tp + increment);
 
 		for (n = LCTF_INFO_VLEN(fp, tp->ctt_info); n != 0; n--, lmp++) {
+			if (lmp->ctlm_name == 0 &&
+			    _ctf_member_info(fp, lmp->ctlm_name, name,
+			    (ulong_t)CTF_LMEM_OFFSET(lmp) + off, mip) == 0)
+				return (0);
 			if (strcmp(ctf_strptr(fp, lmp->ctlm_name), name) == 0) {
 				mip->ctm_type = lmp->ctlm_type;
-				mip->ctm_offset = (ulong_t)CTF_LMEM_OFFSET(lmp);
+				mip->ctm_offset =
+				    (ulong_t)CTF_LMEM_OFFSET(lmp) + off;
 				return (0);
 			}
 		}
@@ -699,6 +705,17 @@ ctf_member_info(ctf_file_t *fp, ctf_id_t
 }
 
 /*
+ * Return the type and offset for a given member of a STRUCT or UNION.
+ */
+int
+ctf_member_info(ctf_file_t *fp, ctf_id_t type, const char *name,
+    ctf_membinfo_t *mip)
+{
+
+	return (_ctf_member_info(fp, type, name, 0, mip));
+}
+
+/*
  * Return the array type, index, and size information for the specified ARRAY.
  */
 int
Index: src/external/cddl/osnet/dist/common/nvpair/fnvpair.c
===================================================================
RCS file: src/external/cddl/osnet/dist/common/nvpair/fnvpair.c
diff -N src/external/cddl/osnet/dist/common/nvpair/fnvpair.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/common/nvpair/fnvpair.c	16 Jun 2017 17:49:37 -0000
@@ -0,0 +1,512 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/nvpair.h>
+#ifndef _KERNEL
+#include <sys/zfs_context.h>
+#include <stdlib.h>
+#else
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/debug.h>
+#endif
+
+/*
+ * "Force" nvlist wrapper.
+ *
+ * These functions wrap the nvlist_* functions with assertions that assume
+ * the operation is successful.  This allows the caller's code to be much
+ * more readable, especially for the fnvlist_lookup_* and fnvpair_value_*
+ * functions, which can return the requested value (rather than filling in
+ * a pointer).
+ *
+ * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate
+ * with KM_SLEEP.
+ *
+ * More wrappers should be added as needed -- for example
+ * nvlist_lookup_*_array and nvpair_value_*_array.
+ */
+
+nvlist_t *
+fnvlist_alloc(void)
+{
+	nvlist_t *nvl;
+	VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP));
+	return (nvl);
+}
+
+void
+fnvlist_free(nvlist_t *nvl)
+{
+	nvlist_free(nvl);
+}
+
+size_t
+fnvlist_size(nvlist_t *nvl)
+{
+	size_t size;
+	VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE));
+	return (size);
+}
+
+/*
+ * Returns allocated buffer of size *sizep.  Caller must free the buffer with
+ * fnvlist_pack_free().
+ */
+char *
+fnvlist_pack(nvlist_t *nvl, size_t *sizep)
+{
+	char *packed = 0;
+	VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
+	    KM_SLEEP), ==, 0);
+	return (packed);
+}
+
+/*ARGSUSED*/
+void
+fnvlist_pack_free(char *pack, size_t size)
+{
+#ifdef _KERNEL
+	kmem_free(pack, size);
+#else
+	free(pack);
+#endif
+}
+
+nvlist_t *
+fnvlist_unpack(char *buf, size_t buflen)
+{
+	nvlist_t *rv;
+	VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP));
+	return (rv);
+}
+
+nvlist_t *
+fnvlist_dup(nvlist_t *nvl)
+{
+	nvlist_t *rv;
+	VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP));
+	return (rv);
+}
+
+void
+fnvlist_merge(nvlist_t *dst, nvlist_t *src)
+{
+	VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
+}
+
+size_t
+fnvlist_num_pairs(nvlist_t *nvl)
+{
+	size_t count = 0;
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
+	    pair = nvlist_next_nvpair(nvl, pair))
+		count++;
+	return (count);
+}
+
+void
+fnvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+	VERIFY0(nvlist_add_boolean(nvl, name));
+}
+
+void
+fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+	VERIFY0(nvlist_add_boolean_value(nvl, name, val));
+}
+
+void
+fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+	VERIFY0(nvlist_add_byte(nvl, name, val));
+}
+
+void
+fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+	VERIFY0(nvlist_add_int8(nvl, name, val));
+}
+
+void
+fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+	VERIFY0(nvlist_add_uint8(nvl, name, val));
+}
+
+void
+fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+	VERIFY0(nvlist_add_int16(nvl, name, val));
+}
+
+void
+fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+	VERIFY0(nvlist_add_uint16(nvl, name, val));
+}
+
+void
+fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+	VERIFY0(nvlist_add_int32(nvl, name, val));
+}
+
+void
+fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+	VERIFY0(nvlist_add_uint32(nvl, name, val));
+}
+
+void
+fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+	VERIFY0(nvlist_add_int64(nvl, name, val));
+}
+
+void
+fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+	VERIFY0(nvlist_add_uint64(nvl, name, val));
+}
+
+void
+fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+	VERIFY0(nvlist_add_string(nvl, name, val));
+}
+
+void
+fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+	VERIFY0(nvlist_add_nvlist(nvl, name, val));
+}
+
+void
+fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+	VERIFY0(nvlist_add_nvpair(nvl, pair));
+}
+
+void
+fnvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+    boolean_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_boolean_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_byte_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint16_array(nvlist_t *nvl, const char *name,
+    uint16_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint32_array(nvlist_t *nvl, const char *name,
+    uint32_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint64_array(nvlist_t *nvl, const char *name,
+    uint64_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_string_array(nvlist_t *nvl, const char *name,
+    char * const *val, uint_t n)
+{
+	VERIFY0(nvlist_add_string_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name,
+    nvlist_t **val, uint_t n)
+{
+	VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n));
+}
+
+void
+fnvlist_remove(nvlist_t *nvl, const char *name)
+{
+	VERIFY0(nvlist_remove_all(nvl, name));
+}
+
+void
+fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+	VERIFY0(nvlist_remove_nvpair(nvl, pair));
+}
+
+nvpair_t *
+fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name)
+{
+	nvpair_t *rv;
+	VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv));
+	return (rv);
+}
+
+/* returns B_TRUE if the entry exists */
+boolean_t
+fnvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+	return (nvlist_lookup_boolean(nvl, name) == 0);
+}
+
+boolean_t
+fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name)
+{
+	boolean_t rv;
+	VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv));
+	return (rv);
+}
+
+uchar_t
+fnvlist_lookup_byte(nvlist_t *nvl, const char *name)
+{
+	uchar_t rv;
+	VERIFY0(nvlist_lookup_byte(nvl, name, &rv));
+	return (rv);
+}
+
+int8_t
+fnvlist_lookup_int8(nvlist_t *nvl, const char *name)
+{
+	int8_t rv;
+	VERIFY0(nvlist_lookup_int8(nvl, name, &rv));
+	return (rv);
+}
+
+int16_t
+fnvlist_lookup_int16(nvlist_t *nvl, const char *name)
+{
+	int16_t rv;
+	VERIFY0(nvlist_lookup_int16(nvl, name, &rv));
+	return (rv);
+}
+
+int32_t
+fnvlist_lookup_int32(nvlist_t *nvl, const char *name)
+{
+	int32_t rv;
+	VERIFY0(nvlist_lookup_int32(nvl, name, &rv));
+	return (rv);
+}
+
+int64_t
+fnvlist_lookup_int64(nvlist_t *nvl, const char *name)
+{
+	int64_t rv;
+	VERIFY0(nvlist_lookup_int64(nvl, name, &rv));
+	return (rv);
+}
+
+uint8_t
+fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name)
+{
+	uint8_t rv;
+	VERIFY0(nvlist_lookup_uint8(nvl, name, &rv));
+	return (rv);
+}
+
+uint16_t
+fnvlist_lookup_uint16(nvlist_t *nvl, const char *name)
+{
+	uint16_t rv;
+	VERIFY0(nvlist_lookup_uint16(nvl, name, &rv));
+	return (rv);
+}
+
+uint32_t
+fnvlist_lookup_uint32(nvlist_t *nvl, const char *name)
+{
+	uint32_t rv;
+	VERIFY0(nvlist_lookup_uint32(nvl, name, &rv));
+	return (rv);
+}
+
+uint64_t
+fnvlist_lookup_uint64(nvlist_t *nvl, const char *name)
+{
+	uint64_t rv;
+	VERIFY0(nvlist_lookup_uint64(nvl, name, &rv));
+	return (rv);
+}
+
+char *
+fnvlist_lookup_string(nvlist_t *nvl, const char *name)
+{
+	char *rv;
+	VERIFY0(nvlist_lookup_string(nvl, name, &rv));
+	return (rv);
+}
+
+nvlist_t *
+fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name)
+{
+	nvlist_t *rv;
+	VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv));
+	return (rv);
+}
+
+boolean_t
+fnvpair_value_boolean_value(nvpair_t *nvp)
+{
+	boolean_t rv;
+	VERIFY0(nvpair_value_boolean_value(nvp, &rv));
+	return (rv);
+}
+
+uchar_t
+fnvpair_value_byte(nvpair_t *nvp)
+{
+	uchar_t rv;
+	VERIFY0(nvpair_value_byte(nvp, &rv));
+	return (rv);
+}
+
+int8_t
+fnvpair_value_int8(nvpair_t *nvp)
+{
+	int8_t rv;
+	VERIFY0(nvpair_value_int8(nvp, &rv));
+	return (rv);
+}
+
+int16_t
+fnvpair_value_int16(nvpair_t *nvp)
+{
+	int16_t rv;
+	VERIFY0(nvpair_value_int16(nvp, &rv));
+	return (rv);
+}
+
+int32_t
+fnvpair_value_int32(nvpair_t *nvp)
+{
+	int32_t rv;
+	VERIFY0(nvpair_value_int32(nvp, &rv));
+	return (rv);
+}
+
+int64_t
+fnvpair_value_int64(nvpair_t *nvp)
+{
+	int64_t rv;
+	VERIFY0(nvpair_value_int64(nvp, &rv));
+	return (rv);
+}
+
+uint8_t
+fnvpair_value_uint8_t(nvpair_t *nvp)
+{
+	uint8_t rv;
+	VERIFY0(nvpair_value_uint8(nvp, &rv));
+	return (rv);
+}
+
+uint16_t
+fnvpair_value_uint16(nvpair_t *nvp)
+{
+	uint16_t rv;
+	VERIFY0(nvpair_value_uint16(nvp, &rv));
+	return (rv);
+}
+
+uint32_t
+fnvpair_value_uint32(nvpair_t *nvp)
+{
+	uint32_t rv;
+	VERIFY0(nvpair_value_uint32(nvp, &rv));
+	return (rv);
+}
+
+uint64_t
+fnvpair_value_uint64(nvpair_t *nvp)
+{
+	uint64_t rv;
+	VERIFY0(nvpair_value_uint64(nvp, &rv));
+	return (rv);
+}
+
+char *
+fnvpair_value_string(nvpair_t *nvp)
+{
+	char *rv;
+	VERIFY0(nvpair_value_string(nvp, &rv));
+	return (rv);
+}
+
+nvlist_t *
+fnvpair_value_nvlist(nvpair_t *nvp)
+{
+	nvlist_t *rv;
+	VERIFY0(nvpair_value_nvlist(nvp, &rv));
+	return (rv);
+}
Index: src/external/cddl/osnet/dist/common/nvpair/nvpair.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/nvpair/nvpair.c,v
retrieving revision 1.3
diff -u -p -r1.3 nvpair.c
--- src/external/cddl/osnet/dist/common/nvpair/nvpair.c	10 Apr 2015 22:29:35 -0000	1.3
+++ src/external/cddl/osnet/dist/common/nvpair/nvpair.c	10 Jun 2017 05:30:43 -0000
@@ -35,7 +35,6 @@
 
 #if defined(_KERNEL) && !defined(_BOOT)
 #include <sys/varargs.h>
-#include <sys/ddi.h>
 #include <sys/sunddi.h>
 #else
 #include <stdarg.h>
@@ -49,6 +48,14 @@
 #endif
 #define	skip_whitespace(p)	while ((*(p) == ' ') || (*(p) == '\t')) p++
 
+#if !defined(illumos) && !defined(_KERNEL)
+/*
+ * libnvpair is the lowest commen denominator for ZFS related libraries,
+ * defining aok here makes it usable by all ZFS related libraries
+ */
+int aok;
+#endif
+
 /*
  * nvpair.c - Provides kernel & userland interfaces for manipulating
  *	name-value pairs.
Index: src/external/cddl/osnet/dist/common/unicode/u8_textprep.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/unicode/u8_textprep.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 u8_textprep.c
--- src/external/cddl/osnet/dist/common/unicode/u8_textprep.c	7 Aug 2009 18:32:31 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/common/unicode/u8_textprep.c	18 Apr 2017 15:05:00 -0000
@@ -23,7 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 
 /*
@@ -42,14 +41,13 @@
 #include <sys/systm.h>
 #include <sys/debug.h>
 #include <sys/kmem.h>
-#include <sys/ddi.h>
 #include <sys/sunddi.h>
 #else
-#include <sys/u8_textprep.h>
 #include <strings.h>
 #endif	/* _KERNEL */
 #include <sys/byteorder.h>
 #include <sys/errno.h>
+#include <sys/u8_textprep.h>
 #include <sys/u8_textprep_data.h>
 
 
@@ -144,10 +142,10 @@
 #define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
 
 /* The following are some convenience macros. */
-#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
-	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
-		(uint32_t)(b3) & 0x3F;
-
+#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3)  \
+	(u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
+		(((uint32_t)(b2) & 0x3F) << 6)  | \
+		((uint32_t)(b3) & 0x3F));
 #define	U8_SIMPLE_SWAP(a, b, t) \
 	(t) = (a); \
 	(a) = (b); \
@@ -217,10 +215,10 @@ const int8_t u8_number_of_bytes[0x100] =
 /*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 
-/*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
+/*	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 
-/*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
+/*	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
 	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
 
 /*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
Index: src/external/cddl/osnet/dist/common/util/strtolctype.h
===================================================================
RCS file: src/external/cddl/osnet/dist/common/util/strtolctype.h
diff -N src/external/cddl/osnet/dist/common/util/strtolctype.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/common/util/strtolctype.h	4 Feb 2015 07:24:07 -0000
@@ -0,0 +1,79 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1988 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+#ifndef	_COMMON_UTIL_CTYPE_H
+#define	_COMMON_UTIL_CTYPE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * This header file contains a collection of macros that the strtou?ll?
+ * functions in common/util use to test characters.  What we need is a kernel
+ * version of ctype.h.
+ *
+ * NOTE: These macros are used within several DTrace probe context functions.
+ * They must not be altered to make function calls or perform actions not
+ * safe in probe context.
+ */
+
+#if defined(illumos) && (defined(_KERNEL) || defined(_BOOT))
+
+#define	isalnum(ch)	(isalpha(ch) || isdigit(ch))
+#define	isalpha(ch)	(isupper(ch) || islower(ch))
+#define	isdigit(ch)	((ch) >= '0' && (ch) <= '9')
+#define	islower(ch)	((ch) >= 'a' && (ch) <= 'z')
+#define	isspace(ch)	(((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
+			((ch) == '\t') || ((ch) == '\f'))
+#define	isupper(ch)	((ch) >= 'A' && (ch) <= 'Z')
+#define	isxdigit(ch)	(isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
+			((ch) >= 'A' && (ch) <= 'F'))
+
+#endif	/* _KERNEL || _BOOT */
+
+#define	DIGIT(x)	\
+	(isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
+
+#define	MBASE	('z' - 'a' + 1 + 10)
+
+/*
+ * The following macro is a version of isalnum() that limits alphabetic
+ * characters to the ranges a-z and A-Z; locale dependent characters will not
+ * return 1.  The members of a-z and A-Z are assumed to be in ascending order
+ * and contiguous.
+ */
+#define	lisalnum(x)	\
+	(isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _COMMON_UTIL_CTYPE_H */
Index: src/external/cddl/osnet/dist/common/zfs/zfeature_common.c
===================================================================
RCS file: src/external/cddl/osnet/dist/common/zfs/zfeature_common.c
diff -N src/external/cddl/osnet/dist/common/zfs/zfeature_common.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/common/zfs/zfeature_common.c	27 Jun 2017 23:57:12 -0000
@@ -0,0 +1,252 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <errno.h>
+#include <string.h>
+#endif
+#include <sys/debug.h>
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+#include "zfeature_common.h"
+
+/*
+ * Set to disable all feature checks while opening pools, allowing pools with
+ * unsupported features to be opened. Set for testing only.
+ */
+boolean_t zfeature_checks_disable = B_FALSE;
+
+zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+/*
+ * Valid characters for feature guids. This list is mainly for aesthetic
+ * purposes and could be expanded in the future. There are different allowed
+ * characters in the guids reverse dns portion (before the colon) and its
+ * short name (after the colon).
+ */
+static int
+valid_char(char c, boolean_t after_colon)
+{
+	return ((c >= 'a' && c <= 'z') ||
+	    (c >= '0' && c <= '9') ||
+	    (after_colon && c == '_') ||
+	    (!after_colon && (c == '.' || c == '-')));
+}
+
+/*
+ * Every feature guid must contain exactly one colon which separates a reverse
+ * dns organization name from the feature's "short" name (e.g.
+ * "com.company:feature_name").
+ */
+boolean_t
+zfeature_is_valid_guid(const char *name)
+{
+	int i;
+	boolean_t has_colon = B_FALSE;
+
+	i = 0;
+	while (name[i] != '\0') {
+		char c = name[i++];
+		if (c == ':') {
+			if (has_colon)
+				return (B_FALSE);
+			has_colon = B_TRUE;
+			continue;
+		}
+		if (!valid_char(c, has_colon))
+			return (B_FALSE);
+	}
+
+	return (has_colon);
+}
+
+boolean_t
+zfeature_is_supported(const char *guid)
+{
+	if (zfeature_checks_disable)
+		return (B_TRUE);
+
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t *feature = &spa_feature_table[i];
+		if (strcmp(guid, feature->fi_guid) == 0)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+int
+zfeature_lookup_name(const char *name, spa_feature_t *res)
+{
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t *feature = &spa_feature_table[i];
+		if (strcmp(name, feature->fi_uname) == 0) {
+			if (res != NULL)
+				*res = i;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+boolean_t
+zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
+{
+	zfeature_info_t *feature = &spa_feature_table[fid];
+
+	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
+		if (feature->fi_depends[i] == check)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static void
+zfeature_register(spa_feature_t fid, const char *guid, const char *name,
+    const char *desc, zfeature_flags_t flags, const spa_feature_t *deps)
+{
+	zfeature_info_t *feature = &spa_feature_table[fid];
+	static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+	ASSERT(name != NULL);
+	ASSERT(desc != NULL);
+	ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
+	    (flags & ZFEATURE_FLAG_MOS) == 0);
+	ASSERT3U(fid, <, SPA_FEATURES);
+	ASSERT(zfeature_is_valid_guid(guid));
+
+	if (deps == NULL)
+		deps = nodeps;
+
+	feature->fi_feature = fid;
+	feature->fi_guid = guid;
+	feature->fi_uname = name;
+	feature->fi_desc = desc;
+	feature->fi_flags = flags;
+	feature->fi_depends = deps;
+}
+
+void
+zpool_feature_init(void)
+{
+	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
+	    "com.delphix:async_destroy", "async_destroy",
+	    "Destroy filesystems asynchronously.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+	    "com.delphix:empty_bpobj", "empty_bpobj",
+	    "Snapshots use less space.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+	zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
+	    "org.illumos:lz4_compress", "lz4_compress",
+	    "LZ4 compression algorithm support.",
+	    ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL);
+
+	zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+	    "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
+	    "Crash dumps to multiple vdev pools.",
+	    0, NULL);
+
+	zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
+	    "com.delphix:spacemap_histogram", "spacemap_histogram",
+	    "Spacemaps maintain space histograms.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+	zfeature_register(SPA_FEATURE_ENABLED_TXG,
+	    "com.delphix:enabled_txg", "enabled_txg",
+	    "Record txg at which a feature is enabled",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
+
+	static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG,
+	    SPA_FEATURE_NONE };
+	zfeature_register(SPA_FEATURE_HOLE_BIRTH,
+	    "com.delphix:hole_birth", "hole_birth",
+	    "Retain hole birth txg for more precise zfs send",
+	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    hole_birth_deps);
+
+	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
+	    "com.delphix:extensible_dataset", "extensible_dataset",
+	    "Enhanced dataset functionality, used by other features.",
+	    0, NULL);
+
+	static const spa_feature_t bookmarks_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_BOOKMARKS,
+	    "com.delphix:bookmarks", "bookmarks",
+	    "\"zfs bookmark\" command",
+	    ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps);
+
+	static const spa_feature_t filesystem_limits_deps[] = {
+	    SPA_FEATURE_EXTENSIBLE_DATASET,
+	    SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
+	    "com.joyent:filesystem_limits", "filesystem_limits",
+	    "Filesystem and snapshot limits.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps);
+
+	zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
+	    "com.delphix:embedded_data", "embedded_data",
+	    "Blocks which compress very well use even less space.",
+	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    NULL);
+
+	static const spa_feature_t large_blocks_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+	    "org.open-zfs:large_blocks", "large_blocks",
+	    "Support for blocks larger than 128KB.",
+	    ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
+#ifndef __NetBSD__
+	zfeature_register(SPA_FEATURE_SHA512,
+	    "org.illumos:sha512", "sha512",
+	    "SHA-512/256 hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, NULL);
+	zfeature_register(SPA_FEATURE_SKEIN,
+	    "org.illumos:skein", "skein",
+	    "Skein hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, NULL);
+#endif
+
+#ifdef illumos
+	zfeature_register(SPA_FEATURE_EDONR,
+	    "org.illumos:edonr", "edonr",
+	    "Edon-R hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, NULL);
+#endif
+}
Index: src/external/cddl/osnet/dist/common/zfs/zfeature_common.h
===================================================================
RCS file: src/external/cddl/osnet/dist/common/zfs/zfeature_common.h
diff -N src/external/cddl/osnet/dist/common/zfs/zfeature_common.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/common/zfs/zfeature_common.h	27 Jun 2017 23:57:31 -0000
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#ifndef _ZFEATURE_COMMON_H
+#define	_ZFEATURE_COMMON_H
+
+#include <sys/fs/zfs.h>
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct zfeature_info;
+
+typedef enum spa_feature {
+	SPA_FEATURE_NONE = -1,
+	SPA_FEATURE_ASYNC_DESTROY,
+	SPA_FEATURE_EMPTY_BPOBJ,
+	SPA_FEATURE_LZ4_COMPRESS,
+	SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+	SPA_FEATURE_SPACEMAP_HISTOGRAM,
+	SPA_FEATURE_ENABLED_TXG,
+	SPA_FEATURE_HOLE_BIRTH,
+	SPA_FEATURE_EXTENSIBLE_DATASET,
+	SPA_FEATURE_EMBEDDED_DATA,
+	SPA_FEATURE_BOOKMARKS,
+	SPA_FEATURE_FS_SS_LIMIT,
+	SPA_FEATURE_LARGE_BLOCKS,
+#ifndef __NetBSD__
+	SPA_FEATURE_SHA512,
+	SPA_FEATURE_SKEIN,
+#endif
+#ifdef illumos
+	SPA_FEATURE_EDONR,
+#endif
+	SPA_FEATURES
+} spa_feature_t;
+
+#define	SPA_FEATURE_DISABLED	(-1ULL)
+
+typedef enum zfeature_flags {
+	/* Can open pool readonly even if this feature is not supported. */
+	ZFEATURE_FLAG_READONLY_COMPAT =		(1 << 0),
+	/* Is this feature necessary to read the MOS? */
+	ZFEATURE_FLAG_MOS =			(1 << 1),
+	/* Activate this feature at the same time it is enabled. */
+	ZFEATURE_FLAG_ACTIVATE_ON_ENABLE =	(1 << 2),
+	/* Each dataset has a field set if it has ever used this feature. */
+	ZFEATURE_FLAG_PER_DATASET =		(1 << 3)
+} zfeature_flags_t;
+
+typedef struct zfeature_info {
+	spa_feature_t fi_feature;
+	const char *fi_uname;	/* User-facing feature name */
+	const char *fi_guid;	/* On-disk feature identifier */
+	const char *fi_desc;	/* Feature description */
+	zfeature_flags_t fi_flags;
+	/* array of dependencies, terminated by SPA_FEATURE_NONE */
+	const spa_feature_t *fi_depends;
+} zfeature_info_t;
+
+typedef int (zfeature_func_t)(zfeature_info_t *, void *);
+
+#define	ZFS_FEATURE_DEBUG
+
+extern zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+extern boolean_t zfeature_is_valid_guid(const char *);
+
+extern boolean_t zfeature_is_supported(const char *);
+extern int zfeature_lookup_name(const char *, spa_feature_t *);
+extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
+
+extern void zpool_feature_init(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFEATURE_COMMON_H */
Index: src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_comutil.c
--- src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c	27 Feb 2010 22:29:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c	23 Mar 2013 15:29:25 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 /*
@@ -37,8 +37,8 @@
 
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
-#include <sys/int_limits.h>
 #include <sys/nvpair.h>
+#include "zfs_comutil.h"
 
 /*
  * Are there allocatable vdevs?
@@ -103,3 +103,104 @@ zpool_get_rewind_policy(nvlist_t *nvl, z
 	if (zrpp->zrp_request == 0)
 		zrpp->zrp_request = ZPOOL_NO_REWIND;
 }
+
+typedef struct zfs_version_spa_map {
+	int	version_zpl;
+	int	version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+	{ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+	{ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+	{ZPL_VERSION_FUID, SPA_VERSION_FUID},
+	{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+	{ZPL_VERSION_SA, SPA_VERSION_SA},
+	{0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+	int i;
+	int version = -1;
+
+	for (i = 0; zfs_version_table[i].version_spa; i++) {
+		if (spa_version >= zfs_version_table[i].version_spa)
+			version = zfs_version_table[i].version_zpl;
+	}
+
+	return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+	int i;
+	int version = -1;
+
+	for (i = 0; zfs_version_table[i].version_zpl; i++) {
+		if (zfs_version_table[i].version_zpl >= zpl_version)
+			return (zfs_version_table[i].version_spa);
+	}
+
+	return (version);
+}
+
+/*
+ * This is the table of legacy internal event names; it should not be modified.
+ * The internal events are now stored in the history log as strings.
+ */
+const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
+	"invalid event",
+	"pool create",
+	"vdev add",
+	"pool remove",
+	"pool destroy",
+	"pool export",
+	"pool import",
+	"vdev attach",
+	"vdev replace",
+	"vdev detach",
+	"vdev online",
+	"vdev offline",
+	"vdev upgrade",
+	"pool clear",
+	"pool scrub",
+	"pool property set",
+	"create",
+	"clone",
+	"destroy",
+	"destroy_begin_sync",
+	"inherit",
+	"property set",
+	"quota set",
+	"permission update",
+	"permission remove",
+	"permission who remove",
+	"promote",
+	"receive",
+	"rename",
+	"reservation set",
+	"replay_inc_sync",
+	"replay_full_sync",
+	"rollback",
+	"snapshot",
+	"filesystem version upgrade",
+	"refquota set",
+	"refreservation set",
+	"pool scrub done",
+	"user hold",
+	"user release",
+	"pool split",
+};
Index: src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_comutil.h
--- src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h	27 Feb 2010 22:29:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h	23 Mar 2013 15:29:25 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef	_ZFS_COMUTIL_H
@@ -36,6 +36,11 @@ extern "C" {
 extern boolean_t zfs_allocatable_devs(nvlist_t *);
 extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
 
+extern int zfs_zpl_version_map(int spa_version);
+extern int zfs_spa_version_map(int zpl_version);
+#define	ZFS_NUM_LEGACY_HISTORY_EVENTS 41
+extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS];
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_deleg.c
--- src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c	27 Feb 2010 22:29:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c	10 Oct 2016 11:10:02 -0000
@@ -19,10 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
+#include <sys/zfs_context.h>
+
 #if defined(_KERNEL)
 #include <sys/systm.h>
 #include <sys/sunddi.h>
@@ -34,42 +38,34 @@
 #include <libnvpair.h>
 #include <ctype.h>
 #endif
-/* XXX includes zfs_context.h, so why bother with the above? */
 #include <sys/dsl_deleg.h>
 #include "zfs_prop.h"
 #include "zfs_deleg.h"
 #include "zfs_namecheck.h"
 
-/*
- * permission table
- *
- * Keep this table in sorted order
- *
- * This table is used for displaying all permissions for
- * zfs allow
- */
-
 zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
-	{ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW},
-	{ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE },
-	{ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE },
-	{ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY },
-	{ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT },
-	{ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE },
-	{ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE },
-	{ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME },
-	{ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK },
-	{ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT },
-	{ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
-	{ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
-	{ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
-	{ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
-	{ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
-	{ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
-	{ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
-	{ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
-	{ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
-	{NULL, ZFS_DELEG_NOTE_NONE }
+	{ZFS_DELEG_PERM_ALLOW},
+	{ZFS_DELEG_PERM_BOOKMARK},
+	{ZFS_DELEG_PERM_CLONE},
+	{ZFS_DELEG_PERM_CREATE},
+	{ZFS_DELEG_PERM_DESTROY},
+	{ZFS_DELEG_PERM_DIFF},
+	{ZFS_DELEG_PERM_MOUNT},
+	{ZFS_DELEG_PERM_PROMOTE},
+	{ZFS_DELEG_PERM_RECEIVE},
+	{ZFS_DELEG_PERM_RENAME},
+	{ZFS_DELEG_PERM_ROLLBACK},
+	{ZFS_DELEG_PERM_SNAPSHOT},
+	{ZFS_DELEG_PERM_SHARE},
+	{ZFS_DELEG_PERM_SEND},
+	{ZFS_DELEG_PERM_USERPROP},
+	{ZFS_DELEG_PERM_USERQUOTA},
+	{ZFS_DELEG_PERM_GROUPQUOTA},
+	{ZFS_DELEG_PERM_USERUSED},
+	{ZFS_DELEG_PERM_GROUPUSED},
+	{ZFS_DELEG_PERM_HOLD},
+	{ZFS_DELEG_PERM_RELEASE},
+	{NULL}
 };
 
 static int
@@ -182,8 +178,9 @@ zfs_deleg_verify_nvlist(nvlist_t *nvp)
 			    nvpair_name(perm_name));
 			if (error)
 				return (-1);
-		} while (perm_name = nvlist_next_nvpair(perms, perm_name));
-	} while (who = nvlist_next_nvpair(nvp, who));
+		} while ((perm_name = nvlist_next_nvpair(perms, perm_name))
+		    != NULL);
+	} while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
 	return (0);
 }
 
Index: src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_deleg.h
--- src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h	27 Feb 2010 22:29:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h	13 Jan 2014 02:59:29 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef	_ZFS_DELEG_H
@@ -52,6 +53,7 @@ typedef enum {
 	ZFS_DELEG_NOTE_CLONE,
 	ZFS_DELEG_NOTE_PROMOTE,
 	ZFS_DELEG_NOTE_RENAME,
+	ZFS_DELEG_NOTE_SEND,
 	ZFS_DELEG_NOTE_RECEIVE,
 	ZFS_DELEG_NOTE_ALLOW,
 	ZFS_DELEG_NOTE_USERPROP,
@@ -63,6 +65,8 @@ typedef enum {
 	ZFS_DELEG_NOTE_GROUPUSED,
 	ZFS_DELEG_NOTE_HOLD,
 	ZFS_DELEG_NOTE_RELEASE,
+	ZFS_DELEG_NOTE_DIFF,
+	ZFS_DELEG_NOTE_BOOKMARK,
 	ZFS_DELEG_NOTE_NONE
 } zfs_deleg_note_t;
 
Index: src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 zfs_fletcher.c
--- src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c	27 Feb 2010 22:29:40 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c	22 Nov 2015 17:22:21 -0000
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
 
 /*
  * Fletcher Checksums
@@ -131,8 +134,10 @@
 #include <sys/zio.h>
 #include <sys/spa.h>
 
+/*ARGSUSED*/
 void
-fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_2_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
@@ -148,8 +153,10 @@ fletcher_2_native(const void *buf, uint6
 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 }
 
+/*ARGSUSED*/
 void
-fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_2_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
@@ -165,8 +172,10 @@ fletcher_2_byteswap(const void *buf, uin
 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 }
 
+/*ARGSUSED*/
 void
-fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_4_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
@@ -182,8 +191,10 @@ fletcher_4_native(const void *buf, uint6
 	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 }
 
+/*ARGSUSED*/
 void
-fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_4_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
Index: src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 zfs_fletcher.h
--- src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h	27 Feb 2010 22:29:40 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h	22 Nov 2015 17:22:21 -0000
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
 
 #ifndef	_ZFS_FLETCHER_H
 #define	_ZFS_FLETCHER_H
@@ -37,14 +40,12 @@ extern "C" {
  * fletcher checksum functions
  */
 
-void fletcher_2_native(const void *, uint64_t, zio_cksum_t *);
-void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *);
-void fletcher_4_native(const void *, uint64_t, zio_cksum_t *);
-void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *);
-void fletcher_4_incremental_native(const void *, uint64_t,
-    zio_cksum_t *);
-void fletcher_4_incremental_byteswap(const void *, uint64_t,
-    zio_cksum_t *);
+void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
+void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
+void fletcher_4_incremental_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_incremental_byteswap(const void *, uint64_t, zio_cksum_t *);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c
===================================================================
RCS file: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c
diff -N src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c	30 Apr 2017 03:32:02 -0000
@@ -0,0 +1,1380 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Xin Li <delphij@FreeBSD.org>. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Portions Copyright 2005, 2010, Oracle and/or its affiliates.
+ * All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cred.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/nvpair.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zfs_ioctl.h>
+#include "zfs_namecheck.h"
+#include "zfs_ioctl_compat.h"
+
+static int zfs_version_ioctl = ZFS_IOCVER_CURRENT;
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
+    0, "ZFS_IOCTL_VERSION");
+
+/*
+ * FreeBSD zfs_cmd compatibility with older binaries
+ * appropriately remap/extend the zfs_cmd_t structure
+ */
+void
+zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag)
+{
+	zfs_cmd_v15_t *zc_c;
+	zfs_cmd_v28_t *zc28_c;
+	zfs_cmd_deadman_t *zcdm_c;
+	zfs_cmd_zcmd_t *zcmd_c;
+	zfs_cmd_edbp_t *edbp_c;
+	zfs_cmd_resume_t *resume_c;
+	zfs_cmd_inlanes_t *inlanes_c;
+
+	switch (cflag) {
+	case ZFS_CMD_COMPAT_INLANES:
+		inlanes_c = (void *)addr;
+		/* zc */
+		strlcpy(zc->zc_name, inlanes_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, inlanes_c->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc->zc_string, inlanes_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = inlanes_c->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		FIELD_COPY(zc_begin_record);
+		FIELD_COPY(zc_inject_record);
+		FIELD_COPY(zc_defer_destroy);
+		FIELD_COPY(zc_flags);
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		FIELD_COPY(zc_resumable);
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+		break;
+
+	case ZFS_CMD_COMPAT_RESUME:
+		resume_c = (void *)addr;
+		/* zc */
+		strlcpy(zc->zc_name, resume_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, resume_c->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc->zc_string, resume_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = resume_c->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		FIELD_COPY(zc_begin_record);
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(zc->zc_inject_record.zi_func,
+		    resume_c->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		zc->zc_inject_record.zi_nlanes = 1;
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+		FIELD_COPY(zc_defer_destroy);
+		FIELD_COPY(zc_flags);
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		FIELD_COPY(zc_resumable);
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+		break;
+
+	case ZFS_CMD_COMPAT_EDBP:
+		edbp_c = (void *)addr;
+		/* zc */
+		strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = edbp_c->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record;
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(zc->zc_inject_record.zi_func,
+		    edbp_c->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		zc->zc_inject_record.zi_nlanes = 1;
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+		FIELD_COPY(zc_defer_destroy);
+		FIELD_COPY(zc_flags);
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		zc->zc_resumable = B_FALSE;
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+		break;
+
+	case ZFS_CMD_COMPAT_ZCMD:
+		zcmd_c = (void *)addr;
+		/* zc */
+		strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = zcmd_c->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record;
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(zc->zc_inject_record.zi_func,
+		    zcmd_c->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		zc->zc_inject_record.zi_nlanes = 1;
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+
+		/* boolean_t -> uint32_t */
+		zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy);
+		zc->zc_flags = 0;
+
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		zc->zc_resumable = B_FALSE;
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+
+		break;
+
+	case ZFS_CMD_COMPAT_DEADMAN:
+		zcdm_c = (void *)addr;
+		/* zc */
+		strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zc->field = zcdm_c->field
+		zc->zc_guid = zcdm_c->zc_guid;
+		zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf;
+		zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size;
+		zc->zc_nvlist_src = zcdm_c->zc_nvlist_src;
+		zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size;
+		zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst;
+		zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size;
+		zc->zc_cookie = zcdm_c->zc_cookie;
+		zc->zc_objset_type = zcdm_c->zc_objset_type;
+		zc->zc_perm_action = zcdm_c->zc_perm_action;
+		zc->zc_history = zcdm_c->zc_history;
+		zc->zc_history_len = zcdm_c->zc_history_len;
+		zc->zc_history_offset = zcdm_c->zc_history_offset;
+		zc->zc_obj = zcdm_c->zc_obj;
+		zc->zc_iflags = zcdm_c->zc_iflags;
+		zc->zc_share = zcdm_c->zc_share;
+		zc->zc_jailid = zcdm_c->zc_jailid;
+		zc->zc_objset_stats = zcdm_c->zc_objset_stats;
+		zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record;
+		zc->zc_defer_destroy = zcdm_c->zc_defer_destroy;
+		(void)zcdm_c->zc_temphold;
+		zc->zc_action_handle = zcdm_c->zc_action_handle;
+		zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd;
+		zc->zc_simple = zcdm_c->zc_simple;
+		zc->zc_resumable = B_FALSE;
+		zc->zc_sendobj = zcdm_c->zc_sendobj;
+		zc->zc_fromobj = zcdm_c->zc_fromobj;
+		zc->zc_createtxg = zcdm_c->zc_createtxg;
+		zc->zc_stat = zcdm_c->zc_stat;
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(zc->zc_inject_record.zi_func,
+		    resume_c->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		zc->zc_inject_record.zi_nlanes = 1;
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+
+		/* we always assume zc_nvlist_dst_filled is true */
+		zc->zc_nvlist_dst_filled = B_TRUE;
+#undef FIELD_COPY
+		break;
+
+	case ZFS_CMD_COMPAT_V28:
+		zc28_c = (void *)addr;
+
+		/* zc */
+		strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN);
+		zc->zc_guid = zc28_c->zc_guid;
+		zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf;
+		zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size;
+		zc->zc_nvlist_src = zc28_c->zc_nvlist_src;
+		zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size;
+		zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst;
+		zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size;
+		zc->zc_cookie = zc28_c->zc_cookie;
+		zc->zc_objset_type = zc28_c->zc_objset_type;
+		zc->zc_perm_action = zc28_c->zc_perm_action;
+		zc->zc_history = zc28_c->zc_history;
+		zc->zc_history_len = zc28_c->zc_history_len;
+		zc->zc_history_offset = zc28_c->zc_history_offset;
+		zc->zc_obj = zc28_c->zc_obj;
+		zc->zc_iflags = zc28_c->zc_iflags;
+		zc->zc_share = zc28_c->zc_share;
+		zc->zc_jailid = zc28_c->zc_jailid;
+		zc->zc_objset_stats = zc28_c->zc_objset_stats;
+		zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record;
+		zc->zc_defer_destroy = zc28_c->zc_defer_destroy;
+		(void)zc28_c->zc_temphold;
+		zc->zc_action_handle = zc28_c->zc_action_handle;
+		zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd;
+		zc->zc_simple = zc28_c->zc_simple;
+		zc->zc_resumable = B_FALSE;
+		zc->zc_sendobj = zc28_c->zc_sendobj;
+		zc->zc_fromobj = zc28_c->zc_fromobj;
+		zc->zc_createtxg = zc28_c->zc_createtxg;
+		zc->zc_stat = zc28_c->zc_stat;
+
+		/* zc->zc_inject_record */
+		zc->zc_inject_record.zi_objset =
+		    zc28_c->zc_inject_record.zi_objset;
+		zc->zc_inject_record.zi_object =
+		    zc28_c->zc_inject_record.zi_object;
+		zc->zc_inject_record.zi_start =
+		    zc28_c->zc_inject_record.zi_start;
+		zc->zc_inject_record.zi_end =
+		    zc28_c->zc_inject_record.zi_end;
+		zc->zc_inject_record.zi_guid =
+		    zc28_c->zc_inject_record.zi_guid;
+		zc->zc_inject_record.zi_level =
+		    zc28_c->zc_inject_record.zi_level;
+		zc->zc_inject_record.zi_error =
+		    zc28_c->zc_inject_record.zi_error;
+		zc->zc_inject_record.zi_type =
+		    zc28_c->zc_inject_record.zi_type;
+		zc->zc_inject_record.zi_freq =
+		    zc28_c->zc_inject_record.zi_freq;
+		zc->zc_inject_record.zi_failfast =
+		    zc28_c->zc_inject_record.zi_failfast;
+		strlcpy(zc->zc_inject_record.zi_func,
+		    zc28_c->zc_inject_record.zi_func, MAXNAMELEN);
+		zc->zc_inject_record.zi_iotype =
+		    zc28_c->zc_inject_record.zi_iotype;
+		zc->zc_inject_record.zi_duration =
+		    zc28_c->zc_inject_record.zi_duration;
+		zc->zc_inject_record.zi_timer =
+		    zc28_c->zc_inject_record.zi_timer;
+		zc->zc_inject_record.zi_nlanes = 1;
+		zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED;
+		zc->zc_inject_record.zi_pad = 0;
+		break;
+
+	case ZFS_CMD_COMPAT_V15:
+		zc_c = (void *)addr;
+
+		/* zc */
+		strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN);
+		strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN);
+		strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN);
+		zc->zc_guid = zc_c->zc_guid;
+		zc->zc_nvlist_conf = zc_c->zc_nvlist_conf;
+		zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size;
+		zc->zc_nvlist_src = zc_c->zc_nvlist_src;
+		zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size;
+		zc->zc_nvlist_dst = zc_c->zc_nvlist_dst;
+		zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size;
+		zc->zc_cookie = zc_c->zc_cookie;
+		zc->zc_objset_type = zc_c->zc_objset_type;
+		zc->zc_perm_action = zc_c->zc_perm_action;
+		zc->zc_history = zc_c->zc_history;
+		zc->zc_history_len = zc_c->zc_history_len;
+		zc->zc_history_offset = zc_c->zc_history_offset;
+		zc->zc_obj = zc_c->zc_obj;
+		zc->zc_share = zc_c->zc_share;
+		zc->zc_jailid = zc_c->zc_jailid;
+		zc->zc_objset_stats = zc_c->zc_objset_stats;
+		zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record;
+
+		/* zc->zc_inject_record */
+		zc->zc_inject_record.zi_objset =
+		    zc_c->zc_inject_record.zi_objset;
+		zc->zc_inject_record.zi_object =
+		    zc_c->zc_inject_record.zi_object;
+		zc->zc_inject_record.zi_start =
+		    zc_c->zc_inject_record.zi_start;
+		zc->zc_inject_record.zi_end =
+		    zc_c->zc_inject_record.zi_end;
+		zc->zc_inject_record.zi_guid =
+		    zc_c->zc_inject_record.zi_guid;
+		zc->zc_inject_record.zi_level =
+		    zc_c->zc_inject_record.zi_level;
+		zc->zc_inject_record.zi_error =
+		    zc_c->zc_inject_record.zi_error;
+		zc->zc_inject_record.zi_type =
+		    zc_c->zc_inject_record.zi_type;
+		zc->zc_inject_record.zi_freq =
+		    zc_c->zc_inject_record.zi_freq;
+		zc->zc_inject_record.zi_failfast =
+		    zc_c->zc_inject_record.zi_failfast;
+		break;
+	}
+}
+
+void
+zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request,
+    const int cflag)
+{
+	zfs_cmd_v15_t *zc_c;
+	zfs_cmd_v28_t *zc28_c;
+	zfs_cmd_deadman_t *zcdm_c;
+	zfs_cmd_zcmd_t *zcmd_c;
+	zfs_cmd_edbp_t *edbp_c;
+	zfs_cmd_resume_t *resume_c;
+	zfs_cmd_inlanes_t *inlanes_c;
+
+	switch (cflag) {
+	case ZFS_CMD_COMPAT_INLANES:
+		inlanes_c = (void *)addr;
+		strlcpy(inlanes_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(inlanes_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+		strlcpy(inlanes_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) inlanes_c->field = zc->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		FIELD_COPY(zc_begin_record);
+		FIELD_COPY(zc_inject_record);
+		FIELD_COPY(zc_defer_destroy);
+		FIELD_COPY(zc_flags);
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+		break;
+
+	case ZFS_CMD_COMPAT_RESUME:
+		resume_c = (void *)addr;
+		strlcpy(resume_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(resume_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+		strlcpy(resume_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) resume_c->field = zc->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		FIELD_COPY(zc_begin_record);
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(resume_c->zc_inject_record.zi_func,
+		    zc->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+		FIELD_COPY(zc_defer_destroy);
+		FIELD_COPY(zc_flags);
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+		break;
+
+	case ZFS_CMD_COMPAT_EDBP:
+		edbp_c = (void *)addr;
+		strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+		strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) edbp_c->field = zc->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(resume_c->zc_inject_record.zi_func,
+		    zc->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+		FIELD_COPY(zc_defer_destroy);
+		FIELD_COPY(zc_flags);
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+		break;
+
+	case ZFS_CMD_COMPAT_ZCMD:
+		zcmd_c = (void *)addr;
+		/* zc */
+		strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+		strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zcmd_c->field = zc->field
+		FIELD_COPY(zc_nvlist_src);
+		FIELD_COPY(zc_nvlist_src_size);
+		FIELD_COPY(zc_nvlist_dst);
+		FIELD_COPY(zc_nvlist_dst_size);
+		FIELD_COPY(zc_nvlist_dst_filled);
+		FIELD_COPY(zc_pad2);
+		FIELD_COPY(zc_history);
+		FIELD_COPY(zc_guid);
+		FIELD_COPY(zc_nvlist_conf);
+		FIELD_COPY(zc_nvlist_conf_size);
+		FIELD_COPY(zc_cookie);
+		FIELD_COPY(zc_objset_type);
+		FIELD_COPY(zc_perm_action);
+		FIELD_COPY(zc_history_len);
+		FIELD_COPY(zc_history_offset);
+		FIELD_COPY(zc_obj);
+		FIELD_COPY(zc_iflags);
+		FIELD_COPY(zc_share);
+		FIELD_COPY(zc_jailid);
+		FIELD_COPY(zc_objset_stats);
+		zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(resume_c->zc_inject_record.zi_func,
+		    zc->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+
+		/* boolean_t -> uint32_t */
+		zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy);
+		zcmd_c->zc_temphold = 0;
+
+		FIELD_COPY(zc_action_handle);
+		FIELD_COPY(zc_cleanup_fd);
+		FIELD_COPY(zc_simple);
+		FIELD_COPY(zc_sendobj);
+		FIELD_COPY(zc_fromobj);
+		FIELD_COPY(zc_createtxg);
+		FIELD_COPY(zc_stat);
+#undef FIELD_COPY
+
+		break;
+
+	case ZFS_CMD_COMPAT_DEADMAN:
+		zcdm_c = (void *)addr;
+
+		strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+		strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN);
+
+#define FIELD_COPY(field) zcdm_c->field = zc->field
+		zcdm_c->zc_guid = zc->zc_guid;
+		zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+		zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+		zcdm_c->zc_nvlist_src = zc->zc_nvlist_src;
+		zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+		zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+		zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+		zcdm_c->zc_cookie = zc->zc_cookie;
+		zcdm_c->zc_objset_type = zc->zc_objset_type;
+		zcdm_c->zc_perm_action = zc->zc_perm_action;
+		zcdm_c->zc_history = zc->zc_history;
+		zcdm_c->zc_history_len = zc->zc_history_len;
+		zcdm_c->zc_history_offset = zc->zc_history_offset;
+		zcdm_c->zc_obj = zc->zc_obj;
+		zcdm_c->zc_iflags = zc->zc_iflags;
+		zcdm_c->zc_share = zc->zc_share;
+		zcdm_c->zc_jailid = zc->zc_jailid;
+		zcdm_c->zc_objset_stats = zc->zc_objset_stats;
+		zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+		zcdm_c->zc_defer_destroy = zc->zc_defer_destroy;
+		zcdm_c->zc_temphold = 0;
+		zcdm_c->zc_action_handle = zc->zc_action_handle;
+		zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd;
+		zcdm_c->zc_simple = zc->zc_simple;
+		zcdm_c->zc_sendobj = zc->zc_sendobj;
+		zcdm_c->zc_fromobj = zc->zc_fromobj;
+		zcdm_c->zc_createtxg = zc->zc_createtxg;
+		zcdm_c->zc_stat = zc->zc_stat;
+		FIELD_COPY(zc_inject_record.zi_objset);
+		FIELD_COPY(zc_inject_record.zi_object);
+		FIELD_COPY(zc_inject_record.zi_start);
+		FIELD_COPY(zc_inject_record.zi_end);
+		FIELD_COPY(zc_inject_record.zi_guid);
+		FIELD_COPY(zc_inject_record.zi_level);
+		FIELD_COPY(zc_inject_record.zi_error);
+		FIELD_COPY(zc_inject_record.zi_type);
+		FIELD_COPY(zc_inject_record.zi_freq);
+		FIELD_COPY(zc_inject_record.zi_failfast);
+		strlcpy(resume_c->zc_inject_record.zi_func,
+		    zc->zc_inject_record.zi_func, MAXNAMELEN);
+		FIELD_COPY(zc_inject_record.zi_iotype);
+		FIELD_COPY(zc_inject_record.zi_duration);
+		FIELD_COPY(zc_inject_record.zi_timer);
+		FIELD_COPY(zc_inject_record.zi_cmd);
+		FIELD_COPY(zc_inject_record.zi_pad);
+#undef FIELD_COPY
+#ifndef _KERNEL
+		if (request == ZFS_IOC_RECV)
+			strlcpy(zcdm_c->zc_top_ds,
+			    zc->zc_value + strlen(zc->zc_value) + 1,
+			    (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1);
+#endif
+		break;
+
+	case ZFS_CMD_COMPAT_V28:
+		zc28_c = (void *)addr;
+
+		strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2);
+		strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN);
+		zc28_c->zc_guid = zc->zc_guid;
+		zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+		zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+		zc28_c->zc_nvlist_src = zc->zc_nvlist_src;
+		zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+		zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+		zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+		zc28_c->zc_cookie = zc->zc_cookie;
+		zc28_c->zc_objset_type = zc->zc_objset_type;
+		zc28_c->zc_perm_action = zc->zc_perm_action;
+		zc28_c->zc_history = zc->zc_history;
+		zc28_c->zc_history_len = zc->zc_history_len;
+		zc28_c->zc_history_offset = zc->zc_history_offset;
+		zc28_c->zc_obj = zc->zc_obj;
+		zc28_c->zc_iflags = zc->zc_iflags;
+		zc28_c->zc_share = zc->zc_share;
+		zc28_c->zc_jailid = zc->zc_jailid;
+		zc28_c->zc_objset_stats = zc->zc_objset_stats;
+		zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+		zc28_c->zc_defer_destroy = zc->zc_defer_destroy;
+		zc28_c->zc_temphold = 0;
+		zc28_c->zc_action_handle = zc->zc_action_handle;
+		zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd;
+		zc28_c->zc_simple = zc->zc_simple;
+		zc28_c->zc_sendobj = zc->zc_sendobj;
+		zc28_c->zc_fromobj = zc->zc_fromobj;
+		zc28_c->zc_createtxg = zc->zc_createtxg;
+		zc28_c->zc_stat = zc->zc_stat;
+#ifndef _KERNEL
+		if (request == ZFS_IOC_RECV)
+			strlcpy(zc28_c->zc_top_ds,
+			    zc->zc_value + strlen(zc->zc_value) + 1,
+			    MAXPATHLEN * 2 - strlen(zc->zc_value) - 1);
+#endif
+		/* zc_inject_record */
+		zc28_c->zc_inject_record.zi_objset =
+		    zc->zc_inject_record.zi_objset;
+		zc28_c->zc_inject_record.zi_object =
+		    zc->zc_inject_record.zi_object;
+		zc28_c->zc_inject_record.zi_start =
+		    zc->zc_inject_record.zi_start;
+		zc28_c->zc_inject_record.zi_end =
+		    zc->zc_inject_record.zi_end;
+		zc28_c->zc_inject_record.zi_guid =
+		    zc->zc_inject_record.zi_guid;
+		zc28_c->zc_inject_record.zi_level =
+		    zc->zc_inject_record.zi_level;
+		zc28_c->zc_inject_record.zi_error =
+		    zc->zc_inject_record.zi_error;
+		zc28_c->zc_inject_record.zi_type =
+		    zc->zc_inject_record.zi_type;
+		zc28_c->zc_inject_record.zi_freq =
+		    zc->zc_inject_record.zi_freq;
+		zc28_c->zc_inject_record.zi_failfast =
+		    zc->zc_inject_record.zi_failfast;
+		strlcpy(zc28_c->zc_inject_record.zi_func,
+		    zc->zc_inject_record.zi_func, MAXNAMELEN);
+		zc28_c->zc_inject_record.zi_iotype =
+		    zc->zc_inject_record.zi_iotype;
+		zc28_c->zc_inject_record.zi_duration =
+		    zc->zc_inject_record.zi_duration;
+		zc28_c->zc_inject_record.zi_timer =
+		    zc->zc_inject_record.zi_timer;
+		break;
+
+	case ZFS_CMD_COMPAT_V15:
+		zc_c = (void *)addr;
+
+		/* zc */
+		strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN);
+		strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN);
+		strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN);
+		zc_c->zc_guid = zc->zc_guid;
+		zc_c->zc_nvlist_conf = zc->zc_nvlist_conf;
+		zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size;
+		zc_c->zc_nvlist_src = zc->zc_nvlist_src;
+		zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size;
+		zc_c->zc_nvlist_dst = zc->zc_nvlist_dst;
+		zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size;
+		zc_c->zc_cookie = zc->zc_cookie;
+		zc_c->zc_objset_type = zc->zc_objset_type;
+		zc_c->zc_perm_action = zc->zc_perm_action;
+		zc_c->zc_history = zc->zc_history;
+		zc_c->zc_history_len = zc->zc_history_len;
+		zc_c->zc_history_offset = zc->zc_history_offset;
+		zc_c->zc_obj = zc->zc_obj;
+		zc_c->zc_share = zc->zc_share;
+		zc_c->zc_jailid = zc->zc_jailid;
+		zc_c->zc_objset_stats = zc->zc_objset_stats;
+		zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin;
+
+		/* zc_inject_record */
+		zc_c->zc_inject_record.zi_objset =
+		    zc->zc_inject_record.zi_objset;
+		zc_c->zc_inject_record.zi_object =
+		    zc->zc_inject_record.zi_object;
+		zc_c->zc_inject_record.zi_start =
+		    zc->zc_inject_record.zi_start;
+		zc_c->zc_inject_record.zi_end =
+		    zc->zc_inject_record.zi_end;
+		zc_c->zc_inject_record.zi_guid =
+		    zc->zc_inject_record.zi_guid;
+		zc_c->zc_inject_record.zi_level =
+		    zc->zc_inject_record.zi_level;
+		zc_c->zc_inject_record.zi_error =
+		    zc->zc_inject_record.zi_error;
+		zc_c->zc_inject_record.zi_type =
+		    zc->zc_inject_record.zi_type;
+		zc_c->zc_inject_record.zi_freq =
+		    zc->zc_inject_record.zi_freq;
+		zc_c->zc_inject_record.zi_failfast =
+		    zc->zc_inject_record.zi_failfast;
+
+		break;
+	}
+}
+
+static int
+zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag,
+    nvlist_t **nvp)
+{
+	char *packed;
+	int error;
+	nvlist_t *list = NULL;
+
+	/*
+	 * Read in and unpack the user-supplied nvlist.
+	 */
+	if (size == 0)
+		return (EINVAL);
+
+#ifdef _KERNEL
+	packed = kmem_alloc(size, KM_SLEEP);
+	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+	    iflag)) != 0) {
+		kmem_free(packed, size);
+		return (error);
+	}
+#else
+	packed = (void *)(uintptr_t)nvl;
+#endif
+
+	error = nvlist_unpack(packed, size, &list, 0);
+
+#ifdef _KERNEL
+	kmem_free(packed, size);
+#endif
+
+	if (error != 0)
+		return (error);
+
+	*nvp = list;
+	return (0);
+}
+
+static int
+zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
+{
+	char *packed = NULL;
+	int error = 0;
+	size_t size;
+
+	VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
+
+#ifdef _KERNEL
+	packed = kmem_alloc(size, KM_SLEEP);
+	VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
+	    KM_SLEEP) == 0);
+
+	if (ddi_copyout(packed,
+	    (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0)
+		error = EFAULT;
+	kmem_free(packed, size);
+#else
+	packed = (void *)(uintptr_t)zc->zc_nvlist_dst;
+	VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
+	    0) == 0);
+#endif
+
+	zc->zc_nvlist_dst_size = size;
+	return (error);
+}
+
+static void
+zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl)
+{
+	nvlist_t **child;
+	nvlist_t *nvroot = NULL;
+	vdev_stat_t *vs;
+	uint_t c, children, nelem;
+
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++) {
+			zfs_ioctl_compat_fix_stats_nvlist(child[c]);
+		}
+	}
+
+	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0)
+		zfs_ioctl_compat_fix_stats_nvlist(nvroot);
+#ifdef _KERNEL
+	if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
+#else
+	if ((nvlist_lookup_uint64_array(nvl, "stats",
+#endif
+
+	    (uint64_t **)&vs, &nelem) == 0)) {
+		nvlist_add_uint64_array(nvl,
+#ifdef _KERNEL
+		    "stats",
+#else
+		    ZPOOL_CONFIG_VDEV_STATS,
+#endif
+		    (uint64_t *)vs, nelem);
+#ifdef _KERNEL
+		nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS,
+#else
+		nvlist_remove(nvl, "stats",
+#endif
+		    DATA_TYPE_UINT64_ARRAY);
+	}
+}
+
+static int
+zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc)
+{
+	nvlist_t *nv, *nvp = NULL;
+	nvpair_t *elem;
+	int error;
+
+	if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
+	    zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
+		return (error);
+
+	if (nc == 5) { /* ZFS_IOC_POOL_STATS */
+		elem = NULL;
+		while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
+			if (nvpair_value_nvlist(elem, &nvp) == 0)
+				zfs_ioctl_compat_fix_stats_nvlist(nvp);
+		}
+		elem = NULL;
+	} else
+		zfs_ioctl_compat_fix_stats_nvlist(nv);
+
+	error = zfs_ioctl_compat_put_nvlist(zc, nv);
+
+	nvlist_free(nv);
+
+	return (error);
+}
+
+static int
+zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc)
+{
+	nvlist_t *nv, *nva = NULL;
+	int error;
+
+	if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst,
+	    zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0)
+		return (error);
+
+#ifdef _KERNEL
+	if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) {
+		nvlist_add_nvlist(nv, "used", nva);
+		nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST);
+	}
+
+	if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) {
+		nvlist_add_nvlist(nv, "available", nva);
+		nvlist_remove(nv, "free", DATA_TYPE_NVLIST);
+	}
+#else
+	if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) {
+		nvlist_add_nvlist(nv, "allocated", nva);
+		nvlist_remove(nv, "used", DATA_TYPE_NVLIST);
+	}
+
+	if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) {
+		nvlist_add_nvlist(nv, "free", nva);
+		nvlist_remove(nv, "available", DATA_TYPE_NVLIST);
+	}
+#endif
+
+	error = zfs_ioctl_compat_put_nvlist(zc, nv);
+
+	nvlist_free(nv);
+
+	return (error);
+}
+
+#ifndef _KERNEL
+int
+zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag)
+{
+	int nc, ret;
+	void *zc_c;
+	unsigned long ncmd;
+	zfs_iocparm_t zp;
+
+	switch (cflag) {
+	case ZFS_CMD_COMPAT_NONE:
+		ncmd = _IOWR('Z', request, struct zfs_iocparm);
+		zp.zfs_cmd = (uint64_t)zc;
+		zp.zfs_cmd_size = sizeof(zfs_cmd_t);
+		zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT;
+		return (ioctl(fd, ncmd, &zp));
+	case ZFS_CMD_COMPAT_INLANES:
+		ncmd = _IOWR('Z', request, struct zfs_iocparm);
+		zp.zfs_cmd = (uint64_t)zc;
+		zp.zfs_cmd_size = sizeof(zfs_cmd_inlanes_t);
+		zp.zfs_ioctl_version = ZFS_IOCVER_INLANES;
+		return (ioctl(fd, ncmd, &zp));
+	case ZFS_CMD_COMPAT_RESUME:
+		ncmd = _IOWR('Z', request, struct zfs_iocparm);
+		zp.zfs_cmd = (uint64_t)zc;
+		zp.zfs_cmd_size = sizeof(zfs_cmd_resume_t);
+		zp.zfs_ioctl_version = ZFS_IOCVER_RESUME;
+		return (ioctl(fd, ncmd, &zp));
+	case ZFS_CMD_COMPAT_EDBP:
+		ncmd = _IOWR('Z', request, struct zfs_iocparm);
+		zp.zfs_cmd = (uint64_t)zc;
+		zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t);
+		zp.zfs_ioctl_version = ZFS_IOCVER_EDBP;
+		return (ioctl(fd, ncmd, &zp));
+	case ZFS_CMD_COMPAT_ZCMD:
+		ncmd = _IOWR('Z', request, struct zfs_iocparm);
+		zp.zfs_cmd = (uint64_t)zc;
+		zp.zfs_cmd_size = sizeof(zfs_cmd_zcmd_t);
+		zp.zfs_ioctl_version = ZFS_IOCVER_ZCMD;
+		return (ioctl(fd, ncmd, &zp));
+	case ZFS_CMD_COMPAT_LZC:
+		ncmd = _IOWR('Z', request, struct zfs_cmd);
+		return (ioctl(fd, ncmd, zc));
+	case ZFS_CMD_COMPAT_DEADMAN:
+		zc_c = malloc(sizeof(zfs_cmd_deadman_t));
+		ncmd = _IOWR('Z', request, struct zfs_cmd_deadman);
+		break;
+	case ZFS_CMD_COMPAT_V28:
+		zc_c = malloc(sizeof(zfs_cmd_v28_t));
+		ncmd = _IOWR('Z', request, struct zfs_cmd_v28);
+		break;
+	case ZFS_CMD_COMPAT_V15:
+		nc = zfs_ioctl_v28_to_v15[request];
+		zc_c = malloc(sizeof(zfs_cmd_v15_t));
+		ncmd = _IOWR('Z', nc, struct zfs_cmd_v15);
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	if (ZFS_IOCREQ(ncmd) == ZFS_IOC_COMPAT_FAIL)
+		return (ENOTSUP);
+
+	zfs_cmd_compat_put(zc, (caddr_t)zc_c, request, cflag);
+
+	ret = ioctl(fd, ncmd, zc_c);
+	if (cflag == ZFS_CMD_COMPAT_V15 &&
+	    nc == ZFS_IOC_POOL_IMPORT)
+		ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS,
+		    struct zfs_cmd_v15), zc_c);
+	zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag);
+	free(zc_c);
+
+	if (cflag == ZFS_CMD_COMPAT_V15) {
+		switch (nc) {
+		case ZFS_IOC_POOL_IMPORT:
+		case ZFS_IOC_POOL_CONFIGS:
+		case ZFS_IOC_POOL_STATS:
+		case ZFS_IOC_POOL_TRYIMPORT:
+			zfs_ioctl_compat_fix_stats(zc, nc);
+			break;
+		case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
+			zfs_ioctl_compat_pool_get_props(zc);
+			break;
+		}
+	}
+
+	return (ret);
+}
+#else /* _KERNEL */
+int
+zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag)
+{
+	int error = 0;
+
+	/* are we creating a clone? */
+	if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0')
+		*vec = ZFS_IOC_CLONE;
+
+	if (cflag == ZFS_CMD_COMPAT_V15) {
+		switch (*vec) {
+
+		case 7: /* ZFS_IOC_POOL_SCRUB (v15) */
+			zc->zc_cookie = POOL_SCAN_SCRUB;
+			break;
+		}
+	}
+
+	return (error);
+}
+
+void
+zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag)
+{
+	if (cflag == ZFS_CMD_COMPAT_V15) {
+		switch (vec) {
+		case ZFS_IOC_POOL_CONFIGS:
+		case ZFS_IOC_POOL_STATS:
+		case ZFS_IOC_POOL_TRYIMPORT:
+			zfs_ioctl_compat_fix_stats(zc, vec);
+			break;
+		case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */
+			zfs_ioctl_compat_pool_get_props(zc);
+			break;
+		}
+	}
+}
+
+nvlist_t *
+zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec,
+    const int cflag)
+{
+	nvlist_t *nvl, *tmpnvl, *hnvl;
+	nvpair_t *elem;
+	char *poolname, *snapname;
+	int err;
+
+	if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC ||
+	    cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP ||
+	    cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES)
+		goto out;
+
+	switch (vec) {
+	case ZFS_IOC_CREATE:
+		nvl = fnvlist_alloc();
+		fnvlist_add_int32(nvl, "type", zc->zc_objset_type);
+		if (innvl != NULL) {
+			fnvlist_add_nvlist(nvl, "props", innvl);
+			nvlist_free(innvl);
+		}
+		return (nvl);
+	break;
+	case ZFS_IOC_CLONE:
+		nvl = fnvlist_alloc();
+		fnvlist_add_string(nvl, "origin", zc->zc_value);
+		if (innvl != NULL) {
+			fnvlist_add_nvlist(nvl, "props", innvl);
+			nvlist_free(innvl);
+		}
+		return (nvl);
+	break;
+	case ZFS_IOC_SNAPSHOT:
+		if (innvl == NULL)
+			goto out;
+		nvl = fnvlist_alloc();
+		fnvlist_add_nvlist(nvl, "props", innvl);
+		tmpnvl = fnvlist_alloc();
+		snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
+		fnvlist_add_boolean(tmpnvl, snapname);
+		kmem_free(snapname, strlen(snapname + 1));
+		/* check if we are doing a recursive snapshot */
+		if (zc->zc_cookie)
+			dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value,
+			    tmpnvl);
+		fnvlist_add_nvlist(nvl, "snaps", tmpnvl);
+		fnvlist_free(tmpnvl);
+		nvlist_free(innvl);
+		/* strip dataset part from zc->zc_name */
+		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+		return (nvl);
+	break;
+	case ZFS_IOC_SPACE_SNAPS:
+		nvl = fnvlist_alloc();
+		fnvlist_add_string(nvl, "firstsnap", zc->zc_value);
+		if (innvl != NULL)
+			nvlist_free(innvl);
+		return (nvl);
+	break;
+	case ZFS_IOC_DESTROY_SNAPS:
+		if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN)
+			goto out;
+		nvl = fnvlist_alloc();
+		if (innvl != NULL) {
+			fnvlist_add_nvlist(nvl, "snaps", innvl);
+		} else {
+			/*
+			 * We are probably called by even older binaries,
+			 * allocate and populate nvlist with recursive
+			 * snapshots
+			 */
+			if (zfs_component_namecheck(zc->zc_value, NULL,
+			    NULL) == 0) {
+				tmpnvl = fnvlist_alloc();
+				if (dmu_get_recursive_snaps_nvl(zc->zc_name,
+				    zc->zc_value, tmpnvl) == 0)
+					fnvlist_add_nvlist(nvl, "snaps",
+					    tmpnvl);
+				nvlist_free(tmpnvl);
+			}
+		}
+		if (innvl != NULL)
+			nvlist_free(innvl);
+		/* strip dataset part from zc->zc_name */
+		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+		return (nvl);
+	break;
+	case ZFS_IOC_HOLD:
+		nvl = fnvlist_alloc();
+		tmpnvl = fnvlist_alloc();
+		if (zc->zc_cleanup_fd != -1)
+			fnvlist_add_int32(nvl, "cleanup_fd",
+			    (int32_t)zc->zc_cleanup_fd);
+		if (zc->zc_cookie) {
+			hnvl = fnvlist_alloc();
+			if (dmu_get_recursive_snaps_nvl(zc->zc_name,
+			    zc->zc_value, hnvl) == 0) {
+				elem = NULL;
+				while ((elem = nvlist_next_nvpair(hnvl,
+				    elem)) != NULL) {
+					nvlist_add_string(tmpnvl,
+					    nvpair_name(elem), zc->zc_string);
+				}
+			}
+			nvlist_free(hnvl);
+		} else {
+			snapname = kmem_asprintf("%s@%s", zc->zc_name,
+			    zc->zc_value);
+			nvlist_add_string(tmpnvl, snapname, zc->zc_string);
+			kmem_free(snapname, strlen(snapname + 1));
+		}
+		fnvlist_add_nvlist(nvl, "holds", tmpnvl);
+		nvlist_free(tmpnvl);
+		if (innvl != NULL)
+			nvlist_free(innvl);
+		/* strip dataset part from zc->zc_name */
+		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+		return (nvl);
+	break;
+	case ZFS_IOC_RELEASE:
+		nvl = fnvlist_alloc();
+		tmpnvl = fnvlist_alloc();
+		if (zc->zc_cookie) {
+			hnvl = fnvlist_alloc();
+			if (dmu_get_recursive_snaps_nvl(zc->zc_name,
+			    zc->zc_value, hnvl) == 0) {
+				elem = NULL;
+				while ((elem = nvlist_next_nvpair(hnvl,
+				    elem)) != NULL) {
+					fnvlist_add_boolean(tmpnvl,
+					    zc->zc_string);
+					fnvlist_add_nvlist(nvl,
+					    nvpair_name(elem), tmpnvl);
+				}
+			}
+			nvlist_free(hnvl);
+		} else {
+			snapname = kmem_asprintf("%s@%s", zc->zc_name,
+			    zc->zc_value);
+			fnvlist_add_boolean(tmpnvl, zc->zc_string);
+			fnvlist_add_nvlist(nvl, snapname, tmpnvl);
+			kmem_free(snapname, strlen(snapname + 1));
+		}
+		nvlist_free(tmpnvl);
+		if (innvl != NULL)
+			nvlist_free(innvl);
+		/* strip dataset part from zc->zc_name */
+		zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0';
+		return (nvl);
+	break;
+	}
+out:
+	return (innvl);
+}
+
+nvlist_t *
+zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec,
+    const int cflag)
+{
+	nvlist_t *tmpnvl;
+
+	if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC ||
+	    cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP ||
+	    cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES)
+		return (outnvl);
+
+	switch (vec) {
+	case ZFS_IOC_SPACE_SNAPS:
+		(void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie);
+		(void) nvlist_lookup_uint64(outnvl, "compressed",
+		    &zc->zc_objset_type);
+		(void) nvlist_lookup_uint64(outnvl, "uncompressed",
+		    &zc->zc_perm_action);
+		nvlist_free(outnvl);
+		/* return empty outnvl */
+		tmpnvl = fnvlist_alloc();
+		return (tmpnvl);
+	break;
+	case ZFS_IOC_CREATE:
+	case ZFS_IOC_CLONE:
+	case ZFS_IOC_HOLD:
+	case ZFS_IOC_RELEASE:
+		nvlist_free(outnvl);
+		/* return empty outnvl */
+		tmpnvl = fnvlist_alloc();
+		return (tmpnvl);
+	break;
+	}
+
+	return (outnvl);
+}
+#endif /* KERNEL */
Index: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h
===================================================================
RCS file: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h
diff -N src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h	25 Apr 2017 23:40:33 -0000
@@ -0,0 +1,544 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2014 Xin Li <delphij@FreeBSD.org>.  All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZFS_IOCTL_COMPAT_H
+#define	_SYS_ZFS_IOCTL_COMPAT_H
+
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+#include <sys/dmu.h>
+#include <sys/zio.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef _KERNEL
+#include <sys/nvpair.h>
+#endif  /* _KERNEL */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Backwards ioctl compatibility
+ */
+
+/* ioctl versions for vfs.zfs.version.ioctl */
+#define	ZFS_IOCVER_UNDEF	-1
+#define	ZFS_IOCVER_NONE		0
+#define	ZFS_IOCVER_DEADMAN	1
+#define	ZFS_IOCVER_LZC		2
+#define	ZFS_IOCVER_ZCMD		3
+#define	ZFS_IOCVER_EDBP		4
+#define	ZFS_IOCVER_RESUME	5
+#define	ZFS_IOCVER_INLANES	6
+#define	ZFS_IOCVER_PAD		7
+#define	ZFS_IOCVER_CURRENT	ZFS_IOCVER_PAD
+
+/* compatibility conversion flag */
+#define	ZFS_CMD_COMPAT_NONE	0
+#define	ZFS_CMD_COMPAT_V15	1
+#define	ZFS_CMD_COMPAT_V28	2
+#define	ZFS_CMD_COMPAT_DEADMAN	3
+#define	ZFS_CMD_COMPAT_LZC	4
+#define	ZFS_CMD_COMPAT_ZCMD	5
+#define	ZFS_CMD_COMPAT_EDBP	6
+#define	ZFS_CMD_COMPAT_RESUME	7
+#define	ZFS_CMD_COMPAT_INLANES	8
+
+#define	ZFS_IOC_COMPAT_PASS	254
+#define	ZFS_IOC_COMPAT_FAIL	255
+
+#define	ZFS_IOCREQ(ioreq)	((ioreq) & 0xff)
+
+typedef struct zfs_iocparm {
+	uint32_t	zfs_ioctl_version;
+	uint64_t	zfs_cmd;
+	uint64_t	zfs_cmd_size;
+} zfs_iocparm_t;
+
+typedef struct zinject_record_v15 {
+	uint64_t	zi_objset;
+	uint64_t	zi_object;
+	uint64_t	zi_start;
+	uint64_t	zi_end;
+	uint64_t	zi_guid;
+	uint32_t	zi_level;
+	uint32_t	zi_error;
+	uint64_t	zi_type;
+	uint32_t	zi_freq;
+	uint32_t	zi_failfast;
+} zinject_record_v15_t;
+
+typedef struct zfs_cmd_v15 {
+	char		zc_name[MAXPATHLEN];
+	char		zc_value[MAXPATHLEN];
+	char		zc_string[MAXNAMELEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t 	zc_history;		/* really (char *) */
+	uint64_t 	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+	zinject_record_v15_t zc_inject_record;
+} zfs_cmd_v15_t;
+
+typedef struct zinject_record_v28 {
+	uint64_t	zi_objset;
+	uint64_t	zi_object;
+	uint64_t	zi_start;
+	uint64_t	zi_end;
+	uint64_t	zi_guid;
+	uint32_t	zi_level;
+	uint32_t	zi_error;
+	uint64_t	zi_type;
+	uint32_t	zi_freq;
+	uint32_t	zi_failfast;
+	char		zi_func[MAXNAMELEN];
+	uint32_t	zi_iotype;
+	int32_t		zi_duration;
+	uint64_t	zi_timer;
+} zinject_record_v28_t;
+
+typedef struct zfs_cmd_v28 {
+	char		zc_name[MAXPATHLEN];
+	char		zc_value[MAXPATHLEN * 2];
+	char		zc_string[MAXNAMELEN];
+	char		zc_top_ds[MAXPATHLEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t 	zc_history;		/* really (char *) */
+	uint64_t 	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+	zinject_record_v28_t zc_inject_record;
+	boolean_t	zc_defer_destroy;
+	boolean_t	zc_temphold;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	uint8_t		zc_pad[3];		/* alignment */
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
+} zfs_cmd_v28_t;
+
+typedef struct zinject_record_deadman {
+	uint64_t	zi_objset;
+	uint64_t	zi_object;
+	uint64_t	zi_start;
+	uint64_t	zi_end;
+	uint64_t	zi_guid;
+	uint32_t	zi_level;
+	uint32_t	zi_error;
+	uint64_t	zi_type;
+	uint32_t	zi_freq;
+	uint32_t	zi_failfast;
+	char		zi_func[MAXNAMELEN];
+	uint32_t	zi_iotype;
+	int32_t		zi_duration;
+	uint64_t	zi_timer;
+	uint32_t	zi_cmd;
+	uint32_t	zi_pad;
+} zinject_record_deadman_t;
+
+typedef struct zfs_cmd_deadman {
+	char		zc_name[MAXPATHLEN];
+	char		zc_value[MAXPATHLEN * 2];
+	char		zc_string[MAXNAMELEN];
+	char		zc_top_ds[MAXPATHLEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t 	zc_history;		/* really (char *) */
+	uint64_t 	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+	/* zc_inject_record doesn't change in libzfs_core */
+	zinject_record_deadman_t zc_inject_record;
+	boolean_t	zc_defer_destroy;
+	boolean_t	zc_temphold;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	uint8_t		zc_pad[3];		/* alignment */
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
+} zfs_cmd_deadman_t;
+
+typedef struct zfs_cmd_zcmd {
+	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
+	int		zc_pad2;
+
+	/*
+	 * The following members are for legacy ioctls which haven't been
+	 * converted to the new method.
+	 */
+	uint64_t	zc_history;		/* really (char *) */
+	char		zc_value[MAXPATHLEN * 2];
+	char		zc_string[MAXNAMELEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+	zinject_record_deadman_t zc_inject_record;
+	boolean_t	zc_defer_destroy;
+	boolean_t	zc_temphold;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	uint8_t		zc_pad[3];		/* alignment */
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
+} zfs_cmd_zcmd_t;
+
+typedef struct zfs_cmd_edbp {
+	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
+	int		zc_pad2;
+
+	/*
+	 * The following members are for legacy ioctls which haven't been
+	 * converted to the new method.
+	 */
+	uint64_t	zc_history;		/* really (char *) */
+	char		zc_value[MAXPATHLEN * 2];
+	char		zc_string[MAXNAMELEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	struct drr_begin zc_begin_record;
+	zinject_record_deadman_t zc_inject_record;
+	uint32_t	zc_defer_destroy;
+	uint32_t	zc_flags;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	uint8_t		zc_pad[3];		/* alignment */
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
+} zfs_cmd_edbp_t;
+
+typedef struct zfs_cmd_resume {
+	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
+	int		zc_pad2;
+
+	/*
+	 * The following members are for legacy ioctls which haven't been
+	 * converted to the new method.
+	 */
+	uint64_t	zc_history;		/* really (char *) */
+	char		zc_value[MAXPATHLEN * 2];
+	char		zc_string[MAXNAMELEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	dmu_replay_record_t zc_begin_record;
+	zinject_record_deadman_t zc_inject_record;
+	uint32_t	zc_defer_destroy;
+	uint32_t	zc_flags;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	boolean_t	zc_resumable;
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
+} zfs_cmd_resume_t;
+
+typedef struct zfs_cmd_inlanes {
+	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
+	int		zc_pad2;
+
+	/*
+	 * The following members are for legacy ioctls which haven't been
+	 * converted to the new method.
+	 */
+	uint64_t	zc_history;		/* really (char *) */
+	char		zc_value[MAXPATHLEN * 2];
+	char		zc_string[MAXNAMELEN];
+	uint64_t	zc_guid;
+	uint64_t	zc_nvlist_conf;		/* really (char *) */
+	uint64_t	zc_nvlist_conf_size;
+	uint64_t	zc_cookie;
+	uint64_t	zc_objset_type;
+	uint64_t	zc_perm_action;
+	uint64_t	zc_history_len;
+	uint64_t	zc_history_offset;
+	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
+	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
+	dmu_objset_stats_t zc_objset_stats;
+	dmu_replay_record_t zc_begin_record;
+	zinject_record_t zc_inject_record;
+	uint32_t	zc_defer_destroy;
+	uint32_t	zc_flags;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	boolean_t	zc_resumable;
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
+} zfs_cmd_inlanes_t;
+
+#ifdef _KERNEL
+static unsigned long zfs_ioctl_v15_to_v28[] = {
+	0,	/*  0 ZFS_IOC_POOL_CREATE */
+	1,	/*  1 ZFS_IOC_POOL_DESTROY */
+	2,	/*  2 ZFS_IOC_POOL_IMPORT */
+	3,	/*  3 ZFS_IOC_POOL_EXPORT */
+	4,	/*  4 ZFS_IOC_POOL_CONFIGS */
+	5,	/*  5 ZFS_IOC_POOL_STATS */
+	6,	/*  6 ZFS_IOC_POOL_TRYIMPORT */
+	7,	/*  7 ZFS_IOC_POOL_SCRUB */
+	8,	/*  8 ZFS_IOC_POOL_FREEZE */
+	9,	/*  9 ZFS_IOC_POOL_UPGRADE */
+	10,	/* 10 ZFS_IOC_POOL_GET_HISTORY */
+	11,	/* 11 ZFS_IOC_VDEV_ADD */
+	12,	/* 12 ZFS_IOC_VDEV_REMOVE */
+	13,	/* 13 ZFS_IOC_VDEV_SET_STATE */
+	14,	/* 14 ZFS_IOC_VDEV_ATTACH */
+	15,	/* 15 ZFS_IOC_VDEV_DETACH */
+	16,	/* 16 ZFS_IOC_VDEV_SETPATH */
+	18,	/* 17 ZFS_IOC_OBJSET_STATS */
+	19,	/* 18 ZFS_IOC_OBJSET_ZPLPROPS */
+	20, 	/* 19 ZFS_IOC_DATASET_LIST_NEXT */
+	21,	/* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */
+	22,	/* 21 ZFS_IOC_SET_PROP */
+	ZFS_IOC_COMPAT_PASS,	/* 22 ZFS_IOC_CREATE_MINOR */
+	ZFS_IOC_COMPAT_PASS,	/* 23 ZFS_IOC_REMOVE_MINOR */
+	23,	/* 24 ZFS_IOC_CREATE */
+	24,	/* 25 ZFS_IOC_DESTROY */
+	25,	/* 26 ZFS_IOC_ROLLBACK */
+	26,	/* 27 ZFS_IOC_RENAME */
+	27,	/* 28 ZFS_IOC_RECV */
+	28,	/* 29 ZFS_IOC_SEND */
+	29,	/* 30 ZFS_IOC_INJECT_FAULT */
+	30,	/* 31 ZFS_IOC_CLEAR_FAULT */
+	31,	/* 32 ZFS_IOC_INJECT_LIST_NEXT */
+	32,	/* 33 ZFS_IOC_ERROR_LOG */
+	33,	/* 34 ZFS_IOC_CLEAR */
+	34,	/* 35 ZFS_IOC_PROMOTE */
+	35,	/* 36 ZFS_IOC_DESTROY_SNAPS */
+	36,	/* 37 ZFS_IOC_SNAPSHOT */
+	37,	/* 38 ZFS_IOC_DSOBJ_TO_DSNAME */
+	38,	/* 39 ZFS_IOC_OBJ_TO_PATH */
+	39,	/* 40 ZFS_IOC_POOL_SET_PROPS */
+	40,	/* 41 ZFS_IOC_POOL_GET_PROPS */
+	41,	/* 42 ZFS_IOC_SET_FSACL */
+	42,	/* 43 ZFS_IOC_GET_FSACL */
+	ZFS_IOC_COMPAT_PASS,	/* 44 ZFS_IOC_ISCSI_PERM_CHECK */
+	43,	/* 45 ZFS_IOC_SHARE */
+	44,	/* 46 ZFS_IOC_IHNERIT_PROP */
+	58,	/* 47 ZFS_IOC_JAIL */
+	59,	/* 48 ZFS_IOC_UNJAIL */
+	45,	/* 49 ZFS_IOC_SMB_ACL */
+	46,	/* 50 ZFS_IOC_USERSPACE_ONE */
+	47,	/* 51 ZFS_IOC_USERSPACE_MANY */
+	48,	/* 52 ZFS_IOC_USERSPACE_UPGRADE */
+	17,	/* 53 ZFS_IOC_SETFRU */
+};
+
+#else	/* KERNEL */
+static unsigned long zfs_ioctl_v28_to_v15[] = {
+	0,	/*  0 ZFS_IOC_POOL_CREATE */
+	1,	/*  1 ZFS_IOC_POOL_DESTROY */
+	2,	/*  2 ZFS_IOC_POOL_IMPORT */
+	3,	/*  3 ZFS_IOC_POOL_EXPORT */
+	4,	/*  4 ZFS_IOC_POOL_CONFIGS */
+	5,	/*  5 ZFS_IOC_POOL_STATS */
+	6,	/*  6 ZFS_IOC_POOL_TRYIMPORT */
+	7,	/*  7 ZFS_IOC_POOL_SCAN */
+	8,	/*  8 ZFS_IOC_POOL_FREEZE */
+	9,	/*  9 ZFS_IOC_POOL_UPGRADE */
+	10,	/* 10 ZFS_IOC_POOL_GET_HISTORY */
+	11,	/* 11 ZFS_IOC_VDEV_ADD */
+	12,	/* 12 ZFS_IOC_VDEV_REMOVE */
+	13,	/* 13 ZFS_IOC_VDEV_SET_STATE */
+	14,	/* 14 ZFS_IOC_VDEV_ATTACH */
+	15,	/* 15 ZFS_IOC_VDEV_DETACH */
+	16,	/* 16 ZFS_IOC_VDEV_SETPATH */
+	53,	/* 17 ZFS_IOC_VDEV_SETFRU */
+	17,	/* 18 ZFS_IOC_OBJSET_STATS */
+	18,	/* 19 ZFS_IOC_OBJSET_ZPLPROPS */
+	19, 	/* 20 ZFS_IOC_DATASET_LIST_NEXT */
+	20,	/* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */
+	21,	/* 22 ZFS_IOC_SET_PROP */
+	24,	/* 23 ZFS_IOC_CREATE */
+	25,	/* 24 ZFS_IOC_DESTROY */
+	26,	/* 25 ZFS_IOC_ROLLBACK */
+	27,	/* 26 ZFS_IOC_RENAME */
+	28,	/* 27 ZFS_IOC_RECV */
+	29,	/* 28 ZFS_IOC_SEND */
+	30,	/* 39 ZFS_IOC_INJECT_FAULT */
+	31,	/* 30 ZFS_IOC_CLEAR_FAULT */
+	32,	/* 31 ZFS_IOC_INJECT_LIST_NEXT */
+	33,	/* 32 ZFS_IOC_ERROR_LOG */
+	34,	/* 33 ZFS_IOC_CLEAR */
+	35,	/* 34 ZFS_IOC_PROMOTE */
+	36,	/* 35 ZFS_IOC_DESTROY_SNAPS */
+	37,	/* 36 ZFS_IOC_SNAPSHOT */
+	38,	/* 37 ZFS_IOC_DSOBJ_TO_DSNAME */
+	39,	/* 38 ZFS_IOC_OBJ_TO_PATH */
+	40,	/* 39 ZFS_IOC_POOL_SET_PROPS */
+	41,	/* 40 ZFS_IOC_POOL_GET_PROPS */
+	42,	/* 41 ZFS_IOC_SET_FSACL */
+	43,	/* 42 ZFS_IOC_GET_FSACL */
+	45,	/* 43 ZFS_IOC_SHARE */
+	46,	/* 44 ZFS_IOC_IHNERIT_PROP */
+	49,	/* 45 ZFS_IOC_SMB_ACL */
+	50,	/* 46 ZFS_IOC_USERSPACE_ONE */
+	51,	/* 47 ZFS_IOC_USERSPACE_MANY */
+	52,	/* 48 ZFS_IOC_USERSPACE_UPGRADE */
+	ZFS_IOC_COMPAT_FAIL,	/* 49 ZFS_IOC_HOLD */
+	ZFS_IOC_COMPAT_FAIL,	/* 50 ZFS_IOC_RELEASE */
+	ZFS_IOC_COMPAT_FAIL,	/* 51 ZFS_IOC_GET_HOLDS */
+	ZFS_IOC_COMPAT_FAIL,	/* 52 ZFS_IOC_OBJSET_RECVD_PROPS */
+	ZFS_IOC_COMPAT_FAIL,	/* 53 ZFS_IOC_VDEV_SPLIT */
+	ZFS_IOC_COMPAT_FAIL,	/* 54 ZFS_IOC_NEXT_OBJ */
+	ZFS_IOC_COMPAT_FAIL,	/* 55 ZFS_IOC_DIFF */
+	ZFS_IOC_COMPAT_FAIL,	/* 56 ZFS_IOC_TMP_SNAPSHOT */
+	ZFS_IOC_COMPAT_FAIL,	/* 57 ZFS_IOC_OBJ_TO_STATS */
+	47,	/* 58 ZFS_IOC_JAIL */
+	48,	/* 59 ZFS_IOC_UNJAIL */
+};
+#endif	/* ! _KERNEL */
+
+#ifdef _KERNEL
+int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int);
+void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int);
+nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int,
+    const int);
+nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int,
+    const int);
+#else
+int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int);
+#endif	/* _KERNEL */
+void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int);
+void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_IOCTL_COMPAT_H */
Index: src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_namecheck.c
--- src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c	27 Feb 2010 22:29:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c	10 Oct 2016 11:10:02 -0000
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
 
 /*
  * Common name validation routines for ZFS.  These routines are shared by the
@@ -62,11 +65,11 @@ valid_char(char c)
  * 	[-_.: ]
  */
 int
-snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
+zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
 {
 	const char *loc;
 
-	if (strlen(path) >= MAXNAMELEN) {
+	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
 		if (why)
 			*why = NAME_ERR_TOOLONG;
 		return (-1);
@@ -113,7 +116,7 @@ permset_namecheck(const char *path, name
 		return (-1);
 	}
 
-	return (snapshot_namecheck(&path[1], why, what));
+	return (zfs_component_namecheck(&path[1], why, what));
 }
 
 /*
@@ -137,14 +140,9 @@ dataset_namecheck(const char *path, name
 
 	/*
 	 * Make sure the name is not too long.
-	 *
-	 * ZFS_MAXNAMELEN is the maximum dataset length used in the userland
-	 * which is the same as MAXNAMELEN used in the kernel.
-	 * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all
-	 * places using MAXNAMELEN.
 	 */
 
-	if (strlen(path) >= MAXNAMELEN) {
+	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
 		if (why)
 			*why = NAME_ERR_TOOLONG;
 		return (-1);
@@ -273,7 +271,7 @@ mountpoint_namecheck(const char *path, n
 		while (*end != '/' && *end != '\0')
 			end++;
 
-		if (end - start >= MAXNAMELEN) {
+		if (end - start >= ZFS_MAX_DATASET_NAME_LEN) {
 			if (why)
 				*why = NAME_ERR_TOOLONG;
 			return (-1);
@@ -298,13 +296,8 @@ pool_namecheck(const char *pool, nameche
 
 	/*
 	 * Make sure the name is not too long.
-	 *
-	 * ZPOOL_MAXNAMELEN is the maximum pool length used in the userland
-	 * which is the same as MAXNAMELEN used in the kernel.
-	 * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all
-	 * places using MAXNAMELEN.
 	 */
-	if (strlen(pool) >= MAXNAMELEN) {
+	if (strlen(pool) >= ZFS_MAX_DATASET_NAME_LEN) {
 		if (why)
 			*why = NAME_ERR_TOOLONG;
 		return (-1);
Index: src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_namecheck.h
--- src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h	27 Feb 2010 22:29:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h	13 Jan 2014 02:59:29 -0000
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
 
 #ifndef	_ZFS_NAMECHECK_H
 #define	_ZFS_NAMECHECK_H
@@ -48,7 +51,7 @@ typedef enum {
 int pool_namecheck(const char *, namecheck_err_t *, char *);
 int dataset_namecheck(const char *, namecheck_err_t *, char *);
 int mountpoint_namecheck(const char *, namecheck_err_t *);
-int snapshot_namecheck(const char *, namecheck_err_t *, char *);
+int zfs_component_namecheck(const char *, namecheck_err_t *, char *);
 int permset_namecheck(const char *, namecheck_err_t *, char *);
 
 #ifdef	__cplusplus
Index: src/external/cddl/osnet/dist/common/zfs/zfs_prop.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_prop.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_prop.c
--- src/external/cddl/osnet/dist/common/zfs/zfs_prop.c	27 Feb 2010 22:29:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_prop.c	28 Jun 2017 00:04:06 -0000
@@ -19,10 +19,15 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/u8_textprep.h>
@@ -66,6 +71,14 @@ zfs_prop_init(void)
 		{ "fletcher2",	ZIO_CHECKSUM_FLETCHER_2 },
 		{ "fletcher4",	ZIO_CHECKSUM_FLETCHER_4 },
 		{ "sha256",	ZIO_CHECKSUM_SHA256 },
+		{ "noparity",	ZIO_CHECKSUM_NOPARITY },
+#ifndef __NetBSD__
+		{ "sha512",	ZIO_CHECKSUM_SHA512 },
+		{ "skein",	ZIO_CHECKSUM_SKEIN },
+#endif
+#ifdef illumos
+		{ "edonr",	ZIO_CHECKSUM_EDONR },
+#endif
 		{ NULL }
 	};
 
@@ -76,6 +89,18 @@ zfs_prop_init(void)
 		{ "sha256",	ZIO_CHECKSUM_SHA256 },
 		{ "sha256,verify",
 				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+#ifndef __NetBSD__
+		{ "sha512",	ZIO_CHECKSUM_SHA512 },
+		{ "sha512,verify",
+				ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
+		{ "skein",	ZIO_CHECKSUM_SKEIN },
+		{ "skein,verify",
+				ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
+#endif
+#ifdef illumos
+		{ "edonr,verify",
+				ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
+#endif
 		{ NULL }
 	};
 
@@ -94,6 +119,7 @@ zfs_prop_init(void)
 		{ "gzip-8",	ZIO_COMPRESS_GZIP_8 },
 		{ "gzip-9",	ZIO_COMPRESS_GZIP_9 },
 		{ "zle",	ZIO_COMPRESS_ZLE },
+		{ "lz4",	ZIO_COMPRESS_LZ4 },
 		{ NULL }
 	};
 
@@ -107,6 +133,7 @@ zfs_prop_init(void)
 		{ "discard",	ZFS_ACL_DISCARD },
 		{ "groupmask",	ZFS_ACL_GROUPMASK },
 		{ "passthrough", ZFS_ACL_PASSTHROUGH },
+		{ "restricted", ZFS_ACL_RESTRICTED },
 		{ NULL }
 	};
 
@@ -153,6 +180,7 @@ zfs_prop_init(void)
 		{ "2",		2 },
 		{ "3",		3 },
 		{ "4",		4 },
+		{ "5",		5 },
 		{ "current",	ZPL_VERSION },
 		{ NULL }
 	};
@@ -183,185 +211,260 @@ zfs_prop_init(void)
 		{ NULL }
 	};
 
+	static zprop_index_t sync_table[] = {
+		{ "standard",	ZFS_SYNC_STANDARD },
+		{ "always",	ZFS_SYNC_ALWAYS },
+		{ "disabled",	ZFS_SYNC_DISABLED },
+		{ NULL }
+	};
+
+	static zprop_index_t volmode_table[] = {
+		{ "default",	ZFS_VOLMODE_DEFAULT },
+		{ "geom",	ZFS_VOLMODE_GEOM },
+		{ "dev",	ZFS_VOLMODE_DEV },
+		{ "none",	ZFS_VOLMODE_NONE },
+		{ NULL }
+	};
+
+	static zprop_index_t redundant_metadata_table[] = {
+		{ "all",	ZFS_REDUNDANT_METADATA_ALL },
+		{ "most",	ZFS_REDUNDANT_METADATA_MOST },
+		{ NULL }
+	};
+
 	/* inherit index properties */
-	register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT,
+	zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
+	    ZFS_REDUNDANT_METADATA_ALL,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "all | most", "REDUND_MD",
+	    redundant_metadata_table);
+	zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
-	    checksum_table);
-	register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+	    "standard | always | disabled", "SYNC",
+	    sync_table);
+	zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
+	    ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME,
+	    "on | off | fletcher2 | fletcher4 | sha256 | sha512 | "
+	    "skein | edonr", "CHECKSUM", checksum_table);
+	zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | verify | sha256[,verify]", "DEDUP",
-	    dedup_table);
-	register_index(ZFS_PROP_COMPRESSION, "compression",
+	    "on | off | verify | sha256[,verify], sha512[,verify], "
+	    "skein[,verify], edonr,verify", "DEDUP", dedup_table);
+	zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
 	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS",
-	    compress_table);
-	register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+	    "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4",
+	    "COMPRESS", compress_table);
+	zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "hidden | visible", "SNAPDIR", snapdir_table);
-	register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
-	    "discard | groupmask | passthrough", "ACLMODE", acl_mode_table);
-	register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED,
+	zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "discard | groupmask | passthrough | restricted", "ACLMODE",
+	    acl_mode_table);
+	zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
+	    ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "discard | noallow | restricted | passthrough | passthrough-x",
 	    "ACLINHERIT", acl_inherit_table);
-	register_index(ZFS_PROP_COPIES, "copies", 1,
-	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "1 | 2 | 3", "COPIES", copies_table);
-	register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+	zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "PRIMARYCACHE", cache_table);
-	register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+	zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "SECONDARYCACHE", cache_table);
-	register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+	zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "latency | throughput", "LOGBIAS", logbias_table);
+	zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
+	    ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+	    "default | geom | dev | none", "VOLMODE", volmode_table);
 
 	/* inherit index (boolean) properties */
-	register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
-	register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
 	    boolean_table);
-	register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
 	    boolean_table);
-	register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
 	    boolean_table);
-	register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
 	    boolean_table);
-	register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
-	register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
+	zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR",
 	    boolean_table);
-	register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN",
 	    boolean_table);
-	register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+	zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
 	    boolean_table);
 
 	/* default index properties */
-	register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+	zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
-	    "1 | 2 | 3 | 4 | current", "VERSION", version_table);
-	register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+	    "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table);
+	zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
 	    "CANMOUNT", canmount_table);
 
 	/* readonly index (boolean) properties */
-	register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+	zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
-	register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+	zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
 	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
 	    boolean_table);
 
 	/* set once index properties */
-	register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+	zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
 	    PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "none | formC | formD | formKC | formKD", "NORMALIZATION",
 	    normalize_table);
-	register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE,
-	    PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
+	    ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_SNAPSHOT,
 	    "sensitive | insensitive | mixed", "CASE", case_table);
 
 	/* set once index (boolean) properties */
-	register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+	zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
 	    "on | off", "UTF8ONLY", boolean_table);
 
 	/* string properties */
-	register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+	zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
-	register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "<path> | legacy | none", "MOUNTPOINT");
-	register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS");
-	register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT,
-	    ZFS_TYPE_DATASET, "on | off | type=<type>", "SHAREISCSI");
-	register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
-	    ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE");
-	register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB");
-	register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT,
-	    PROP_INHERIT, ZFS_TYPE_DATASET, "<sensitivity label>", "MLSLABEL");
+	zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
+	    ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
+	zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
+	    "MOUNTPOINT");
+	zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options",
+	    "SHARENFS");
+	zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+	    "filesystem | volume | snapshot | bookmark", "TYPE");
+	zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "on | off | sharemgr(1M) options", "SHARESMB");
+	zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
+	    ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
+	    "<sensitivity label>", "MLSLABEL");
+	zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    "receive_resume_token",
+	    NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<string token>", "RESUMETOK");
 
 	/* readonly number properties */
-	register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+	zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "<size>", "USED");
-	register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+	zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
-	register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY,
-	    ZFS_TYPE_DATASET, "<size>", "REFER");
-	register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+	zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "REFER");
+	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<1.00x or higher if compressed>", "RATIO");
-	register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+	zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET,
+	    "<1.00x or higher if compressed>", "REFRATIO");
+	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
 	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
 	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
-	register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP");
-	register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDDS");
-	register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDCHILD");
-	register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+	zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDSNAP");
+	zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDDS");
+	zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDCHILD");
+	zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
 	    PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
-	register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+	zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
 	    ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
+	zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
+	    ZFS_TYPE_DATASET, "<size>", "WRITTEN");
+	zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LUSED");
+	zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
+	    0, PROP_READONLY, ZFS_TYPE_DATASET, "<size>", "LREFER");
 
 	/* default number properties */
-	register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+	zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
-	register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT,
-	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size> | none", "RESERV");
-	register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+	zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
+	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<size> | none", "RESERV");
+	zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
 	    ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
-	register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+	zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
-	register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+	zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "<size> | none", "REFRESERV");
+	zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+	    "<count> | none", "FSLIMIT");
+	zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<count> | none", "SSLIMIT");
+	zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+	    "<count>", "FSCOUNT");
+	zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<count>", "SSCOUNT");
 
 	/* inherit number properties */
-	register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE,
-	    PROP_INHERIT,
-	    ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE");
+	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
+	    SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
 
 	/* hidden properties */
-	register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG");
-	register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+	zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG");
+	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
-	register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
-	    PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
-	register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
-	    PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
-	register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+	zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME");
+	zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
+	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+	zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
 	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
 	    "STMF_SBD_LU");
-	register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
-	    ZFS_TYPE_DATASET, "GUID");
-	register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+	zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "GUID");
+	zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
 	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
 	    "USERACCOUNTING");
-	register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+	zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
-	register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
+	zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
+	zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
+	zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
 
 	/* oddball properties */
-	register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
-	    PROP_READONLY, ZFS_TYPE_DATASET,
+	zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
+	    NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
 	    "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
 }
 
@@ -443,6 +546,18 @@ zfs_prop_userquota(const char *name)
 }
 
 /*
+ * Returns true if this is a valid written@ property.
+ * Note that after the @, any character is valid (eg, another @, for
+ * written@pool/fs@origin).
+ */
+boolean_t
+zfs_prop_written(const char *name)
+{
+	static const char *prefix = "written@";
+	return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+/*
  * Tables of index types, plus functions to convert between the user view
  * (strings) and internal representation (uint64_t).
  */
Index: src/external/cddl/osnet/dist/common/zfs/zfs_prop.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_prop.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_prop.h
--- src/external/cddl/osnet/dist/common/zfs/zfs_prop.h	27 Feb 2010 22:29:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zfs_prop.h	12 Jun 2012 05:57:26 -0000
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -98,16 +98,16 @@ zprop_desc_t *zpool_prop_get_table(void)
 /*
  * Common routines to initialize property tables
  */
-void register_impl(int, const char *, zprop_type_t, uint64_t,
+void zprop_register_impl(int, const char *, zprop_type_t, uint64_t,
     const char *, zprop_attr_t, int, const char *, const char *,
     boolean_t, boolean_t, const zprop_index_t *);
-void register_string(int, const char *, const char *, zprop_attr_t attr,
-    int, const char *, const char *);
-void register_number(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_string(int, const char *, const char *,
+    zprop_attr_t attr, int, const char *, const char *);
+void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int,
     const char *, const char *);
-void register_index(int, const char *, uint64_t, zprop_attr_t, int,
+void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int,
     const char *, const char *, const zprop_index_t *);
-void register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
+void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t,
     int, const char *);
 
 /*
@@ -121,6 +121,8 @@ uint64_t zprop_random_value(int, uint64_
 const char *zprop_values(int, zfs_type_t);
 size_t zprop_width(int, boolean_t *, zfs_type_t);
 boolean_t zprop_valid_for_type(int, zfs_type_t);
+boolean_t zfs_prop_written(const char *name);
+
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/common/zfs/zpool_prop.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zpool_prop.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zpool_prop.c
--- src/external/cddl/osnet/dist/common/zfs/zpool_prop.c	27 Feb 2010 22:29:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zpool_prop.c	27 Mar 2016 02:52:13 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zio.h>
@@ -64,53 +66,70 @@ zpool_prop_init(void)
 	};
 
 	/* string properties */
-	register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+	zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
 	    ZFS_TYPE_POOL, "<path>", "ALTROOT");
-	register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+	zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
 	    ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
-	register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+	zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+	zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT");
 
 	/* readonly number properties */
-	register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "SIZE");
-	register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "FREE");
-	register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "ALLOC");
-	register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "FREEING");
+	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "LEAKED");
+	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
+	zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
+	zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
+	zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "CAP");
-	register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<guid>", "GUID");
-	register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+	zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<state>", "HEALTH");
-	register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP");
+	zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
+	    "DEDUP");
 
 	/* default number properties */
-	register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+	zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
-	register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+	zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
 
 	/* default index (boolean) properties */
-	register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table);
-	register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
-	register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
-	register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT,
-	    ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+	zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
+	    boolean_table);
+	zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+	zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
+	    boolean_table);
+	zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+	zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
 
 	/* default index properties */
-	register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+	zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
 	    ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
 	    "wait | continue | panic", "FAILMODE", failuremode_table);
 
 	/* hidden properties */
-	register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+	zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+	zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
 }
 
 /*
@@ -156,6 +175,26 @@ zpool_prop_default_numeric(zpool_prop_t 
 	return (zpool_prop_table[prop].pd_numdefault);
 }
 
+/*
+ * Returns true if this is a valid feature@ property.
+ */
+boolean_t
+zpool_prop_feature(const char *name)
+{
+	static const char *prefix = "feature@";
+	return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+/*
+ * Returns true if this is a valid unsupported@ property.
+ */
+boolean_t
+zpool_prop_unsupported(const char *name)
+{
+	static const char *prefix = "unsupported@";
+	return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
 int
 zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
     uint64_t *index)
Index: src/external/cddl/osnet/dist/common/zfs/zprop_common.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zprop_common.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zprop_common.c
--- src/external/cddl/osnet/dist/common/zfs/zprop_common.c	27 Feb 2010 22:29:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/common/zfs/zprop_common.c	16 Jun 2017 17:20:43 -0000
@@ -19,9 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 /*
  * Common routines used by zfs and zpool property management.
@@ -39,7 +42,11 @@
 
 #if defined(_KERNEL)
 #include <sys/systm.h>
+#ifdef __FreeBSD__
+#include <sys/libkern.h>
+#else
 #include <util/qsort.h>
+#endif
 #else
 #include <stdlib.h>
 #include <string.h>
@@ -65,7 +72,7 @@ zprop_get_numprops(zfs_type_t type)
 }
 
 void
-register_impl(int prop, const char *name, zprop_type_t type,
+zprop_register_impl(int prop, const char *name, zprop_type_t type,
     uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
     int objset_types, const char *values, const char *colname,
     boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
@@ -97,38 +104,40 @@ register_impl(int prop, const char *name
 }
 
 void
-register_string(int prop, const char *name, const char *def,
+zprop_register_string(int prop, const char *name, const char *def,
     zprop_attr_t attr, int objset_types, const char *values,
     const char *colname)
 {
-	register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+	zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
 	    objset_types, values, colname, B_FALSE, B_TRUE, NULL);
 
 }
 
 void
-register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr,
-    int objset_types, const char *values, const char *colname)
+zprop_register_number(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname)
 {
-	register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+	zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
 	    objset_types, values, colname, B_TRUE, B_TRUE, NULL);
 }
 
 void
-register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr,
-    int objset_types, const char *values, const char *colname,
-    const zprop_index_t *idx_tbl)
+zprop_register_index(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname, const zprop_index_t *idx_tbl)
 {
-	register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+	zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
 	    objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl);
 }
 
 void
-register_hidden(int prop, const char *name, zprop_type_t type,
+zprop_register_hidden(int prop, const char *name, zprop_type_t type,
     zprop_attr_t attr, int objset_types, const char *colname)
 {
-	register_impl(prop, name, type, 0, NULL, attr,
-	    objset_types, NULL, colname, B_FALSE, B_FALSE, NULL);
+	zprop_register_impl(prop, name, type, 0, NULL, attr,
+	    objset_types, NULL, colname,
+	    type == PROP_TYPE_NUMBER, B_FALSE, NULL);
 }
 
 
@@ -161,7 +170,7 @@ int
 zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
     boolean_t ordered, zfs_type_t type)
 {
-	int i, num_props, size, prop;
+	int i, j, num_props, size, prop;
 	zprop_desc_t *prop_tbl;
 	zprop_desc_t **order;
 
@@ -176,7 +185,7 @@ zprop_iter_common(zprop_func func, void 
 		return (ZPROP_CONT);
 #endif
 
-	for (int j = 0; j < num_props; j++)
+	for (j = 0; j < num_props; j++)
 		order[j] = &prop_tbl[j];
 
 	if (ordered) {
Index: src/external/cddl/osnet/dist/head/nlist.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/head/nlist.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 nlist.h
--- src/external/cddl/osnet/dist/head/nlist.h	20 Feb 2010 04:33:53 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/head/nlist.h	12 Apr 2017 22:47:20 -0000
@@ -19,6 +19,9 @@
  *
  * CDDL HEADER END
  */
+/*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ */
 /*	Copyright (c) 1988 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -26,8 +29,6 @@
 #ifndef _NLIST_H
 #define	_NLIST_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.8.2.4 */
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -41,11 +42,7 @@ struct nlist {
 	char		n_numaux;	/* number of aux. entries */
 };
 
-#if defined(__STDC__)
 extern int nlist(const char *, struct nlist *);
-#else	/* __STDC__ */
-extern int nlist();
-#endif  /* __STDC__ */
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c,v
retrieving revision 1.6
diff -u -p -r1.6 ctf_lib.c
--- src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c	24 Sep 2015 14:20:18 -0000	1.6
+++ src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c	12 Apr 2017 22:50:09 -0000
@@ -350,6 +350,7 @@ ctf_fdopen(int fd, int *errp)
 			if ((sp32 = malloc(nbytes)) == NULL || pread64(fd,
 			    sp32, nbytes, hdr.e64.e_shoff) != nbytes) {
 				free(sp);
+				free(sp32);
 				return (ctf_set_open_errno(errp, errno));
 			}
 
Index: src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c,v
retrieving revision 1.3
diff -u -p -r1.3 dt_isadep.c
--- src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c	21 Feb 2015 15:13:20 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c	8 May 2017 12:44:43 -0000
@@ -22,6 +22,9 @@
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2014 Howard Su
+ * Copyright 2015 George V. Neville-Neil
+ *
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -36,6 +39,10 @@
 #include <dt_impl.h>
 #include <dt_pid.h>
 
+#ifndef illumos
+#include <libproc_compat.h>
+#endif
+
 #define	OP(x)		((x) >> 30)
 #define	OP2(x)		(((x) >> 22) & 0x07)
 #define	COND(x)		(((x) >> 25) & 0x0f)
@@ -75,6 +82,8 @@ dt_pid_create_return_probe(struct ps_pro
 {
 
 	uint32_t *text;
+	int i;
+	int srdepth = 0;
 
 	dt_dprintf("%s: unimplemented\n", __func__);
 	return (DT_PROC_ERR);
@@ -83,13 +92,12 @@ dt_pid_create_return_probe(struct ps_pro
 		dt_dprintf("mr sparkle: malloc() failed\n");
 		return (DT_PROC_ERR);
 	}
-#ifdef DOODAD
+
 	if (Pread(P, text, symp->st_size, symp->st_value) != symp->st_size) {
 		dt_dprintf("mr sparkle: Pread() failed\n");
 		free(text);
 		return (DT_PROC_ERR);
 	}
-#endif
 
 	/*
 	 * Leave a dummy instruction in the last slot to simplify edge
@@ -179,4 +187,3 @@ dt_pid_create_glob_offset_probes(struct 
 
 	return (ftp->ftps_noffs);
 }
-
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c,v
retrieving revision 1.7
diff -u -p -r1.7 drti.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c	1 Mar 2016 21:09:17 -0000	1.7
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c	12 Apr 2017 22:52:42 -0000
@@ -32,9 +32,7 @@
 #include <sys/ioctl.h>
 
 #include <stdarg.h>
-#define dprintf __hide_dprintf
 #include <stdio.h>
-#undef dprintf
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
@@ -68,7 +66,7 @@ extern dof_hdr_t __SUNW_dof;	/* DOF defi
 static boolean_t dof_init_debug = B_FALSE;	/* From DTRACE_DOF_INIT_DEBUG */
 
 static void __printflike(2,3)
-dprintf(int debug, const char *fmt, ...)
+dbg_printf(int debug, const char *fmt, ...)
 {
 	va_list ap;
 
@@ -122,13 +120,13 @@ dtrace_dof_init(void)
 		dof_init_debug = B_TRUE;
 
 	if (dlinfo(RTLD_SELF, RTLD_DI_LINKMAP, &lmp) == -1 || lmp == NULL) {
-		dprintf(1, "couldn't discover module name or address\n");
+		dbg_printf(1, "couldn't discover module name or address\n");
 		return;
 	}
 
 #ifdef illumos
 	if (dlinfo(RTLD_SELF, RTLD_DI_LMID, &lmid) == -1) {
-		dprintf(1, "couldn't discover link map ID\n");
+		dbg_printf(1, "couldn't discover link map ID\n");
 		return;
 	}
 #endif
@@ -142,7 +140,7 @@ dtrace_dof_init(void)
 	    dof->dofh_ident[DOF_ID_MAG1] != DOF_MAG_MAG1 ||
 	    dof->dofh_ident[DOF_ID_MAG2] != DOF_MAG_MAG2 ||
 	    dof->dofh_ident[DOF_ID_MAG3] != DOF_MAG_MAG3) {
-		dprintf(0, ".SUNW_dof section corrupt\n");
+		dbg_printf(0, ".SUNW_dof section corrupt\n");
 		return;
 	}
 
@@ -166,7 +164,7 @@ dtrace_dof_init(void)
 		devnamep = p;
 
 	if ((fd = open64(devnamep, O_RDWR)) < 0) {
-		dprintf(1, "failed to open helper device %s", devnamep);
+		dbg_printf(1, "failed to open helper device %s", devnamep);
 #ifdef illumos
 		/*
 		 * If the device path wasn't explicitly set, try again with
@@ -178,7 +176,7 @@ dtrace_dof_init(void)
 		devnamep = olddevname;
 
 		if ((fd = open64(devnamep, O_RDWR)) < 0) {
-			dprintf(1, "failed to open helper device %s", devnamep);
+			dbg_printf(1, "failed to open helper device %s", devnamep);
 			return;
 		}
 #else
@@ -186,9 +184,9 @@ dtrace_dof_init(void)
 #endif
 	}
 	if ((gen = ioctl(fd, DTRACEHIOC_ADDDOF, &dh)) == -1)
-		dprintf(1, "DTrace ioctl failed for DOF at %p", dof);
+		dbg_printf(1, "DTrace ioctl failed for DOF at %p", dof);
 	else {
-		dprintf(1, "DTrace ioctl succeeded for DOF at %p\n", dof);
+		dbg_printf(1, "DTrace ioctl succeeded for DOF at %p\n", dof);
 #if defined(__FreeBSD__) || defined(__NetBSD__)
 		gen = dh.dofhp_gen;
 #endif
@@ -209,14 +207,14 @@ dtrace_dof_fini(void)
 	int fd;
 
 	if ((fd = open64(devnamep, O_RDWR)) < 0) {
-		dprintf(1, "failed to open helper device %s", devnamep);
+		dbg_printf(1, "failed to open helper device %s", devnamep);
 		return;
 	}
 
 	if ((gen = ioctl(fd, DTRACEHIOC_REMOVE, &gen)) == -1)
-		dprintf(1, "DTrace ioctl failed to remove DOF (%d)\n", gen);
+		dbg_printf(1, "DTrace ioctl failed to remove DOF (%d)\n", gen);
 	else
-		dprintf(1, "DTrace ioctl removed DOF (%d)\n", gen);
+		dbg_printf(1, "DTrace ioctl removed DOF (%d)\n", gen);
 
 	(void) close(fd);
 }
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c,v
retrieving revision 1.6
diff -u -p -r1.6 dt_aggregate.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c	24 Sep 2015 14:25:29 -0000	1.6
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c	20 Apr 2017 14:50:58 -0000
@@ -39,10 +39,8 @@
 #include <alloca.h>
 #else
 #include <sys/sysctl.h>
-#if defined(__FreeBSD__) || defined(__NetBSD__)
 #include <libproc_compat.h>
 #endif
-#endif
 #include <limits.h>
 
 #define	DTRACE_AHASHSIZE	32779		/* big 'ol prime */
@@ -188,7 +186,7 @@ dt_aggregate_lquantizedcmp(int64_t *lhs,
 {
 	long double lsum = dt_aggregate_lquantizedsum(lhs);
 	long double rsum = dt_aggregate_lquantizedsum(rhs);
-	int64_t lzero, rzero;
+	int64_t lzero = 0, rzero = 0;
 
 	if (lsum < rsum)
 		return (DT_LESSTHAN);
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_as.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c	24 Sep 2015 14:25:29 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c	12 Apr 2017 22:55:13 -0000
@@ -430,12 +430,9 @@ dt_as(dt_pcb_t *pcb)
 			if ((idp = dip->di_extern) == NULL)
 				continue; /* no relocation entry needed */
 
-/*###431 [cc] error: 'kbits' may be used uninitialized in this function [-Werror=maybe-uninitialized]%%%*/
 			if ((idp->di_flags & kmask) == kbits) {
 				nodef = knodef;
 				rp = krp++;
-/*###434 [cc] error: 'ubits' may be used uninitialized in this function [-Werror=maybe-uninitialized]%%%*/
-/*###434 [cc] error: 'umask' may be used uninitialized in this function [-Werror=maybe-uninitialized]%%%*/
 			} else if ((idp->di_flags & umask) == ubits) {
 				nodef = unodef;
 				rp = urp++;
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_cc.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c	24 Sep 2015 14:25:29 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c	12 Apr 2017 23:04:59 -0000
@@ -21,8 +21,9 @@
 
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2015 Gary Mills
  */
 
 /*
@@ -119,7 +120,6 @@ static const dtrace_diftype_t dt_int_rty
 static void *dt_compile(dtrace_hdl_t *, int, dtrace_probespec_t, void *,
     uint_t, int, char *const[], FILE *, const char *);
 
-
 /*ARGSUSED*/
 static int
 dt_idreset(dt_idhash_t *dhp, dt_ident_t *idp, void *ignored)
@@ -1058,46 +1058,6 @@ dt_action_printm(dtrace_hdl_t *dtp, dt_n
 }
 
 static void
-dt_action_printt(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp)
-{
-	dtrace_actdesc_t *ap = dt_stmt_action(dtp, sdp);
-
-	dt_node_t *size = dnp->dn_args;
-	dt_node_t *addr = dnp->dn_args->dn_list;
-
-	char n[DT_TYPE_NAMELEN];
-
-	if (dt_node_is_posconst(size) == 0) {
-		dnerror(size, D_PRINTT_SIZE, "printt( ) argument #1 must "
-		    "be a non-zero positive integral constant expression\n");
-	}
-
-	if (addr == NULL || addr->dn_kind != DT_NODE_FUNC ||
-	    addr->dn_ident != dt_idhash_lookup(dtp->dt_globals, "typeref")) {
-		dnerror(addr, D_PRINTT_ADDR,
-		    "printt( ) argument #2 is incompatible with "
-		    "prototype:\n\tprototype: typeref()\n"
-		    "\t argument: %s\n",
-		    dt_node_type_name(addr, n, sizeof (n)));
-	}
-
-	dt_cg(yypcb, addr);
-	ap->dtad_difo = dt_as(yypcb);
-	ap->dtad_kind = DTRACEACT_PRINTT;
-
-	ap->dtad_difo->dtdo_rtype.dtdt_flags |= DIF_TF_BYREF;
-
-	/*
-	 * Allow additional buffer space for the data size, type size,
-	 * type string length and a stab in the dark (32 bytes) for the
-	 * type string. The type string is part of the typeref() that
-	 * this action references.
-	 */
-	ap->dtad_difo->dtdo_rtype.dtdt_size = size->dn_value + 3 * sizeof(uintptr_t) + 32;
-
-}
-
-static void
 dt_action_commit(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp)
 {
 	dtrace_actdesc_t *ap = dt_stmt_action(dtp, sdp);
@@ -1169,9 +1129,6 @@ dt_compile_fun(dtrace_hdl_t *dtp, dt_nod
 	case DT_ACT_PRINTM:
 		dt_action_printm(dtp, dnp->dn_expr, sdp);
 		break;
-	case DT_ACT_PRINTT:
-		dt_action_printt(dtp, dnp->dn_expr, sdp);
-		break;
 	case DT_ACT_RAISE:
 		dt_action_raise(dtp, dnp->dn_expr, sdp);
 		break;
@@ -2435,7 +2392,7 @@ dt_compile(dtrace_hdl_t *dtp, int contex
 	dt_node_t *dnp;
 	dt_decl_t *ddp;
 	dt_pcb_t pcb;
-	void *rv = NULL;	// XXX: gcc
+	void *volatile rv;
 	int err;
 
 	if ((fp == NULL && s == NULL) || (cflags & ~DTRACE_C_MASK) != 0) {
@@ -2518,6 +2475,28 @@ dt_compile(dtrace_hdl_t *dtp, int contex
 	}
 
 	/*
+	 * Perform sugar transformations (for "if" / "else") and replace the
+	 * existing clause chain with the new one.
+	 */
+	if (context == DT_CTX_DPROG) {
+		dt_node_t *dnp, *next_dnp;
+		dt_node_t *new_list = NULL;
+
+		for (dnp = yypcb->pcb_root->dn_list;
+		    dnp != NULL; dnp = next_dnp) {
+			/* remove this node from the list */
+			next_dnp = dnp->dn_list;
+			dnp->dn_list = NULL;
+
+			if (dnp->dn_kind == DT_NODE_CLAUSE)
+				dnp = dt_compile_sugar(dtp, dnp);
+			/* append node to the new list */
+			new_list = dt_node_link(new_list, dnp);
+		}
+		yypcb->pcb_root->dn_list = new_list;
+	}
+
+	/*
 	 * If we have successfully created a parse tree for a D program, loop
 	 * over the clauses and actions and instantiate the corresponding
 	 * libdtrace program.  If we are parsing a D expression, then we
@@ -2537,6 +2516,8 @@ dt_compile(dtrace_hdl_t *dtp, int contex
 		for (; dnp != NULL; dnp = dnp->dn_list) {
 			switch (dnp->dn_kind) {
 			case DT_NODE_CLAUSE:
+				if (DT_TREEDUMP_PASS(dtp, 4))
+					dt_printd(dnp, stderr, 0);
 				dt_compile_clause(dtp, dnp);
 				break;
 			case DT_NODE_XLATOR:
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c,v
retrieving revision 1.5
diff -u -p -r1.5 dt_cg.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c	24 Sep 2015 14:25:29 -0000	1.5
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c	12 Apr 2017 23:13:06 -0000
@@ -446,7 +446,6 @@ dt_cg_store(dt_node_t *src, dt_irlist_t 
 			instr = DIF_INSTR_STORE(DIF_OP_STX, reg, dst->dn_reg);
 			break;
 		default:
-			instr = 0;
 			xyerror(D_UNKNOWN, "internal error -- cg cannot store "
 			    "size %lu when passed by value\n", (ulong_t)size);
 		}
@@ -1354,40 +1353,6 @@ dt_cg_inline(dt_node_t *dnp, dt_irlist_t
 	}
 }
 
-static void
-dt_cg_func_typeref(dtrace_hdl_t *dtp, dt_node_t *dnp)
-{
-	dtrace_typeinfo_t dtt;
-	dt_node_t *addr = dnp->dn_args;
-	dt_node_t *nelm = addr->dn_list;
-	dt_node_t *strp = nelm->dn_list;
-	dt_node_t *typs = strp->dn_list;
-	char buf[DT_TYPE_NAMELEN];
-	char *p;
-
-	ctf_type_name(addr->dn_ctfp, addr->dn_type, buf, sizeof (buf));
-
-	/*
-	 * XXX Hack alert! XXX
-	 * The prototype has two dummy args that we munge to represent
-	 * the type string and the type size.
-	 *
-	 * Yes, I hear your grumble, but it works for now. We'll come
-	 * up with a more elegant implementation later. :-)
-	 */
-	free(strp->dn_string);
-
-	if ((p = strchr(buf, '*')) != NULL)
-		*p = '\0';
-
-	strp->dn_string = strdup(buf);
-
-	if (dtrace_lookup_by_type(dtp,  DTRACE_OBJ_EVERY, buf, &dtt) < 0)
-		return;
-
-	typs->dn_value = ctf_type_size(dtt.dtt_ctfp, dtt.dtt_type);
-}
-
 typedef struct dt_xlmemb {
 	dt_ident_t *dtxl_idp;		/* translated ident */
 	dt_irlist_t *dtxl_dlp;		/* instruction list */
@@ -2003,8 +1968,6 @@ dt_cg_node(dt_node_t *dnp, dt_irlist_t *
 
 		switch (dnp->dn_kind) {
 		case DT_NODE_FUNC: {
-			dtrace_hdl_t *dtp = yypcb->pcb_hdl;
-
 			if ((idp = dnp->dn_ident)->di_kind != DT_IDENT_FUNC) {
 				dnerror(dnp, D_CG_EXPR, "%s %s( ) may not be "
 				    "called from a D expression (D program "
@@ -2012,15 +1975,6 @@ dt_cg_node(dt_node_t *dnp, dt_irlist_t *
 				    dt_idkind_name(idp->di_kind), idp->di_name);
 			}
 
-			switch (idp->di_id) {
-			case DIF_SUBR_TYPEREF:
-				dt_cg_func_typeref(dtp, dnp);
-				break;
-
-			default:
-				break;
-			}
-
 			dt_cg_arglist(dnp->dn_ident, dnp->dn_args, dlp, drp);
 
 			dnp->dn_reg = dt_regset_alloc(drp);
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c,v
retrieving revision 1.9
diff -u -p -r1.9 dt_consume.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c	29 Sep 2015 14:31:22 -0000	1.9
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c	29 Apr 2017 03:36:39 -0000
@@ -1297,7 +1297,7 @@ dt_print_stack(dtrace_hdl_t *dtp, FILE *
 			if (pc > sym.st_value) {
 				(void) snprintf(c, sizeof (c), "%s`%s+0x%llx",
 				    dts.dts_object, dts.dts_name,
-				    (unsigned long long)(pc - sym.st_value));
+				    (u_longlong_t)(pc - sym.st_value));
 			} else {
 				(void) snprintf(c, sizeof (c), "%s`%s",
 				    dts.dts_object, dts.dts_name);
@@ -1310,10 +1310,10 @@ dt_print_stack(dtrace_hdl_t *dtp, FILE *
 			 */
 			if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
 				(void) snprintf(c, sizeof (c), "%s`0x%llx",
-				    dts.dts_object, (unsigned long long)pc);
+				    dts.dts_object, (u_longlong_t)pc);
 			} else {
 				(void) snprintf(c, sizeof (c), "0x%llx",
-				    (unsigned long long)pc);
+				    (u_longlong_t)pc);
 			}
 		}
 
@@ -1387,7 +1387,7 @@ dt_print_ustack(dtrace_hdl_t *dtp, FILE 
 			if (pc[i] > sym.st_value) {
 				(void) snprintf(c, sizeof (c),
 				    "%s`%s+0x%llx", dt_basename(objname), name,
-				    (unsigned long long)(pc[i] - sym.st_value));
+				    (u_longlong_t)(pc[i] - sym.st_value));
 			} else {
 				(void) snprintf(c, sizeof (c),
 				    "%s`%s", dt_basename(objname), name);
@@ -1413,10 +1413,10 @@ dt_print_ustack(dtrace_hdl_t *dtp, FILE 
 			if (P != NULL && Pobjname(P, pc[i], objname,
 			    sizeof (objname)) != 0) {
 				(void) snprintf(c, sizeof (c), "%s`0x%llx",
-				    dt_basename(objname), (unsigned long long)pc[i]);
+				    dt_basename(objname), (u_longlong_t)pc[i]);
 			} else {
 				(void) snprintf(c, sizeof (c), "0x%llx",
-				    (unsigned long long)pc[i]);
+				    (u_longlong_t)pc[i]);
 			}
 		}
 
@@ -1526,7 +1526,7 @@ dt_print_umod(dtrace_hdl_t *dtp, FILE *f
 	if (P != NULL && Pobjname(P, pc, objname, sizeof (objname)) != 0) {
 		(void) snprintf(c, sizeof (c), "%s", dt_basename(objname));
 	} else {
-		(void) snprintf(c, sizeof (c), "0x%llx", (unsigned long long)pc);
+		(void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc);
 	}
 
 	err = dt_printf(dtp, fp, format, c);
@@ -1540,312 +1540,6 @@ dt_print_umod(dtrace_hdl_t *dtp, FILE *f
 }
 
 static int
-dt_print_memory(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr)
-{
-	int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET);
-	size_t nbytes = *((uintptr_t *) addr);
-
-	return (dt_print_bytes(dtp, fp, addr + sizeof(uintptr_t),
-	    nbytes, 50, quiet, 1));
-}
-
-typedef struct dt_type_cbdata {
-	dtrace_hdl_t		*dtp;
-	dtrace_typeinfo_t	dtt;
-	caddr_t			addr;
-	caddr_t			addrend;
-	const char		*name;
-	int			f_type;
-	int			indent;
-	int			type_width;
-	int			name_width;
-	FILE			*fp;
-} dt_type_cbdata_t;
-
-static int	dt_print_type_data(dt_type_cbdata_t *, ctf_id_t);
-
-static int
-dt_print_type_member(const char *name, ctf_id_t type, ulong_t off, void *arg)
-{
-	dt_type_cbdata_t cbdata;
-	dt_type_cbdata_t *cbdatap = arg;
-	ssize_t ssz;
-
-	if ((ssz = ctf_type_size(cbdatap->dtt.dtt_ctfp, type)) <= 0)
-		return (0);
-
-	off /= 8;
-
-	cbdata = *cbdatap;
-	cbdata.name = name;
-	cbdata.addr += off;
-	cbdata.addrend = cbdata.addr + ssz;
-
-	return (dt_print_type_data(&cbdata, type));
-}
-
-static int
-dt_print_type_width(const char *name, ctf_id_t type, ulong_t off, void *arg)
-{
-	char buf[DT_TYPE_NAMELEN];
-	char *p;
-	dt_type_cbdata_t *cbdatap = arg;
-	size_t sz = strlen(name);
-
-	ctf_type_name(cbdatap->dtt.dtt_ctfp, type, buf, sizeof (buf));
-
-	if ((p = strchr(buf, '[')) != NULL)
-		p[-1] = '\0';
-	else
-		p = __UNCONST("");
-
-	sz += strlen(p);
-
-	if (sz > cbdatap->name_width)
-		cbdatap->name_width = sz;
-
-	sz = strlen(buf);
-
-	if (sz > cbdatap->type_width)
-		cbdatap->type_width = sz;
-
-	return (0);
-}
-
-static int
-dt_print_type_data(dt_type_cbdata_t *cbdatap, ctf_id_t type)
-{
-	caddr_t addr = cbdatap->addr;
-	caddr_t addrend = cbdatap->addrend;
-	char buf[DT_TYPE_NAMELEN];
-	char *p;
-	int cnt = 0;
-	uint_t kind = ctf_type_kind(cbdatap->dtt.dtt_ctfp, type);
-	ssize_t ssz = ctf_type_size(cbdatap->dtt.dtt_ctfp, type);
-
-	ctf_type_name(cbdatap->dtt.dtt_ctfp, type, buf, sizeof (buf));
-
-	if ((p = strchr(buf, '[')) != NULL)
-		p[-1] = '\0';
-	else
-		p = __UNCONST("");
-
-	if (cbdatap->f_type) {
-		int type_width = roundup(cbdatap->type_width + 1, 4);
-		int name_width = roundup(cbdatap->name_width + 1, 4);
-
-		name_width -= strlen(cbdatap->name);
-
-		dt_printf(cbdatap->dtp, cbdatap->fp, "%*s%-*s%s%-*s	= ",cbdatap->indent * 4,"",type_width,buf,cbdatap->name,name_width,p);
-	}
-
-	while (addr < addrend) {
-		dt_type_cbdata_t cbdata;
-		ctf_arinfo_t arinfo;
-		ctf_encoding_t cte;
-		void *vp = addr;
-		cbdata = *cbdatap;
-		cbdata.name = "";
-		cbdata.addr = addr;
-		cbdata.addrend = addr + ssz;
-		cbdata.f_type = 0;
-		cbdata.indent++;
-		cbdata.type_width = 0;
-		cbdata.name_width = 0;
-
-		if (cnt > 0)
-			dt_printf(cbdatap->dtp, cbdatap->fp, "%*s", cbdatap->indent * 4,"");
-
-		switch (kind) {
-		case CTF_K_INTEGER:
-			if (ctf_type_encoding(cbdatap->dtt.dtt_ctfp, type, &cte) != 0)
-				return (-1);
-			if ((cte.cte_format & CTF_INT_SIGNED) != 0)
-				switch (cte.cte_bits) {
-				case 8:
-					if (isprint(*((unsigned char *) vp)))
-						dt_printf(cbdatap->dtp, cbdatap->fp, "'%c', ", *((char *) vp));
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%d (0x%x);\n", *((char *) vp), *((char *) vp));
-					break;
-				case 16:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%hd (0x%hx);\n", *((short *) vp), *((u_short *) vp));
-					break;
-				case 32:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%d (0x%x);\n", *((int *) vp), *((u_int *) vp));
-					break;
-				case 64:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%jd (0x%jx);\n", *((long long *) vp), *((unsigned long long *) vp));
-					break;
-				default:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_INTEGER: format %x offset %u bits %u\n",cte.cte_format,cte.cte_offset,cte.cte_bits);
-					break;
-				}
-			else
-				switch (cte.cte_bits) {
-				case 8:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%u (0x%x);\n", *((uint8_t *) vp) & 0xff, *((uint8_t *) vp) & 0xff);
-					break;
-				case 16:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%hu (0x%hx);\n", *((u_short *) vp), *((u_short *) vp));
-					break;
-				case 32:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%u (0x%x);\n", *((u_int *) vp), *((u_int *) vp));
-					break;
-				case 64:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "%ju (0x%jx);\n", *((unsigned long long *) vp), *((unsigned long long *) vp));
-					break;
-				default:
-					dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_INTEGER: format %x offset %u bits %u\n",cte.cte_format,cte.cte_offset,cte.cte_bits);
-					break;
-				}
-			break;
-		case CTF_K_FLOAT:
-			dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_FLOAT: format %x offset %u bits %u\n",cte.cte_format,cte.cte_offset,cte.cte_bits);
-			break;
-		case CTF_K_POINTER:
-			dt_printf(cbdatap->dtp, cbdatap->fp, "%p;\n", *((void **) addr));
-			break;
-		case CTF_K_ARRAY:
-			if (ctf_array_info(cbdatap->dtt.dtt_ctfp, type, &arinfo) != 0)
-				return (-1);
-			dt_printf(cbdatap->dtp, cbdatap->fp, "{\n%*s",cbdata.indent * 4,"");
-			dt_print_type_data(&cbdata, arinfo.ctr_contents);
-			dt_printf(cbdatap->dtp, cbdatap->fp, "%*s};\n",cbdatap->indent * 4,"");
-			break;
-		case CTF_K_FUNCTION:
-			dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_FUNCTION:\n");
-			break;
-		case CTF_K_STRUCT:
-			cbdata.f_type = 1;
-			if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type,
-			    dt_print_type_width, &cbdata) != 0)
-				return (-1);
-			dt_printf(cbdatap->dtp, cbdatap->fp, "{\n");
-			if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type,
-			    dt_print_type_member, &cbdata) != 0)
-				return (-1);
-			dt_printf(cbdatap->dtp, cbdatap->fp, "%*s};\n",cbdatap->indent * 4,"");
-			break;
-		case CTF_K_UNION:
-			cbdata.f_type = 1;
-			if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type,
-			    dt_print_type_width, &cbdata) != 0)
-				return (-1);
-			dt_printf(cbdatap->dtp, cbdatap->fp, "{\n");
-			if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type,
-			    dt_print_type_member, &cbdata) != 0)
-				return (-1);
-			dt_printf(cbdatap->dtp, cbdatap->fp, "%*s};\n",cbdatap->indent * 4,"");
-			break;
-		case CTF_K_ENUM:
-			dt_printf(cbdatap->dtp, cbdatap->fp, "%s;\n", ctf_enum_name(cbdatap->dtt.dtt_ctfp, type, *((int *) vp)));
-			break;
-		case CTF_K_TYPEDEF:
-			dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type));
-			break;
-		case CTF_K_VOLATILE:
-			if (cbdatap->f_type)
-				dt_printf(cbdatap->dtp, cbdatap->fp, "volatile ");
-			dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type));
-			break;
-		case CTF_K_CONST:
-			if (cbdatap->f_type)
-				dt_printf(cbdatap->dtp, cbdatap->fp, "const ");
-			dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type));
-			break;
-		case CTF_K_RESTRICT:
-			if (cbdatap->f_type)
-				dt_printf(cbdatap->dtp, cbdatap->fp, "restrict ");
-			dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type));
-			break;
-		default:
-			break;
-		}
-
-		addr += ssz;
-		cnt++;
-	}
-
-	return (0);
-}
-
-static int
-dt_print_type(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr)
-{
-	char *p;
-	dtrace_typeinfo_t dtt;
-	dt_type_cbdata_t cbdata;
-	int num = 0;
-	int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET);
-	ssize_t ssz;
-
-	if (!quiet)
-		dt_printf(dtp, fp, "\n");
-
-	/* Get the total number of bytes of data buffered. */
-	size_t nbytes = *((uintptr_t *) addr);
-	addr += sizeof(uintptr_t);
-
-	/*
-	 * Get the size of the type so that we can check that it matches
-	 * the CTF data we look up and so that we can figure out how many
-	 * type elements are buffered.
-	 */
-	size_t typs = *((uintptr_t *) addr);
-	addr += sizeof(uintptr_t);
-
-	/*
-	 * Point to the type string in the buffer. Get it's string
-	 * length and round it up to become the offset to the start
-	 * of the buffered type data which we would like to be aligned
-	 * for easy access.
-	 */
-	char *strp = (char *) addr;
-	int offset = roundup(strlen(strp) + 1, sizeof(uintptr_t));
-
-	/*
-	 * The type string might have a format such as 'int [20]'.
-	 * Check if there is an array dimension present.
-	 */
-	if ((p = strchr(strp, '[')) != NULL) {
-		/* Strip off the array dimension. */
-		*p++ = '\0';
-
-		for (; *p != '\0' && *p != ']'; p++)
-			num = num * 10 + *p - '0';
-	} else
-		/* No array dimension, so default. */
-		num = 1;
-
-	/* Lookup the CTF type from the type string. */
-	if (dtrace_lookup_by_type(dtp,  DTRACE_OBJ_EVERY, strp, &dtt) < 0)
-		return (-1);
-
-	/* Offset the buffer address to the start of the data... */
-	addr += offset;
-
-	ssz = ctf_type_size(dtt.dtt_ctfp, dtt.dtt_type);
-
-	if (typs != ssz) {
-		printf("Expected type size from buffer (%lu) to match type size looked up now (%ld)\n", (u_long) typs, (long) ssz);
-		return (-1);
-	}
-
-	cbdata.dtp = dtp;
-	cbdata.dtt = dtt;
-	cbdata.name = "";
-	cbdata.addr = addr;
-	cbdata.addrend = addr + nbytes;
-	cbdata.indent = 1;
-	cbdata.f_type = 1;
-	cbdata.type_width = 0;
-	cbdata.name_width = 0;
-	cbdata.fp = fp;
-
-	return (dt_print_type_data(&cbdata, dtt.dtt_type));
-}
-
-static int
 dt_print_sym(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr)
 {
 	/* LINTED - alignment */
@@ -1868,10 +1562,10 @@ dt_print_sym(dtrace_hdl_t *dtp, FILE *fp
 		 */
 		if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
 			(void) snprintf(c, sizeof (c), "%s`0x%llx",
-			    dts.dts_object, (unsigned long long)pc);
+			    dts.dts_object, (u_longlong_t)pc);
 		} else {
 			(void) snprintf(c, sizeof (c), "0x%llx",
-			    (unsigned long long)pc);
+			    (u_longlong_t)pc);
 		}
 	}
 
@@ -1895,7 +1589,7 @@ dt_print_mod(dtrace_hdl_t *dtp, FILE *fp
 	if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) {
 		(void) snprintf(c, sizeof (c), "%s", dts.dts_object);
 	} else {
-		(void) snprintf(c, sizeof (c), "0x%llx", (unsigned long long)pc);
+		(void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc);
 	}
 
 	if (dt_printf(dtp, fp, format, c) < 0)
@@ -1904,6 +1598,16 @@ dt_print_mod(dtrace_hdl_t *dtp, FILE *fp
 	return (0);
 }
 
+static int
+dt_print_memory(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr)
+{
+	int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET);
+	size_t nbytes = *((uintptr_t *) addr);
+
+	return (dt_print_bytes(dtp, fp, addr + sizeof(uintptr_t),
+	    nbytes, 50, quiet, 1));
+}
+
 typedef struct dt_normal {
 	dtrace_aggvarid_t dtnd_id;
 	uint64_t dtnd_normal;
@@ -2644,12 +2348,6 @@ dt_consume_cpu(dtrace_hdl_t *dtp, FILE *
 				goto nextrec;
 			}
 
-			if (act == DTRACEACT_PRINTT) {
-				if (dt_print_type(dtp, fp, addr) < 0)
-					return (-1);
-				goto nextrec;
-			}
-
 			if (DTRACEACT_ISPRINTFLIKE(act)) {
 				void *fmtdata;
 				int (*func)(dtrace_hdl_t *, FILE *, void *,
@@ -2674,8 +2372,6 @@ dt_consume_cpu(dtrace_hdl_t *dtp, FILE *
 				case DTRACEACT_FREOPEN:
 					func = dtrace_freopen;
 					break;
-				default:
-					return (dt_set_errno(dtp, EDT_BADAGG));
 				}
 
 				n = (*func)(dtp, fp, fmtdata, &data,
@@ -3381,7 +3077,7 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE *
 		 * Reduce memory usage by re-allocating smaller buffers
 		 * for the "remnants".
 		 */
-		while ((buf = dt_pq_walk(dtp->dt_bufq, &cookie)) != NULL)
+		while (buf = dt_pq_walk(dtp->dt_bufq, &cookie))
 			dt_realloc_buf(dtp, buf, buf->dtbd_size);
 	}
 
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dis.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c	24 Sep 2015 14:25:29 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c	5 May 2017 16:56:53 -0000
@@ -176,7 +176,7 @@ dt_dis_setx(const dtrace_difo_t *dp, con
 
 	if (intptr < dp->dtdo_intlen) {
 		(void) fprintf(fp, "\t\t! 0x%llx",
-		    (unsigned long long)dp->dtdo_inttab[intptr]);
+		    (u_longlong_t)dp->dtdo_inttab[intptr]);
 	}
 }
 
@@ -334,8 +334,8 @@ dt_dis_rtab(const char *rtag, const dtra
 
 	for (; len != 0; len--, rp++) {
 		(void) fprintf(fp, "%-4u %-8llu %-8llu %s\n",
-		    rp->dofr_type, (unsigned long long)rp->dofr_offset,
-		    (unsigned long long)rp->dofr_data,
+		    rp->dofr_type, (u_longlong_t)rp->dofr_offset,
+		    (u_longlong_t)rp->dofr_data,
 		    &dp->dtdo_strtab[rp->dofr_name]);
 	}
 }
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c,v
retrieving revision 1.4
diff -u -p -r1.4 dt_dof.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c	24 Sep 2015 14:25:29 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c	13 Apr 2017 01:18:39 -0000
@@ -462,18 +462,8 @@ dof_add_probe(dt_idhash_t *dhp, dt_ident
 		dt_buf_write(dtp, &ddo->ddo_enoffs, pip->pi_enoffs,
 		    pip->pi_nenoffs * sizeof (uint32_t), sizeof (uint32_t));
 
-		/*
-		 * If pi_rname isn't set, the relocation will be against the
-		 * function name. If it is, the relocation will be against
-		 * pi_rname. This will be used if the function is scoped
-		 * locally so an alternate symbol is added for the purpose
-		 * of this relocation.
-		 */
-		if (pip->pi_rname == NULL)
-			dofr.dofr_name = dofpr.dofpr_func;
-		else
-			dofr.dofr_name = dof_add_string(ddo, pip->pi_rname);
-		dofr.dofr_type = DOF_RELO_SETX;
+		dofr.dofr_name = dof_add_string(ddo, pip->pi_rname);
+		dofr.dofr_type = DOF_RELO_DOFREL;
 		dofr.dofr_offset = dt_buf_len(&ddo->ddo_probes);
 		dofr.dofr_data = 0;
 
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h,v
retrieving revision 1.3
diff -u -p -r1.3 dt_errtags.h
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h	24 Sep 2015 14:25:29 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h	13 Apr 2017 01:19:06 -0000
@@ -265,8 +265,6 @@ typedef enum {
 	D_NOREG,			/* no available internal registers */
 	D_PRINTM_ADDR,			/* printm() memref bad type */
 	D_PRINTM_SIZE,			/* printm() size bad type */
-	D_PRINTT_ADDR,			/* printt() typeref bad type */
-	D_PRINTT_SIZE			/* printt() size bad type */
 } dt_errtag_t;
 
 extern const char *dt_errtag(dt_errtag_t);
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y,v
retrieving revision 1.3
diff -u -p -r1.3 dt_grammar.y
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y	24 Sep 2015 14:25:29 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y	10 Oct 2016 11:14:31 -0000
@@ -23,8 +23,9 @@
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
@@ -155,6 +156,8 @@
 %type	<l_node>	probe_specifier_list
 %type	<l_node>	probe_specifier
 %type	<l_node>	statement_list
+%type	<l_node>	statement_list_impl
+%type	<l_node>	statement_or_block
 %type	<l_node>	statement
 %type	<l_node>	declaration
 %type	<l_node>	init_declarator_list
@@ -319,9 +322,11 @@ probe_definition:
 				    "or actions following probe description\n");
 			}
 			$$ = dt_node_clause($1, NULL, NULL);
+			yybegin(YYS_CLAUSE);
 		}
 	|	probe_specifiers '{' statement_list '}' {
 			$$ = dt_node_clause($1, NULL, $3);
+			yybegin(YYS_CLAUSE);
 		}
 	|	probe_specifiers DT_TOK_DIV expression DT_TOK_EPRED {
 			dnerror($3, D_SYNTAX, "expected actions { } following "
@@ -330,6 +335,7 @@ probe_definition:
 	|	probe_specifiers DT_TOK_DIV expression DT_TOK_EPRED
 		    '{' statement_list '}' {
 			$$ = dt_node_clause($1, $3, $6);
+			yybegin(YYS_CLAUSE);
 		}
 	;
 
@@ -349,12 +355,30 @@ probe_specifier:
 	|	DT_TOK_INT   { $$ = dt_node_pdesc_by_id($1); }
 	;
 
-statement_list:	statement { $$ = $1; }
-	|	statement_list ';' statement { $$ = LINK($1, $3); }
+statement_list_impl: /* empty */ { $$ = NULL; }
+	|	statement_list_impl statement { $$ = LINK($1, $2); }
+	;
+
+statement_list:
+		statement_list_impl { $$ = $1; }
+	|	statement_list_impl expression {
+			$$ = LINK($1, dt_node_statement($2));
+		}
 	;
 
-statement:	/* empty */ { $$ = NULL; }
-	|	expression { $$ = dt_node_statement($1); }
+statement_or_block:
+		statement
+	|	'{' statement_list '}' { $$ = $2; }
+
+statement:	';' { $$ = NULL; }
+	|	expression ';' { $$ = dt_node_statement($1); }
+	|	DT_KEY_IF DT_TOK_LPAR expression DT_TOK_RPAR statement_or_block {
+			$$ = dt_node_if($3, $5, NULL);
+		}
+	|	DT_KEY_IF DT_TOK_LPAR expression DT_TOK_RPAR
+		statement_or_block DT_KEY_ELSE statement_or_block {
+			$$ = dt_node_if($3, $5, $7);
+		}
 	;
 
 argument_expression_list:
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c,v
retrieving revision 1.5
diff -u -p -r1.5 dt_handle.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c	24 Sep 2015 14:25:29 -0000	1.5
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c	29 Apr 2017 03:37:32 -0000
@@ -208,7 +208,7 @@ dt_handle_err(dtrace_hdl_t *dtp, dtrace_
 	case DTRACEFLT_BADALIGN:
 	case DTRACEFLT_BADSTACK:
 		(void) sprintf(details, " (0x%llx)",
-		    (unsigned long long)err.dteda_addr);
+		    (u_longlong_t)err.dteda_addr);
 		break;
 
 	default:
@@ -335,7 +335,7 @@ dt_handle_cpudrop(dtrace_hdl_t *dtp, pro
 	}
 
 	(void) snprintf(s, size, "%llu %sdrop%s on CPU %d\n",
-	    (unsigned long long)howmany,
+	    (u_longlong_t)howmany,
 	    what == DTRACEDROP_PRINCIPAL ? "" : "aggregation ",
 	    howmany > 1 ? "s" : "", (int)cpu);
 
@@ -429,7 +429,7 @@ dt_handle_status(dtrace_hdl_t *dtp, dtra
 		}
 
 		(void) snprintf(s, size, "%llu %s%s%s\n",
-		    (unsigned long long)(nval - oval),
+		    (u_longlong_t)(nval - oval),
 		    _dt_droptab[i].dtdrt_str, (nval - oval > 1) ? "s" : "",
 		    _dt_droptab[i].dtdrt_msg != NULL ?
 		    _dt_droptab[i].dtdrt_msg : "");
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c,v
retrieving revision 1.5
diff -u -p -r1.5 dt_ident.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c	4 Feb 2016 17:27:32 -0000	1.5
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c	5 May 2017 16:57:32 -0000
@@ -358,7 +358,7 @@ dt_idcook_args(dt_node_t *dnp, dt_ident_
 
 	if (ap->dn_value >= prp->pr_argc) {
 		xyerror(D_ARGS_IDX, "index %lld is out of range for %s %s[ ]\n",
-		    (long long)ap->dn_value, dtrace_desc2str(yypcb->pcb_pdesc,
+		    (longlong_t)ap->dn_value, dtrace_desc2str(yypcb->pcb_pdesc,
 		    n1, sizeof (n1)), idp->di_name);
 	}
 
@@ -374,12 +374,12 @@ dt_idcook_args(dt_node_t *dnp, dt_ident_
 
 	if (xnp->dn_type == CTF_ERR) {
 		xyerror(D_ARGS_TYPE, "failed to resolve translated type for "
-		    "%s[%lld]\n", idp->di_name, (long long)ap->dn_value);
+		    "%s[%lld]\n", idp->di_name, (longlong_t)ap->dn_value);
 	}
 
 	if (nnp->dn_type == CTF_ERR) {
 		xyerror(D_ARGS_TYPE, "failed to resolve native type for "
-		    "%s[%lld]\n", idp->di_name, (long long)ap->dn_value);
+		    "%s[%lld]\n", idp->di_name, (longlong_t)ap->dn_value);
 	}
 
 	if (dtp->dt_xlatemode == DT_XL_STATIC && (
@@ -428,7 +428,7 @@ dt_idcook_args(dt_node_t *dnp, dt_ident_
 
 	} else {
 		xyerror(D_ARGS_XLATOR, "translator for %s[%lld] from %s to %s "
-		    "is not defined\n", idp->di_name, (long long)ap->dn_value,
+		    "is not defined\n", idp->di_name, (longlong_t)ap->dn_value,
 		    dt_node_type_name(nnp, n1, sizeof (n1)),
 		    dt_node_type_name(xnp, n2, sizeof (n2)));
 	}
@@ -459,7 +459,7 @@ dt_idcook_regs(dt_node_t *dnp, dt_ident_
 
 	if ((ap->dn_flags & DT_NF_SIGNED) && (int64_t)ap->dn_value < 0) {
 		xyerror(D_REGS_IDX, "index %lld is out of range for array %s\n",
-		    (long long)ap->dn_value, idp->di_name);
+		    (longlong_t)ap->dn_value, idp->di_name);
 	}
 
 	if (dt_type_lookup("uint64_t", &dtt) == -1) {
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h,v
retrieving revision 1.7
diff -u -p -r1.7 dt_impl.h
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h	28 Apr 2016 00:02:40 -0000	1.7
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h	29 Apr 2017 03:37:50 -0000
@@ -26,7 +26,7 @@
 
 /*
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef	_DT_IMPL_H
@@ -362,6 +362,7 @@ struct dtrace_hdl {
 	int dt_indent;		/* recommended flow indent */
 	dtrace_epid_t dt_last_epid;	/* most recently consumed EPID */
 	uint64_t dt_last_timestamp;	/* most recently consumed timestamp */
+	boolean_t dt_has_sugar;	/* syntactic sugar used? */
 };
 
 /*
@@ -487,7 +488,6 @@ struct dtrace_hdl {
 #define	DT_ACT_SETOPT		DT_ACT(28)	/* setopt() action */
 #define	DT_ACT_PRINT		DT_ACT(29)	/* print() action */
 #define	DT_ACT_PRINTM		DT_ACT(30)	/* printm() action */
-#define	DT_ACT_PRINTT		DT_ACT(31)	/* printt() action */
 
 /*
  * Sentinel to tell freopen() to restore the saved stdout.  This must not
@@ -744,15 +744,19 @@ extern int _dtrace_argmax;		/* default m
 extern const char *_dtrace_libdir;	/* default library directory */
 extern const char *_dtrace_moddir;	/* default kernel module directory */
 
-#ifndef illumos
-extern const char *dt_bootfile(char *, size_t);
-#endif
-
 #if defined(__FreeBSD__) || defined(__NetBSD__)
 extern int gmatch(const char *, const char *);
 extern int yylex(void);
 #endif
 
+#ifdef __NetBSD__
+extern const char *dt_bootfile(char *, size_t);
+
+#define longlong_t long long
+#define u_longlong_t unsigned long long
+#define __DECONST(a, b) __UNCONST(b)
+#endif
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c,v
retrieving revision 1.9
diff -u -p -r1.9 dt_link.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c	12 Jan 2017 23:12:59 -0000	1.9
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c	29 Apr 2017 03:14:25 -0000
@@ -247,7 +247,7 @@ printf("%s:%s(%d): DOODAD\n",__FUNCTION_
 			rel->r_offset = s->dofs_offset +
 			    dofr[j].dofr_offset;
 			rel->r_info = ELF32_R_INFO(count + dep->de_global,
-			    R_386_32);
+			    R_386_PC32);
 #elif defined(__mips__)
 /* XXX */
 printf("%s:%s(%d): DOODAD\n",__FUNCTION__,__FILE__,__LINE__);
@@ -269,6 +269,9 @@ printf("%s:%s(%d): DOODAD\n",__FUNCTION_
 			    dofr[j].dofr_offset + 4;
 			rel->r_info = ELF32_R_INFO(count + dep->de_global,
 			    R_SPARC_32);
+#elif defined(__riscv__)
+/* XXX */
+printf("%s:%s(%d): DOODAD\n",__FUNCTION__,__FILE__,__LINE__);
 #else
 #error unknown ISA
 #endif
@@ -294,11 +297,7 @@ printf("%s:%s(%d): DOODAD\n",__FUNCTION_
 	sym->st_value = 0;
 	sym->st_size = dof->dofh_filesz;
 	sym->st_info = ELF32_ST_INFO(STB_GLOBAL, STT_OBJECT);
-#ifdef illumos
-	sym->st_other = 0;
-#else
 	sym->st_other = ELF32_ST_VISIBILITY(STV_HIDDEN);
-#endif
 	sym->st_shndx = ESHDR_DOF;
 	sym++;
 
@@ -450,23 +449,16 @@ prepare_elf64(dtrace_hdl_t *dtp, const d
 			    dofr[j].dofr_offset;
 			rel->r_info = ELF64_R_INFO(count + dep->de_global,
 			    R_PPC64_REL64);
+#elif defined(__riscv__)
+/* XXX */
 #elif defined(__i386) || defined(__amd64)
-			rel->r_offset = s->dofs_offset +
-			    dofr[j].dofr_offset;
-#ifdef illumos
-			rel->r_info = ELF64_R_INFO(count + dep->de_global,
-			    R_AMD64_64);
-#else
-#if defined(__amd64)
-			rel->r_info = ELF64_R_INFO(count + dep->de_global,
-			    R_X86_64_RELATIVE);
-#endif
+#ifndef R_X86_64_PC64
+#define R_X86_64_PC64           24
 #endif
-#elif defined(__sparc)
 			rel->r_offset = s->dofs_offset +
 			    dofr[j].dofr_offset;
 			rel->r_info = ELF64_R_INFO(count + dep->de_global,
-			    R_SPARC_64);
+			    R_X86_64_PC64);
 #else
 #error unknown ISA
 #endif
@@ -492,11 +484,7 @@ prepare_elf64(dtrace_hdl_t *dtp, const d
 	sym->st_value = 0;
 	sym->st_size = dof->dofh_filesz;
 	sym->st_info = GELF_ST_INFO(STB_GLOBAL, STT_OBJECT);
-#ifdef illumos
-	sym->st_other = 0;
-#else
 	sym->st_other = ELF64_ST_VISIBILITY(STV_HIDDEN);
-#endif
 	sym->st_shndx = ESHDR_DOF;
 	sym++;
 
@@ -806,16 +794,15 @@ dump_elf64(dtrace_hdl_t *dtp, const dof_
 }
 
 static int
-dt_symtab_lookup(Elf_Data *data_sym, int nsym, uintptr_t addr, uint_t shn,
-    GElf_Sym *sym, int uses_funcdesc, Elf *elf)
+dt_symtab_lookup(Elf_Data *data_sym, int start, int end, uintptr_t addr,
+    uint_t shn, GElf_Sym *sym, int uses_funcdesc, Elf *elf)
 {
-	int i, ret = -1;
 	Elf64_Addr symval;
 	Elf_Scn *opd_scn;
 	Elf_Data *opd_desc;
-	GElf_Sym s;
+	int i;
 
-	for (i = 0; i < nsym && gelf_getsym(data_sym, i, sym) != NULL; i++) {
+	for (i = start; i < end && gelf_getsym(data_sym, i, sym) != NULL; i++) {
 		if (GELF_ST_TYPE(sym->st_info) == STT_FUNC) {
 			symval = sym->st_value;
 			if (uses_funcdesc) {
@@ -825,20 +812,12 @@ dt_symtab_lookup(Elf_Data *data_sym, int
 				    *(uint64_t*)((char *)opd_desc->d_buf + symval);
 			}
 			if ((uses_funcdesc || shn == sym->st_shndx) &&
-			    symval <= addr &&
-			    addr < symval + sym->st_size) {
-				if (GELF_ST_BIND(sym->st_info) == STB_GLOBAL)
-					return (0);
-
-				ret = 0;
-				s = *sym;
-			}
+			    symval <= addr && addr < symval + sym->st_size)
+				return (0);
 		}
 	}
 
-	if (ret == 0)
-		*sym = s;
-	return (ret);
+	return (-1);
 }
 
 #if defined(__aarch64__)
@@ -949,7 +928,15 @@ dt_modtext(dtrace_hdl_t *dtp, char *p, i
 
 	return (0);
 }
-
+#elif defined(__riscv__)
+/* XXX */
+static int
+dt_modtext(dtrace_hdl_t *dtp, char *p, int isenabled, GElf_Rela *rela,
+    uint32_t *off)
+{
+printf("%s:%s(%d): DOODAD\n",__FUNCTION__,__FILE__,__LINE__);
+	return (0);
+}
 #elif defined(__sparc)
 
 #define	DT_OP_RET		0x81c7e008
@@ -1224,6 +1211,8 @@ process_obj(dtrace_hdl_t *dtp, const cha
 	static const char dt_enabled[] = "enabled";
 	static const char dt_symprefix[] = "$dtrace";
 	static const char dt_symfmt[] = "%s%ld.%s";
+	static const char dt_weaksymfmt[] = "%s.%s";
+	char probename[DTRACE_NAMELEN];
 	int fd, i, ndx, eprobe, mod = 0;
 	Elf *elf = NULL;
 	GElf_Ehdr ehdr;
@@ -1237,10 +1226,11 @@ process_obj(dtrace_hdl_t *dtp, const cha
 	dt_provider_t *pvp;
 	dt_probe_t *prp;
 	uint32_t off, eclass, emachine1, emachine2;
-	size_t symsize, nsym = 0, isym, istr, len;
+	size_t symsize, osym, nsym, isym, istr, len;
 	key_t objkey;
 	dt_link_pair_t *pair, *bufs = NULL;
 	dt_strtab_t *strtab;
+	void *tmp;
 
 	if ((fd = open64(obj, O_RDWR)) == -1) {
 		return (dt_link_error(dtp, elf, fd, bufs,
@@ -1374,12 +1364,13 @@ process_obj(dtrace_hdl_t *dtp, const cha
 		 * target (text) section to replace the call instruction with
 		 * one or more nops.
 		 *
-		 * If the function containing the probe is locally scoped
-		 * (static), we create an alias used by the relocation in the
-		 * generated object. The alias, a new symbol, will be global
-		 * (so that the relocation from the generated object can be
-		 * resolved), and hidden (so that it is converted to a local
-		 * symbol at link time). Such aliases have this form:
+		 * To avoid runtime overhead, the relocations added to the
+		 * generated object should be resolved at static link time. We
+		 * therefore create aliases for the functions that contain
+		 * probes. An alias is global (so that the relocation from the
+		 * generated object can be resolved), and hidden (so that its
+		 * address is known at static link time). Such aliases have this
+		 * form:
 		 *
 		 *   $dtrace<key>.<function>
 		 *
@@ -1417,16 +1408,13 @@ process_obj(dtrace_hdl_t *dtp, const cha
 			if (strncmp(s, dt_prefix, sizeof (dt_prefix) - 1) != 0)
 				continue;
 
-			if (dt_symtab_lookup(data_sym, isym, rela.r_offset,
-			    shdr_rel.sh_info, &fsym,
-			    (emachine1 == EM_PPC64), elf) != 0) {
+			if (dt_symtab_lookup(data_sym, 0, isym, rela.r_offset,
+			    shdr_rel.sh_info, &fsym, (emachine1 == EM_PPC64),
+			    elf) != 0) {
 				dt_strtab_destroy(strtab);
 				goto err;
 			}
 
-			if (GELF_ST_BIND(fsym.st_info) != STB_LOCAL)
-				continue;
-
 			if (fsym.st_name > data_str->d_size) {
 				dt_strtab_destroy(strtab);
 				goto err;
@@ -1462,12 +1450,12 @@ process_obj(dtrace_hdl_t *dtp, const cha
 		}
 
 		/*
-		 * If needed, allocate the additional space for the symbol
-		 * table and string table copying the old data into the new
-		 * buffers, and marking the buffers as dirty. We inject those
-		 * newly allocated buffers into the libelf data structures, but
-		 * are still responsible for freeing them once we're done with
-		 * the elf handle.
+		 * If any probes were found, allocate the additional space for
+		 * the symbol table and string table, copying the old data into
+		 * the new buffers, and marking the buffers as dirty. We inject
+		 * those newly allocated buffers into the libelf data
+		 * structures, but are still responsible for freeing them once
+		 * we're done with the elf handle.
 		 */
 		if (nsym > 0) {
 			/*
@@ -1501,7 +1489,9 @@ process_obj(dtrace_hdl_t *dtp, const cha
 			bufs = pair;
 
 			bcopy(data_str->d_buf, pair->dlp_str, data_str->d_size);
+			tmp = data_str->d_buf;
 			data_str->d_buf = pair->dlp_str;
+			pair->dlp_str = tmp;
 			data_str->d_size += len;
 			(void) elf_flagdata(data_str, ELF_C_SET, ELF_F_DIRTY);
 
@@ -1509,16 +1499,20 @@ process_obj(dtrace_hdl_t *dtp, const cha
 			(void) gelf_update_shdr(scn_str, &shdr_str);
 
 			bcopy(data_sym->d_buf, pair->dlp_sym, data_sym->d_size);
+			tmp = data_sym->d_buf;
 			data_sym->d_buf = pair->dlp_sym;
+			pair->dlp_sym = tmp;
 			data_sym->d_size += nsym * symsize;
 			(void) elf_flagdata(data_sym, ELF_C_SET, ELF_F_DIRTY);
 
 			shdr_sym.sh_size += nsym * symsize;
 			(void) gelf_update_shdr(scn_sym, &shdr_sym);
 
+			osym = isym;
 			nsym += isym;
 		} else {
 			dt_strtab_destroy(strtab);
+			continue;
 		}
 
 		/*
@@ -1577,62 +1571,62 @@ process_obj(dtrace_hdl_t *dtp, const cha
 			bcopy(s, pname, p - s);
 			pname[p - s] = '\0';
 
-			p = strhyphenate(p + 3); /* strlen("___") */
-
-			if (dt_symtab_lookup(data_sym, isym, rela.r_offset,
-			    shdr_rel.sh_info, &fsym,
-			    (emachine1 == EM_PPC64), elf) != 0)
-				goto err;
-
-			if (fsym.st_name > data_str->d_size)
-				goto err;
-
-			assert(GELF_ST_TYPE(fsym.st_info) == STT_FUNC);
-
-			/*
-			 * If a NULL relocation name is passed to
-			 * dt_probe_define(), the function name is used for the
-			 * relocation. The relocation needs to use a mangled
-			 * name if the symbol is locally scoped; the function
-			 * name may need to change if we've found the global
-			 * alias for the locally scoped symbol (we prefer
-			 * global symbols to locals in dt_symtab_lookup()).
-			 */
-			s = (char *)data_str->d_buf + fsym.st_name;
-			r = NULL;
+			if (dt_symtab_lookup(data_sym, osym, isym,
+			    rela.r_offset, shdr_rel.sh_info, &fsym,
+			    (emachine1 == EM_PPC64), elf) == 0) {
+				if (fsym.st_name > data_str->d_size)
+					goto err;
 
-			if (GELF_ST_BIND(fsym.st_info) == STB_LOCAL) {
+				r = s = (char *) data_str->d_buf + fsym.st_name;
+				assert(strstr(s, dt_symprefix) == s);
+				s = strchr(s, '.') + 1;
+			} else if (dt_symtab_lookup(data_sym, 0, osym,
+			    rela.r_offset, shdr_rel.sh_info, &fsym,
+			    (emachine1 == EM_PPC64), elf) == 0) {
+				u_int bind;
+
+				bind = GELF_ST_BIND(fsym.st_info) == STB_WEAK ?
+				    STB_WEAK : STB_GLOBAL;
+
+				/*
+				 * Emit an alias for the symbol. It needs to be
+				 * non-preemptible so that .SUNW_dof relocations
+				 * may be resolved at static link time. Aliases
+				 * of weak symbols are given a non-unique name
+				 * so that they may be merged by the linker.
+				 */
 				dsym = fsym;
 				dsym.st_name = istr;
-				dsym.st_info = GELF_ST_INFO(STB_GLOBAL,
-				    STT_FUNC);
-				dsym.st_other =
-				    ELF64_ST_VISIBILITY(STV_ELIMINATE);
+				dsym.st_info = GELF_ST_INFO(bind, STT_FUNC);
+				dsym.st_other = GELF_ST_VISIBILITY(STV_HIDDEN);
 				(void) gelf_update_sym(data_sym, isym, &dsym);
-
-				r = (char *)data_str->d_buf + istr;
-				istr += 1 + sprintf(r, dt_symfmt,
-				    dt_symprefix, objkey, s);
+				r = (char *) data_str->d_buf + istr;
+				s = (char *) data_str->d_buf + fsym.st_name;
+				if (bind == STB_WEAK)
+					istr += sprintf(r, dt_weaksymfmt,
+					    dt_symprefix, s);
+				else
+					istr += sprintf(r, dt_symfmt,
+					    dt_symprefix, objkey, s);
+				istr++;
 				isym++;
 				assert(isym <= nsym);
-
-			} else if (strncmp(s, dt_symprefix,
-			    strlen(dt_symprefix)) == 0) {
-				r = s;
-				if ((s = strchr(s, '.')) == NULL)
-					goto err;
-				s++;
-			}
+			} else
+				goto err;
 
 			if ((pvp = dt_provider_lookup(dtp, pname)) == NULL) {
 				return (dt_link_error(dtp, elf, fd, bufs,
 				    "no such provider %s", pname));
 			}
 
-			if ((prp = dt_probe_lookup(pvp, p)) == NULL) {
+			if (strlcpy(probename, p + 3, sizeof (probename)) >=
+			    sizeof (probename))
 				return (dt_link_error(dtp, elf, fd, bufs,
-				    "no such probe %s", p));
-			}
+				    "invalid probe name %s", probename));
+			(void) strhyphenate(probename);
+			if ((prp = dt_probe_lookup(pvp, probename)) == NULL)
+				return (dt_link_error(dtp, elf, fd, bufs,
+				    "no such probe %s", probename));
 
 			assert(fsym.st_value <= rela.r_offset);
 
@@ -1695,9 +1689,6 @@ process_obj(dtrace_hdl_t *dtp, const cha
 	(void) elf_end(elf);
 	(void) close(fd);
 
-#ifndef illumos
-	if (nsym > 0)
-#endif
 	while ((pair = bufs) != NULL) {
 		bufs = pair->dlp_next;
 		dt_free(dtp, pair->dlp_str);
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c,v
retrieving revision 1.15
diff -u -p -r1.15 dt_module.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c	3 Aug 2016 16:37:02 -0000	1.15
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c	15 Jun 2017 23:20:19 -0000
@@ -24,6 +24,7 @@
  */
 /*
  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2016, Pedro Giffuni.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -36,7 +37,7 @@
 #include <sys/task.h>
 #else
 #include <sys/param.h>
-#include <sys/linker.h>
+//#include <sys/linker.h>
 #include <sys/module.h>
 #include <sys/stat.h>
 #endif
@@ -93,7 +94,7 @@ dt_module_syminit32(dt_module_t *dmp)
 	uint_t i, n = dmp->dm_nsymelems;
 	uint_t asrsv = 0;
 
-#if defined(__FreeBSD__) || defined(__NetBSD__)
+#if defined(__FreeBSD__)
 	GElf_Ehdr ehdr;
 	int is_elf_obj;
 
@@ -115,13 +116,16 @@ dt_module_syminit32(dt_module_t *dmp)
 		    (ELF32_ST_BIND(sym->st_info) != STB_LOCAL || sym->st_size)) {
 			asrsv++; /* reserve space in the address map */
 
-#if defined(__FreeBSD__) || defined(__NetBSD__)
+#if defined(__FreeBSD__)
 			sym->st_value += (Elf_Addr) dmp->dm_reloc_offset;
 			if (is_elf_obj && sym->st_shndx != SHN_UNDEF &&
 			    sym->st_shndx < ehdr.e_shnum)
 				sym->st_value +=
 				    dmp->dm_sec_offsets[sym->st_shndx];
 #endif
+#ifdef __NetBSD__
+			sym->st_value += (Elf_Addr) dmp->dm_reloc_offset;
+#endif
 		}
 
 		dt_module_symhash_insert(dmp, name, i);
@@ -143,7 +147,7 @@ dt_module_syminit64(dt_module_t *dmp)
 	uint_t i, n = dmp->dm_nsymelems;
 	uint_t asrsv = 0;
 
-#if defined(__FreeBSD__) || defined(__NetBSD__)
+#if defined(__FreeBSD__)
 	GElf_Ehdr ehdr;
 	int is_elf_obj;
 
@@ -164,13 +168,16 @@ dt_module_syminit64(dt_module_t *dmp)
 		if (sym->st_value != 0 &&
 		    (ELF64_ST_BIND(sym->st_info) != STB_LOCAL || sym->st_size)) {
 			asrsv++; /* reserve space in the address map */
-#if defined(__FreeBSD__) || defined(__NetBSD__)
+#if defined(__FreeBSD__)
 			sym->st_value += (Elf_Addr) dmp->dm_reloc_offset;
 			if (is_elf_obj && sym->st_shndx != SHN_UNDEF &&
 			    sym->st_shndx < ehdr.e_shnum)
 				sym->st_value +=
 				    dmp->dm_sec_offsets[sym->st_shndx];
 #endif
+#ifdef __NetBSD__
+			sym->st_value += (Elf_Addr) dmp->dm_reloc_offset;
+#endif
 		}
 
 		dt_module_symhash_insert(dmp, name, i);
@@ -725,22 +732,20 @@ dt_module_load_proc(dtrace_hdl_t *dtp, d
 		return (dt_set_errno(dtp, EDT_CANTLOAD));
 	}
 
-	dmp->dm_libctfp = malloc(sizeof (ctf_file_t *) * arg.dpa_count);
+	dmp->dm_libctfp = calloc(arg.dpa_count, sizeof (ctf_file_t *));
 	if (dmp->dm_libctfp == NULL) {
 		dt_proc_unlock(dtp, p);
 		dt_proc_release(dtp, p);
 		return (dt_set_errno(dtp, EDT_NOMEM));
 	}
-	bzero(dmp->dm_libctfp, sizeof (ctf_file_t *) * arg.dpa_count);
 
-	dmp->dm_libctfn = malloc(sizeof (char *) * arg.dpa_count);
+	dmp->dm_libctfn = calloc(arg.dpa_count, sizeof (char *));
 	if (dmp->dm_libctfn == NULL) {
 		free(dmp->dm_libctfp);
 		dt_proc_unlock(dtp, p);
 		dt_proc_release(dtp, p);
 		return (dt_set_errno(dtp, EDT_NOMEM));
 	}
-	bzero(dmp->dm_libctfn, sizeof (char *) * arg.dpa_count);
 
 	dmp->dm_nctflibs = arg.dpa_count;
 
@@ -821,17 +826,14 @@ dt_module_load(dtrace_hdl_t *dtp, dt_mod
 	dmp->dm_nsymbuckets = _dtrace_strbuckets;
 	dmp->dm_symfree = 1;		/* first free element is index 1 */
 
-	dmp->dm_symbuckets = malloc(sizeof (uint_t) * dmp->dm_nsymbuckets);
-	dmp->dm_symchains = malloc(sizeof (dt_sym_t) * dmp->dm_nsymelems + 1);
+	dmp->dm_symbuckets = calloc(dmp->dm_nsymbuckets, sizeof (uint_t));
+	dmp->dm_symchains = calloc(dmp->dm_nsymelems + 1, sizeof (dt_sym_t));
 
 	if (dmp->dm_symbuckets == NULL || dmp->dm_symchains == NULL) {
 		dt_module_unload(dtp, dmp);
 		return (dt_set_errno(dtp, EDT_NOMEM));
 	}
 
-	bzero(dmp->dm_symbuckets, sizeof (uint_t) * dmp->dm_nsymbuckets);
-	bzero(dmp->dm_symchains, sizeof (dt_sym_t) * dmp->dm_nsymelems + 1);
-
 	/*
 	 * Iterate over the symbol table data buffer and insert each symbol
 	 * name into the name hash if the name and type are valid.  Then
@@ -983,7 +985,7 @@ dt_module_unload(dtrace_hdl_t *dtp, dt_m
 		free(dmp->dm_asmap);
 		dmp->dm_asmap = NULL;
 	}
-#if defined(__FreeBSD__) || defined(__NetBSD__)
+#if defined(__FreeBSD__)
 	if (dmp->dm_sec_offsets != NULL) {
 		free(dmp->dm_sec_offsets);
 		dmp->dm_sec_offsets = NULL;
@@ -1145,7 +1147,7 @@ dt_module_update(dtrace_hdl_t *dtp, stru
 	char fname[MAXPATHLEN];
 	struct stat64 st;
 	int fd, err, bits;
-#if defined(__FreeBSD__) 
+#ifdef __FreeBSD__
 	struct module_stat ms;
 	dt_kmodule_t *dkmp;
 	uint_t h;
@@ -1178,11 +1180,15 @@ dt_module_update(dtrace_hdl_t *dtp, stru
 	char osrel[64];
 	char machine[64];
 	size_t len;
+	uintptr_t mapbase;
+	int i;
+	bool ismod;
 
 	if (strcmp("netbsd", name) == 0) {
 		/* want the kernel, but it is not absolute */
 		dt_bootfile(machine, sizeof(machine));
 		snprintf(fname, sizeof(fname), "/%s", machine);
+		ismod = false;
 	} else {
 
 		/* build stand module path from system */
@@ -1202,6 +1208,7 @@ dt_module_update(dtrace_hdl_t *dtp, stru
 
 		(void) snprintf(fname, sizeof (fname),
 		    "/stand/%s/%s/modules/%s/%s.kmod", machine, osrel, name, name);
+		ismod = true;
 	}
 #endif
 
@@ -1258,6 +1265,34 @@ dt_module_update(dtrace_hdl_t *dtp, stru
 		}
 	}
 #endif
+#ifdef __NetBSD__
+	mapbase = 0;
+	if (ismod) {
+		int maxmodules = 512;
+		modstat_t modstat_buf[maxmodules], *ms;
+		struct iovec iov = { modstat_buf, sizeof(modstat_buf) };
+
+		if (modctl(MODCTL_STAT, &iov) < 0) {
+			dt_dprintf("failed to get list of kernel modules: %s\n",
+				   strerror(errno));
+			return;
+		}
+
+		for (i = 0; i < maxmodules; i++) {
+			ms = &modstat_buf[i];
+			if (!strcmp(name, ms->ms_name)) {
+				mapbase = ms->ms_addr;
+				break;
+			}
+		}
+		if (i == maxmodules) {
+			dt_dprintf("module %s not found\n", name);
+			return;
+		}
+		dmp->dm_reloc_offset = (void *)mapbase;
+	}
+#endif
+
 	/*
 	 * Iterate over the section headers locating various sections of
 	 * interest and use their attributes to flesh out the dt_module_t.
@@ -1302,7 +1337,8 @@ dt_module_update(dtrace_hdl_t *dtp, stru
 	dmp->dm_flags |= DT_DM_KERNEL;
 #ifdef illumos
 	dmp->dm_modid = (int)OBJFS_MODID(st.st_ino);
-#else
+#endif /* illumos */
+#ifdef __FreeBSD__
 	/*
 	 * Include .rodata and special sections into .text.
 	 * This depends on default section layout produced by GNU ld
@@ -1310,7 +1346,7 @@ dt_module_update(dtrace_hdl_t *dtp, stru
 	 * [Text][R/O data][R/W data][Dynamic][BSS][Non loadable]
 	 */
 	dmp->dm_text_size = dmp->dm_data_va - dmp->dm_text_va;
-#if defined(__i386__) && !defined(__NetBSD__)
+#if defined(__i386__)
 	/*
 	 * Find the first load section and figure out the relocation
 	 * offset for the symbols. The kernel module will not need
@@ -1323,12 +1359,21 @@ dt_module_update(dtrace_hdl_t *dtp, stru
 		}
 	}
 #endif
-#endif /* illumos */
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+	if (ismod) {
+		dmp->dm_text_va = mapbase;
+		dmp->dm_data_va = 0;
+		dmp->dm_data_size = 0;
+		dmp->dm_bss_va = 0;
+		dmp->dm_bss_size = 0;
+	}
+#endif
 
 	if (dmp->dm_info.objfs_info_primary)
 		dmp->dm_flags |= DT_DM_PRIMARY;
 
-#if defined(__FreeBSD__)
+#ifdef __FreeBSD__
 	ms.version = sizeof(ms);
 	for (modid = kldfirstmod(k_stat->id); modid > 0;
 	    modid = modnext(modid)) {
@@ -1405,8 +1450,31 @@ dtrace_update(dtrace_hdl_t *dtp)
 			dt_module_update(dtp, &k_stat);
 	}
 #elif defined(__NetBSD__)
-	/* XXX just the kernel for now */
+	size_t len;
+	struct iovec iov;
+	modstat_t *ms;
+
 	dt_module_update(dtp, "netbsd");
+	for (len = 8192;;) {
+		iov.iov_base = malloc(len);
+		iov.iov_len = len;
+		if (modctl(MODCTL_STAT, &iov)) {
+			free(iov.iov_base);
+			iov.iov_len = 0;
+			break;
+		}
+		if (len >= iov.iov_len) {
+			break;
+		}
+		free(iov.iov_base);
+		len = iov.iov_len;
+	}
+	len = iov.iov_len / sizeof(modstat_t);
+	for (ms = iov.iov_base; len != 0; ms++, len--) {
+		if (ms->ms_source != MODULE_SOURCE_FILESYS)
+			continue;
+		dt_module_update(dtp, ms->ms_name);
+	}
 #endif
 
 	/*
@@ -1583,6 +1651,7 @@ dtrace_lookup_by_addr(dtrace_hdl_t *dtp,
 
 	for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
 	    dmp = dt_list_next(dmp)) {
+
 		if (addr - dmp->dm_text_va < dmp->dm_text_size ||
 		    addr - dmp->dm_data_va < dmp->dm_data_size ||
 		    addr - dmp->dm_bss_va < dmp->dm_bss_size)
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c,v
retrieving revision 1.12
diff -u -p -r1.12 dt_open.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c	28 Apr 2016 11:38:41 -0000	1.12
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c	5 May 2017 16:59:27 -0000
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -130,8 +130,9 @@
 #define	DT_VERS_1_11	DT_VERSION_NUMBER(1, 11, 0)
 #define	DT_VERS_1_12	DT_VERSION_NUMBER(1, 12, 0)
 #define	DT_VERS_1_12_1	DT_VERSION_NUMBER(1, 12, 1)
-#define	DT_VERS_LATEST	DT_VERS_1_12_1
-#define	DT_VERS_STRING	"Sun D 1.12.1"
+#define	DT_VERS_1_13	DT_VERSION_NUMBER(1, 13, 0)
+#define	DT_VERS_LATEST	DT_VERS_1_13
+#define	DT_VERS_STRING	"Sun D 1.13"
 
 const dt_version_t _dtrace_versions[] = {
 	DT_VERS_1_0,	/* D API 1.0.0 (PSARC 2001/466) Solaris 10 FCS */
@@ -157,6 +158,7 @@ const dt_version_t _dtrace_versions[] = 
 	DT_VERS_1_11,	/* D API 1.11 */
 	DT_VERS_1_12,	/* D API 1.12 */
 	DT_VERS_1_12_1,	/* D API 1.12.1 */
+	DT_VERS_1_13,	/* D API 1.13 */
 	0
 };
 
@@ -390,8 +392,6 @@ static const dt_ident_t _dtrace_globals[
 	&dt_idops_func, "void(@, ...)" },
 { "printm", DT_IDENT_ACTFUNC, 0, DT_ACT_PRINTM, DT_ATTR_STABCMN, DT_VERS_1_0,
 	&dt_idops_func, "void(size_t, uintptr_t *)" },
-{ "printt", DT_IDENT_ACTFUNC, 0, DT_ACT_PRINTT, DT_ATTR_STABCMN, DT_VERS_1_0,
-	&dt_idops_func, "void(size_t, uintptr_t *)" },
 { "probefunc", DT_IDENT_SCALAR, 0, DIF_VAR_PROBEFUNC,
 	DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_type, "string" },
 { "probemod", DT_IDENT_SCALAR, 0, DIF_VAR_PROBEMOD,
@@ -503,8 +503,6 @@ static const dt_ident_t _dtrace_globals[
 	&dt_idops_func, "void(@, size_t, ...)" },
 { "trunc", DT_IDENT_ACTFUNC, 0, DT_ACT_TRUNC, DT_ATTR_STABCMN,
 	DT_VERS_1_0, &dt_idops_func, "void(...)" },
-{ "typeref", DT_IDENT_FUNC, 0, DIF_SUBR_TYPEREF, DT_ATTR_STABCMN, DT_VERS_1_1,
-	&dt_idops_func, "uintptr_t *(void *, size_t, string, size_t)" },
 { "uaddr", DT_IDENT_ACTFUNC, 0, DT_ACT_UADDR, DT_ATTR_STABCMN,
 	DT_VERS_1_2, &dt_idops_func, "_usymaddr(uintptr_t)" },
 { "ucaller", DT_IDENT_SCALAR, 0, DIF_VAR_UCALLER, DT_ATTR_STABCMN,
@@ -941,9 +939,11 @@ dt_provmod_open(dt_provmod_t **provmod, 
 			 * reallocate it. We normally won't need to do this
 			 * because providers aren't being loaded all the time.
 			 */
-			if ((p = realloc(p_providers,len)) == NULL)
+			if ((p = realloc(p_providers,len)) == NULL) {
+				free(p_providers);
 				/* How do we report errors here? */
 				return;
+			}
 			p_providers = p;
 		} else
 			break;
@@ -1190,8 +1190,10 @@ dt_vopen(int version, int flags, int *er
 	(void) fcntl(ftfd, F_SETFD, FD_CLOEXEC);
 
 alloc:
-	if ((dtp = malloc(sizeof (dtrace_hdl_t))) == NULL)
+	if ((dtp = malloc(sizeof (dtrace_hdl_t))) == NULL) {
+	        dt_provmod_destroy(&provmod);
 		return (set_open_errno(dtp, errp, EDT_NOMEM));
+	}
 
 	bzero(dtp, sizeof (dtrace_hdl_t));
 	dtp->dt_oflags = flags;
@@ -1339,8 +1341,8 @@ alloc:
 
 	/*
 	 * On FreeBSD the kernel module name can't be hard-coded. The
-	 * 'kern.bootfile' sysctl value tells us exactly which file is
-	 * being used as the kernel.
+	 * 'kern.bootfile' sysctl value tells us exactly which file is being
+	 * used as the kernel.
 	 */
 #ifndef illumos
 # ifdef __FreeBSD__
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c,v
retrieving revision 1.7
diff -u -p -r1.7 dt_options.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c	24 Sep 2015 14:25:29 -0000	1.7
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c	29 Apr 2017 03:20:38 -0000
@@ -751,7 +751,7 @@ dt_opt_rate(dtrace_hdl_t *dtp, const cha
 			}
 		}
 
-		if ((suffix[i].name == NULL && *end != '\0') || val < 0)
+		if (suffix[i].name == NULL && *end != '\0' || val < 0)
 			return (dt_set_errno(dtp, EDT_BADOPTVAL));
 
 		if (mul == 0) {
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c,v
retrieving revision 1.8
diff -u -p -r1.8 dt_parser.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c	4 Feb 2016 17:27:32 -0000	1.8
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c	5 May 2017 17:00:23 -0000
@@ -23,7 +23,7 @@
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Copyright (c) 2013, Joyent Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
@@ -424,7 +424,7 @@ dt_node_name(const dt_node_t *dnp, char 
 	switch (dnp->dn_kind) {
 	case DT_NODE_INT:
 		(void) snprintf(buf, len, "integer constant 0x%llx",
-		    (unsigned long long)dnp->dn_value);
+		    (u_longlong_t)dnp->dn_value);
 		break;
 	case DT_NODE_STRING:
 		s = strchr2esc(dnp->dn_string, strlen(dnp->dn_string));
@@ -1280,7 +1280,7 @@ dt_node_int(uintmax_t value)
 	}
 
 	xyerror(D_INT_OFLOW, "integer constant 0x%llx cannot be represented "
-	    "in any built-in integral type\n", (unsigned long long)value);
+	    "in any built-in integral type\n", (u_longlong_t)value);
 	/*NOTREACHED*/
 	return (NULL);		/* keep gcc happy */
 }
@@ -2143,6 +2143,17 @@ dt_node_statement(dt_node_t *expr)
 }
 
 dt_node_t *
+dt_node_if(dt_node_t *pred, dt_node_t *acts, dt_node_t *else_acts)
+{
+	dt_node_t *dnp = dt_node_alloc(DT_NODE_IF);
+	dnp->dn_conditional = pred;
+	dnp->dn_body = acts;
+	dnp->dn_alternate_body = else_acts;
+
+	return (dnp);
+}
+
+dt_node_t *
 dt_node_pdesc_by_name(char *spec)
 {
 	dtrace_hdl_t *dtp = yypcb->pcb_hdl;
@@ -2184,20 +2195,19 @@ dt_node_pdesc_by_id(uintmax_t id)
 		longjmp(yypcb->pcb_jmpbuf, EDT_NOMEM);
 
 	if (id > UINT_MAX) {
-		xyerror(D_PDESC_INVAL, "identifier %"PRIuMAX" exceeds maximum "
-		    "probe id\n", id);
+		xyerror(D_PDESC_INVAL, "identifier %llu exceeds maximum "
+		    "probe id\n", (u_longlong_t)id);
 	}
 
 	if (yypcb->pcb_pspec != DTRACE_PROBESPEC_NAME) {
-		xyerror(D_PDESC_INVAL, "probe identifier %"PRIuMAX
-		    " not permitted when specifying %s\n", id,
+		xyerror(D_PDESC_INVAL, "probe identifier %llu not permitted "
+		    "when specifying %s\n", (u_longlong_t)id,
 		    names[yypcb->pcb_pspec]);
 	}
 
 	if (dtrace_id2desc(dtp, (dtrace_id_t)id, dnp->dn_desc) != 0) {
-		xyerror(D_PDESC_INVAL, "invalid probe identifier %"PRIuMAX
-		    ": %s\n",
-		    id, dtrace_errmsg(dtp, dtrace_errno(dtp)));
+		xyerror(D_PDESC_INVAL, "invalid probe identifier %llu: %s\n",
+		    (u_longlong_t)id, dtrace_errmsg(dtp, dtrace_errno(dtp)));
 	}
 
 	return (dnp);
@@ -2212,7 +2222,6 @@ dt_node_clause(dt_node_t *pdescs, dt_nod
 	dnp->dn_pred = pred;
 	dnp->dn_acts = acts;
 
-	yybegin(YYS_CLAUSE);
 	return (dnp);
 }
 
@@ -3204,8 +3213,9 @@ dt_cook_op2(dt_node_t *dnp, uint_t idfla
 				dt_xcook_ident(lp, dhp, idkind, B_TRUE);
 			else
 				dt_xcook_ident(lp, dhp, idp->di_kind, B_FALSE);
-		} else
+		} else {
 			lp = dnp->dn_left = dt_node_cook(lp, 0);
+		}
 
 		/*
 		 * Switch op to '+' for *(E1 + E2) array mode in these cases:
@@ -3219,10 +3229,12 @@ dt_cook_op2(dt_node_t *dnp, uint_t idfla
 			if (lp->dn_ident->di_kind == DT_IDENT_ARRAY) {
 				if (lp->dn_args != NULL)
 					op = DT_TOK_ADD;
-			} else if (!dt_ident_unref(lp->dn_ident))
+			} else if (!dt_ident_unref(lp->dn_ident)) {
 				op = DT_TOK_ADD;
-		} else if (lp->dn_kind != DT_NODE_AGG)
+			}
+		} else if (lp->dn_kind != DT_NODE_AGG) {
 			op = DT_TOK_ADD;
+		}
 	}
 
 	switch (op) {
@@ -3646,45 +3658,34 @@ asgn_common:
 
 	case DT_TOK_PTR:
 		/*
-		 * If the left-hand side of operator -> is the name "self",
-		 * then we permit a TLS variable to be created or referenced.
+		 * If the left-hand side of operator -> is one of the scoping
+		 * keywords, permit a local or thread variable to be created or
+		 * referenced.
 		 */
-		if (lp->dn_kind == DT_NODE_IDENT &&
-		    strcmp(lp->dn_string, "self") == 0) {
-			if (rp->dn_kind != DT_NODE_VAR) {
-				dt_xcook_ident(rp, dtp->dt_tls,
-				    DT_IDENT_SCALAR, B_TRUE);
-			}
-
-			if (idflags != 0)
-				rp = dt_node_cook(rp, idflags);
-
-			dnp->dn_right = dnp->dn_left; /* avoid freeing rp */
-			dt_node_free(dnp);
-			return (rp);
-		}
+		if (lp->dn_kind == DT_NODE_IDENT) {
+			dt_idhash_t *dhp = NULL;
 
-		/*
-		 * If the left-hand side of operator -> is the name "this",
-		 * then we permit a local variable to be created or referenced.
-		 */
-		if (lp->dn_kind == DT_NODE_IDENT &&
-		    strcmp(lp->dn_string, "this") == 0) {
-			if (rp->dn_kind != DT_NODE_VAR) {
-				dt_xcook_ident(rp, yypcb->pcb_locals,
-				    DT_IDENT_SCALAR, B_TRUE);
+			if (strcmp(lp->dn_string, "self") == 0) {
+				dhp = dtp->dt_tls;
+			} else if (strcmp(lp->dn_string, "this") == 0) {
+				dhp = yypcb->pcb_locals;
 			}
+			if (dhp != NULL) {
+				if (rp->dn_kind != DT_NODE_VAR) {
+					dt_xcook_ident(rp, dhp,
+					    DT_IDENT_SCALAR, B_TRUE);
+				}
 
-			if (idflags != 0)
-				rp = dt_node_cook(rp, idflags);
+				if (idflags != 0)
+					rp = dt_node_cook(rp, idflags);
 
-			dnp->dn_right = dnp->dn_left; /* avoid freeing rp */
-			dt_node_free(dnp);
-			return (rp);
+				/* avoid freeing rp */
+				dnp->dn_right = dnp->dn_left;
+				dt_node_free(dnp);
+				return (rp);
+			}
 		}
-
 		/*FALLTHRU*/
-
 	case DT_TOK_DOT:
 		lp = dnp->dn_left = dt_node_cook(lp, DT_IDFLG_REF);
 
@@ -4503,7 +4504,8 @@ static dt_node_t *(*dt_cook_funcs[])(dt_
 	dt_cook_xlator,		/* DT_NODE_XLATOR */
 	dt_cook_none,		/* DT_NODE_PROBE */
 	dt_cook_provider,	/* DT_NODE_PROVIDER */
-	dt_cook_none		/* DT_NODE_PROG */
+	dt_cook_none,		/* DT_NODE_PROG */
+	dt_cook_none,		/* DT_NODE_IF */
 };
 
 /*
@@ -4518,6 +4520,8 @@ dt_node_cook(dt_node_t *dnp, uint_t idfl
 
 	yylineno = dnp->dn_line;
 
+	assert(dnp->dn_kind <
+	    sizeof (dt_cook_funcs) / sizeof (dt_cook_funcs[0]));
 	dnp = dt_cook_funcs[dnp->dn_kind](dnp, idflags);
 	dnp->dn_flags |= DT_NF_COOKED;
 
@@ -4620,6 +4624,181 @@ dt_node_diftype(dtrace_hdl_t *dtp, const
 	tp->dtdt_size = ctf_type_size(dnp->dn_ctfp, dnp->dn_type);
 }
 
+/*
+ * Output the parse tree as D.  The "-xtree=8" argument will call this
+ * function to print out the program after any syntactic sugar
+ * transformations have been applied (e.g. to implement "if").  The
+ * resulting output can be used to understand the transformations
+ * applied by these features, or to run such a script on a system that
+ * does not support these features
+ *
+ * Note that the output does not express precisely the same program as
+ * the input.  In particular:
+ *  - Only the clauses are output.  #pragma options, variable
+ *    declarations, etc. are excluded.
+ *  - Command argument substitution has already been done, so the output
+ *    will not contain e.g. $$1, but rather the substituted string.
+ */
+void
+dt_printd(dt_node_t *dnp, FILE *fp, int depth)
+{
+	dt_node_t *arg;
+
+	switch (dnp->dn_kind) {
+	case DT_NODE_INT:
+		(void) fprintf(fp, "0x%llx", (u_longlong_t)dnp->dn_value);
+		if (!(dnp->dn_flags & DT_NF_SIGNED))
+			(void) fprintf(fp, "u");
+		break;
+
+	case DT_NODE_STRING: {
+		char *escd = strchr2esc(dnp->dn_string, strlen(dnp->dn_string));
+		(void) fprintf(fp, "\"%s\"", escd);
+		free(escd);
+		break;
+	}
+
+	case DT_NODE_IDENT:
+		(void) fprintf(fp, "%s", dnp->dn_string);
+		break;
+
+	case DT_NODE_VAR:
+		(void) fprintf(fp, "%s%s",
+		    (dnp->dn_ident->di_flags & DT_IDFLG_LOCAL) ? "this->" :
+		    (dnp->dn_ident->di_flags & DT_IDFLG_TLS) ? "self->" : "",
+		    dnp->dn_ident->di_name);
+
+		if (dnp->dn_args != NULL) {
+			(void) fprintf(fp, "[");
+
+			for (arg = dnp->dn_args; arg != NULL;
+			    arg = arg->dn_list) {
+				dt_printd(arg, fp, 0);
+				if (arg->dn_list != NULL)
+					(void) fprintf(fp, ", ");
+			}
+
+			(void) fprintf(fp, "]");
+		}
+		break;
+
+	case DT_NODE_SYM: {
+		const dtrace_syminfo_t *dts = dnp->dn_ident->di_data;
+		(void) fprintf(fp, "%s`%s", dts->dts_object, dts->dts_name);
+		break;
+	}
+	case DT_NODE_FUNC:
+		(void) fprintf(fp, "%s(", dnp->dn_ident->di_name);
+
+		for (arg = dnp->dn_args; arg != NULL; arg = arg->dn_list) {
+			dt_printd(arg, fp, 0);
+			if (arg->dn_list != NULL)
+				(void) fprintf(fp, ", ");
+		}
+		(void) fprintf(fp, ")");
+		break;
+
+	case DT_NODE_OP1:
+		(void) fprintf(fp, "%s(", opstr(dnp->dn_op));
+		dt_printd(dnp->dn_child, fp, 0);
+		(void) fprintf(fp, ")");
+		break;
+
+	case DT_NODE_OP2:
+		(void) fprintf(fp, "(");
+		dt_printd(dnp->dn_left, fp, 0);
+		if (dnp->dn_op == DT_TOK_LPAR) {
+			(void) fprintf(fp, ")");
+			dt_printd(dnp->dn_right, fp, 0);
+			break;
+		}
+		if (dnp->dn_op == DT_TOK_PTR || dnp->dn_op == DT_TOK_DOT ||
+		    dnp->dn_op == DT_TOK_LBRAC)
+			(void) fprintf(fp, "%s", opstr(dnp->dn_op));
+		else
+			(void) fprintf(fp, " %s ", opstr(dnp->dn_op));
+		dt_printd(dnp->dn_right, fp, 0);
+		if (dnp->dn_op == DT_TOK_LBRAC) {
+			dt_node_t *ln = dnp->dn_right;
+			while (ln->dn_list != NULL) {
+				(void) fprintf(fp, ", ");
+				dt_printd(ln->dn_list, fp, depth);
+				ln = ln->dn_list;
+			}
+			(void) fprintf(fp, "]");
+		}
+		(void) fprintf(fp, ")");
+		break;
+
+	case DT_NODE_OP3:
+		(void) fprintf(fp, "(");
+		dt_printd(dnp->dn_expr, fp, 0);
+		(void) fprintf(fp, " ? ");
+		dt_printd(dnp->dn_left, fp, 0);
+		(void) fprintf(fp, " : ");
+		dt_printd(dnp->dn_right, fp, 0);
+		(void) fprintf(fp, ")");
+		break;
+
+	case DT_NODE_DEXPR:
+	case DT_NODE_DFUNC:
+		(void) fprintf(fp, "%*s", depth * 8, "");
+		dt_printd(dnp->dn_expr, fp, depth + 1);
+		(void) fprintf(fp, ";\n");
+		break;
+
+	case DT_NODE_PDESC:
+		(void) fprintf(fp, "%s:%s:%s:%s",
+		    dnp->dn_desc->dtpd_provider, dnp->dn_desc->dtpd_mod,
+		    dnp->dn_desc->dtpd_func, dnp->dn_desc->dtpd_name);
+		break;
+
+	case DT_NODE_CLAUSE:
+		for (arg = dnp->dn_pdescs; arg != NULL; arg = arg->dn_list) {
+			dt_printd(arg, fp, 0);
+			if (arg->dn_list != NULL)
+				(void) fprintf(fp, ",");
+			(void) fprintf(fp, "\n");
+		}
+
+		if (dnp->dn_pred != NULL) {
+			(void) fprintf(fp, "/");
+			dt_printd(dnp->dn_pred, fp, 0);
+			(void) fprintf(fp, "/\n");
+		}
+			(void) fprintf(fp, "{\n");
+
+		for (arg = dnp->dn_acts; arg != NULL; arg = arg->dn_list)
+			dt_printd(arg, fp, depth + 1);
+		(void) fprintf(fp, "}\n");
+		(void) fprintf(fp, "\n");
+		break;
+
+	case DT_NODE_IF:
+		(void) fprintf(fp, "%*sif (", depth * 8, "");
+		dt_printd(dnp->dn_conditional, fp, 0);
+		(void) fprintf(fp, ") {\n");
+
+		for (arg = dnp->dn_body; arg != NULL; arg = arg->dn_list)
+			dt_printd(arg, fp, depth + 1);
+		if (dnp->dn_alternate_body == NULL) {
+			(void) fprintf(fp, "%*s}\n", depth * 8, "");
+		} else {
+			(void) fprintf(fp, "%*s} else {\n", depth * 8, "");
+			for (arg = dnp->dn_alternate_body; arg != NULL;
+			    arg = arg->dn_list)
+				dt_printd(arg, fp, depth + 1);
+			(void) fprintf(fp, "%*s}\n", depth * 8, "");
+		}
+
+		break;
+
+	default:
+		(void) fprintf(fp, "/* bad node %p, kind %d */\n",
+		    (void *)dnp, dnp->dn_kind);
+	}
+}
+
 void
 dt_node_printr(dt_node_t *dnp, FILE *fp, int depth)
 {
@@ -4666,7 +4845,7 @@ dt_node_printr(dt_node_t *dnp, FILE *fp,
 
 	case DT_NODE_INT:
 		(void) fprintf(fp, "INT 0x%llx (%s)\n",
-		    (unsigned long long)dnp->dn_value, buf);
+		    (u_longlong_t)dnp->dn_value, buf);
 		break;
 
 	case DT_NODE_STRING:
@@ -4730,6 +4909,13 @@ dt_node_printr(dt_node_t *dnp, FILE *fp,
 		(void) fprintf(fp, "OP2 %s (%s)\n", opstr(dnp->dn_op), buf);
 		dt_node_printr(dnp->dn_left, fp, depth + 1);
 		dt_node_printr(dnp->dn_right, fp, depth + 1);
+		if (dnp->dn_op == DT_TOK_LBRAC) {
+			dt_node_t *ln = dnp->dn_right;
+			while (ln->dn_list != NULL) {
+				dt_node_printr(ln->dn_list, fp, depth + 1);
+				ln = ln->dn_list;
+			}
+		}
 		break;
 
 	case DT_NODE_OP3:
@@ -4791,6 +4977,7 @@ dt_node_printr(dt_node_t *dnp, FILE *fp,
 
 		for (arg = dnp->dn_acts; arg != NULL; arg = arg->dn_list)
 			dt_node_printr(arg, fp, depth + 1);
+		(void) fprintf(fp, "\n");
 		break;
 
 	case DT_NODE_INLINE:
@@ -4841,6 +5028,24 @@ dt_node_printr(dt_node_t *dnp, FILE *fp,
 			dt_node_printr(arg, fp, depth + 1);
 		break;
 
+	case DT_NODE_IF:
+		(void) fprintf(fp, "IF attr=%s CONDITION:\n", a);
+
+		dt_node_printr(dnp->dn_conditional, fp, depth + 1);
+
+		(void) fprintf(fp, "%*sIF BODY: \n", depth * 2, "");
+		for (arg = dnp->dn_body; arg != NULL; arg = arg->dn_list)
+			dt_node_printr(arg, fp, depth + 1);
+
+		if (dnp->dn_alternate_body != NULL) {
+			(void) fprintf(fp, "%*sIF ELSE: \n", depth * 2, "");
+			for (arg = dnp->dn_alternate_body; arg != NULL;
+			    arg = arg->dn_list)
+				dt_node_printr(arg, fp, depth + 1);
+		}
+
+		break;
+
 	default:
 		(void) fprintf(fp, "<bad node %p, kind %d>\n",
 		    (void *)dnp, dnp->dn_kind);
@@ -4979,7 +5184,7 @@ yylabel(const char *label)
 	yypcb->pcb_region = label;
 }
 
-#if 0
+#ifndef __NetBSD__
 int
 yywrap(void)
 {
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h,v
retrieving revision 1.3
diff -u -p -r1.3 dt_parser.h
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h	4 Feb 2016 16:48:34 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h	13 Apr 2017 01:56:30 -0000
@@ -23,7 +23,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013 Joyent, Inc. All rights reserved.
  */
 
@@ -105,6 +105,12 @@ typedef struct dt_node {
 			struct dt_node *_probes;  /* list of probe nodes */
 			int _redecl;		/* provider redeclared */
 		} _provider;
+
+		struct {
+			struct dt_node *_conditional;
+			struct dt_node *_body;
+			struct dt_node *_alternate_body;
+		} _conditional;
 	} dn_u;
 
 	struct dt_node *dn_list; /* parse tree list link */
@@ -140,6 +146,11 @@ typedef struct dt_node {
 #define	dn_provred	dn_u._provider._redecl	/* DT_NODE_PROVIDER */
 #define	dn_probes	dn_u._provider._probes	/* DT_NODE_PROVIDER */
 
+/* DT_NODE_IF: */
+#define	dn_conditional		dn_u._conditional._conditional
+#define	dn_body			dn_u._conditional._body
+#define	dn_alternate_body	dn_u._conditional._alternate_body
+
 #define	DT_NODE_FREE	0	/* unused node (waiting to be freed) */
 #define	DT_NODE_INT	1	/* integer value */
 #define	DT_NODE_STRING	2	/* string value */
@@ -162,6 +173,7 @@ typedef struct dt_node {
 #define	DT_NODE_PROBE	19	/* probe definition */
 #define	DT_NODE_PROVIDER 20	/* provider definition */
 #define	DT_NODE_PROG	21	/* program translation unit */
+#define	DT_NODE_IF	22	/* if statement */
 
 #define	DT_NF_SIGNED	0x01	/* data is a signed quantity (else unsigned) */
 #define	DT_NF_COOKED	0x02	/* data is a known type (else still cooking) */
@@ -213,6 +225,7 @@ extern dt_node_t *dt_node_xlator(dt_decl
 extern dt_node_t *dt_node_probe(char *, int, dt_node_t *, dt_node_t *);
 extern dt_node_t *dt_node_provider(char *, dt_node_t *);
 extern dt_node_t *dt_node_program(dt_node_t *);
+extern dt_node_t *dt_node_if(dt_node_t *, dt_node_t *, dt_node_t *);
 
 extern dt_node_t *dt_node_link(dt_node_t *, dt_node_t *);
 extern dt_node_t *dt_node_cook(dt_node_t *, uint_t);
@@ -237,6 +250,7 @@ extern void dt_node_promote(dt_node_t *,
 extern void dt_node_diftype(dtrace_hdl_t *,
     const dt_node_t *, dtrace_diftype_t *);
 extern void dt_node_printr(dt_node_t *, FILE *, int);
+extern void dt_printd(dt_node_t *, FILE *, int);
 extern const char *dt_node_name(const dt_node_t *, char *, size_t);
 extern int dt_node_root(dt_node_t *);
 
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c,v
retrieving revision 1.7
diff -u -p -r1.7 dt_pid.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c	24 Sep 2015 14:25:29 -0000	1.7
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c	29 Apr 2017 03:26:56 -0000
@@ -572,12 +572,6 @@ dt_pid_usdt_mapping(void *data, const pr
 	prsyminfo_t sip;
 	dof_helper_t dh;
 	GElf_Half e_type;
-#if defined(__FreeBSD__) || defined(__NetBSD__)
-	dof_hdr_t hdr;
-	size_t sz;
-	uint64_t dofmax;
-	void *dof;
-#endif
 	const char *mname;
 	const char *syms[] = { "___SUNW_dof", "__SUNW_dof" };
 	int i, fd = -1;
@@ -607,61 +601,24 @@ dt_pid_usdt_mapping(void *data, const pr
 			continue;
 		}
 
-#if defined(__FreeBSD__) || defined(__NetBSD__)
+		dh.dofhp_dof = sym.st_value;
 		dh.dofhp_addr = (e_type == ET_EXEC) ? 0 : pmp->pr_vaddr;
-		if (Pread(P, &hdr, sizeof (hdr), sym.st_value) !=
-		    sizeof (hdr)) {
-			dt_dprintf("read of DOF header failed\n");
-			continue;
-		}
-
-		sz = sizeof(dofmax);
-		if (sysctlbyname("kern.dtrace.dof_maxsize", &dofmax, &sz,
-		    NULL, 0) != 0) {
-			dt_dprintf("failed to read dof_maxsize: %s\n",
-			    strerror(errno));
-			continue;
-		}
-		if (dofmax < hdr.dofh_loadsz) {
-			dt_dprintf("DOF load size exceeds maximum\n");
-			continue;
-		}
-
-		if ((dof = malloc(hdr.dofh_loadsz)) == NULL)
-			return (-1);
-
-		if (Pread(P, dof, hdr.dofh_loadsz, sym.st_value) !=
-		    hdr.dofh_loadsz) {
-			free(dof);
-			dt_dprintf("read of DOF section failed\n");
-			continue;
-		}
-
-		dh.dofhp_dof = (uintptr_t)dof;
-		dh.dofhp_pid = proc_getpid(P);
-
 		dt_pid_objname(dh.dofhp_mod, sizeof (dh.dofhp_mod),
 		    sip.prs_lmid, mname);
 
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+		dh.dofhp_pid = proc_getpid(P);
+
 		if (fd == -1 &&
 		    (fd = open("/dev/dtrace/helper", O_RDWR, 0)) < 0) {
 			dt_dprintf("open of helper device failed: %s\n",
 			    strerror(errno));
-			free(dof);
 			return (-1); /* errno is set for us */
 		}
 
 		if (ioctl(fd, DTRACEHIOC_ADDDOF, &dh, sizeof (dh)) < 0)
 			dt_dprintf("DOF was rejected for %s\n", dh.dofhp_mod);
-
-		free(dof);
 #else
-		dh.dofhp_dof = sym.st_value;
-		dh.dofhp_addr = (e_type == ET_EXEC) ? 0 : pmp->pr_vaddr;
-
-		dt_pid_objname(dh.dofhp_mod, sizeof (dh.dofhp_mod),
-		    sip.prs_lmid, mname);
-
 		if (fd == -1 &&
 		    (fd = pr_open(P, "/dev/dtrace/helper", O_RDWR, 0)) < 0) {
 			dt_dprintf("pr_open of helper device failed: %s\n",
@@ -772,8 +729,13 @@ dt_pid_create_probes(dtrace_probedesc_t 
 	(void) snprintf(provname, sizeof (provname), "pid%d", (int)pid);
 
 	if (gmatch(provname, pdp->dtpd_provider) != 0) {
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+		if ((P = dt_proc_grab(dtp, pid, 0, 1)) == NULL)
+#else
 		if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE,
-		    0)) == NULL) {
+		    0)) == NULL)
+#endif
+		{
 			(void) dt_pid_error(dtp, pcb, NULL, NULL, D_PROC_GRAB,
 			    "failed to grab process %d", (int)pid);
 			return (-1);
@@ -974,7 +936,6 @@ dt_pid_get_types(dtrace_hdl_t *dtp, cons
 		mptr = pdp->dtpd_mod;
 		lmid = 0;
 	}
-	__USE(lmid);
 
 	if (Pxlookup_by_name(p, lmid, mptr, pdp->dtpd_func,
 	    &sym, &si) != 0) {
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_print.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c	24 Sep 2015 14:25:29 -0000	1.1
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c	29 Apr 2017 03:27:15 -0000
@@ -190,7 +190,7 @@ print_bitfield(dt_printarg_t *pap, ulong
 		value >>= shift;
 	value &= mask;
 
-	(void) fprintf(fp, "%#llx", (unsigned long long)value);
+	(void) fprintf(fp, "%#llx", (u_longlong_t)value);
 }
 
 /*
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c,v
retrieving revision 1.10
diff -u -p -r1.10 dt_printf.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c	1 Oct 2015 20:51:19 -0000	1.10
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c	29 Apr 2017 03:28:59 -0000
@@ -25,11 +25,7 @@
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
-#ifdef illumos
 #include <sys/sysmacros.h>
-#else
-#define	ABS(a)		((a) < 0 ? -(a) : (a))
-#endif
 #include <string.h>
 #include <strings.h>
 #include <stdlib.h>
@@ -301,7 +297,8 @@ pfprint_fp(dtrace_hdl_t *dtp, FILE *fp, 
     const dt_pfargd_t *pfd, const void *addr, size_t size, uint64_t normal)
 {
 	double n = (double)normal;
-#if !defined(__arm__) && !defined(__powerpc__) && !defined(__mips__)
+#if !defined(__arm__) && !defined(__powerpc__) && \
+    !defined(__mips__) && !defined(__riscv__)
 	long double ldn = (long double)normal;
 #endif
 
@@ -312,7 +309,8 @@ pfprint_fp(dtrace_hdl_t *dtp, FILE *fp, 
 	case sizeof (double):
 		return (dt_printf(dtp, fp, format,
 		    *((double *)addr) / n));
-#if !defined(__arm__) && !defined(__powerpc__) && !defined(__mips__)
+#if !defined(__arm__) && !defined(__powerpc__) && \
+    !defined(__mips__) && !defined(__riscv__)
 	case sizeof (long double):
 		return (dt_printf(dtp, fp, format,
 		    *((long double *)addr) / ldn));
@@ -524,6 +522,7 @@ pfprint_port(dtrace_hdl_t *dtp, FILE *fp
 
 #ifdef illumos
 	if ((sv = getservbyport_r(port, NULL, &res, buf, sizeof (buf))) != NULL)
+		return (dt_printf(dtp, fp, format, sv->s_name));
 #elif defined(__FreeBSD__)
 	if (getservbyport_r(port, NULL, &res, buf, sizeof (buf), &sv) > 0)
 		return (dt_printf(dtp, fp, format, sv->s_name));
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c,v
retrieving revision 1.8
diff -u -p -r1.8 dt_proc.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c	26 Sep 2015 00:33:34 -0000	1.8
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c	29 Apr 2017 03:29:14 -0000
@@ -98,10 +98,6 @@
 #define	IS_SYS_EXEC(w)	(w == SYS_execve)
 #define	IS_SYS_FORK(w)	(w == SYS_vfork || w == SYS_forksys)
 
-#if !defined(__DECONST) && defined(__UNCONST)
-#define __DECONST(a, b) __UNCONST(b)
-#endif
-
 static dt_bkpt_t *
 dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
 {
@@ -115,7 +111,11 @@ dt_proc_bpcreate(dt_proc_t *dpr, uintptr
 		dbp->dbp_data = data;
 		dbp->dbp_addr = addr;
 
+#ifdef __NetBSD__
 		if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0)
+#else
+		if (Psetbkpt(P, dbp->dbp_addr, dbp->dbp_instr) == 0)
+#endif
 			dbp->dbp_active = B_TRUE;
 
 		dt_list_append(&dpr->dpr_bps, dbp);
@@ -186,7 +186,11 @@ dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_pr
 	    (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits);
 
 	dbp->dbp_func(dtp, dpr, dbp->dbp_data);
+#ifdef __NetBSD__
 	(void) Pxecbkpt(dpr->dpr_proc, &dbp->dbp_instr);
+#else
+	(void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr);
+#endif
 }
 
 static void
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h,v
retrieving revision 1.4
diff -u -p -r1.4 dt_proc.h
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h	26 Sep 2015 00:33:34 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h	13 Apr 2017 17:05:26 -0000
@@ -80,7 +80,11 @@ typedef struct dt_bkpt {
 	dt_bkpt_f *dbp_func;		/* callback function to execute */
 	void *dbp_data;			/* callback function private data */
 	uintptr_t dbp_addr;		/* virtual address of breakpoint */
+#ifdef __NetBSD__
 	proc_breakpoint_t dbp_instr;	/* saved instruction from breakpoint */
+#else
+	ulong_t dbp_instr;		/* saved instruction from breakpoint */
+#endif
 	ulong_t dbp_hits;		/* count of breakpoint hits for debug */
 	int dbp_active;			/* flag indicating breakpoint is on */
 } dt_bkpt_t;
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c,v
retrieving revision 1.3
diff -u -p -r1.3 dt_provider.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c	24 Sep 2015 14:25:29 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c	29 Apr 2017 16:01:03 -0000
@@ -545,9 +545,7 @@ dt_probe_define(dt_provider_t *pvp, dt_p
 
 	for (pip = prp->pr_inst; pip != NULL; pip = pip->pi_next) {
 		if (strcmp(pip->pi_fname, fname) == 0 &&
-		    ((rname == NULL && pip->pi_rname == NULL) ||
-		    (rname != NULL && pip->pi_rname != NULL &&
-		    strcmp(pip->pi_rname, rname) == 0)))
+		    strcmp(pip->pi_rname, rname) == 0)
 			break;
 	}
 
@@ -565,7 +563,7 @@ dt_probe_define(dt_provider_t *pvp, dt_p
 		if ((pip->pi_fname = strdup(fname)) == NULL)
 			goto nomem;
 
-		if (rname != NULL && (pip->pi_rname = strdup(rname)) == NULL)
+		if ((pip->pi_rname = strdup(rname)) == NULL)
 			goto nomem;
 
 		pip->pi_noffs = 0;
@@ -605,7 +603,7 @@ dt_probe_define(dt_provider_t *pvp, dt_p
 	dt_dprintf("defined probe %s %s:%s %s() +0x%x (%s)\n",
 	    isenabled ? "(is-enabled)" : "",
 	    pvp->pv_desc.dtvd_name, prp->pr_ident->di_name, fname, offset,
-	    rname != NULL ? rname : fname);
+	    rname);
 
 	assert(*noffs < *maxoffs);
 	(*offs)[(*noffs)++] = offset;
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c,v
retrieving revision 1.2
diff -u -p -r1.2 dt_regset.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c	24 Sep 2015 14:25:29 -0000	1.2
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c	13 Apr 2017 17:08:43 -0000
@@ -27,6 +27,7 @@
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2016 Pedro Giffuni.  All rights reserved.
  */
 
 #include <sys/types.h>
@@ -47,15 +48,15 @@ dt_regset_create(ulong_t nregs)
 	if (drp == NULL)
 		return (NULL);
 
-	drp->dr_bitmap = malloc(sizeof (ulong_t) * n);
-	drp->dr_size = nregs;
+	drp->dr_bitmap = calloc(n, sizeof (ulong_t));
 
 	if (drp->dr_bitmap == NULL) {
 		dt_regset_destroy(drp);
 		return (NULL);
 	}
 
-	bzero(drp->dr_bitmap, sizeof (ulong_t) * n);
+	drp->dr_size = nregs;
+
 	return (drp);
 }
 
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 dt_strtab.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c	20 Feb 2010 04:33:49 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c	13 Apr 2017 17:09:37 -0000
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Portions Copyright 2016 Pedro Giffuni.  All rights reserved.
+ */
+ 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
@@ -70,12 +74,11 @@ dt_strtab_create(size_t bufsz)
 		return (NULL);
 
 	bzero(sp, sizeof (dt_strtab_t));
-	sp->str_hash = malloc(nbuckets * sizeof (dt_strhash_t *));
+	sp->str_hash = calloc(nbuckets, sizeof (dt_strhash_t *));
 
 	if (sp->str_hash == NULL)
 		goto err;
 
-	bzero(sp->str_hash, nbuckets * sizeof (dt_strhash_t *));
 	sp->str_hashsz = nbuckets;
 	sp->str_bufs = NULL;
 	sp->str_ptr = NULL;
@@ -253,8 +256,10 @@ dt_strtab_insert(dt_strtab_t *sp, const 
 	 * Now copy the string data into our buffer list, and then update
 	 * the global counts of strings and bytes.  Return str's byte offset.
 	 */
-	if (dt_strtab_copyin(sp, str, len + 1) == -1)
+	if (dt_strtab_copyin(sp, str, len + 1) == -1) {
+		free(hp);
 		return (-1L);
+	}
 
 	sp->str_nstrs++;
 	sp->str_size += len + 1;
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c,v
retrieving revision 1.12
diff -u -p -r1.12 dt_subr.c
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c	24 Sep 2015 14:25:29 -0000	1.12
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c	29 Apr 2017 03:39:45 -0000
@@ -582,6 +582,7 @@ int
 dt_printf(dtrace_hdl_t *dtp, FILE *fp, const char *format, ...)
 {
 	va_list ap;
+	va_list ap2;
 	int n;
 
 #ifndef illumos
@@ -606,11 +607,13 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c
 		len = dtp->dt_sprintf_buflen - len;
 		assert(len >= 0);
 
-		if ((n = vsnprintf(buf, len, format, ap)) < 0)
+		va_copy(ap2, ap);
+		if ((n = vsnprintf(buf, len, format, ap2)) < 0)
 			n = dt_set_errno(dtp, errno);
 
+		va_end(ap2);
 		va_end(ap);
-
+		
 		return (n);
 	}
 
@@ -641,11 +644,14 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c
 			dtp->dt_buffered_buf[0] = '\0';
 		}
 
-		if ((needed = vsnprintf(NULL, 0, format, ap)) < 0) {
+		va_copy(ap2, ap);
+		if ((needed = vsnprintf(NULL, 0, format, ap2)) < 0) {
 			rval = dt_set_errno(dtp, errno);
+			va_end(ap2);
 			va_end(ap);
 			return (rval);
 		}
+		va_end(ap2);
 
 		if (needed == 0) {
 			va_end(ap);
@@ -671,12 +677,15 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c
 			dtp->dt_buffered_size <<= 1;
 		}
 
+		va_copy(ap2, ap);
 		if (vsnprintf(&dtp->dt_buffered_buf[dtp->dt_buffered_offs],
-		    avail, format, ap) < 0) {
+		    avail, format, ap2) < 0) {
 			rval = dt_set_errno(dtp, errno);
+			va_end(ap2);
 			va_end(ap);
 			return (rval);
 		}
+		va_end(ap2);
 
 		dtp->dt_buffered_offs += needed;
 		assert(dtp->dt_buffered_buf[dtp->dt_buffered_offs] == '\0');
@@ -684,8 +693,10 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c
 		return (0);
 	}
 
-	n = vfprintf(fp, format, ap);
+	va_copy(ap2, ap);
+	n = vfprintf(fp, format, ap2);
 	fflush(fp);
+	va_end(ap2);
 	va_end(ap);
 
 	if (n < 0) {
@@ -920,7 +931,7 @@ dtrace_addr2str(dtrace_hdl_t *dtp, uint6
 
 	if (err == 0 && addr != sym.st_value) {
 		(void) snprintf(s, n, "%s`%s+0x%llx", dts.dts_object,
-		    dts.dts_name, (unsigned long long)addr - sym.st_value);
+		    dts.dts_name, (u_longlong_t)addr - sym.st_value);
 	} else if (err == 0) {
 		(void) snprintf(s, n, "%s`%s",
 		    dts.dts_object, dts.dts_name);
@@ -932,9 +943,9 @@ dtrace_addr2str(dtrace_hdl_t *dtp, uint6
 		 */
 		if (dtrace_lookup_by_addr(dtp, addr, NULL, &dts) == 0) {
 			(void) snprintf(s, n, "%s`0x%llx", dts.dts_object,
-			    (unsigned long long)addr);
+			    (u_longlong_t)addr);
 		} else {
-			(void) snprintf(s, n, "0x%llx", (unsigned long long)addr);
+			(void) snprintf(s, n, "0x%llx", (u_longlong_t)addr);
 		}
 	}
 
@@ -967,7 +978,7 @@ dtrace_uaddr2str(dtrace_hdl_t *dtp, pid_
 
 		if (addr > sym.st_value) {
 			(void) snprintf(c, sizeof (c), "%s`%s+0x%llx", obj,
-			    name, (unsigned long long)(addr - sym.st_value));
+			    name, (u_longlong_t)(addr - sym.st_value));
 		} else {
 			(void) snprintf(c, sizeof (c), "%s`%s", obj, name);
 		}
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c
diff -N src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c	10 Oct 2016 11:14:31 -0000
@@ -0,0 +1,516 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Syntactic sugar features are implemented by transforming the D parse tree
+ * such that it only uses the subset of D that is supported by the rest of the
+ * compiler / the kernel.  A clause containing these language features is
+ * referred to as a "super-clause", and its transformation typically entails
+ * creating several "sub-clauses" to implement it. For diagnosability, the
+ * sub-clauses will be printed if the "-xtree=8" flag is specified.
+ *
+ * Currently, the only syntactic sugar feature is "if/else" statements.  Each
+ * basic block (e.g. the body of the "if" and "else" statements, and the
+ * statements before and after) is turned into its own sub-clause, with a
+ * predicate that causes it to be executed only if the code flows to this point.
+ * Nested if/else statements are supported.
+ *
+ * This infrastructure is designed to accommodate other syntactic sugar features
+ * in the future.
+ */
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/sysmacros.h>
+
+#include <assert.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <dt_module.h>
+#include <dt_program.h>
+#include <dt_provider.h>
+#include <dt_printf.h>
+#include <dt_pid.h>
+#include <dt_grammar.h>
+#include <dt_ident.h>
+#include <dt_string.h>
+#include <dt_impl.h>
+
+typedef struct dt_sugar_parse {
+	dtrace_hdl_t *dtsp_dtp;		/* dtrace handle */
+	dt_node_t *dtsp_pdescs;		/* probe descriptions */
+	int dtsp_num_conditions;	/* number of condition variables */
+	int dtsp_num_ifs;		/* number of "if" statements */
+	dt_node_t *dtsp_clause_list;	/* list of clauses */
+} dt_sugar_parse_t;
+
+static void dt_sugar_visit_stmts(dt_sugar_parse_t *, dt_node_t *, int);
+
+/*
+ * Return a node for "self->%error".
+ *
+ * Note that the "%" is part of the variable name, and is included so that
+ * this variable name can not collide with any user-specified variable.
+ *
+ * This error variable is used to keep track of if there has been an error
+ * in any of the sub-clauses, and is used to prevent execution of subsequent
+ * sub-clauses following an error.
+ */
+static dt_node_t *
+dt_sugar_new_error_var(void)
+{
+	return (dt_node_op2(DT_TOK_PTR, dt_node_ident(strdup("self")),
+	    dt_node_ident(strdup("%error"))));
+}
+
+/*
+ * Append this clause to the clause list.
+ */
+static void
+dt_sugar_append_clause(dt_sugar_parse_t *dp, dt_node_t *clause)
+{
+	dp->dtsp_clause_list = dt_node_link(dp->dtsp_clause_list, clause);
+}
+
+/*
+ * Prepend this clause to the clause list.
+ */
+static void
+dt_sugar_prepend_clause(dt_sugar_parse_t *dp, dt_node_t *clause)
+{
+	dp->dtsp_clause_list = dt_node_link(clause, dp->dtsp_clause_list);
+}
+
+/*
+ * Return a node for "this->%condition_<condid>", or NULL if condid==0.
+ *
+ * Note that the "%" is part of the variable name, and is included so that
+ * this variable name can not collide with any user-specified variable.
+ */
+static dt_node_t *
+dt_sugar_new_condition_var(int condid)
+{
+	char *str;
+
+	if (condid == 0)
+		return (NULL);
+	assert(condid > 0);
+
+	(void) asprintf(&str, "%%condition_%d", ABS(condid));
+	return (dt_node_op2(DT_TOK_PTR, dt_node_ident(strdup("this")),
+	    dt_node_ident(str)));
+}
+
+/*
+ * Return new clause to evaluate predicate and set newcond.  condid is
+ * the condition that we are already under, or 0 if none.
+ * The new clause will be of the form:
+ *
+ * dp_pdescs
+ * /!self->%error/
+ * {
+ *	this->%condition_<newcond> =
+ *	    (this->%condition_<condid> && pred);
+ * }
+ *
+ * Note: if condid==0, we will instead do "... = (1 && pred)", to effectively
+ * convert the pred to a boolean.
+ *
+ * Note: Unless an error has been encountered, we always set the condition
+ * variable (either to 0 or 1).  This lets us avoid resetting the condition
+ * variables back to 0 when the super-clause completes.
+ */
+static dt_node_t *
+dt_sugar_new_condition_impl(dt_sugar_parse_t *dp,
+    dt_node_t *pred, int condid, int newcond)
+{
+	dt_node_t *value, *body, *newpred;
+
+	/* predicate is !self->%error */
+	newpred = dt_node_op1(DT_TOK_LNEG, dt_sugar_new_error_var());
+
+	if (condid == 0) {
+		/*
+		 * value is (1 && pred)
+		 *
+		 * Note, D doesn't allow a probe-local "this" variable to
+		 * be reused as a different type, even from a different probe.
+		 * Therefore, value can't simply be <pred>, because then
+		 * its type could be different when we reuse this condid
+		 * in a different meta-clause.
+		 */
+		value = dt_node_op2(DT_TOK_LAND, dt_node_int(1), pred);
+	} else {
+		/* value is (this->%condition_<condid> && pred) */
+		value = dt_node_op2(DT_TOK_LAND,
+		    dt_sugar_new_condition_var(condid), pred);
+	}
+
+	/* body is "this->%condition_<retval> = <value>;" */
+	body = dt_node_statement(dt_node_op2(DT_TOK_ASGN,
+	    dt_sugar_new_condition_var(newcond), value));
+
+	return (dt_node_clause(dp->dtsp_pdescs, newpred, body));
+}
+
+/*
+ * Generate a new clause to evaluate predicate and set a new condition variable,
+ * whose ID will be returned.  The new clause will be appended to
+ * dp_first_new_clause.
+ */
+static int
+dt_sugar_new_condition(dt_sugar_parse_t *dp, dt_node_t *pred, int condid)
+{
+	dp->dtsp_num_conditions++;
+	dt_sugar_append_clause(dp, dt_sugar_new_condition_impl(dp,
+	    pred, condid, dp->dtsp_num_conditions));
+	return (dp->dtsp_num_conditions);
+}
+
+/*
+ * Visit the specified node and all of its descendants.  Currently this is only
+ * used to count the number of "if" statements (dtsp_num_ifs).
+ */
+static void
+dt_sugar_visit_all(dt_sugar_parse_t *dp, dt_node_t *dnp)
+{
+	dt_node_t *arg;
+
+	switch (dnp->dn_kind) {
+	case DT_NODE_FREE:
+	case DT_NODE_INT:
+	case DT_NODE_STRING:
+	case DT_NODE_SYM:
+	case DT_NODE_TYPE:
+	case DT_NODE_PROBE:
+	case DT_NODE_PDESC:
+	case DT_NODE_IDENT:
+		break;
+
+	case DT_NODE_FUNC:
+		for (arg = dnp->dn_args; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+		break;
+
+	case DT_NODE_OP1:
+		dt_sugar_visit_all(dp, dnp->dn_child);
+		break;
+
+	case DT_NODE_OP2:
+		dt_sugar_visit_all(dp, dnp->dn_left);
+		dt_sugar_visit_all(dp, dnp->dn_right);
+		if (dnp->dn_op == DT_TOK_LBRAC) {
+			dt_node_t *ln = dnp->dn_right;
+			while (ln->dn_list != NULL) {
+				dt_sugar_visit_all(dp, ln->dn_list);
+				ln = ln->dn_list;
+			}
+		}
+		break;
+
+	case DT_NODE_OP3:
+		dt_sugar_visit_all(dp, dnp->dn_expr);
+		dt_sugar_visit_all(dp, dnp->dn_left);
+		dt_sugar_visit_all(dp, dnp->dn_right);
+		break;
+
+	case DT_NODE_DEXPR:
+	case DT_NODE_DFUNC:
+		dt_sugar_visit_all(dp, dnp->dn_expr);
+		break;
+
+	case DT_NODE_AGG:
+		for (arg = dnp->dn_aggtup; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+
+		if (dnp->dn_aggfun)
+			dt_sugar_visit_all(dp, dnp->dn_aggfun);
+		break;
+
+	case DT_NODE_CLAUSE:
+		for (arg = dnp->dn_pdescs; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+
+		if (dnp->dn_pred != NULL)
+			dt_sugar_visit_all(dp, dnp->dn_pred);
+
+		for (arg = dnp->dn_acts; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+		break;
+
+	case DT_NODE_INLINE: {
+		const dt_idnode_t *inp = dnp->dn_ident->di_iarg;
+
+		dt_sugar_visit_all(dp, inp->din_root);
+		break;
+	}
+	case DT_NODE_MEMBER:
+		if (dnp->dn_membexpr)
+			dt_sugar_visit_all(dp, dnp->dn_membexpr);
+		break;
+
+	case DT_NODE_XLATOR:
+		for (arg = dnp->dn_members; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+		break;
+
+	case DT_NODE_PROVIDER:
+		for (arg = dnp->dn_probes; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+		break;
+
+	case DT_NODE_PROG:
+		for (arg = dnp->dn_list; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+		break;
+
+	case DT_NODE_IF:
+		dp->dtsp_num_ifs++;
+		dt_sugar_visit_all(dp, dnp->dn_conditional);
+
+		for (arg = dnp->dn_body; arg != NULL; arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+		for (arg = dnp->dn_alternate_body; arg != NULL;
+		    arg = arg->dn_list)
+			dt_sugar_visit_all(dp, arg);
+
+		break;
+
+	default:
+		(void) dnerror(dnp, D_UNKNOWN, "bad node %p, kind %d\n",
+		    (void *)dnp, dnp->dn_kind);
+	}
+}
+
+/*
+ * Return a new clause which resets the error variable to zero:
+ *
+ *   dp_pdescs{ self->%error = 0; }
+ *
+ * This clause will be executed at the beginning of each meta-clause, to
+ * ensure the error variable is unset (in case the previous meta-clause
+ * failed).
+ */
+static dt_node_t *
+dt_sugar_new_clearerror_clause(dt_sugar_parse_t *dp)
+{
+	dt_node_t *stmt = dt_node_statement(dt_node_op2(DT_TOK_ASGN,
+	    dt_sugar_new_error_var(), dt_node_int(0)));
+	return (dt_node_clause(dp->dtsp_pdescs, NULL, stmt));
+}
+
+/*
+ * Evaluate the conditional, and recursively visit the body of the "if"
+ * statement (and the "else", if present).
+ */
+static void
+dt_sugar_do_if(dt_sugar_parse_t *dp, dt_node_t *if_stmt, int precondition)
+{
+	int newid;
+
+	assert(if_stmt->dn_kind == DT_NODE_IF);
+
+	/* condition */
+	newid = dt_sugar_new_condition(dp,
+	    if_stmt->dn_conditional, precondition);
+
+	/* body of if */
+	dt_sugar_visit_stmts(dp, if_stmt->dn_body, newid);
+
+	/*
+	 * Visit the body of the "else" statement, if present.  Note that we
+	 * generate a new condition which is the inverse of the previous
+	 * condition.
+	 */
+	if (if_stmt->dn_alternate_body != NULL) {
+		dt_node_t *pred =
+		    dt_node_op1(DT_TOK_LNEG, dt_sugar_new_condition_var(newid));
+		dt_sugar_visit_stmts(dp, if_stmt->dn_alternate_body,
+		    dt_sugar_new_condition(dp, pred, precondition));
+	}
+}
+
+/*
+ * Generate a new clause to evaluate the statements based on the condition.
+ * The new clause will be appended to dp_first_new_clause.
+ *
+ * dp_pdescs
+ * /!self->%error && this->%condition_<condid>/
+ * {
+ *	stmts
+ * }
+ */
+static void
+dt_sugar_new_basic_block(dt_sugar_parse_t *dp, int condid, dt_node_t *stmts)
+{
+	dt_node_t *pred = NULL;
+
+	if (condid == 0) {
+		/*
+		 * Don't bother with !error on the first clause, because if
+		 * there is only one clause, we don't add the prelude to
+		 * zero out %error.
+		 */
+		if (dp->dtsp_num_conditions != 0) {
+			pred = dt_node_op1(DT_TOK_LNEG,
+			    dt_sugar_new_error_var());
+		}
+	} else {
+		pred = dt_node_op2(DT_TOK_LAND,
+		    dt_node_op1(DT_TOK_LNEG, dt_sugar_new_error_var()),
+		    dt_sugar_new_condition_var(condid));
+	}
+	dt_sugar_append_clause(dp,
+	    dt_node_clause(dp->dtsp_pdescs, pred, stmts));
+}
+
+/*
+ * Visit all the statements in this list, and break them into basic blocks,
+ * generating new clauses for "if" and "else" statements.
+ */
+static void
+dt_sugar_visit_stmts(dt_sugar_parse_t *dp, dt_node_t *stmts, int precondition)
+{
+	dt_node_t *stmt;
+	dt_node_t *prev_stmt = NULL;
+	dt_node_t *next_stmt;
+	dt_node_t *first_stmt_in_basic_block = NULL;
+
+	for (stmt = stmts; stmt != NULL; stmt = next_stmt) {
+		next_stmt = stmt->dn_list;
+
+		if (stmt->dn_kind != DT_NODE_IF) {
+			if (first_stmt_in_basic_block == NULL)
+				first_stmt_in_basic_block = stmt;
+			prev_stmt = stmt;
+			continue;
+		}
+
+		/*
+		 * Remove this and following statements from the previous
+		 * clause.
+		 */
+		if (prev_stmt != NULL)
+			prev_stmt->dn_list = NULL;
+
+		/*
+		 * Generate clause for statements preceding the "if"
+		 */
+		if (first_stmt_in_basic_block != NULL) {
+			dt_sugar_new_basic_block(dp, precondition,
+			    first_stmt_in_basic_block);
+		}
+
+		dt_sugar_do_if(dp, stmt, precondition);
+
+		first_stmt_in_basic_block = NULL;
+
+		prev_stmt = stmt;
+	}
+
+	/* generate clause for statements after last "if". */
+	if (first_stmt_in_basic_block != NULL) {
+		dt_sugar_new_basic_block(dp, precondition,
+		    first_stmt_in_basic_block);
+	}
+}
+
+/*
+ * Generate a new clause which will set the error variable when an error occurs.
+ * Only one of these clauses is created per program (e.g. script file).
+ * The clause is:
+ *
+ * dtrace:::ERROR{ self->%error = 1; }
+ */
+static dt_node_t *
+dt_sugar_makeerrorclause(void)
+{
+	dt_node_t *acts, *pdesc;
+
+	pdesc = dt_node_pdesc_by_name(strdup("dtrace:::ERROR"));
+
+	acts = dt_node_statement(dt_node_op2(DT_TOK_ASGN,
+	    dt_sugar_new_error_var(), dt_node_int(1)));
+
+	return (dt_node_clause(pdesc, NULL, acts));
+}
+
+/*
+ * Transform the super-clause into straight-D, returning the new list of
+ * sub-clauses.
+ */
+dt_node_t *
+dt_compile_sugar(dtrace_hdl_t *dtp, dt_node_t *clause)
+{
+	dt_sugar_parse_t dp = { 0 };
+	int condid = 0;
+
+	dp.dtsp_dtp = dtp;
+	dp.dtsp_pdescs = clause->dn_pdescs;
+
+	/* make dt_node_int() generate an "int"-typed integer */
+	yyintdecimal = B_TRUE;
+	yyintsuffix[0] = '\0';
+	yyintprefix = 0;
+
+	dt_sugar_visit_all(&dp, clause);
+
+	if (dp.dtsp_num_ifs == 0 && dp.dtsp_num_conditions == 0) {
+		/*
+		 * There is nothing that modifies the number of clauses.  Use
+		 * the existing clause as-is, with its predicate intact.  This
+		 * ensures that in the absence of D sugar, the body of the
+		 * clause can create a variable that is referenced in the
+		 * predicate.
+		 */
+		dt_sugar_append_clause(&dp, dt_node_clause(clause->dn_pdescs,
+		    clause->dn_pred, clause->dn_acts));
+	} else {
+		if (clause->dn_pred != NULL) {
+			condid = dt_sugar_new_condition(&dp,
+			    clause->dn_pred, condid);
+		}
+
+		if (clause->dn_acts == NULL) {
+			/*
+			 * dt_sugar_visit_stmts() does not emit a clause with
+			 * an empty body (e.g. if there's an empty "if" body),
+			 * but we need the empty body here so that we
+			 * continue to get the default tracing action.
+			 */
+			dt_sugar_new_basic_block(&dp, condid, NULL);
+		} else {
+			dt_sugar_visit_stmts(&dp, clause->dn_acts, condid);
+		}
+	}
+
+	if (dp.dtsp_num_conditions != 0) {
+		dt_sugar_prepend_clause(&dp,
+		    dt_sugar_new_clearerror_clause(&dp));
+	}
+
+	if (dp.dtsp_clause_list != NULL &&
+	    dp.dtsp_clause_list->dn_list != NULL && !dtp->dt_has_sugar) {
+		dtp->dt_has_sugar = B_TRUE;
+		dt_sugar_prepend_clause(&dp, dt_sugar_makeerrorclause());
+	}
+	return (dp.dtsp_clause_list);
+}
Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h,v
retrieving revision 1.5
diff -u -p -r1.5 dtrace.h
--- src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h	24 Sep 2015 14:25:29 -0000	1.5
+++ src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h	13 Apr 2017 17:14:37 -0000
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
@@ -59,6 +59,7 @@ extern "C" {
 #define	DTRACE_VERSION	3		/* library ABI interface version */
 
 struct ps_prochandle;
+struct dt_node;
 typedef struct dtrace_hdl dtrace_hdl_t;
 typedef struct dtrace_prog dtrace_prog_t;
 typedef struct dtrace_vector dtrace_vector_t;
@@ -115,7 +116,7 @@ typedef struct dtrace_proginfo {
 #define	DTRACE_C_CPP	0x0010	/* Preprocess input file with cpp(1) utility */
 #define	DTRACE_C_KNODEF	0x0020	/* Permit unresolved kernel symbols in DIFO */
 #define	DTRACE_C_UNODEF	0x0040	/* Permit unresolved user symbols in DIFO */
-#define	DTRACE_C_PSPEC	0x0080	/* Intepret ambiguous specifiers as probes */
+#define	DTRACE_C_PSPEC	0x0080	/* Interpret ambiguous specifiers as probes */
 #define	DTRACE_C_ETAGS	0x0100	/* Prefix error messages with error tags */
 #define	DTRACE_C_ARGREF	0x0200	/* Do not require all macro args to be used */
 #define	DTRACE_C_DEFARG	0x0800	/* Use 0/"" as value for unspecified args */
@@ -523,6 +524,10 @@ extern int dtrace_type_strcompile(dtrace
 extern int dtrace_type_fcompile(dtrace_hdl_t *,
     FILE *, dtrace_typeinfo_t *);
 
+extern struct dt_node *dt_compile_sugar(dtrace_hdl_t *,
+    struct dt_node *);
+
+
 /*
  * DTrace Probe Interface
  *
Index: src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c,v
retrieving revision 1.1
diff -u -p -r1.1 dt_isadep.c
--- src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c	24 Sep 2015 14:26:09 -0000	1.1
+++ src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c	7 May 2017 03:58:22 -0000
@@ -40,9 +40,8 @@
 
 #include <dis_tables.h>
 
-#ifndef illumos
-#define PR_MODEL_ILP32	1
-#define PR_MODEL_LP64	2
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+#include <libproc.h>
 #include <libproc_compat.h>
 #endif
 
@@ -94,11 +93,7 @@ dt_pid_has_jump_table(struct ps_prochand
 	char dmodel = Pstatus(P)->pr_dmodel;
 #else
 	pid_t pid = proc_getpid(P);
-#if __i386__
-	char dmodel = PR_MODEL_ILP32;
-#elif __amd64__
-	char dmodel = PR_MODEL_LP64;
-#endif
+	char dmodel = proc_getmodel(P);
 #endif
 
 	/*
@@ -150,11 +145,7 @@ dt_pid_create_return_probe(struct ps_pro
 	char dmodel = Pstatus(P)->pr_dmodel;
 #else
 	pid_t pid = proc_getpid(P);
-#if __i386__
-	char dmodel = PR_MODEL_ILP32;
-#elif __amd64__
-	char dmodel = PR_MODEL_LP64;
-#endif
+	char dmodel = proc_getmodel(P);
 #endif
 
 	/*
@@ -311,11 +302,7 @@ dt_pid_create_offset_probe(struct ps_pro
 		char dmodel = Pstatus(P)->pr_dmodel;
 #else
 		pid_t pid = proc_getpid(P);
-#if __i386__
-		char dmodel = PR_MODEL_ILP32;
-#elif __amd64__
-		char dmodel = PR_MODEL_LP64;
-#endif
+		char dmodel = proc_getmodel(P);
 #endif
 
 		if ((text = malloc(symp->st_size)) == NULL) {
@@ -394,11 +381,7 @@ dt_pid_create_glob_offset_probes(struct 
 	char dmodel = Pstatus(P)->pr_dmodel;
 #else
 	pid_t pid = proc_getpid(P);
-#if __i386__
-	char dmodel = PR_MODEL_ILP32;
-#elif __amd64__
-	char dmodel = PR_MODEL_LP64;
-#endif
+	char dmodel = proc_getmodel(P);
 #endif
 
 	ftp->ftps_type = DTFTP_OFFSETS;
@@ -522,13 +505,8 @@ dt_instr_size(uchar_t *instr, dtrace_hdl
 
 	cpu_mode = (dmodel == PR_MODEL_ILP32) ? SIZE32 : SIZE64;
 
-#ifdef notyet
-	// XXX: does not compile!
 	if (dtrace_disx86(&x86dis, cpu_mode) != 0)
 		return (-1);
-#else
-	__USE(cpu_mode);
-#endif
 
 	/*
 	 * If the instruction was a single-byte breakpoint, there may be
Index: src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c,v
retrieving revision 1.4
diff -u -p -r1.4 gmatch.c
--- src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c	7 Feb 2015 20:30:03 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c	13 Apr 2017 17:22:08 -0000
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,34 +19,34 @@
  * CDDL HEADER END
  */
 
-/*	Copyright (c) 1988 AT&T	*/
-/*	  All Rights Reserved  	*/
-
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.1.5.2 */
+/*	Copyright (c) 1988 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 /*LINTLIBRARY*/
 
-#if defined(sun)
+#ifdef illumos
 #pragma weak gmatch = _gmatch
 #endif
 
-#if defined(sun)
+#ifdef illumos
 #include "gen_synonyms.h"
 #endif
 #include <sys/types.h>
 #include <libgen.h>
 #include <stdlib.h>
 #include <limits.h>
-#include <ctype.h>
-#if defined(sun)
+#ifdef illumos
 #include <widec.h>
 #include "_range.h"
 #else
+#include <ctype.h>
 /* DOODAD */ static int multibyte = 0;
 #define WCHAR_CSMASK    0x30000000
 #define valid_range(c1, c2) \
@@ -61,7 +60,7 @@
 	c = cl; \
 	if (n <= 0) \
 		return (0); \
-	p += n;
+	p += n
 
 int gmatch(const char *, const char *);
 int
@@ -103,13 +102,13 @@ gmatch(const char *s, const char *p)
 				notflag = 1;
 				p++;
 			}
-			Popwchar(p, c)
+			Popwchar(p, c);
 			do
 			{
 				if (c == '-' && lc && *p != ']') {
-					Popwchar(p, c)
+					Popwchar(p, c);
 					if (c == '\\') {
-						Popwchar(p, c)
+						Popwchar(p, c);
 					}
 					if (notflag) {
 						if (!multibyte ||
@@ -128,7 +127,7 @@ gmatch(const char *s, const char *p)
 					}
 				} else if (c == '\\') {
 					/* skip to quoted character */
-					Popwchar(p, c)
+					Popwchar(p, c);
 				}
 				lc = c;
 				if (notflag) {
@@ -142,14 +141,14 @@ gmatch(const char *s, const char *p)
 					if (scc == lc)
 						ok++;
 				}
-				Popwchar(p, c)
+				Popwchar(p, c);
 			} while (c != ']');
 			return (ok ? gmatch(s, p) : 0);
 		}
 
 	case '\\':
 		/* skip to quoted character and see if it matches */
-		Popwchar(p, c)
+		Popwchar(p, c);
 
 	default:
 		if (c != scc)
Index: src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c,v
retrieving revision 1.3
diff -u -p -r1.3 libnvpair.c
--- src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c	19 Oct 2013 23:07:39 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c	20 Apr 2017 17:38:21 -0000
@@ -19,15 +19,17 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
+#include <solaris.h>
+#include <inttypes.h>
 #include <unistd.h>
-#include <strings.h>
+#include <string.h>
 #include <libintl.h>
-#include <sys/types.h>
-#include <sys/inttypes.h>
+#include <stdarg.h>
+#include <note.h>
 #include "libnvpair.h"
 
 /*
@@ -38,21 +40,531 @@
  *	between kernel and userland, and possibly saving onto disk files.
  */
 
+/*
+ * Print control structure.
+ */
+
+#define	DEFINEOP(opname, vtype) \
+	struct { \
+		int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \
+		    const char *, vtype); \
+		void *arg; \
+	} opname
+
+#define	DEFINEARROP(opname, vtype) \
+	struct { \
+		int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \
+		    const char *, vtype, uint_t); \
+		void *arg; \
+	} opname
+
+struct nvlist_printops {
+	DEFINEOP(print_boolean, int);
+	DEFINEOP(print_boolean_value, boolean_t);
+	DEFINEOP(print_byte, uchar_t);
+	DEFINEOP(print_int8, int8_t);
+	DEFINEOP(print_uint8, uint8_t);
+	DEFINEOP(print_int16, int16_t);
+	DEFINEOP(print_uint16, uint16_t);
+	DEFINEOP(print_int32, int32_t);
+	DEFINEOP(print_uint32, uint32_t);
+	DEFINEOP(print_int64, int64_t);
+	DEFINEOP(print_uint64, uint64_t);
+	DEFINEOP(print_double, double);
+	DEFINEOP(print_string, char *);
+	DEFINEOP(print_hrtime, hrtime_t);
+	DEFINEOP(print_nvlist, nvlist_t *);
+	DEFINEARROP(print_boolean_array, boolean_t *);
+	DEFINEARROP(print_byte_array, uchar_t *);
+	DEFINEARROP(print_int8_array, int8_t *);
+	DEFINEARROP(print_uint8_array, uint8_t *);
+	DEFINEARROP(print_int16_array, int16_t *);
+	DEFINEARROP(print_uint16_array, uint16_t *);
+	DEFINEARROP(print_int32_array, int32_t *);
+	DEFINEARROP(print_uint32_array, uint32_t *);
+	DEFINEARROP(print_int64_array, int64_t *);
+	DEFINEARROP(print_uint64_array, uint64_t *);
+	DEFINEARROP(print_string_array, char **);
+	DEFINEARROP(print_nvlist_array, nvlist_t **);
+};
+
+struct nvlist_prtctl {
+	FILE *nvprt_fp;			/* output destination */
+	enum nvlist_indent_mode nvprt_indent_mode; /* see above */
+	int nvprt_indent;		/* absolute indent, or tab depth */
+	int nvprt_indentinc;		/* indent or tab increment */
+	const char *nvprt_nmfmt;	/* member name format, max one %s */
+	const char *nvprt_eomfmt;	/* after member format, e.g. "\n" */
+	const char *nvprt_btwnarrfmt;	/* between array members */
+	int nvprt_btwnarrfmt_nl;	/* nvprt_eoamfmt includes newline? */
+	struct nvlist_printops *nvprt_dfltops;
+	struct nvlist_printops *nvprt_custops;
+};
+
+#define	DFLTPRTOP(pctl, type) \
+	((pctl)->nvprt_dfltops->print_##type.op)
+
+#define	DFLTPRTOPARG(pctl, type) \
+	((pctl)->nvprt_dfltops->print_##type.arg)
+
+#define	CUSTPRTOP(pctl, type) \
+	((pctl)->nvprt_custops->print_##type.op)
+
+#define	CUSTPRTOPARG(pctl, type) \
+	((pctl)->nvprt_custops->print_##type.arg)
+
+#define	RENDER(pctl, type, nvl, name, val) \
+	{ \
+		int done = 0; \
+		if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
+			done = CUSTPRTOP(pctl, type)(pctl, \
+			    CUSTPRTOPARG(pctl, type), nvl, name, val); \
+		} \
+		if (!done) { \
+			(void) DFLTPRTOP(pctl, type)(pctl, \
+			    DFLTPRTOPARG(pctl, type), nvl, name, val); \
+		} \
+		(void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
+	}
+
+#define	ARENDER(pctl, type, nvl, name, arrp, count) \
+	{ \
+		int done = 0; \
+		if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \
+			done = CUSTPRTOP(pctl, type)(pctl, \
+			    CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \
+		} \
+		if (!done) { \
+			(void) DFLTPRTOP(pctl, type)(pctl, \
+			    DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \
+		} \
+		(void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \
+	}
+
+static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t);
+
+/*
+ * ======================================================================
+ * |									|
+ * | Indentation							|
+ * |									|
+ * ======================================================================
+ */
+
 static void
-indent(FILE *fp, int depth)
+indent(nvlist_prtctl_t pctl, int onemore)
 {
-	while (depth-- > 0)
-		(void) fprintf(fp, "\t");
+	int depth;
+
+	switch (pctl->nvprt_indent_mode) {
+	case NVLIST_INDENT_ABS:
+		(void) fprintf(pctl->nvprt_fp, "%*s",
+		    pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, "");
+		break;
+
+	case NVLIST_INDENT_TABBED:
+		depth = pctl->nvprt_indent + onemore;
+		while (depth-- > 0)
+			(void) fprintf(pctl->nvprt_fp, "\t");
+	}
 }
 
 /*
- * nvlist_print - Prints elements in an event buffer
+ * ======================================================================
+ * |									|
+ * | Default nvlist member rendering functions.				|
+ * |									|
+ * ======================================================================
+ */
+
+/*
+ * Generate functions to print single-valued nvlist members.
+ *
+ * type_and_variant - suffix to form function name
+ * vtype - C type for the member value
+ * ptype - C type to cast value to for printing
+ * vfmt - format string for pair value, e.g "%d" or "0x%llx"
+ */
+
+#define	NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \
+static int \
+nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
+    nvlist_t *nvl, const char *name, vtype value) \
+{ \
+	FILE *fp = pctl->nvprt_fp; \
+	NOTE(ARGUNUSED(private)) \
+	NOTE(ARGUNUSED(nvl)) \
+	indent(pctl, 1); \
+	(void) fprintf(fp, pctl->nvprt_nmfmt, name); \
+	(void) fprintf(fp, vfmt, (ptype)value); \
+	return (1); \
+}
+
+NVLIST_PRTFUNC(boolean, int, int, "%d")
+NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d")
+NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x")
+NVLIST_PRTFUNC(int8, int8_t, int, "%d")
+NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x")
+NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d")
+NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x")
+NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d")
+NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x")
+NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld")
+NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx")
+NVLIST_PRTFUNC(double, double, double, "0x%f")
+NVLIST_PRTFUNC(string, char *, char *, "%s")
+NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx")
+
+/*
+ * Generate functions to print array-valued nvlist members.
+ */
+
+#define	NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \
+static int \
+nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \
+    nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \
+{ \
+	FILE *fp = pctl->nvprt_fp; \
+	uint_t i; \
+	NOTE(ARGUNUSED(private)) \
+	NOTE(ARGUNUSED(nvl)) \
+	for (i = 0; i < count; i++) { \
+		if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \
+			indent(pctl, 1); \
+			(void) fprintf(fp, pctl->nvprt_nmfmt, name); \
+			if (pctl->nvprt_btwnarrfmt_nl) \
+				(void) fprintf(fp, "[%d]: ", i); \
+		} \
+		if (i != 0) \
+			(void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
+		(void) fprintf(fp, vfmt, (ptype)valuep[i]); \
+	} \
+	return (1); \
+}
+
+NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d")
+NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x")
+NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d")
+NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x")
+NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d")
+NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x")
+NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d")
+NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x")
+NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld")
+NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx")
+NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s")
+
+/*ARGSUSED*/
+static int
+nvprint_nvlist(nvlist_prtctl_t pctl, void *private,
+    nvlist_t *nvl, const char *name, nvlist_t *value)
+{
+	FILE *fp = pctl->nvprt_fp;
+
+	indent(pctl, 1);
+	(void) fprintf(fp, "%s = (embedded nvlist)\n", name);
+
+	pctl->nvprt_indent += pctl->nvprt_indentinc;
+	nvlist_print_with_indent(value, pctl);
+	pctl->nvprt_indent -= pctl->nvprt_indentinc;
+
+	indent(pctl, 1);
+	(void) fprintf(fp, "(end %s)\n", name);
+
+	return (1);
+}
+
+/*ARGSUSED*/
+static int
+nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private,
+    nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count)
+{
+	FILE *fp = pctl->nvprt_fp;
+	uint_t i;
+
+	indent(pctl, 1);
+	(void) fprintf(fp, "%s = (array of embedded nvlists)\n", name);
+
+	for (i = 0; i < count; i++) {
+		indent(pctl, 1);
+		(void) fprintf(fp, "(start %s[%d])\n", name, i);
+
+		pctl->nvprt_indent += pctl->nvprt_indentinc;
+		nvlist_print_with_indent(valuep[i], pctl);
+		pctl->nvprt_indent -= pctl->nvprt_indentinc;
+
+		indent(pctl, 1);
+		(void) fprintf(fp, "(end %s[%d])\n", name, i);
+	}
+
+	return (1);
+}
+
+/*
+ * ======================================================================
+ * |									|
+ * | Interfaces that allow control over formatting.			|
+ * |									|
+ * ======================================================================
+ */
+
+void
+nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp)
+{
+	pctl->nvprt_fp = fp;
+}
+
+FILE *
+nvlist_prtctl_getdest(nvlist_prtctl_t pctl)
+{
+	return (pctl->nvprt_fp);
+}
+
+
+void
+nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode,
+    int start, int inc)
+{
+	if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED)
+		mode = NVLIST_INDENT_TABBED;
+
+	if (start < 0)
+		start = 0;
+
+	if (inc < 0)
+		inc = 1;
+
+	pctl->nvprt_indent_mode = mode;
+	pctl->nvprt_indent = start;
+	pctl->nvprt_indentinc = inc;
+}
+
+void
+nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore)
+{
+	indent(pctl, onemore);
+}
+
+
+void
+nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which,
+    const char *fmt)
+{
+	switch (which) {
+	case NVLIST_FMT_MEMBER_NAME:
+		if (fmt == NULL)
+			fmt = "%s = ";
+		pctl->nvprt_nmfmt = fmt;
+		break;
+
+	case NVLIST_FMT_MEMBER_POSTAMBLE:
+		if (fmt == NULL)
+			fmt = "\n";
+		pctl->nvprt_eomfmt = fmt;
+		break;
+
+	case NVLIST_FMT_BTWN_ARRAY:
+		if (fmt == NULL) {
+			pctl->nvprt_btwnarrfmt = " ";
+			pctl->nvprt_btwnarrfmt_nl = 0;
+		} else {
+			pctl->nvprt_btwnarrfmt = fmt;
+			pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL);
+		}
+		break;
+
+	default:
+		break;
+	}
+}
+
+
+void
+nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...)
+{
+	FILE *fp = pctl->nvprt_fp;
+	va_list ap;
+	char *name;
+
+	va_start(ap, which);
+
+	switch (which) {
+	case NVLIST_FMT_MEMBER_NAME:
+		name = va_arg(ap, char *);
+		(void) fprintf(fp, pctl->nvprt_nmfmt, name);
+		break;
+
+	case NVLIST_FMT_MEMBER_POSTAMBLE:
+		(void) fprintf(fp, pctl->nvprt_eomfmt);
+		break;
+
+	case NVLIST_FMT_BTWN_ARRAY:
+		(void) fprintf(fp, pctl->nvprt_btwnarrfmt); \
+		break;
+
+	default:
+		break;
+	}
+
+	va_end(ap);
+}
+
+/*
+ * ======================================================================
+ * |									|
+ * | Interfaces to allow appointment of replacement rendering functions.|
+ * |									|
+ * ======================================================================
+ */
+
+#define	NVLIST_PRINTCTL_REPLACE(type, vtype) \
+void \
+nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
+    int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \
+    void *private) \
+{ \
+	CUSTPRTOP(pctl, type) = func; \
+	CUSTPRTOPARG(pctl, type) = private; \
+}
+
+NVLIST_PRINTCTL_REPLACE(boolean, int)
+NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t)
+NVLIST_PRINTCTL_REPLACE(byte, uchar_t)
+NVLIST_PRINTCTL_REPLACE(int8, int8_t)
+NVLIST_PRINTCTL_REPLACE(uint8, uint8_t)
+NVLIST_PRINTCTL_REPLACE(int16, int16_t)
+NVLIST_PRINTCTL_REPLACE(uint16, uint16_t)
+NVLIST_PRINTCTL_REPLACE(int32, int32_t)
+NVLIST_PRINTCTL_REPLACE(uint32, uint32_t)
+NVLIST_PRINTCTL_REPLACE(int64, int64_t)
+NVLIST_PRINTCTL_REPLACE(uint64, uint64_t)
+NVLIST_PRINTCTL_REPLACE(double, double)
+NVLIST_PRINTCTL_REPLACE(string, char *)
+NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t)
+NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *)
+
+#define	NVLIST_PRINTCTL_AREPLACE(type, vtype) \
+void \
+nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \
+    int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \
+    uint_t), void *private) \
+{ \
+	CUSTPRTOP(pctl, type) = func; \
+	CUSTPRTOPARG(pctl, type) = private; \
+}
+
+NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *)
+NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *)
+NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *)
+NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *)
+NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *)
+NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *)
+NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *)
+NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *)
+NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *)
+NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *)
+NVLIST_PRINTCTL_AREPLACE(string_array, char **)
+NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **)
+
+/*
+ * ======================================================================
+ * |									|
+ * | Interfaces to manage nvlist_prtctl_t cookies.			|
+ * |									|
+ * ======================================================================
  */
-static
+
+
+static const struct nvlist_printops defprtops = {
+	{ nvprint_boolean, NULL },
+	{ nvprint_boolean_value, NULL },
+	{ nvprint_byte, NULL },
+	{ nvprint_int8, NULL },
+	{ nvprint_uint8, NULL },
+	{ nvprint_int16, NULL },
+	{ nvprint_uint16, NULL },
+	{ nvprint_int32, NULL },
+	{ nvprint_uint32, NULL },
+	{ nvprint_int64, NULL },
+	{ nvprint_uint64, NULL },
+	{ nvprint_double, NULL },
+	{ nvprint_string, NULL },
+	{ nvprint_hrtime, NULL },
+	{ nvprint_nvlist, NULL },
+	{ nvaprint_boolean_array, NULL },
+	{ nvaprint_byte_array, NULL },
+	{ nvaprint_int8_array, NULL },
+	{ nvaprint_uint8_array, NULL },
+	{ nvaprint_int16_array, NULL },
+	{ nvaprint_uint16_array, NULL },
+	{ nvaprint_int32_array, NULL },
+	{ nvaprint_uint32_array, NULL },
+	{ nvaprint_int64_array, NULL },
+	{ nvaprint_uint64_array, NULL },
+	{ nvaprint_string_array, NULL },
+	{ nvaprint_nvlist_array, NULL },
+};
+
+static void
+prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl,
+    struct nvlist_printops *ops)
+{
+	pctl->nvprt_fp = fp;
+	pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED;
+	pctl->nvprt_indent = 0;
+	pctl->nvprt_indentinc = 1;
+	pctl->nvprt_nmfmt = "%s = ";
+	pctl->nvprt_eomfmt = "\n";
+	pctl->nvprt_btwnarrfmt = " ";
+	pctl->nvprt_btwnarrfmt_nl = 0;
+
+	pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops;
+	pctl->nvprt_custops = ops;
+}
+
+nvlist_prtctl_t
+nvlist_prtctl_alloc(void)
+{
+	struct nvlist_prtctl *pctl;
+	struct nvlist_printops *ops;
+
+	if ((pctl = malloc(sizeof (*pctl))) == NULL)
+		return (NULL);
+
+	if ((ops = calloc(1, sizeof (*ops))) == NULL) {
+		free(pctl);
+		return (NULL);
+	}
+
+	prtctl_defaults(stdout, pctl, ops);
+
+	return (pctl);
+}
+
 void
-nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth)
+nvlist_prtctl_free(nvlist_prtctl_t pctl)
 {
-	int i;
+	if (pctl != NULL) {
+		free(pctl->nvprt_custops);
+		free(pctl);
+	}
+}
+
+/*
+ * ======================================================================
+ * |									|
+ * | Top-level print request interfaces.				|
+ * |									|
+ * ======================================================================
+ */
+
+/*
+ * nvlist_print - Prints elements in an event buffer
+ */
+static void
+nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl)
+{
+	FILE *fp = pctl->nvprt_fp;
 	char *name;
 	uint_t nelem;
 	nvpair_t *nvp;
@@ -60,7 +572,7 @@ nvlist_print_with_indent(FILE *fp, nvlis
 	if (nvl == NULL)
 		return;
 
-	indent(fp, depth);
+	indent(pctl, 0);
 	(void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl));
 
 	nvp = nvlist_next_nvpair(nvl, NULL);
@@ -68,198 +580,174 @@ nvlist_print_with_indent(FILE *fp, nvlis
 	while (nvp) {
 		data_type_t type = nvpair_type(nvp);
 
-		indent(fp, depth);
 		name = nvpair_name(nvp);
-		(void) fprintf(fp, "\t%s =", name);
 		nelem = 0;
+
 		switch (type) {
 		case DATA_TYPE_BOOLEAN: {
-			(void) fprintf(fp, " 1");
+			RENDER(pctl, boolean, nvl, name, 1);
 			break;
 		}
 		case DATA_TYPE_BOOLEAN_VALUE: {
 			boolean_t val;
 			(void) nvpair_value_boolean_value(nvp, &val);
-			(void) fprintf(fp, " %d", val);
+			RENDER(pctl, boolean_value, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_BYTE: {
 			uchar_t val;
 			(void) nvpair_value_byte(nvp, &val);
-			(void) fprintf(fp, " 0x%2.2x", val);
+			RENDER(pctl, byte, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_INT8: {
 			int8_t val;
 			(void) nvpair_value_int8(nvp, &val);
-			(void) fprintf(fp, " %d", val);
+			RENDER(pctl, int8, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_UINT8: {
 			uint8_t val;
 			(void) nvpair_value_uint8(nvp, &val);
-			(void) fprintf(fp, " 0x%x", val);
+			RENDER(pctl, uint8, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_INT16: {
 			int16_t val;
 			(void) nvpair_value_int16(nvp, &val);
-			(void) fprintf(fp, " %d", val);
+			RENDER(pctl, int16, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_UINT16: {
 			uint16_t val;
 			(void) nvpair_value_uint16(nvp, &val);
-			(void) fprintf(fp, " 0x%x", val);
+			RENDER(pctl, uint16, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_INT32: {
 			int32_t val;
 			(void) nvpair_value_int32(nvp, &val);
-			(void) fprintf(fp, " %d", val);
+			RENDER(pctl, int32, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_UINT32: {
 			uint32_t val;
 			(void) nvpair_value_uint32(nvp, &val);
-			(void) fprintf(fp, " 0x%x", val);
+			RENDER(pctl, uint32, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_INT64: {
 			int64_t val;
 			(void) nvpair_value_int64(nvp, &val);
-			(void) fprintf(fp, " %" PRId64 , val);
+			RENDER(pctl, int64, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_UINT64: {
 			uint64_t val;
 			(void) nvpair_value_uint64(nvp, &val);
-			(void) fprintf(fp, " 0x%" PRIx64, val);
+			RENDER(pctl, uint64, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_DOUBLE: {
 			double val;
 			(void) nvpair_value_double(nvp, &val);
-			(void) fprintf(fp, " 0x%f", val);
+			RENDER(pctl, double, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_STRING: {
 			char *val;
 			(void) nvpair_value_string(nvp, &val);
-			(void) fprintf(fp, " %s", val);
+			RENDER(pctl, string, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_BOOLEAN_ARRAY: {
 			boolean_t *val;
 			(void) nvpair_value_boolean_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " %d", val[i]);
+			ARENDER(pctl, boolean_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_BYTE_ARRAY: {
 			uchar_t *val;
 			(void) nvpair_value_byte_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " 0x%2.2x", val[i]);
+			ARENDER(pctl, byte_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_INT8_ARRAY: {
 			int8_t *val;
 			(void) nvpair_value_int8_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " %d", val[i]);
+			ARENDER(pctl, int8_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_UINT8_ARRAY: {
 			uint8_t *val;
 			(void) nvpair_value_uint8_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " 0x%x", val[i]);
+			ARENDER(pctl, uint8_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_INT16_ARRAY: {
 			int16_t *val;
 			(void) nvpair_value_int16_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " %d", val[i]);
+			ARENDER(pctl, int16_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_UINT16_ARRAY: {
 			uint16_t *val;
 			(void) nvpair_value_uint16_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " 0x%x", val[i]);
+			ARENDER(pctl, uint16_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_INT32_ARRAY: {
 			int32_t *val;
 			(void) nvpair_value_int32_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " %d", val[i]);
+			ARENDER(pctl, int32_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_UINT32_ARRAY: {
 			uint32_t *val;
 			(void) nvpair_value_uint32_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " 0x%x", val[i]);
+			ARENDER(pctl, uint32_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_INT64_ARRAY: {
 			int64_t *val;
 			(void) nvpair_value_int64_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " %" PRId64, val[i]);
+			ARENDER(pctl, int64_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_UINT64_ARRAY: {
 			uint64_t *val;
 			(void) nvpair_value_uint64_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " 0x%" PRIx64, val[i]);
+			ARENDER(pctl, uint64_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_STRING_ARRAY: {
 			char **val;
 			(void) nvpair_value_string_array(nvp, &val, &nelem);
-			for (i = 0; i < nelem; i++)
-				(void) fprintf(fp, " %s", val[i]);
+			ARENDER(pctl, string_array, nvl, name, val, nelem);
 			break;
 		}
 		case DATA_TYPE_HRTIME: {
 			hrtime_t val;
 			(void) nvpair_value_hrtime(nvp, &val);
-			(void) fprintf(fp, " 0x%jx", (intmax_t)val);
+			RENDER(pctl, hrtime, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_NVLIST: {
 			nvlist_t *val;
 			(void) nvpair_value_nvlist(nvp, &val);
-			(void) fprintf(fp, " (embedded nvlist)\n");
-			nvlist_print_with_indent(fp, val, depth + 1);
-			indent(fp, depth + 1);
-			(void) fprintf(fp, "(end %s)\n", name);
+			RENDER(pctl, nvlist, nvl, name, val);
 			break;
 		}
 		case DATA_TYPE_NVLIST_ARRAY: {
 			nvlist_t **val;
 			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
-			(void) fprintf(fp, " (array of embedded nvlists)\n");
-			for (i = 0; i < nelem; i++) {
-				indent(fp, depth + 1);
-				(void) fprintf(fp,
-				    "(start %s[%d])\n", name, i);
-				nvlist_print_with_indent(fp, val[i], depth + 1);
-				indent(fp, depth + 1);
-				(void) fprintf(fp, "(end %s[%d])\n", name, i);
-			}
+			ARENDER(pctl, nvlist_array, nvl, name, val, nelem);
 			break;
 		}
 		default:
 			(void) fprintf(fp, " unknown data type (%d)", type);
 			break;
 		}
-		(void) fprintf(fp, "\n");
 		nvp = nvlist_next_nvpair(nvl, nvp);
 	}
 }
@@ -267,9 +755,17 @@ nvlist_print_with_indent(FILE *fp, nvlis
 void
 nvlist_print(FILE *fp, nvlist_t *nvl)
 {
-	nvlist_print_with_indent(fp, nvl, 0);
+	struct nvlist_prtctl pc;
+
+	prtctl_defaults(fp, &pc, NULL);
+	nvlist_print_with_indent(nvl, &pc);
 }
 
+void
+nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl)
+{
+	nvlist_print_with_indent(nvl, pctl);
+}
 
 #define	NVP(elem, type, vtype, ptype, format) { \
 	vtype	value; \
@@ -298,6 +794,7 @@ dump_nvlist(nvlist_t *list, int indent)
 {
 	nvpair_t	*elem = NULL;
 	boolean_t	bool_value;
+	boolean_t	*bool_array_value;
 	nvlist_t	*nvlist_value;
 	nvlist_t	**nvlist_array_value;
 	uint_t		i, count;
@@ -308,6 +805,10 @@ dump_nvlist(nvlist_t *list, int indent)
 
 	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
 		switch (nvpair_type(elem)) {
+		case DATA_TYPE_BOOLEAN:
+			(void) printf("%*s%s\n", indent, "", nvpair_name(elem));
+			break;
+
 		case DATA_TYPE_BOOLEAN_VALUE:
 			(void) nvpair_value_boolean_value(elem, &bool_value);
 			(void) printf("%*s%s: %s\n", indent, "",
@@ -343,17 +844,27 @@ dump_nvlist(nvlist_t *list, int indent)
 			break;
 
 		case DATA_TYPE_INT64:
-			NVP(elem, int64, int64_t, int64_t, "%" PRIx64);
+			NVP(elem, int64, int64_t, longlong_t, "%lld");
 			break;
 
 		case DATA_TYPE_UINT64:
-			NVP(elem, uint64, uint64_t, uint64_t, "%" PRIu64);
+			NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
 			break;
 
 		case DATA_TYPE_STRING:
 			NVP(elem, string, char *, char *, "'%s'");
 			break;
 
+		case DATA_TYPE_BOOLEAN_ARRAY:
+			(void) nvpair_value_boolean_array(elem,
+			    &bool_array_value, &count);
+			for (i = 0; i < count; i++) {
+				(void) printf("%*s%s[%d]: %s\n", indent, "",
+				    nvpair_name(elem), i,
+				    bool_array_value[i] ? "true" : "false");
+			}
+			break;
+
 		case DATA_TYPE_BYTE_ARRAY:
 			NVPA(elem, byte_array, uchar_t, int, "%u");
 			break;
@@ -383,12 +894,12 @@ dump_nvlist(nvlist_t *list, int indent)
 			break;
 
 		case DATA_TYPE_INT64_ARRAY:
-			NVPA(elem, int64_array, int64_t, int64_t, "%" PRId64);
+			NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
 			break;
 
 		case DATA_TYPE_UINT64_ARRAY:
-			NVPA(elem, uint64_array, uint64_t, uint64_t,
-			    "%" PRIu64);
+			NVPA(elem, uint64_array, uint64_t, u_longlong_t,
+			    "%llu");
 			break;
 
 		case DATA_TYPE_STRING_ARRAY:
@@ -421,6 +932,14 @@ dump_nvlist(nvlist_t *list, int indent)
 }
 
 /*
+ * ======================================================================
+ * |									|
+ * | Misc private interface.						|
+ * |									|
+ * ======================================================================
+ */
+
+/*
  * Determine if string 'value' matches 'nvp' value.  The 'value' string is
  * converted, depending on the type of 'nvp', prior to match.  For numeric
  * types, a radix independent sscanf conversion of 'value' is used. If 'nvp'
@@ -710,7 +1229,8 @@ nvpair_value_match_regex(nvpair_t *nvp, 
 		break;
 	}
 	case DATA_TYPE_BOOLEAN_VALUE: {
-		boolean_t val, val_arg;
+		int32_t val_arg;
+		boolean_t val;
 
 		/* scanf boolean_t from value and check for match */
 		sr = sscanf(value, "%"SCNi32, &val_arg);
@@ -721,7 +1241,8 @@ nvpair_value_match_regex(nvpair_t *nvp, 
 		break;
 	}
 	case DATA_TYPE_BOOLEAN_ARRAY: {
-		boolean_t *val_array, val_arg;
+		boolean_t *val_array;
+		int32_t val_arg;
 
 		/* check indexed value of array for match */
 		sr = sscanf(value, "%"SCNi32, &val_arg);
Index: src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 libnvpair.h
--- src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h	27 Feb 2010 22:30:14 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h	11 Dec 2014 11:54:49 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #ifndef	_LIBNVPAIR_H
@@ -35,10 +35,159 @@
 extern "C" {
 #endif
 
-void nvlist_print(FILE *, nvlist_t *);
-int nvpair_value_match(nvpair_t *, int, char *, char **);
-int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **);
-void dump_nvlist(nvlist_t *, int);
+/*
+ * All interfaces described in this file are private to Solaris, and
+ * are subject to change at any time and without notice.  The public
+ * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR,
+ * are all imported from <sys/nvpair.h> included above.
+ */
+
+extern int nvpair_value_match(nvpair_t *, int, char *, char **);
+extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *,
+    char **);
+
+extern void nvlist_print(FILE *, nvlist_t *);
+extern int nvlist_print_json(FILE *, nvlist_t *);
+extern void dump_nvlist(nvlist_t *, int);
+
+/*
+ * Private nvlist printing interface that allows the caller some control
+ * over output rendering (as opposed to nvlist_print and dump_nvlist).
+ *
+ * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc
+ * (NULL on failure);  on return the cookie is set up for default formatting
+ * and rendering.  Quote the cookie in subsequent customisation functions and
+ * then pass the cookie to nvlist_prt to render the nvlist.  Finally,
+ * use nvlist_prtctl_free to release the cookie.
+ *
+ * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions
+ * we have a corresponding brace of functions that appoint replacement
+ * rendering functions:
+ *
+ *	extern void nvlist_prtctl_xxx(nvlist_prtctl_t,
+ *	    void (*)(nvlist_prtctl_t ctl, void *private, const char *name,
+ *	    xxxtype value))
+ *
+ *	and
+ *
+ *	extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t,
+ *	    void (*)(nvlist_prtctl_t ctl, void *private, const char *name,
+ *	    xxxtype value, uint_t count))
+ *
+ * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8"
+ * and char * for "string".  The function that is appointed to render the
+ * specified datatype receives as arguments the cookie, the nvlist
+ * member name, the value of that member (or a pointer for array function),
+ * and (for array rendering functions) a count of the number of elements.
+ */
+
+typedef struct nvlist_prtctl *nvlist_prtctl_t;	/* opaque */
+
+enum nvlist_indent_mode {
+	NVLIST_INDENT_ABS,	/* Absolute indentation */
+	NVLIST_INDENT_TABBED	/* Indent with tabstops */
+};
+
+extern nvlist_prtctl_t nvlist_prtctl_alloc(void);
+extern void nvlist_prtctl_free(nvlist_prtctl_t);
+extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t);
+
+/* Output stream */
+extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *);
+extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t);
+
+/* Indentation mode, start indent, indent increment; default tabbed/0/1 */
+extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode,
+    int, int);
+extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int);
+
+enum nvlist_prtctl_fmt {
+	NVLIST_FMT_MEMBER_NAME,		/* name fmt; default "%s = " */
+	NVLIST_FMT_MEMBER_POSTAMBLE,	/* after nvlist member; default "\n" */
+	NVLIST_FMT_BTWN_ARRAY		/* between array members; default " " */
+};
+
+extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt,
+    const char *);
+extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...);
+
+/*
+ * Function prototypes for interfaces that appoint a new rendering function
+ * for single-valued nvlist members.
+ *
+ * A replacement function receives arguments as follows:
+ *
+ *	nvlist_prtctl_t	Print control structure; do not change preferences
+ *			for this object from a print callback function.
+ *
+ *	void *		The function-private cookie argument registered
+ *			when the replacement function was appointed.
+ *
+ *	nvlist_t *	The full nvlist that is being processed.  The
+ *			rendering function is called to render a single
+ *			member (name and value passed as below) but it may
+ *			want to reference or incorporate other aspects of
+ *			the full nvlist.
+ *
+ *	const char *	Member name to render
+ *
+ *	valtype		Value of the member to render
+ *
+ * The function must return non-zero if it has rendered output for this
+ * member, or 0 if it wants to default to standard rendering for this
+ * one member.
+ */
+
+#define	NVLIST_PRINTCTL_SVDECL(funcname, valtype) \
+    extern void funcname(nvlist_prtctl_t, \
+    int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \
+    void *)
+
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t);
+NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *);
+
+#undef	NVLIST_PRINTCTL_SVDECL	/* was just for "clarity" above */
+
+/*
+ * Function prototypes for interfaces that appoint a new rendering function
+ * for array-valued nvlist members.
+ *
+ * One additional argument is taken: uint_t for the number of array elements
+ *
+ * Return values as above.
+ */
+#define	NVLIST_PRINTCTL_AVDECL(funcname, vtype) \
+    extern void funcname(nvlist_prtctl_t, \
+    int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \
+    void *)
+
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **);
+NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **);
+
+#undef	NVLIST_PRINTCTL_AVDECL	/* was just for "clarity" above */
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c
diff -N src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c	11 Dec 2014 11:54:49 -0000
@@ -0,0 +1,403 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright (c) 2014, Joyent, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <wchar.h>
+#include <sys/debug.h>
+
+#include "libnvpair.h"
+
+#define	FPRINTF(fp, ...)				\
+	do {						\
+		if (fprintf(fp, __VA_ARGS__) < 0)	\
+			return (-1);			\
+	} while (0)
+
+/*
+ * When formatting a string for JSON output we must escape certain characters,
+ * as described in RFC4627.  This applies to both member names and
+ * DATA_TYPE_STRING values.
+ *
+ * This function will only operate correctly if the following conditions are
+ * met:
+ *
+ *       1. The input String is encoded in the current locale.
+ *
+ *       2. The current locale includes the Basic Multilingual Plane (plane 0)
+ *          as defined in the Unicode standard.
+ *
+ * The output will be entirely 7-bit ASCII (as a subset of UTF-8) with all
+ * representable Unicode characters included in their escaped numeric form.
+ */
+static int
+nvlist_print_json_string(FILE *fp, const char *input)
+{
+	mbstate_t mbr;
+	wchar_t c;
+	size_t sz;
+
+	bzero(&mbr, sizeof (mbr));
+
+	FPRINTF(fp, "\"");
+	while ((sz = mbrtowc(&c, input, MB_CUR_MAX, &mbr)) > 0) {
+		switch (c) {
+		case '"':
+			FPRINTF(fp, "\\\"");
+			break;
+		case '\n':
+			FPRINTF(fp, "\\n");
+			break;
+		case '\r':
+			FPRINTF(fp, "\\r");
+			break;
+		case '\\':
+			FPRINTF(fp, "\\\\");
+			break;
+		case '\f':
+			FPRINTF(fp, "\\f");
+			break;
+		case '\t':
+			FPRINTF(fp, "\\t");
+			break;
+		case '\b':
+			FPRINTF(fp, "\\b");
+			break;
+		default:
+			if ((c >= 0x00 && c <= 0x1f) ||
+			    (c > 0x7f && c <= 0xffff)) {
+				/*
+				 * Render both Control Characters and Unicode
+				 * characters in the Basic Multilingual Plane
+				 * as JSON-escaped multibyte characters.
+				 */
+				FPRINTF(fp, "\\u%04x", (int)(0xffff & c));
+			} else if (c >= 0x20 && c <= 0x7f) {
+				/*
+				 * Render other 7-bit ASCII characters directly
+				 * and drop other, unrepresentable characters.
+				 */
+				FPRINTF(fp, "%c", (int)(0xff & c));
+			}
+			break;
+		}
+		input += sz;
+	}
+
+	if (sz == (size_t)-1 || sz == (size_t)-2) {
+		/*
+		 * We last read an invalid multibyte character sequence,
+		 * so return an error.
+		 */
+		return (-1);
+	}
+
+	FPRINTF(fp, "\"");
+	return (0);
+}
+
+/*
+ * Dump a JSON-formatted representation of an nvlist to the provided FILE *.
+ * This routine does not output any new-lines or additional whitespace other
+ * than that contained in strings, nor does it call fflush(3C).
+ */
+int
+nvlist_print_json(FILE *fp, nvlist_t *nvl)
+{
+	nvpair_t *curr;
+	boolean_t first = B_TRUE;
+
+	FPRINTF(fp, "{");
+
+	for (curr = nvlist_next_nvpair(nvl, NULL); curr;
+	    curr = nvlist_next_nvpair(nvl, curr)) {
+		data_type_t type = nvpair_type(curr);
+
+		if (!first)
+			FPRINTF(fp, ",");
+		else
+			first = B_FALSE;
+
+		if (nvlist_print_json_string(fp, nvpair_name(curr)) == -1)
+			return (-1);
+		FPRINTF(fp, ":");
+
+		switch (type) {
+		case DATA_TYPE_STRING: {
+			char *string = fnvpair_value_string(curr);
+			if (nvlist_print_json_string(fp, string) == -1)
+				return (-1);
+			break;
+		}
+
+		case DATA_TYPE_BOOLEAN: {
+			FPRINTF(fp, "true");
+			break;
+		}
+
+		case DATA_TYPE_BOOLEAN_VALUE: {
+			FPRINTF(fp, "%s", fnvpair_value_boolean_value(curr) ==
+			    B_TRUE ? "true" : "false");
+			break;
+		}
+
+		case DATA_TYPE_BYTE: {
+			FPRINTF(fp, "%hhu", fnvpair_value_byte(curr));
+			break;
+		}
+
+		case DATA_TYPE_INT8: {
+			FPRINTF(fp, "%hhd", fnvpair_value_int8(curr));
+			break;
+		}
+
+		case DATA_TYPE_UINT8: {
+			FPRINTF(fp, "%hhu", fnvpair_value_uint8_t(curr));
+			break;
+		}
+
+		case DATA_TYPE_INT16: {
+			FPRINTF(fp, "%hd", fnvpair_value_int16(curr));
+			break;
+		}
+
+		case DATA_TYPE_UINT16: {
+			FPRINTF(fp, "%hu", fnvpair_value_uint16(curr));
+			break;
+		}
+
+		case DATA_TYPE_INT32: {
+			FPRINTF(fp, "%d", fnvpair_value_int32(curr));
+			break;
+		}
+
+		case DATA_TYPE_UINT32: {
+			FPRINTF(fp, "%u", fnvpair_value_uint32(curr));
+			break;
+		}
+
+		case DATA_TYPE_INT64: {
+			FPRINTF(fp, "%lld",
+			    (long long)fnvpair_value_int64(curr));
+			break;
+		}
+
+		case DATA_TYPE_UINT64: {
+			FPRINTF(fp, "%llu",
+			    (unsigned long long)fnvpair_value_uint64(curr));
+			break;
+		}
+
+		case DATA_TYPE_HRTIME: {
+			hrtime_t val;
+			VERIFY0(nvpair_value_hrtime(curr, &val));
+			FPRINTF(fp, "%llu", (unsigned long long)val);
+			break;
+		}
+
+		case DATA_TYPE_DOUBLE: {
+			double val;
+			VERIFY0(nvpair_value_double(curr, &val));
+			FPRINTF(fp, "%f", val);
+			break;
+		}
+
+		case DATA_TYPE_NVLIST: {
+			if (nvlist_print_json(fp,
+			    fnvpair_value_nvlist(curr)) == -1)
+				return (-1);
+			break;
+		}
+
+		case DATA_TYPE_STRING_ARRAY: {
+			char **val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_string_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				if (nvlist_print_json_string(fp, val[i]) == -1)
+					return (-1);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_NVLIST_ARRAY: {
+			nvlist_t **val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_nvlist_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				if (nvlist_print_json(fp, val[i]) == -1)
+					return (-1);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_BOOLEAN_ARRAY: {
+			boolean_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_boolean_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, val[i] == B_TRUE ?
+				    "true" : "false");
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_BYTE_ARRAY: {
+			uchar_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_byte_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%hhu", val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_UINT8_ARRAY: {
+			uint8_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_uint8_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%hhu", val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_INT8_ARRAY: {
+			int8_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_int8_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%hhd", val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_UINT16_ARRAY: {
+			uint16_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_uint16_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%hu", val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_INT16_ARRAY: {
+			int16_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_int16_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%hd", val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_UINT32_ARRAY: {
+			uint32_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_uint32_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%u", val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_INT32_ARRAY: {
+			int32_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_int32_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%d", val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_UINT64_ARRAY: {
+			uint64_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_uint64_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%llu",
+				    (unsigned long long)val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_INT64_ARRAY: {
+			int64_t *val;
+			uint_t valsz, i;
+			VERIFY0(nvpair_value_int64_array(curr, &val, &valsz));
+			FPRINTF(fp, "[");
+			for (i = 0; i < valsz; i++) {
+				if (i > 0)
+					FPRINTF(fp, ",");
+				FPRINTF(fp, "%lld", (long long)val[i]);
+			}
+			FPRINTF(fp, "]");
+			break;
+		}
+
+		case DATA_TYPE_UNKNOWN:
+			return (-1);
+		}
+	}
+
+	FPRINTF(fp, "}");
+	return (0);
+}
Index: src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 libuutil.h
--- src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h	7 Aug 2009 18:32:34 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h	12 Jun 2012 05:55:36 -0000
@@ -19,15 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_LIBUUTIL_H
 #define	_LIBUUTIL_H
 
+#include <solaris.h>
 #include <sys/types.h>
 #include <stdarg.h>
+#include <stdio.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -142,12 +143,21 @@ extern int uu_open_tmp(const char *dir, 
 /*
  * Convenience functions.
  */
+#define	UU_NELEM(a)	(sizeof (a) / sizeof ((a)[0]))
+
 /*PRINTFLIKE1*/
 extern char *uu_msprintf(const char *format, ...);
 extern void *uu_zalloc(size_t);
 extern char *uu_strdup(const char *);
 extern void uu_free(void *);
 
+extern boolean_t uu_strcaseeq(const char *a, const char *b);
+extern boolean_t uu_streq(const char *a, const char *b);
+extern char *uu_strndup(const char *s, size_t n);
+extern boolean_t uu_strbw(const char *a, const char *b);
+extern void *uu_memdup(const void *buf, size_t sz);
+extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len);
+
 /*
  * Comparison function type definition.
  *   Developers should be careful in their use of the _private argument. If you
Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 uu_alloc.c
--- src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c	7 Aug 2009 18:32:34 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c	12 Jun 2012 05:55:36 -0000
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include "libuutil_common.h"
@@ -67,6 +66,44 @@ uu_strdup(const char *str)
 	return (buf);
 }
 
+/*
+ * Duplicate up to n bytes of a string.  Kind of sort of like
+ * strdup(strlcpy(s, n)).
+ */
+char *
+uu_strndup(const char *s, size_t n)
+{
+	size_t len;
+	char *p;
+
+	len = strnlen(s, n);
+	p = uu_zalloc(len + 1);
+	if (p == NULL)
+		return (NULL);
+
+	if (len > 0)
+		(void) memcpy(p, s, len);
+	p[len] = '\0';
+
+	return (p);
+}
+
+/*
+ * Duplicate a block of memory.  Combines malloc with memcpy, much as
+ * strdup combines malloc, strlen, and strcpy.
+ */
+void *
+uu_memdup(const void *buf, size_t sz)
+{
+	void *p;
+
+	p = uu_zalloc(sz);
+	if (p == NULL)
+		return (NULL);
+	(void) memcpy(p, buf, sz);
+	return (p);
+}
+
 char *
 uu_msprintf(const char *format, ...)
 {
Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 uu_dprintf.c
--- src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c	7 Aug 2009 18:32:34 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c	12 Jun 2012 05:55:36 -0000
@@ -33,7 +33,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <strings.h>
+#include <string.h>
 
 #define	FACILITY_FMT	"%s (%s): "
 
Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c,v
retrieving revision 1.3
diff -u -p -r1.3 uu_misc.c
--- src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c	26 May 2017 22:50:35 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c	30 May 2017 10:49:07 -0000
@@ -20,14 +20,13 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "libuutil_common.h"
 
+#define HAVE_ASSFAIL	1
+
 #include <assert.h>
 #include <errno.h>
 #include <libintl.h>
@@ -39,6 +38,7 @@
 #include <sys/debug.h>
 #include <thread.h>
 #include <unistd.h>
+#include <ctype.h>
 
 #if !defined(TEXT_DOMAIN)
 #define	TEXT_DOMAIN "SYS_TEST"
@@ -74,10 +74,7 @@ static uint32_t		_uu_main_error;
 void
 uu_set_error(uint_t code)
 {
-	if (thr_main() != 0) {
-		_uu_main_error = code;
-		return;
-	}
+
 #if defined(PTHREAD_ONCE_KEY_NP)
 	if (pthread_key_create_once_np(&uu_error_key, NULL) != 0)
 		uu_error_key_setup = -1;
@@ -103,8 +100,6 @@ uu_set_error(uint_t code)
 uint32_t
 uu_error(void)
 {
-	if (thr_main() != 0)
-		return (_uu_main_error);
 
 	if (uu_error_key_setup < 0)	/* can't happen? */
 		return (UU_ERROR_UNKNOWN);
@@ -254,3 +249,30 @@ uu_init(void)
 {
 	(void) pthread_atfork(uu_lockup, uu_release, uu_release_child);
 }
+
+/*
+ * Dump a block of memory in hex+ascii, for debugging
+ */
+void
+uu_dump(FILE *out, const char *prefix, const void *buf, size_t len)
+{
+	const unsigned char *p = buf;
+	int i;
+
+	for (i = 0; i < len; i += 16) {
+		int j;
+
+		(void) fprintf(out, "%s", prefix);
+		for (j = 0; j < 16 && i + j < len; j++) {
+			(void) fprintf(out, "%2.2x ", p[i + j]);
+		}
+		for (; j < 16; j++) {
+			(void) fprintf(out, "   ");
+		}
+		for (j = 0; j < 16 && i + j < len; j++) {
+			(void) fprintf(out, "%c",
+			    isprint(p[i + j]) ? p[i + j] : '.');
+		}
+		(void) fprintf(out, "\n");
+	}
+}
Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 uu_pname.c
--- src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c	7 Aug 2009 18:32:35 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c	13 Apr 2017 17:31:23 -0000
@@ -172,9 +172,7 @@ uu_setpname(char *arg0)
 	 * than in each of its consumers.
 	 */
 	if (arg0 == NULL) {
-		pname = getexecname();
-		if (pname == NULL)
-			pname = "unknown_command";
+		pname = "unknown_command";
 		return (pname);
 	}
 
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h,v
retrieving revision 1.2
diff -u -p -r1.2 libzfs.h
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h	10 Jan 2017 19:20:34 -0000	1.2
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h	22 Apr 2017 17:17:31 -0000
@@ -20,8 +20,14 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc.
  */
 
 #ifndef	_LIBZFS_H
@@ -35,7 +41,8 @@
 #include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/avl.h>
-#include <ucred.h>
+#include <sys/zfs_ioctl.h>
+#include <libzfs_core.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -44,15 +51,14 @@ extern "C" {
 /*
  * Miscellaneous ZFS constants
  */
-#define	ZFS_MAXNAMELEN		MAXNAMELEN
-#define	ZPOOL_MAXNAMELEN	MAXNAMELEN
 #define	ZFS_MAXPROPLEN		MAXPATHLEN
 #define	ZPOOL_MAXPROPLEN	MAXPATHLEN
 
 /*
  * libzfs errors
  */
-enum {
+typedef enum zfs_error {
+	EZFS_SUCCESS = 0,	/* no error -- success */
 	EZFS_NOMEM = 2000,	/* out of memory */
 	EZFS_BADPROP,		/* invalid property value */
 	EZFS_PROPREADONLY,	/* cannot set readonly property */
@@ -86,14 +92,13 @@ enum {
 	EZFS_SHARENFSFAILED,	/* share(1M) failed */
 	EZFS_PERM,		/* permission denied */
 	EZFS_NOSPC,		/* out of space */
+	EZFS_FAULT,		/* bad address */
 	EZFS_IO,		/* I/O error */
 	EZFS_INTR,		/* signal received */
 	EZFS_ISSPARE,		/* device is a hot spare */
 	EZFS_INVALCONFIG,	/* invalid vdev configuration */
 	EZFS_RECURSIVE,		/* recursive dependency */
 	EZFS_NOHISTORY,		/* no history object */
-	EZFS_UNSHAREISCSIFAILED, /* iscsitgtd failed request to unshare */
-	EZFS_SHAREISCSIFAILED,	/* iscsitgtd failed request to share */
 	EZFS_POOLPROPS,		/* couldn't retrieve pool props */
 	EZFS_POOL_NOTSUP,	/* ops not supported for this type of pool */
 	EZFS_POOL_INVALARG,	/* invalid argument for this pool operation */
@@ -101,12 +106,10 @@ enum {
 	EZFS_OPENFAILED,	/* open of device failed */
 	EZFS_NOCAP,		/* couldn't get capacity */
 	EZFS_LABELFAILED,	/* write of label failed */
-	EZFS_ISCSISVCUNAVAIL,	/* iscsi service unavailable */
 	EZFS_BADWHO,		/* invalid permission who */
 	EZFS_BADPERM,		/* invalid permission */
 	EZFS_BADPERMSET,	/* invalid permission set name */
 	EZFS_NODELEGATION,	/* delegated administration is disabled */
-	EZFS_PERMRDONLY,	/* pemissions are readonly */
 	EZFS_UNSHARESMBFAILED,	/* failed to unshare over smb */
 	EZFS_SHARESMBFAILED,	/* failed to share over smb */
 	EZFS_BADCACHE,		/* bad cache file */
@@ -121,8 +124,13 @@ enum {
 	EZFS_PIPEFAILED,	/* pipe create failed */
 	EZFS_THREADCREATEFAILED, /* thread create failed */
 	EZFS_POSTSPLIT_ONLINE,	/* onlining a disk after splitting it */
+	EZFS_SCRUBBING,		/* currently scrubbing */
+	EZFS_NO_SCRUB,		/* no active scrub */
+	EZFS_DIFF,		/* general failure of zfs diff */
+	EZFS_DIFFDATA,		/* bad zfs diff data */
+	EZFS_POOLREADONLY,	/* pool is in read-only mode */
 	EZFS_UNKNOWN
-};
+} zfs_error_t;
 
 /*
  * The following data structures are all part
@@ -178,9 +186,13 @@ extern libzfs_handle_t *zfs_get_handle(z
 
 extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
 
+extern void zfs_save_arguments(int argc, char **, char *, int);
+extern int zpool_log_history(libzfs_handle_t *, const char *);
+
 extern int libzfs_errno(libzfs_handle_t *);
 extern const char *libzfs_error_action(libzfs_handle_t *);
 extern const char *libzfs_error_description(libzfs_handle_t *);
+extern int zfs_standard_error(libzfs_handle_t *, int, const char *);
 extern void libzfs_mnttab_init(libzfs_handle_t *);
 extern void libzfs_mnttab_fini(libzfs_handle_t *);
 extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
@@ -199,20 +211,23 @@ extern void zpool_close(zpool_handle_t *
 extern const char *zpool_get_name(zpool_handle_t *);
 extern int zpool_get_state(zpool_handle_t *);
 extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t);
+extern const char *zpool_pool_state_to_name(pool_state_t);
 extern void zpool_free_handles(libzfs_handle_t *);
+extern int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *);
 
 /*
  * Iterate over all active pools in the system.
  */
 typedef int (*zpool_iter_f)(zpool_handle_t *, void *);
 extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *);
+extern boolean_t zpool_skip_pool(const char *);
 
 /*
  * Functions to create and destroy pools
  */
 extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
     nvlist_t *, nvlist_t *);
-extern int zpool_destroy(zpool_handle_t *);
+extern int zpool_destroy(zpool_handle_t *, const char *);
 extern int zpool_add(zpool_handle_t *, nvlist_t *);
 
 typedef struct splitflags {
@@ -226,8 +241,10 @@ typedef struct splitflags {
 /*
  * Functions to manipulate pool and vdev state
  */
-extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
+extern int zpool_scan(zpool_handle_t *, pool_scan_func_t);
 extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
+extern int zpool_reguid(zpool_handle_t *);
+extern int zpool_reopen(zpool_handle_t *);
 
 extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
     vdev_state_t *);
@@ -247,14 +264,14 @@ extern nvlist_t *zpool_find_vdev(zpool_h
     boolean_t *, boolean_t *);
 extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
     boolean_t *, boolean_t *, boolean_t *);
-extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
+extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *);
 
 /*
  * Functions to manage pool properties
  */
 extern int zpool_set_prop(zpool_handle_t *, const char *, const char *);
 extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *,
-    size_t proplen, zprop_source_t *);
+    size_t proplen, zprop_source_t *, boolean_t);
 extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t,
     zprop_source_t *);
 
@@ -285,6 +302,15 @@ typedef enum {
 	ZPOOL_STATUS_BAD_LOG,		/* cannot read log chain(s) */
 
 	/*
+	 * If the pool has unsupported features but can still be opened in
+	 * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the
+	 * pool has unsupported features but cannot be opened at all, its
+	 * status is ZPOOL_STATUS_UNSUP_FEAT_READ.
+	 */
+	ZPOOL_STATUS_UNSUP_FEAT_READ,	/* unsupported features for read */
+	ZPOOL_STATUS_UNSUP_FEAT_WRITE,	/* unsupported features for write */
+
+	/*
 	 * These faults have no corresponding message ID.  At the time we are
 	 * checking the status, the original reason for the FMA fault (I/O or
 	 * checksum errors) has been lost.
@@ -297,10 +323,12 @@ typedef enum {
 	 * requiring administrative attention.  There is no corresponding
 	 * message ID.
 	 */
-	ZPOOL_STATUS_VERSION_OLDER,	/* older on-disk version */
+	ZPOOL_STATUS_VERSION_OLDER,	/* older legacy on-disk version */
+	ZPOOL_STATUS_FEAT_DISABLED,	/* supported features are disabled */
 	ZPOOL_STATUS_RESILVERING,	/* device being resilvered */
-	ZPOOL_STATUS_OFFLINE_DEV,	/* device online */
+	ZPOOL_STATUS_OFFLINE_DEV,	/* device offline */
 	ZPOOL_STATUS_REMOVED_DEV,	/* removed device */
+	ZPOOL_STATUS_NON_NATIVE_ASHIFT,	/* (e.g. 512e dev with ashift of 9) */
 
 	/*
 	 * Finally, the following indicates a healthy pool.
@@ -316,18 +344,20 @@ extern void zpool_dump_ddt(const ddt_sta
  * Statistics and configuration functions.
  */
 extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **);
+extern nvlist_t *zpool_get_features(zpool_handle_t *);
 extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *);
 extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
 
 /*
  * Import and export functions
  */
-extern int zpool_export(zpool_handle_t *, boolean_t);
-extern int zpool_export_force(zpool_handle_t *);
+extern int zpool_export(zpool_handle_t *, boolean_t, const char *);
+extern int zpool_export_force(zpool_handle_t *, const char *);
 extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
     char *altroot);
 extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
-    nvlist_t *, boolean_t);
+    nvlist_t *, int);
+extern void zpool_print_unsup_feat(nvlist_t *config);
 
 /*
  * Search for pools to import
@@ -356,7 +386,7 @@ extern nvlist_t *zpool_find_import_cache
  */
 struct zfs_cmd;
 
-extern const char *hist_event_table[LOG_END];
+extern const char *zfs_history_event_names[];
 
 extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
     boolean_t verbose);
@@ -364,12 +394,9 @@ extern int zpool_upgrade(zpool_handle_t 
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
 extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
     nvlist_t ***, uint_t *);
-extern void zpool_set_history_str(const char *subcommand, int argc,
-    char **argv, char *history_str);
-extern int zpool_stage_history(libzfs_handle_t *, const char *);
 extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
     size_t len);
-extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
+extern int zfs_ioctl(libzfs_handle_t *, int request, struct zfs_cmd *);
 extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
 extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
     nvlist_t *);
@@ -379,10 +406,12 @@ extern void zpool_explain_recover(libzfs
  * underlying datasets, only the references to them.
  */
 extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int);
+extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *);
 extern void zfs_close(zfs_handle_t *);
 extern zfs_type_t zfs_get_type(const zfs_handle_t *);
 extern const char *zfs_get_name(const zfs_handle_t *);
 extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *);
+extern const char *zfs_get_pool_name(const zfs_handle_t *);
 
 /*
  * Property management functions.  Some functions are shared with the kernel,
@@ -398,10 +427,11 @@ extern const char *zfs_prop_column_name(
 extern boolean_t zfs_prop_align_right(zfs_prop_t);
 
 extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t,
-    nvlist_t *, uint64_t, zfs_handle_t *, const char *);
+    nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, const char *);
 
 extern const char *zfs_prop_to_name(zfs_prop_t);
 extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
+extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *);
 extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
     zprop_source_t *, char *, size_t, boolean_t);
 extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
@@ -412,12 +442,20 @@ extern int zfs_prop_get_userquota_int(zf
     uint64_t *propvalue);
 extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
     char *propbuf, int proplen, boolean_t literal);
+extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
+    uint64_t *propvalue);
+extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
+    char *propbuf, int proplen, boolean_t literal);
+extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname,
+    char *buf, size_t len);
 extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
 extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
 extern const char *zfs_prop_values(zfs_prop_t);
 extern int zfs_prop_is_string(zfs_prop_t prop);
 extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
 extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);
+extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *);
+
 
 typedef struct zprop_list {
 	int		pl_prop;
@@ -429,16 +467,26 @@ typedef struct zprop_list {
 	boolean_t	pl_fixed;
 } zprop_list_t;
 
-extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t);
+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t,
+    boolean_t);
 extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
 
 #define	ZFS_MOUNTPOINT_NONE	"none"
 #define	ZFS_MOUNTPOINT_LEGACY	"legacy"
 
+#define	ZFS_FEATURE_DISABLED	"disabled"
+#define	ZFS_FEATURE_ENABLED	"enabled"
+#define	ZFS_FEATURE_ACTIVE	"active"
+
+#define	ZFS_UNSUPPORTED_INACTIVE	"inactive"
+#define	ZFS_UNSUPPORTED_READONLY	"readonly"
+
 /*
  * zpool property management
  */
 extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **);
+extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *,
+    size_t);
 extern const char *zpool_prop_default_string(zpool_prop_t);
 extern uint64_t zpool_prop_default_numeric(zpool_prop_t);
 extern const char *zpool_prop_column_name(zpool_prop_t);
@@ -490,8 +538,21 @@ extern int zfs_iter_root(libzfs_handle_t
 extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
-extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *);
+extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *);
+
+typedef struct get_all_cb {
+	zfs_handle_t	**cb_handles;
+	size_t		cb_alloc;
+	size_t		cb_used;
+	boolean_t	cb_verbose;
+	int		(*cb_getone)(zfs_handle_t *, void *);
+} get_all_cb_t;
+
+void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
+int libzfs_dataset_cmp(const void *, const void *);
 
 /*
  * Functions to create and destroy datasets.
@@ -501,83 +562,135 @@ extern int zfs_create(libzfs_handle_t *,
 extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
 extern int zfs_destroy(zfs_handle_t *, boolean_t);
 extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
+extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
+extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps,
+    nvlist_t *props);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
-extern int zfs_rename(zfs_handle_t *, const char *, boolean_t);
+
+typedef struct renameflags {
+	/* recursive rename */
+	int recurse : 1;
+
+	/* don't unmount file systems */
+	int nounmount : 1;
+
+	/* force unmount file systems */
+	int forceunmount : 1;
+} renameflags_t;
+
+extern int zfs_rename(zfs_handle_t *, const char *, const char *,
+    renameflags_t flags);
 
 typedef struct sendflags {
 	/* print informational messages (ie, -v was specified) */
-	int verbose : 1;
+	boolean_t verbose;
 
 	/* recursive send  (ie, -R) */
-	int replicate : 1;
+	boolean_t replicate;
 
 	/* for incrementals, do all intermediate snapshots */
-	int doall : 1; /* (ie, -I) */
+	boolean_t doall;
 
 	/* if dataset is a clone, do incremental from its origin */
-	int fromorigin : 1;
+	boolean_t fromorigin;
 
 	/* do deduplication */
-	int dedup : 1;
+	boolean_t dedup;
 
 	/* send properties (ie, -p) */
-	int props : 1;
+	boolean_t props;
+
+	/* do not send (no-op, ie. -n) */
+	boolean_t dryrun;
+
+	/* parsable verbose output (ie. -P) */
+	boolean_t parsable;
+
+	/* show progress (ie. -v) */
+	boolean_t progress;
+
+	/* large blocks (>128K) are permitted */
+	boolean_t largeblock;
+
+	/* WRITE_EMBEDDED records of type DATA are permitted */
+	boolean_t embed_data;
 } sendflags_t;
 
 typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
 
 extern int zfs_send(zfs_handle_t *, const char *, const char *,
-    sendflags_t, int, snapfilter_cb_t, void *);
+    sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
+extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
+extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd,
+    const char *);
+extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl,
+    const char *token);
 
 extern int zfs_promote(zfs_handle_t *);
-extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
-    boolean_t, boolean_t);
-extern int zfs_hold_range(zfs_handle_t *, const char *, const char *,
-    const char *, boolean_t, boolean_t);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *,
+    boolean_t, int);
+extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
 extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
-extern int zfs_release_range(zfs_handle_t *, const char *, const char *,
-    const char *, boolean_t);
+extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
 extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
 
 typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
     uid_t rid, uint64_t space);
 
-extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
-    zfs_userspace_cb_t func, void *arg);
+extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t,
+    zfs_userspace_cb_t, void *);
+
+extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **);
+extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *);
 
 typedef struct recvflags {
 	/* print informational messages (ie, -v was specified) */
-	int verbose : 1;
+	boolean_t verbose;
 
 	/* the destination is a prefix, not the exact fs (ie, -d) */
-	int isprefix : 1;
+	boolean_t isprefix;
 
 	/*
 	 * Only the tail of the sent snapshot path is appended to the
 	 * destination to determine the received snapshot name (ie, -e).
 	 */
-	int istail : 1;
+	boolean_t istail;
 
 	/* do not actually do the recv, just check if it would work (ie, -n) */
-	int dryrun : 1;
+	boolean_t dryrun;
 
 	/* rollback/destroy filesystems as necessary (eg, -F) */
-	int force : 1;
+	boolean_t force;
 
 	/* set "canmount=off" on all modified filesystems */
-	int canmountoff : 1;
+	boolean_t canmountoff;
+
+	/*
+	 * Mark the file systems as "resumable" and do not destroy them if the
+	 * receive is interrupted
+	 */
+	boolean_t resumable;
 
 	/* byteswap flag is used internally; callers need not specify */
-	int byteswap : 1;
+	boolean_t byteswap;
 
 	/* do not mount file systems as they are extracted (private) */
-	int nomount : 1;
+	boolean_t nomount;
 } recvflags_t;
 
-extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t,
-    int, avl_tree_t *);
+extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *,
+    recvflags_t *, int, avl_tree_t *);
+
+typedef enum diff_flags {
+	ZFS_DIFF_PARSEABLE = 0x1,
+	ZFS_DIFF_TIMESTAMP = 0x2,
+	ZFS_DIFF_CLASSIFY = 0x4
+} diff_flags_t;
+
+extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *,
+    int);
 
 /*
  * Miscellaneous functions.
@@ -589,6 +702,7 @@ extern zfs_handle_t *zfs_path_to_zhandle
 extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *,
     zfs_type_t);
 extern int zfs_spa_version(zfs_handle_t *, int *);
+extern boolean_t zfs_bookmark_exists(const char *path);
 
 /*
  * Mount support functions.
@@ -620,21 +734,26 @@ extern int zfs_unshareall_nfs(zfs_handle
 extern int zfs_unshareall_smb(zfs_handle_t *);
 extern int zfs_unshareall_bypath(zfs_handle_t *, const char *);
 extern int zfs_unshareall(zfs_handle_t *);
-extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
-extern int zfs_share_iscsi(zfs_handle_t *);
-extern int zfs_unshare_iscsi(zfs_handle_t *);
-extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *);
 extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
     void *, void *, int, zfs_share_op_t);
 
+#ifdef __FreeBSD__
+/*
+ * FreeBSD-specific jail support function.
+ */
+extern int zfs_jail(zfs_handle_t *, int, int);
+#endif
+
 /*
  * When dealing with nvlists, verify() is extremely useful
  */
+#ifndef verify
 #ifdef NDEBUG
 #define	verify(EX)	((void)(EX))
 #else
 #define	verify(EX)	assert(EX)
 #endif
+#endif
 
 /*
  * Utility function to convert a number to a human-readable form.
@@ -684,6 +803,11 @@ extern boolean_t libzfs_fru_compare(libz
 extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
 extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
 
+#ifndef illumos
+extern int zmount(const char *, const char *, int, char *, char *, int, char *,
+    int);
+#endif
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 libzfs_changelist.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c	27 Feb 2010 22:30:18 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c	10 Oct 2016 11:14:25 -0000
@@ -20,10 +20,14 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
  * Portions Copyright 2007 Ramprakash Jelari
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
 #include <libintl.h>
@@ -116,27 +120,14 @@ changelist_prefix(prop_changelist_t *clp
 		if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned)
 			continue;
 
-		if (ZFS_IS_VOLUME(cn->cn_handle)) {
-			switch (clp->cl_realprop) {
-			case ZFS_PROP_NAME:
-				/* If this was a rename, unshare the zvol */
-				(void) zfs_unshare_iscsi(cn->cn_handle);
-				break;
-
-			case ZFS_PROP_VOLSIZE:
-				/*
-				 * If this was a change to the volume size, we
-				 * need to unshare and reshare the volume.
-				 */
-				(void) zfs_unshare_iscsi(cn->cn_handle);
-				break;
-			}
-		} else {
+		if (!ZFS_IS_VOLUME(cn->cn_handle)) {
 			/*
 			 * Do the property specific processing.
 			 */
 			switch (clp->cl_prop) {
 			case ZFS_PROP_MOUNTPOINT:
+				if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)
+					break;
 				if (zfs_unmount(cn->cn_handle, NULL,
 				    clp->cl_mflags) != 0) {
 					ret = -1;
@@ -146,6 +137,9 @@ changelist_prefix(prop_changelist_t *clp
 			case ZFS_PROP_SHARESMB:
 				(void) zfs_unshare_smb(cn->cn_handle, NULL);
 				break;
+
+			default:
+				break;
 			}
 		}
 	}
@@ -183,8 +177,10 @@ changelist_postfix(prop_changelist_t *cl
 	if ((cn = uu_list_last(clp->cl_list)) == NULL)
 		return (0);
 
-	if (clp->cl_prop == ZFS_PROP_MOUNTPOINT)
+	if (clp->cl_prop == ZFS_PROP_MOUNTPOINT &&
+	    !(clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)) {
 		remove_mountpoint(cn->cn_handle);
+	}
 
 	/*
 	 * It is possible that the changelist_prefix() used libshare
@@ -224,24 +220,8 @@ changelist_postfix(prop_changelist_t *cl
 
 		zfs_refresh_properties(cn->cn_handle);
 
-		if (ZFS_IS_VOLUME(cn->cn_handle)) {
-			if (cn->cn_shared ||
-			    clp->cl_prop == ZFS_PROP_SHAREISCSI) {
-				if (zfs_prop_get(cn->cn_handle,
-				    ZFS_PROP_SHAREISCSI, shareopts,
-				    sizeof (shareopts), NULL, NULL, 0,
-				    B_FALSE) == 0 &&
-				    strcmp(shareopts, "off") == 0) {
-					errors +=
-					    zfs_unshare_iscsi(cn->cn_handle);
-				} else {
-					errors +=
-					    zfs_share_iscsi(cn->cn_handle);
-				}
-			}
-
+		if (ZFS_IS_VOLUME(cn->cn_handle))
 			continue;
-		}
 
 		/*
 		 * Remount if previously mounted or mountpoint was legacy,
@@ -255,7 +235,8 @@ changelist_postfix(prop_changelist_t *cl
 		    shareopts, sizeof (shareopts), NULL, NULL, 0,
 		    B_FALSE) == 0) && (strcmp(shareopts, "off") != 0));
 
-		mounted = zfs_is_mounted(cn->cn_handle, NULL);
+		mounted = (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) ||
+		    zfs_is_mounted(cn->cn_handle, NULL);
 
 		if (!mounted && (cn->cn_mounted ||
 		    ((sharenfs || sharesmb || clp->cl_waslegacy) &&
@@ -316,7 +297,7 @@ void
 changelist_rename(prop_changelist_t *clp, const char *src, const char *dst)
 {
 	prop_changenode_t *cn;
-	char newname[ZFS_MAXNAMELEN];
+	char newname[ZFS_MAX_DATASET_NAME_LEN];
 
 	for (cn = uu_list_first(clp->cl_list); cn != NULL;
 	    cn = uu_list_next(clp->cl_list, cn)) {
@@ -498,7 +479,6 @@ change_one(zfs_handle_t *zhp, void *data
 			 * This is necessary when the original mountpoint
 			 * is legacy or none.
 			 */
-			ASSERT(!clp->cl_alldependents);
 			verify(uu_list_insert_before(clp->cl_list,
 			    uu_list_first(clp->cl_list), cn) == 0);
 		}
@@ -640,8 +620,7 @@ changelist_gather(zfs_handle_t *zhp, zfs
 
 	if (clp->cl_prop != ZFS_PROP_MOUNTPOINT &&
 	    clp->cl_prop != ZFS_PROP_SHARENFS &&
-	    clp->cl_prop != ZFS_PROP_SHARESMB &&
-	    clp->cl_prop != ZFS_PROP_SHAREISCSI)
+	    clp->cl_prop != ZFS_PROP_SHARESMB)
 		return (clp);
 
 	/*
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c
diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c	25 Apr 2017 23:51:09 -0000
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER SART
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#include <err.h>
+
+#include "libzfs_compat.h"
+
+int zfs_ioctl_version = ZFS_IOCVER_UNDEF;
+static int zfs_spa_version = -1;
+
+/*
+ * Get zfs_ioctl_version
+ */
+int
+get_zfs_ioctl_version(void)
+{
+	size_t ver_size;
+	int ver = ZFS_IOCVER_NONE;
+
+	ver_size = sizeof(ver);
+	if (sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0) < 0)
+		err(1, "sysctl vfs.zfs.version.ioctl failed");
+
+	return (ver);
+}
+
+/*
+ * Get the SPA version
+ */
+static int
+get_zfs_spa_version(void)
+{
+	size_t ver_size;
+	int ver = 0;
+
+	ver_size = sizeof(ver);
+	if (sysctlbyname("vfs.zfs.version.spa", &ver, &ver_size, NULL, 0) < 0)
+		err(1, "sysctl vfs.zfs.version.spa failed");
+
+	return (ver);
+}
+
+/*
+ * This is FreeBSD version of ioctl, because Solaris' ioctl() updates
+ * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an
+ * error is returned zc_nvlist_dst_size won't be updated.
+ */
+int
+zcmd_ioctl(int fd, int request, zfs_cmd_t *zc)
+{
+	size_t oldsize;
+	int ret, cflag = ZFS_CMD_COMPAT_NONE;
+
+	if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
+		zfs_ioctl_version = get_zfs_ioctl_version();
+
+	if (zfs_ioctl_version >= ZFS_IOCVER_DEADMAN) {
+		switch (zfs_ioctl_version) {
+		case ZFS_IOCVER_RESUME:
+			cflag = ZFS_CMD_COMPAT_RESUME;
+			break;
+		case ZFS_IOCVER_EDBP:
+			cflag = ZFS_CMD_COMPAT_EDBP;
+			break;
+		case ZFS_IOCVER_ZCMD:
+			cflag = ZFS_CMD_COMPAT_ZCMD;
+			break;
+		case ZFS_IOCVER_LZC:
+			cflag = ZFS_CMD_COMPAT_LZC;
+			break;
+		case ZFS_IOCVER_DEADMAN:
+			cflag = ZFS_CMD_COMPAT_DEADMAN;
+			break;
+		}
+	} else {
+		/*
+		 * If vfs.zfs.version.ioctl is not defined, assume we have v28
+		 * compatible binaries and use vfs.zfs.version.spa to test for v15
+		 */
+		cflag = ZFS_CMD_COMPAT_V28;
+
+		if (zfs_spa_version < 0)
+			zfs_spa_version = get_zfs_spa_version();
+
+		if (zfs_spa_version == SPA_VERSION_15 ||
+		    zfs_spa_version == SPA_VERSION_14 ||
+		    zfs_spa_version == SPA_VERSION_13)
+			cflag = ZFS_CMD_COMPAT_V15;
+	}
+
+	oldsize = zc->zc_nvlist_dst_size;
+	ret = zcmd_ioctl_compat(fd, request, zc, cflag);
+
+	if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) {
+		ret = -1;
+		errno = ENOMEM;
+	}
+
+	return (ret);
+}
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h
diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h	20 Apr 2017 22:54:17 -0000
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER SART
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#ifndef	_LIBZFS_COMPAT_H
+#define	_LIBZFS_COMPAT_H
+
+#include <zfs_ioctl_compat.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+int get_zfs_ioctl_version(void);
+int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc);
+
+#define	ioctl(fd, ioc, zc)	zcmd_ioctl((fd), (ioc), (zc))
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBZFS_COMPAT_H */
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 libzfs_config.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c	27 Feb 2010 22:30:18 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c	10 Oct 2016 11:14:25 -0000
@@ -18,12 +18,19 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Syneto S.R.L. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.
+ */
+
+/*
  * The pool configuration repository is stored in /etc/zfs/zpool.cache as a
  * single packed nvlist.  While it would be nice to just read in this
  * file from userland, this wouldn't work from a local zone.  So we have to have
@@ -218,6 +225,37 @@ zpool_get_config(zpool_handle_t *zhp, nv
 }
 
 /*
+ * Retrieves a list of enabled features and their refcounts and caches it in
+ * the pool handle.
+ */
+nvlist_t *
+zpool_get_features(zpool_handle_t *zhp)
+{
+	nvlist_t *config, *features;
+
+	config = zpool_get_config(zhp, NULL);
+
+	if (config == NULL || !nvlist_exists(config,
+	    ZPOOL_CONFIG_FEATURE_STATS)) {
+		int error;
+		boolean_t missing = B_FALSE;
+
+		error = zpool_refresh_stats(zhp, &missing);
+
+		if (error != 0 || missing)
+			return (NULL);
+
+		config = zpool_get_config(zhp, NULL);
+	}
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+	    &features) != 0)
+		return (NULL);
+
+	return (features);
+}
+
+/*
  * Refresh the vdev statistics associated with the given pool.  This is used in
  * iostat to show configuration changes and determine the delta from the last
  * time the function was called.  This function can fail, in case the pool has
@@ -281,8 +319,7 @@ zpool_refresh_stats(zpool_handle_t *zhp,
 		verify(nvlist_lookup_uint64(config,
 		    ZPOOL_CONFIG_POOL_TXG, &newtxg) == 0);
 
-		if (zhp->zpool_old_config != NULL)
-			nvlist_free(zhp->zpool_old_config);
+		nvlist_free(zhp->zpool_old_config);
 
 		if (oldtxg != newtxg) {
 			nvlist_free(zhp->zpool_config);
@@ -302,6 +339,62 @@ zpool_refresh_stats(zpool_handle_t *zhp,
 }
 
 /*
+ * The following environment variables are undocumented
+ * and should be used for testing purposes only:
+ *
+ * __ZFS_POOL_EXCLUDE - don't iterate over the pools it lists
+ * __ZFS_POOL_RESTRICT - iterate only over the pools it lists
+ *
+ * This function returns B_TRUE if the pool should be skipped
+ * during iteration.
+ */
+boolean_t
+zpool_skip_pool(const char *poolname)
+{
+	static boolean_t initialized = B_FALSE;
+	static const char *exclude = NULL;
+	static const char *restricted = NULL;
+
+	const char *cur, *end;
+	int len;
+	int namelen = strlen(poolname);
+
+	if (!initialized) {
+		initialized = B_TRUE;
+		exclude = getenv("__ZFS_POOL_EXCLUDE");
+		restricted = getenv("__ZFS_POOL_RESTRICT");
+	}
+
+	if (exclude != NULL) {
+		cur = exclude;
+		do {
+			end = strchr(cur, ' ');
+			len = (NULL == end) ? strlen(cur) : (end - cur);
+			if (len == namelen && 0 == strncmp(cur, poolname, len))
+				return (B_TRUE);
+			cur += (len + 1);
+		} while (NULL != end);
+	}
+
+	if (NULL == restricted)
+		return (B_FALSE);
+
+	cur = restricted;
+	do {
+		end = strchr(cur, ' ');
+		len = (NULL == end) ? strlen(cur) : (end - cur);
+
+		if (len == namelen && 0 == strncmp(cur, poolname, len)) {
+			return (B_FALSE);
+		}
+
+		cur += (len + 1);
+	} while (NULL != end);
+
+	return (B_TRUE);
+}
+
+/*
  * Iterate over all pools in the system.
  */
 int
@@ -324,6 +417,9 @@ zpool_iter(libzfs_handle_t *hdl, zpool_i
 	for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
 	    cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {
 
+		if (zpool_skip_pool(cn->cn_name))
+			continue;
+
 		if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) {
 			hdl->libzfs_pool_iter--;
 			return (-1);
@@ -359,6 +455,9 @@ zfs_iter_root(libzfs_handle_t *hdl, zfs_
 	for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
 	    cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {
 
+		if (zpool_skip_pool(cn->cn_name))
+			continue;
+
 		if ((zhp = make_dataset_handle(hdl, cn->cn_name)) == NULL)
 			continue;
 
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c,v
retrieving revision 1.3
diff -u -p -r1.3 libzfs_dataset.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c	3 Apr 2010 19:01:15 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c	22 Apr 2017 08:56:07 -0000
@@ -20,8 +20,16 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012 DEY Storage Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
 #include <ctype.h>
@@ -41,14 +49,12 @@
 #include <pwd.h>
 #include <grp.h>
 #include <stddef.h>
-#include <ucred.h>
 #include <idmap.h>
-#include <aclutils.h>
-#include <directory.h>
 
 #include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
+#include <sys/misc.h>
 #include <libzfs.h>
 
 #include "zfs_namecheck.h"
@@ -73,66 +79,30 @@ zfs_type_to_name(zfs_type_t type)
 		return (dgettext(TEXT_DOMAIN, "snapshot"));
 	case ZFS_TYPE_VOLUME:
 		return (dgettext(TEXT_DOMAIN, "volume"));
+	case ZFS_TYPE_POOL:
+		return (dgettext(TEXT_DOMAIN, "pool"));
+	case ZFS_TYPE_BOOKMARK:
+		return (dgettext(TEXT_DOMAIN, "bookmark"));
+	default:
+		assert(!"unhandled zfs_type_t");
 	}
 
 	return (NULL);
 }
 
 /*
- * Given a path and mask of ZFS types, return a string describing this dataset.
- * This is used when we fail to open a dataset and we cannot get an exact type.
- * We guess what the type would have been based on the path and the mask of
- * acceptable types.
- */
-static const char *
-path_to_str(const char *path, int types)
-{
-	/*
-	 * When given a single type, always report the exact type.
-	 */
-	if (types == ZFS_TYPE_SNAPSHOT)
-		return (dgettext(TEXT_DOMAIN, "snapshot"));
-	if (types == ZFS_TYPE_FILESYSTEM)
-		return (dgettext(TEXT_DOMAIN, "filesystem"));
-	if (types == ZFS_TYPE_VOLUME)
-		return (dgettext(TEXT_DOMAIN, "volume"));
-
-	/*
-	 * The user is requesting more than one type of dataset.  If this is the
-	 * case, consult the path itself.  If we're looking for a snapshot, and
-	 * a '@' is found, then report it as "snapshot".  Otherwise, remove the
-	 * snapshot attribute and try again.
-	 */
-	if (types & ZFS_TYPE_SNAPSHOT) {
-		if (strchr(path, '@') != NULL)
-			return (dgettext(TEXT_DOMAIN, "snapshot"));
-		return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
-	}
-
-	/*
-	 * The user has requested either filesystems or volumes.
-	 * We have no way of knowing a priori what type this would be, so always
-	 * report it as "filesystem" or "volume", our two primitive types.
-	 */
-	if (types & ZFS_TYPE_FILESYSTEM)
-		return (dgettext(TEXT_DOMAIN, "filesystem"));
-
-	assert(types & ZFS_TYPE_VOLUME);
-	return (dgettext(TEXT_DOMAIN, "volume"));
-}
-
-/*
  * Validate a ZFS path.  This is used even before trying to open the dataset, to
  * provide a more meaningful error message.  We call zfs_error_aux() to
  * explain exactly why the name was not valid.
  */
-static int
+int
 zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
     boolean_t modifying)
 {
 	namecheck_err_t why;
 	char what;
 
+	(void) zfs_prop_get_table();
 	if (dataset_namecheck(path, &why, &what) != 0) {
 		if (hdl != NULL) {
 			switch (why) {
@@ -181,6 +151,11 @@ zfs_validate_name(libzfs_handle_t *hdl, 
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "reserved disk name"));
 				break;
+
+			default:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "(%d) not defined"), why);
+				break;
 			}
 		}
 
@@ -290,7 +265,7 @@ zpool_handle(zfs_handle_t *zhp)
 	int len;
 	zpool_handle_t *zph;
 
-	len = strcspn(zhp->zfs_name, "/@") + 1;
+	len = strcspn(zhp->zfs_name, "/@#") + 1;
 	pool_name = zfs_alloc(zhp->zfs_hdl, len);
 	(void) strlcpy(pool_name, zhp->zfs_name, len);
 
@@ -459,7 +434,9 @@ make_dataset_handle_common(zfs_handle_t 
 	else
 		abort();	/* we should never see any other types */
 
-	zhp->zpool_hdl = zpool_handle(zhp);
+	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL)
+		return (-1);
+
 	return (0);
 }
 
@@ -492,7 +469,7 @@ make_dataset_handle(libzfs_handle_t *hdl
 	return (zhp);
 }
 
-static zfs_handle_t *
+zfs_handle_t *
 make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc)
 {
 	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
@@ -509,6 +486,133 @@ make_dataset_handle_zc(libzfs_handle_t *
 	return (zhp);
 }
 
+zfs_handle_t *
+make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc)
+{
+	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
+
+	if (zhp == NULL)
+		return (NULL);
+
+	zhp->zfs_hdl = pzhp->zfs_hdl;
+	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
+	zhp->zfs_head_type = pzhp->zfs_type;
+	zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
+	zhp->zpool_hdl = zpool_handle(zhp);
+	return (zhp);
+}
+
+zfs_handle_t *
+zfs_handle_dup(zfs_handle_t *zhp_orig)
+{
+	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
+
+	if (zhp == NULL)
+		return (NULL);
+
+	zhp->zfs_hdl = zhp_orig->zfs_hdl;
+	zhp->zpool_hdl = zhp_orig->zpool_hdl;
+	(void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name,
+	    sizeof (zhp->zfs_name));
+	zhp->zfs_type = zhp_orig->zfs_type;
+	zhp->zfs_head_type = zhp_orig->zfs_head_type;
+	zhp->zfs_dmustats = zhp_orig->zfs_dmustats;
+	if (zhp_orig->zfs_props != NULL) {
+		if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) {
+			(void) no_memory(zhp->zfs_hdl);
+			zfs_close(zhp);
+			return (NULL);
+		}
+	}
+	if (zhp_orig->zfs_user_props != NULL) {
+		if (nvlist_dup(zhp_orig->zfs_user_props,
+		    &zhp->zfs_user_props, 0) != 0) {
+			(void) no_memory(zhp->zfs_hdl);
+			zfs_close(zhp);
+			return (NULL);
+		}
+	}
+	if (zhp_orig->zfs_recvd_props != NULL) {
+		if (nvlist_dup(zhp_orig->zfs_recvd_props,
+		    &zhp->zfs_recvd_props, 0)) {
+			(void) no_memory(zhp->zfs_hdl);
+			zfs_close(zhp);
+			return (NULL);
+		}
+	}
+	zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck;
+	if (zhp_orig->zfs_mntopts != NULL) {
+		zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl,
+		    zhp_orig->zfs_mntopts);
+	}
+	zhp->zfs_props_table = zhp_orig->zfs_props_table;
+	return (zhp);
+}
+
+boolean_t
+zfs_bookmark_exists(const char *path)
+{
+	nvlist_t *bmarks;
+	nvlist_t *props;
+	char fsname[ZFS_MAX_DATASET_NAME_LEN];
+	char *bmark_name;
+	char *pound;
+	int err;
+	boolean_t rv;
+
+
+	(void) strlcpy(fsname, path, sizeof (fsname));
+	pound = strchr(fsname, '#');
+	if (pound == NULL)
+		return (B_FALSE);
+
+	*pound = '\0';
+	bmark_name = pound + 1;
+	props = fnvlist_alloc();
+	err = lzc_get_bookmarks(fsname, props, &bmarks);
+	nvlist_free(props);
+	if (err != 0) {
+		nvlist_free(bmarks);
+		return (B_FALSE);
+	}
+
+	rv = nvlist_exists(bmarks, bmark_name);
+	nvlist_free(bmarks);
+	return (rv);
+}
+
+zfs_handle_t *
+make_bookmark_handle(zfs_handle_t *parent, const char *path,
+    nvlist_t *bmark_props)
+{
+	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
+
+	if (zhp == NULL)
+		return (NULL);
+
+	/* Fill in the name. */
+	zhp->zfs_hdl = parent->zfs_hdl;
+	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
+
+	/* Set the property lists. */
+	if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) {
+		free(zhp);
+		return (NULL);
+	}
+
+	/* Set the types. */
+	zhp->zfs_head_type = parent->zfs_head_type;
+	zhp->zfs_type = ZFS_TYPE_BOOKMARK;
+
+	if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) {
+		nvlist_free(zhp->zfs_props);
+		free(zhp);
+		return (NULL);
+	}
+
+	return (zhp);
+}
+
 /*
  * Opens the given snapshot, filesystem, or volume.   The 'types'
  * argument is a mask of acceptable types.  The function will print an
@@ -542,6 +646,22 @@ zfs_open(libzfs_handle_t *hdl, const cha
 		return (NULL);
 	}
 
+	if (zhp == NULL) {
+		char *at = strchr(path, '@');
+
+		if (at != NULL)
+			*at = '\0';
+		errno = 0;
+		if ((zhp = make_dataset_handle(hdl, path)) == NULL) {
+			(void) zfs_standard_error(hdl, errno, errbuf);
+			return (NULL);
+		}
+		if (at != NULL)
+			*at = '@';
+		(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
+		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
+	}
+
 	if (!(types & zhp->zfs_type)) {
 		(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
 		zfs_close(zhp);
@@ -592,7 +712,7 @@ libzfs_mnttab_init(libzfs_handle_t *hdl)
 	    sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
 }
 
-void
+static void
 libzfs_mnttab_update(libzfs_handle_t *hdl)
 {
 	struct mnttab entry;
@@ -618,7 +738,8 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl)
 	void *cookie = NULL;
 	mnttab_node_t *mtn;
 
-	while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) {
+	while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie))
+	    != NULL) {
 		free(mtn->mtn_mt.mnt_special);
 		free(mtn->mtn_mt.mnt_mountp);
 		free(mtn->mtn_mt.mnt_fstype);
@@ -690,7 +811,8 @@ libzfs_mnttab_remove(libzfs_handle_t *hd
 	mnttab_node_t *ret;
 
 	find.mtn_mt.mnt_special = (char *)fsname;
-	if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) {
+	if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
+	    != NULL) {
 		avl_remove(&hdl->libzfs_mnttab_cache, ret);
 		free(ret->mtn_mt.mnt_special);
 		free(ret->mtn_mt.mnt_mountp);
@@ -739,7 +861,8 @@ zfs_which_resv_prop(zfs_handle_t *zhp, z
  */
 nvlist_t *
 zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
-    uint64_t zoned, zfs_handle_t *zhp, const char *errbuf)
+    uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl,
+    const char *errbuf)
 {
 	nvpair_t *elem;
 	uint64_t intval;
@@ -872,6 +995,12 @@ zfs_valid_proplist(libzfs_handle_t *hdl,
 				goto error;
 			}
 			continue;
+		} else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "'%s' is readonly"),
+			    propname);
+			(void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf);
+			goto error;
 		}
 
 		if (prop == ZPROP_INVAL) {
@@ -923,36 +1052,39 @@ zfs_valid_proplist(libzfs_handle_t *hdl,
 			break;
 		}
 
-		case ZFS_PROP_RECORDSIZE:
 		case ZFS_PROP_VOLBLOCKSIZE:
-			/* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */
-			if (intval < SPA_MINBLOCKSIZE ||
-			    intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "'%s' must be power of 2 from %u "
-				    "to %uk"), propname,
-				    (uint_t)SPA_MINBLOCKSIZE,
-				    (uint_t)SPA_MAXBLOCKSIZE >> 10);
-				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
-				goto error;
+		case ZFS_PROP_RECORDSIZE:
+		{
+			int maxbs = SPA_MAXBLOCKSIZE;
+			if (zpool_hdl != NULL) {
+				maxbs = zpool_get_prop_int(zpool_hdl,
+				    ZPOOL_PROP_MAXBLOCKSIZE, NULL);
 			}
-			break;
-
-		case ZFS_PROP_SHAREISCSI:
-			if (strcmp(strval, "off") != 0 &&
-			    strcmp(strval, "on") != 0 &&
-			    strcmp(strval, "type=disk") != 0) {
+			/*
+			 * Volumes are limited to a volblocksize of 128KB,
+			 * because they typically service workloads with
+			 * small random writes, which incur a large performance
+			 * penalty with large blocks.
+			 */
+			if (prop == ZFS_PROP_VOLBLOCKSIZE)
+				maxbs = SPA_OLD_MAXBLOCKSIZE;
+			/*
+			 * The value must be a power of two between
+			 * SPA_MINBLOCKSIZE and maxbs.
+			 */
+			if (intval < SPA_MINBLOCKSIZE ||
+			    intval > maxbs || !ISP2(intval)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "'%s' must be 'on', 'off', or 'type=disk'"),
-				    propname);
+				    "'%s' must be power of 2 from 512B "
+				    "to %uKB"), propname, maxbs >> 10);
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
 			}
-
 			break;
-#ifdef PORT_SOLARIS
+		}
 		case ZFS_PROP_MLSLABEL:
 		{
+#ifdef illumos
 			/*
 			 * Verify the mlslabel string and convert to
 			 * internal hex label string.
@@ -1001,10 +1133,15 @@ badlabel:
 			    "invalid mlslabel '%s'"), strval);
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 			m_label_free(new_sl);	/* OK if null */
+#else	/* !illumos */
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "mlslabel is not supported on FreeBSD"));
+			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+#endif	/* illumos */
 			goto error;
 
 		}
-#endif
+
 		case ZFS_PROP_MOUNTPOINT:
 		{
 			namecheck_err_t why;
@@ -1027,6 +1164,13 @@ badlabel:
 					    "component of '%s' is too long"),
 					    propname);
 					break;
+
+				default:
+					zfs_error_aux(hdl,
+					    dgettext(TEXT_DOMAIN,
+					    "(%d) not defined"),
+					    why);
+					break;
 				}
 				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 				goto error;
@@ -1145,12 +1289,17 @@ badlabel:
 			}
 
 			break;
+
 		case ZFS_PROP_UTF8ONLY:
 			chosen_utf = (int)intval;
 			break;
+
 		case ZFS_PROP_NORMALIZE:
 			chosen_normal = (int)intval;
 			break;
+
+		default:
+			break;
 		}
 
 		/*
@@ -1199,6 +1348,9 @@ badlabel:
 					goto error;
 				}
 				break;
+
+			default:
+				break;
 			}
 		}
 	}
@@ -1223,39 +1375,56 @@ badlabel:
 		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		goto error;
 	}
+	return (ret);
+
+error:
+	nvlist_free(ret);
+	return (NULL);
+}
+
+int
+zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
+{
+	uint64_t old_volsize;
+	uint64_t new_volsize;
+	uint64_t old_reservation;
+	uint64_t new_reservation;
+	zfs_prop_t resv_prop;
+	nvlist_t *props;
 
 	/*
 	 * If this is an existing volume, and someone is setting the volsize,
 	 * make sure that it matches the reservation, or add it if necessary.
 	 */
-	if (zhp != NULL && type == ZFS_TYPE_VOLUME &&
-	    nvlist_lookup_uint64(ret, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
-	    &intval) == 0) {
-		uint64_t old_volsize = zfs_prop_get_int(zhp,
-		    ZFS_PROP_VOLSIZE);
-		uint64_t old_reservation;
-		uint64_t new_reservation;
-		zfs_prop_t resv_prop;
-
-		if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
-			goto error;
-		old_reservation = zfs_prop_get_int(zhp, resv_prop);
+	old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
+	if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
+		return (-1);
+	old_reservation = zfs_prop_get_int(zhp, resv_prop);
 
-		if (old_volsize == old_reservation &&
-		    nvlist_lookup_uint64(ret, zfs_prop_to_name(resv_prop),
-		    &new_reservation) != 0) {
-			if (nvlist_add_uint64(ret,
-			    zfs_prop_to_name(resv_prop), intval) != 0) {
-				(void) no_memory(hdl);
-				goto error;
-			}
-		}
+	props = fnvlist_alloc();
+	fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+	    zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));
+
+	if ((zvol_volsize_to_reservation(old_volsize, props) !=
+	    old_reservation) || nvlist_exists(nvl,
+	    zfs_prop_to_name(resv_prop))) {
+		fnvlist_free(props);
+		return (0);
 	}
-	return (ret);
+	if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+	    &new_volsize) != 0) {
+		fnvlist_free(props);
+		return (-1);
+	}
+	new_reservation = zvol_volsize_to_reservation(new_volsize, props);
+	fnvlist_free(props);
 
-error:
-	nvlist_free(ret);
-	return (NULL);
+	if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
+	    new_reservation) != 0) {
+		(void) no_memory(zhp->zfs_hdl);
+		return (-1);
+	}
+	return (1);
 }
 
 void
@@ -1300,6 +1469,12 @@ zfs_setprop_error(libzfs_handle_t *hdl, 
 		(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
 		break;
 
+	case E2BIG:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "property value too long"));
+		(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+		break;
+
 	case ENOTSUP:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool and or dataset must be upgraded to set this "
@@ -1308,11 +1483,27 @@ zfs_setprop_error(libzfs_handle_t *hdl, 
 		break;
 
 	case ERANGE:
-		if (prop == ZFS_PROP_COMPRESSION) {
+	case EDOM:
+		if (prop == ZFS_PROP_COMPRESSION ||
+		    prop == ZFS_PROP_RECORDSIZE) {
 			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "property setting is not allowed on "
 			    "bootable datasets"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
+		} else if (prop == ZFS_PROP_CHECKSUM ||
+		    prop == ZFS_PROP_DEDUP) {
+			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "property setting is not allowed on "
+			    "root pools"));
+			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
+		} else {
+			(void) zfs_standard_error(hdl, err, errbuf);
+		}
+		break;
+
+	case EINVAL:
+		if (prop == ZPROP_INVAL) {
+			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
@@ -1340,15 +1531,10 @@ zfs_setprop_error(libzfs_handle_t *hdl, 
 int
 zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 {
-	zfs_cmd_t zc = { 0 };
 	int ret = -1;
-	prop_changelist_t *cl = NULL;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
-	nvlist_t *nvl = NULL, *realprops;
-	zfs_prop_t prop;
-	boolean_t do_prefix;
-	uint64_t idx;
+	nvlist_t *nvl = NULL;
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
@@ -1360,66 +1546,187 @@ zfs_prop_set(zfs_handle_t *zhp, const ch
 		goto error;
 	}
 
-	if ((realprops = zfs_valid_proplist(hdl, zhp->zfs_type, nvl,
-	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, errbuf)) == NULL)
-		goto error;
+	ret = zfs_prop_set_list(zhp, nvl);
 
+error:
 	nvlist_free(nvl);
-	nvl = realprops;
+	return (ret);
+}
 
-	prop = zfs_name_to_prop(propname);
 
-	if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL)
-		goto error;
 
-	if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "child dataset with inherited mountpoint is used "
-		    "in a non-global zone"));
-		ret = zfs_error(hdl, EZFS_ZONED, errbuf);
+/*
+ * Given an nvlist of property names and values, set the properties for the
+ * given dataset.
+ */
+int
+zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
+{
+	zfs_cmd_t zc = { 0 };
+	int ret = -1;
+	prop_changelist_t **cls = NULL;
+	int cl_idx;
+	char errbuf[1024];
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	nvlist_t *nvl;
+	int nvl_len;
+	int added_resv = 0;
+
+	(void) snprintf(errbuf, sizeof (errbuf),
+	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
+	    zhp->zfs_name);
+
+	if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props,
+	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl,
+	    errbuf)) == NULL)
 		goto error;
-	}
 
 	/*
-	 * If the dataset's canmount property is being set to noauto,
-	 * then we want to prevent unmounting & remounting it.
+	 * We have to check for any extra properties which need to be added
+	 * before computing the length of the nvlist.
 	 */
-	do_prefix = !((prop == ZFS_PROP_CANMOUNT) &&
-	    (zprop_string_to_index(prop, propval, &idx,
-	    ZFS_TYPE_DATASET) == 0) && (idx == ZFS_CANMOUNT_NOAUTO));
-
-	if (do_prefix && (ret = changelist_prefix(cl)) != 0)
+	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
+	    elem != NULL;
+	    elem = nvlist_next_nvpair(nvl, elem)) {
+		if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE &&
+		    (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) {
+			goto error;
+		}
+	}
+	/*
+	 * Check how many properties we're setting and allocate an array to
+	 * store changelist pointers for postfix().
+	 */
+	nvl_len = 0;
+	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
+	    elem != NULL;
+	    elem = nvlist_next_nvpair(nvl, elem))
+		nvl_len++;
+	if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL)
 		goto error;
 
+	cl_idx = 0;
+	for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
+	    elem != NULL;
+	    elem = nvlist_next_nvpair(nvl, elem)) {
+
+		zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
+
+		assert(cl_idx < nvl_len);
+		/*
+		 * We don't want to unmount & remount the dataset when changing
+		 * its canmount property to 'on' or 'noauto'.  We only use
+		 * the changelist logic to unmount when setting canmount=off.
+		 */
+		if (prop != ZFS_PROP_CANMOUNT ||
+		    (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF &&
+		    zfs_is_mounted(zhp, NULL))) {
+			cls[cl_idx] = changelist_gather(zhp, prop, 0, 0);
+			if (cls[cl_idx] == NULL)
+				goto error;
+		}
+
+		if (prop == ZFS_PROP_MOUNTPOINT &&
+		    changelist_haszonedchild(cls[cl_idx])) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "child dataset with inherited mountpoint is used "
+			    "in a non-global zone"));
+			ret = zfs_error(hdl, EZFS_ZONED, errbuf);
+			goto error;
+		}
+
+		/* We don't support those properties on FreeBSD. */
+		switch (prop) {
+		case ZFS_PROP_DEVICES:
+		case ZFS_PROP_ISCSIOPTIONS:
+		case ZFS_PROP_XATTR:
+		case ZFS_PROP_VSCAN:
+		case ZFS_PROP_NBMAND:
+		case ZFS_PROP_MLSLABEL:
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    "property '%s' not supported on FreeBSD",
+			    nvpair_name(elem));
+			ret = zfs_error(hdl, EZFS_PERM, errbuf);
+			goto error;
+		}
+
+		if (cls[cl_idx] != NULL &&
+		    (ret = changelist_prefix(cls[cl_idx])) != 0)
+			goto error;
+
+		cl_idx++;
+	}
+	assert(cl_idx == nvl_len);
+
 	/*
-	 * Execute the corresponding ioctl() to set this property.
+	 * Execute the corresponding ioctl() to set this list of properties.
 	 */
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
-	if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
+	if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 ||
+	    (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0)
 		goto error;
 
 	ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
 
 	if (ret != 0) {
-		zfs_setprop_error(hdl, prop, errno, errbuf);
-	} else {
-		if (do_prefix)
-			ret = changelist_postfix(cl);
+		/* Get the list of unset properties back and report them. */
+		nvlist_t *errorprops = NULL;
+		if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0)
+			goto error;
+		for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL);
+		    elem != NULL;
+		    elem = nvlist_next_nvpair(nvl, elem)) {
+			zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem));
+			zfs_setprop_error(hdl, prop, errno, errbuf);
+		}
+		nvlist_free(errorprops);
+
+		if (added_resv && errno == ENOSPC) {
+			/* clean up the volsize property we tried to set */
+			uint64_t old_volsize = zfs_prop_get_int(zhp,
+			    ZFS_PROP_VOLSIZE);
+			nvlist_free(nvl);
+			nvl = NULL;
+			zcmd_free_nvlists(&zc);
 
-		/*
-		 * Refresh the statistics so the new property value
-		 * is reflected.
-		 */
-		if (ret == 0)
+			if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+				goto error;
+			if (nvlist_add_uint64(nvl,
+			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
+			    old_volsize) != 0)
+				goto error;
+			if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0)
+				goto error;
+			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
+		}
+	} else {
+		for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
+			if (cls[cl_idx] != NULL) {
+				int clp_err = changelist_postfix(cls[cl_idx]);
+				if (clp_err != 0)
+					ret = clp_err;
+			}
+		}
+
+		/*
+		 * Refresh the statistics so the new property value
+		 * is reflected.
+		 */
+		if (ret == 0)
 			(void) get_stats(zhp);
 	}
 
 error:
 	nvlist_free(nvl);
 	zcmd_free_nvlists(&zc);
-	if (cl)
-		changelist_free(cl);
+	if (cls != NULL) {
+		for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) {
+			if (cls[cl_idx] != NULL)
+				changelist_free(cls[cl_idx]);
+		}
+		free(cls);
+	}
 	return (ret);
 }
 
@@ -1477,7 +1784,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, cons
 		return (zfs_error(hdl, EZFS_PROPTYPE, errbuf));
 
 	/*
-	 * Normalize the name, to get rid of shorthand abbrevations.
+	 * Normalize the name, to get rid of shorthand abbreviations.
 	 */
 	propname = zfs_prop_to_name(prop);
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -1550,22 +1857,21 @@ getprop_uint64(zfs_handle_t *zhp, zfs_pr
 	return (value);
 }
 
-static char *
+static const char *
 getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 {
 	nvlist_t *nv;
-	char *value;
+	const char *value;
 
 	*source = NULL;
 	if (nvlist_lookup_nvlist(zhp->zfs_props,
 	    zfs_prop_to_name(prop), &nv) == 0) {
-		verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
+		value = fnvlist_lookup_string(nv, ZPROP_VALUE);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
 		verify(!zhp->zfs_props_table ||
 		    zhp->zfs_props_table[prop] == B_TRUE);
-		if ((value = (char *)zfs_prop_default_string(prop)) == NULL)
-			value = "";
+		value = zfs_prop_default_string(prop);
 		*source = "";
 	}
 
@@ -1649,6 +1955,9 @@ get_numeric_property(zfs_handle_t *zhp, 
 		mntopt_on = MNTOPT_NBMAND;
 		mntopt_off = MNTOPT_NONBMAND;
 		break;
+
+	default:
+		break;
 	}
 
 	/*
@@ -1706,6 +2015,10 @@ get_numeric_property(zfs_handle_t *zhp, 
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFRESERVATION:
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+	case ZFS_PROP_FILESYSTEM_COUNT:
+	case ZFS_PROP_SNAPSHOT_COUNT:
 		*val = getprop_uint64(zhp, prop, source);
 
 		if (*source == NULL) {
@@ -1740,11 +2053,14 @@ get_numeric_property(zfs_handle_t *zhp, 
 			zcmd_free_nvlists(&zc);
 			return (-1);
 		}
-		if (zplprops)
-			nvlist_free(zplprops);
+		nvlist_free(zplprops);
 		zcmd_free_nvlists(&zc);
 		break;
 
+	case ZFS_PROP_INCONSISTENT:
+		*val = zhp->zfs_dmustats.dds_inconsistent;
+		break;
+
 	default:
 		switch (zfs_prop_get_type(prop)) {
 		case PROP_TYPE_NUMBER:
@@ -1821,8 +2137,6 @@ zfs_prop_get_recvd(zfs_handle_t *zhp, co
 		err = zfs_prop_get(zhp, prop, propbuf, proplen,
 		    NULL, NULL, 0, literal);
 		zfs_unset_recvd_props_mode(zhp, &cookie);
-	} else if (zfs_prop_userquota(propname)) {
-		return (-1);
 	} else {
 		nvlist_t *propval;
 		char *recvdval;
@@ -1837,6 +2151,117 @@ zfs_prop_get_recvd(zfs_handle_t *zhp, co
 	return (err == 0 ? 0 : -1);
 }
 
+static int
+get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen)
+{
+	nvlist_t *value;
+	nvpair_t *pair;
+
+	value = zfs_get_clones_nvl(zhp);
+	if (value == NULL)
+		return (-1);
+
+	propbuf[0] = '\0';
+	for (pair = nvlist_next_nvpair(value, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(value, pair)) {
+		if (propbuf[0] != '\0')
+			(void) strlcat(propbuf, ",", proplen);
+		(void) strlcat(propbuf, nvpair_name(pair), proplen);
+	}
+
+	return (0);
+}
+
+struct get_clones_arg {
+	uint64_t numclones;
+	nvlist_t *value;
+	const char *origin;
+	char buf[ZFS_MAX_DATASET_NAME_LEN];
+};
+
+int
+get_clones_cb(zfs_handle_t *zhp, void *arg)
+{
+	struct get_clones_arg *gca = arg;
+
+	if (gca->numclones == 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf),
+	    NULL, NULL, 0, B_TRUE) != 0)
+		goto out;
+	if (strcmp(gca->buf, gca->origin) == 0) {
+		fnvlist_add_boolean(gca->value, zfs_get_name(zhp));
+		gca->numclones--;
+	}
+
+out:
+	(void) zfs_iter_children(zhp, get_clones_cb, gca);
+	zfs_close(zhp);
+	return (0);
+}
+
+nvlist_t *
+zfs_get_clones_nvl(zfs_handle_t *zhp)
+{
+	nvlist_t *nv, *value;
+
+	if (nvlist_lookup_nvlist(zhp->zfs_props,
+	    zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) {
+		struct get_clones_arg gca;
+
+		/*
+		 * if this is a snapshot, then the kernel wasn't able
+		 * to get the clones.  Do it by slowly iterating.
+		 */
+		if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT)
+			return (NULL);
+		if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0)
+			return (NULL);
+		if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) {
+			nvlist_free(nv);
+			return (NULL);
+		}
+
+		gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES);
+		gca.value = value;
+		gca.origin = zhp->zfs_name;
+
+		if (gca.numclones != 0) {
+			zfs_handle_t *root;
+			char pool[ZFS_MAX_DATASET_NAME_LEN];
+			char *cp = pool;
+
+			/* get the pool name */
+			(void) strlcpy(pool, zhp->zfs_name, sizeof (pool));
+			(void) strsep(&cp, "/@");
+			root = zfs_open(zhp->zfs_hdl, pool,
+			    ZFS_TYPE_FILESYSTEM);
+
+			(void) get_clones_cb(root, &gca);
+		}
+
+		if (gca.numclones != 0 ||
+		    nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 ||
+		    nvlist_add_nvlist(zhp->zfs_props,
+		    zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) {
+			nvlist_free(nv);
+			nvlist_free(value);
+			return (NULL);
+		}
+		nvlist_free(nv);
+		nvlist_free(value);
+		verify(0 == nvlist_lookup_nvlist(zhp->zfs_props,
+		    zfs_prop_to_name(ZFS_PROP_CLONES), &nv));
+	}
+
+	verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0);
+
+	return (value);
+}
+
 /*
  * Retrieve a property from the given object.  If 'literal' is specified, then
  * numbers are left as exact values.  Otherwise, numbers are converted to a
@@ -1850,7 +2275,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 {
 	char *source = NULL;
 	uint64_t val;
-	char *str;
+	const char *str;
 	const char *strval;
 	boolean_t received = zfs_is_recvd_props_mode(zhp);
 
@@ -1920,8 +2345,8 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 			}
 
 			if ((zpool_get_prop(zhp->zpool_hdl,
-			    ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL)) ||
-			    (strcmp(root, "-") == 0))
+			    ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL,
+			    B_FALSE)) || (strcmp(root, "-") == 0))
 				root[0] = '\0';
 			/*
 			 * Special case an alternate root of '/'. This will
@@ -1955,13 +2380,14 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 		break;
 
 	case ZFS_PROP_ORIGIN:
-		(void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
-		    proplen);
-		/*
-		 * If there is no parent at all, return failure to indicate that
-		 * it doesn't apply to this dataset.
-		 */
-		if (propbuf[0] == '\0')
+		str = getprop_string(zhp, prop, &source);
+		if (str == NULL)
+			return (-1);
+		(void) strlcpy(propbuf, str, proplen);
+		break;
+
+	case ZFS_PROP_CLONES:
+		if (get_clones_string(zhp, propbuf, proplen) != 0)
 			return (-1);
 		break;
 
@@ -1993,6 +2419,31 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 		}
 		break;
 
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+	case ZFS_PROP_FILESYSTEM_COUNT:
+	case ZFS_PROP_SNAPSHOT_COUNT:
+
+		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
+			return (-1);
+
+		/*
+		 * If limit is UINT64_MAX, we translate this into 'none' (unless
+		 * literal is set), and indicate that it's the default value.
+		 * Otherwise, we print the number nicely and indicate that it's
+		 * set locally.
+		 */
+		if (literal) {
+			(void) snprintf(propbuf, proplen, "%llu",
+			    (u_longlong_t)val);
+		} else if (val == UINT64_MAX) {
+			(void) strlcpy(propbuf, "none", proplen);
+		} else {
+			zfs_nicenum(val, propbuf, proplen);
+		}
+		break;
+
+	case ZFS_PROP_REFRATIO:
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
@@ -2012,6 +2463,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 		case ZFS_TYPE_SNAPSHOT:
 			str = "snapshot";
 			break;
+		case ZFS_TYPE_BOOKMARK:
+			str = "bookmark";
+			break;
 		default:
 			abort();
 		}
@@ -2042,9 +2496,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 		 */
 		(void) strlcpy(propbuf, zhp->zfs_name, proplen);
 		break;
-#ifdef PORT_SOLARIS
+
 	case ZFS_PROP_MLSLABEL:
 		{
+#ifdef illumos
 			m_label_t *new_sl = NULL;
 			char *ascii = NULL;	/* human readable label */
 
@@ -2078,9 +2533,23 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 
 			(void) strlcpy(propbuf, ascii, proplen);
 			free(ascii);
+#else	/* !illumos */
+			propbuf[0] = '\0';
+#endif	/* illumos */
 		}
 		break;
-#endif
+
+	case ZFS_PROP_GUID:
+		/*
+		 * GUIDs are stored as numbers, but they are identifiers.
+		 * We don't want them to be pretty printed, because pretty
+		 * printing mangles the ID into a truncated and useless value.
+		 */
+		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
+			return (-1);
+		(void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val);
+		break;
+
 	default:
 		switch (zfs_prop_get_type(prop)) {
 		case PROP_TYPE_NUMBER:
@@ -2095,8 +2564,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop
 			break;
 
 		case PROP_TYPE_STRING:
-			(void) strlcpy(propbuf,
-			    getprop_string(zhp, prop, &source), proplen);
+			str = getprop_string(zhp, prop, &source);
+			if (str == NULL)
+				return (-1);
+			(void) strlcpy(propbuf, str, proplen);
 			break;
 
 		case PROP_TYPE_INDEX:
@@ -2172,19 +2643,16 @@ zfs_prop_get_numeric(zfs_handle_t *zhp, 
 	return (0);
 }
 
-#ifdef PORT_SOLARIS /* NetBSD zfs QUOTA support */
 static int
 idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
     char **domainp, idmap_rid_t *ridp)
 {
-	idmap_handle_t *idmap_hdl = NULL;
+#ifdef illumos
 	idmap_get_handle_t *get_hdl = NULL;
 	idmap_stat status;
 	int err = EINVAL;
 
-	if (idmap_init(&idmap_hdl) != IDMAP_SUCCESS)
-		goto out;
-	if (idmap_get_create(idmap_hdl, &get_hdl) != IDMAP_SUCCESS)
+	if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS)
 		goto out;
 
 	if (isuser) {
@@ -2203,9 +2671,11 @@ idmap_id_to_numeric_domain_rid(uid_t id,
 out:
 	if (get_hdl)
 		idmap_get_destroy(get_hdl);
-	if (idmap_hdl)
-		(void) idmap_fini(idmap_hdl);
 	return (err);
+#else	/* !illumos */
+	assert(!"invalid code path");
+	return (EINVAL); // silence compiler warning
+#endif	/* illumos */
 }
 
 /*
@@ -2223,7 +2693,7 @@ userquota_propname_decode(const char *pr
 	boolean_t isuser;
 
 	domain[0] = '\0';
-
+	*ridp = 0;
 	/* Figure out the property type ({user|group}{quota|space}) */
 	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
 		if (strncmp(propname, zfs_userquota_prop_prefixes[type],
@@ -2240,39 +2710,69 @@ userquota_propname_decode(const char *pr
 	cp = strchr(propname, '@') + 1;
 
 	if (strchr(cp, '@')) {
+#ifdef illumos
 		/*
 		 * It's a SID name (eg "user@domain") that needs to be
 		 * turned into S-1-domainID-RID.
 		 */
-		directory_error_t e;
+		int flag = 0;
+		idmap_stat stat, map_stat;
+		uid_t pid;
+		idmap_rid_t rid;
+		idmap_get_handle_t *gh = NULL;
+
+		stat = idmap_get_create(&gh);
+		if (stat != IDMAP_SUCCESS) {
+			idmap_get_destroy(gh);
+			return (ENOMEM);
+		}
 		if (zoned && getzoneid() == GLOBAL_ZONEID)
 			return (ENOENT);
 		if (isuser) {
-			e = directory_sid_from_user_name(NULL,
-			    cp, &numericsid);
+			stat = idmap_getuidbywinname(cp, NULL, flag, &pid);
+			if (stat < 0)
+				return (ENOENT);
+			stat = idmap_get_sidbyuid(gh, pid, flag, &numericsid,
+			    &rid, &map_stat);
 		} else {
-			e = directory_sid_from_group_name(NULL,
-			    cp, &numericsid);
+			stat = idmap_getgidbywinname(cp, NULL, flag, &pid);
+			if (stat < 0)
+				return (ENOENT);
+			stat = idmap_get_sidbygid(gh, pid, flag, &numericsid,
+			    &rid, &map_stat);
+		}
+		if (stat < 0) {
+			idmap_get_destroy(gh);
+			return (ENOENT);
 		}
-		if (e != NULL) {
-			directory_error_free(e);
+		stat = idmap_get_mappings(gh);
+		idmap_get_destroy(gh);
+
+		if (stat < 0) {
 			return (ENOENT);
 		}
 		if (numericsid == NULL)
 			return (ENOENT);
 		cp = numericsid;
+		*ridp = rid;
 		/* will be further decoded below */
+#else	/* !illumos */
+		return (ENOENT);
+#endif	/* illumos */
 	}
 
 	if (strncmp(cp, "S-1-", 4) == 0) {
 		/* It's a numeric SID (eg "S-1-234-567-89") */
 		(void) strlcpy(domain, cp, domainlen);
-		cp = strrchr(domain, '-');
-		*cp = '\0';
-		cp++;
-
 		errno = 0;
-		*ridp = strtoull(cp, &end, 10);
+		if (*ridp == 0) {
+			cp = strrchr(domain, '-');
+			*cp = '\0';
+			cp++;
+			*ridp = strtoull(cp, &end, 10);
+		} else {
+			end = "";
+		}
 		if (numericsid) {
 			free(numericsid);
 			numericsid = NULL;
@@ -2322,14 +2822,6 @@ userquota_propname_decode(const char *pr
 	ASSERT3P(numericsid, ==, NULL);
 	return (0);
 }
-#else
-static int
-userquota_propname_decode(const char *propname, boolean_t zoned,
-    zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp)
-{
-	return (ENOENT);
-}
-#endif /* PORT_SOLARIS */
 
 static int
 zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname,
@@ -2338,7 +2830,7 @@ zfs_prop_get_userquota_common(zfs_handle
 	int err;
 	zfs_cmd_t zc = { 0 };
 
-	(void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	err = userquota_propname_decode(propname,
 	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED),
@@ -2390,144 +2882,85 @@ zfs_prop_get_userquota(zfs_handle_t *zhp
 	return (0);
 }
 
-/*
- * Returns the name of the given zfs handle.
- */
-const char *
-zfs_get_name(const zfs_handle_t *zhp)
-{
-	return (zhp->zfs_name);
-}
-
-/*
- * Returns the type of the given zfs handle.
- */
-zfs_type_t
-zfs_get_type(const zfs_handle_t *zhp)
+int
+zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname,
+    uint64_t *propvalue)
 {
-	return (zhp->zfs_type);
-}
+	int err;
+	zfs_cmd_t zc = { 0 };
+	const char *snapname;
 
-static int
-zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc)
-{
-	int rc;
-	uint64_t	orig_cookie;
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
-	orig_cookie = zc->zc_cookie;
-top:
-	(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
-	rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc);
+	snapname = strchr(propname, '@') + 1;
+	if (strchr(snapname, '@')) {
+		(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+	} else {
+		/* snapname is the short name, append it to zhp's fsname */
+		char *cp;
 
-	if (rc == -1) {
-		switch (errno) {
-		case ENOMEM:
-			/* expand nvlist memory and try again */
-			if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) {
-				zcmd_free_nvlists(zc);
-				return (-1);
-			}
-			zc->zc_cookie = orig_cookie;
-			goto top;
-		/*
-		 * An errno value of ESRCH indicates normal completion.
-		 * If ENOENT is returned, then the underlying dataset
-		 * has been removed since we obtained the handle.
-		 */
-		case ESRCH:
-		case ENOENT:
-			rc = 1;
-			break;
-		default:
-			rc = zfs_standard_error(zhp->zfs_hdl, errno,
-			    dgettext(TEXT_DOMAIN,
-			    "cannot iterate filesystems"));
-			break;
-		}
+		(void) strlcpy(zc.zc_value, zhp->zfs_name,
+		    sizeof (zc.zc_value));
+		cp = strchr(zc.zc_value, '@');
+		if (cp != NULL)
+			*cp = '\0';
+		(void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value));
+		(void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value));
 	}
-	return (rc);
+
+	err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc);
+	if (err)
+		return (err);
+
+	*propvalue = zc.zc_cookie;
+	return (0);
 }
 
-/*
- * Iterate over all child filesystems
- */
 int
-zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+zfs_prop_get_written(zfs_handle_t *zhp, const char *propname,
+    char *propbuf, int proplen, boolean_t literal)
 {
-	zfs_cmd_t zc = { 0 };
-	zfs_handle_t *nzhp;
-	int ret;
-
-	if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM)
-		return (0);
+	int err;
+	uint64_t propvalue;
 
-	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
-		return (-1);
+	err = zfs_prop_get_written_int(zhp, propname, &propvalue);
 
-	while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT,
-	    &zc)) == 0) {
-		/*
-		 * Silently ignore errors, as the only plausible explanation is
-		 * that the pool has since been removed.
-		 */
-		if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
-		    &zc)) == NULL) {
-			continue;
-		}
+	if (err)
+		return (err);
 
-		if ((ret = func(nzhp, data)) != 0) {
-			zcmd_free_nvlists(&zc);
-			return (ret);
-		}
+	if (literal) {
+		(void) snprintf(propbuf, proplen, "%llu", propvalue);
+	} else {
+		zfs_nicenum(propvalue, propbuf, proplen);
 	}
-	zcmd_free_nvlists(&zc);
-	return ((ret < 0) ? ret : 0);
+	return (0);
 }
 
 /*
- * Iterate over all snapshots
+ * Returns the name of the given zfs handle.
  */
-int
-zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+const char *
+zfs_get_name(const zfs_handle_t *zhp)
 {
-	zfs_cmd_t zc = { 0 };
-	zfs_handle_t *nzhp;
-	int ret;
-
-	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
-		return (0);
-
-	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
-		return (-1);
-	while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT,
-	    &zc)) == 0) {
-
-		if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
-		    &zc)) == NULL) {
-			continue;
-		}
-
-		if ((ret = func(nzhp, data)) != 0) {
-			zcmd_free_nvlists(&zc);
-			return (ret);
-		}
-	}
-	zcmd_free_nvlists(&zc);
-	return ((ret < 0) ? ret : 0);
+	return (zhp->zfs_name);
 }
 
 /*
- * Iterate over all children, snapshots and filesystems
+ * Returns the name of the parent pool for the given zfs handle.
  */
-int
-zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+const char *
+zfs_get_pool_name(const zfs_handle_t *zhp)
 {
-	int ret;
-
-	if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0)
-		return (ret);
+	return (zhp->zpool_hdl->zpool_name);
+}
 
-	return (zfs_iter_snapshots(zhp, func, data));
+/*
+ * Returns the type of the given zfs handle.
+ */
+zfs_type_t
+zfs_get_type(const zfs_handle_t *zhp)
+{
+	return (zhp->zfs_type);
 }
 
 /*
@@ -2553,18 +2986,19 @@ is_descendant(const char *ds1, const cha
 
 /*
  * Given a complete name, return just the portion that refers to the parent.
- * Can return NULL if this is a pool.
+ * Will return -1 if there is no parent (path is just the name of the
+ * pool).
  */
 static int
 parent_name(const char *path, char *buf, size_t buflen)
 {
-	char *loc;
+	char *slashp;
 
-	if ((loc = strrchr(path, '/')) == NULL)
-		return (-1);
+	(void) strlcpy(buf, path, buflen);
 
-	(void) strncpy(buf, path, MIN(buflen, loc - path));
-	buf[loc - path] = '\0';
+	if ((slashp = strrchr(buf, '/')) == NULL)
+		return (-1);
+	*slashp = '\0';
 
 	return (0);
 }
@@ -2582,7 +3016,7 @@ check_parents(libzfs_handle_t *hdl, cons
     boolean_t accept_ancestor, int *prefixlen)
 {
 	zfs_cmd_t zc = { 0 };
-	char parent[ZFS_MAXNAMELEN];
+	char parent[ZFS_MAX_DATASET_NAME_LEN];
 	char *slash;
 	zfs_handle_t *zhp;
 	char errbuf[1024];
@@ -2710,8 +3144,7 @@ create_parents(libzfs_handle_t *hdl, cha
 	 * up to the prefixlen-long one.
 	 */
 	for (cp = target + prefixlen + 1;
-	    cp = strchr(cp, '/'); *cp = '/', cp++) {
-		char *logstr;
+	    (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) {
 
 		*cp = '\0';
 
@@ -2722,16 +3155,12 @@ create_parents(libzfs_handle_t *hdl, cha
 			continue;
 		}
 
-		logstr = hdl->libzfs_log_str;
-		hdl->libzfs_log_str = NULL;
 		if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM,
 		    NULL) != 0) {
-			hdl->libzfs_log_str = logstr;
 			opname = dgettext(TEXT_DOMAIN, "create");
 			goto ancestorerr;
 		}
 
-		hdl->libzfs_log_str = logstr;
 		h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM);
 		if (h == NULL) {
 			opname = dgettext(TEXT_DOMAIN, "open");
@@ -2767,7 +3196,7 @@ zfs_create_ancestors(libzfs_handle_t *hd
 {
 	int prefix;
 	char *path_copy;
-	int rc;
+	int rc = 0;
 
 	if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
 		return (-1);
@@ -2789,12 +3218,12 @@ int
 zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
     nvlist_t *props)
 {
-	zfs_cmd_t zc = { 0 };
 	int ret;
 	uint64_t size = 0;
 	uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 	char errbuf[1024];
 	uint64_t zoned;
+	enum lzc_dataset_type ost;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), path);
@@ -2814,21 +3243,34 @@ zfs_create(libzfs_handle_t *hdl, const c
 	 * will return ENOENT, not EEXIST.  To prevent this from happening, we
 	 * first try to see if the dataset exists.
 	 */
-	(void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name));
-	if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
+	if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset already exists"));
 		return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 	}
 
 	if (type == ZFS_TYPE_VOLUME)
-		zc.zc_objset_type = DMU_OST_ZVOL;
+		ost = LZC_DATSET_TYPE_ZVOL;
 	else
-		zc.zc_objset_type = DMU_OST_ZFS;
+		ost = LZC_DATSET_TYPE_ZFS;
+
+	/* open zpool handle for prop validation */
+	char pool_path[ZFS_MAX_DATASET_NAME_LEN];
+	(void) strlcpy(pool_path, path, sizeof (pool_path));
+
+	/* truncate pool_path at first slash */
+	char *p = strchr(pool_path, '/');
+	if (p != NULL)
+		*p = '\0';
+
+	zpool_handle_t *zpool_handle = zpool_open(hdl, pool_path);
 
 	if (props && (props = zfs_valid_proplist(hdl, type, props,
-	    zoned, NULL, errbuf)) == 0)
+	    zoned, NULL, zpool_handle, errbuf)) == 0) {
+		zpool_close(zpool_handle);
 		return (-1);
+	}
+	zpool_close(zpool_handle);
 
 	if (type == ZFS_TYPE_VOLUME) {
 		/*
@@ -2876,18 +3318,13 @@ zfs_create(libzfs_handle_t *hdl, const c
 		}
 	}
 
-	if (props && zcmd_write_src_nvlist(hdl, &zc, props) != 0)
-		return (-1);
-	nvlist_free(props);
-
 	/* create the dataset */
-	ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc);
-
-	zcmd_free_nvlists(&zc);
+	ret = lzc_create(path, ost, props);
+	nvlist_free(props);
 
 	/* check for failure */
 	if (ret != 0) {
-		char parent[ZFS_MAXNAMELEN];
+		char parent[ZFS_MAX_DATASET_NAME_LEN];
 		(void) parent_name(path, parent, sizeof (parent));
 
 		switch (errno) {
@@ -2901,15 +3338,6 @@ zfs_create(libzfs_handle_t *hdl, const c
 			    "parent '%s' is not a filesystem"), parent);
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 
-		case EDOM:
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "volume block size must be power of 2 from "
-			    "%u to %uk"),
-			    (uint_t)SPA_MINBLOCKSIZE,
-			    (uint_t)SPA_MAXBLOCKSIZE >> 10);
-
-			return (zfs_error(hdl, EZFS_BADPROP, errbuf));
-
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded to set this "
@@ -2935,32 +3363,38 @@ zfs_create(libzfs_handle_t *hdl, const c
 
 /*
  * Destroys the given dataset.  The caller must make sure that the filesystem
- * isn't mounted, and that there are no active dependents.
+ * isn't mounted, and that there are no active dependents. If the file system
+ * does not exist this function does nothing.
  */
 int
 zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
 {
 	zfs_cmd_t zc = { 0 };
 
+	if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) {
+		nvlist_t *nv = fnvlist_alloc();
+		fnvlist_add_boolean(nv, zhp->zfs_name);
+		int error = lzc_destroy_bookmarks(nv, NULL);
+		fnvlist_free(nv);
+		if (error != 0) {
+			return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
+			    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
+			    zhp->zfs_name));
+		}
+		return (0);
+	}
+
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	if (ZFS_IS_VOLUME(zhp)) {
-		/*
-		 * If user doesn't have permissions to unshare volume, then
-		 * abort the request.  This would only happen for a
-		 * non-privileged user.
-		 */
-		if (zfs_unshare_iscsi(zhp) != 0) {
-			return (-1);
-		}
-
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
 	}
 
 	zc.zc_defer_destroy = defer;
-	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) {
+	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0 &&
+	    errno != ENOENT) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
 		    zhp->zfs_name));
@@ -2972,36 +3406,26 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t
 }
 
 struct destroydata {
-	char *snapname;
-	boolean_t gotone;
-	boolean_t closezhp;
+	nvlist_t *nvl;
+	const char *snapname;
 };
 
 static int
 zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct destroydata *dd = arg;
-	zfs_handle_t *szhp;
-	char name[ZFS_MAXNAMELEN];
-	boolean_t closezhp = dd->closezhp;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int rv = 0;
 
-	(void) strlcpy(name, zhp->zfs_name, sizeof (name));
-	(void) strlcat(name, "@", sizeof (name));
-	(void) strlcat(name, dd->snapname, sizeof (name));
-
-	szhp = make_dataset_handle(zhp->zfs_hdl, name);
-	if (szhp) {
-		dd->gotone = B_TRUE;
-		zfs_close(szhp);
-	}
+	(void) snprintf(name, sizeof (name),
+	    "%s@%s", zhp->zfs_name, dd->snapname);
 
-	dd->closezhp = B_TRUE;
-	if (!dd->gotone)
-		rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg);
-	if (closezhp)
-		zfs_close(zhp);
-	return (rv);
+	if (lzc_exists(name))
+		verify(nvlist_add_boolean(dd->nvl, name) == 0);
+
+	rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd);
+	zfs_close(zhp);
+	return (rv);
 }
 
 /*
@@ -3010,43 +3434,68 @@ zfs_check_snap_cb(zfs_handle_t *zhp, voi
 int
 zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
 {
-	zfs_cmd_t zc = { 0 };
 	int ret;
 	struct destroydata dd = { 0 };
 
 	dd.snapname = snapname;
-	(void) zfs_check_snap_cb(zhp, &dd);
+	verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0);
+	(void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd);
 
-	if (!dd.gotone) {
-		return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
+	if (nvlist_empty(dd.nvl)) {
+		ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"),
-		    zhp->zfs_name, snapname));
+		    zhp->zfs_name, snapname);
+	} else {
+		ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer);
 	}
+	nvlist_free(dd.nvl);
+	return (ret);
+}
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-	zc.zc_defer_destroy = defer;
+/*
+ * Destroys all the snapshots named in the nvlist.
+ */
+int
+zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer)
+{
+	int ret;
+	nvlist_t *errlist = NULL;
 
-	ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc);
-	if (ret != 0) {
+	ret = lzc_destroy_snaps(snaps, defer, &errlist);
+
+	if (ret == 0) {
+		nvlist_free(errlist);
+		return (0);
+	}
+
+	if (nvlist_empty(errlist)) {
 		char errbuf[1024];
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot destroy snapshots"));
 
-		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot destroy '%s@%s'"), zc.zc_name, snapname);
+		ret = zfs_standard_error(hdl, ret, errbuf);
+	}
+	for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) {
+		char errbuf[1024];
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"),
+		    nvpair_name(pair));
 
-		switch (errno) {
+		switch (fnvpair_value_int32(pair)) {
 		case EEXIST:
-			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-			    "snapshot is cloned"));
-			return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf));
-
+			zfs_error_aux(hdl,
+			    dgettext(TEXT_DOMAIN, "snapshot is cloned"));
+			ret = zfs_error(hdl, EZFS_EXISTS, errbuf);
+			break;
 		default:
-			return (zfs_standard_error(zhp->zfs_hdl, errno,
-			    errbuf));
+			ret = zfs_standard_error(hdl, errno, errbuf);
+			break;
 		}
 	}
 
-	return (0);
+	nvlist_free(errlist);
+	return (ret);
 }
 
 /*
@@ -3055,12 +3504,10 @@ zfs_destroy_snaps(zfs_handle_t *zhp, cha
 int
 zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
 {
-	zfs_cmd_t zc = { 0 };
-	char parent[ZFS_MAXNAMELEN];
+	char parent[ZFS_MAX_DATASET_NAME_LEN];
 	int ret;
 	char errbuf[1024];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
-	zfs_type_t type;
 	uint64_t zoned;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
@@ -3068,7 +3515,7 @@ zfs_clone(zfs_handle_t *zhp, const char 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot create '%s'"), target);
 
-	/* validate the target name */
+	/* validate the target/clone name */
 	if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE))
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 
@@ -3079,32 +3526,21 @@ zfs_clone(zfs_handle_t *zhp, const char 
 	(void) parent_name(target, parent, sizeof (parent));
 
 	/* do the clone */
-	if (ZFS_IS_VOLUME(zhp)) {
-		zc.zc_objset_type = DMU_OST_ZVOL;
-		type = ZFS_TYPE_VOLUME;
-	} else {
-		zc.zc_objset_type = DMU_OST_ZFS;
-		type = ZFS_TYPE_FILESYSTEM;
-	}
 
 	if (props) {
+		zfs_type_t type;
+		if (ZFS_IS_VOLUME(zhp)) {
+			type = ZFS_TYPE_VOLUME;
+		} else {
+			type = ZFS_TYPE_FILESYSTEM;
+		}
 		if ((props = zfs_valid_proplist(hdl, type, props, zoned,
-		    zhp, errbuf)) == NULL)
+		    zhp, zhp->zpool_hdl, errbuf)) == NULL)
 			return (-1);
-
-		if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
-			nvlist_free(props);
-			return (-1);
-		}
-
-		nvlist_free(props);
 	}
 
-	(void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value));
-	ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_CREATE, &zc);
-
-	zcmd_free_nvlists(&zc);
+	ret = lzc_clone(target, zhp->zfs_name, props);
+	nvlist_free(props);
 
 	if (ret != 0) {
 		switch (errno) {
@@ -3189,74 +3625,149 @@ zfs_promote(zfs_handle_t *zhp)
 	return (ret);
 }
 
+typedef struct snapdata {
+	nvlist_t *sd_nvl;
+	const char *sd_snapname;
+} snapdata_t;
+
+static int
+zfs_snapshot_cb(zfs_handle_t *zhp, void *arg)
+{
+	snapdata_t *sd = arg;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	int rv = 0;
+
+	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) {
+		(void) snprintf(name, sizeof (name),
+		    "%s@%s", zfs_get_name(zhp), sd->sd_snapname);
+
+		fnvlist_add_boolean(sd->sd_nvl, name);
+
+		rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd);
+	}
+	zfs_close(zhp);
+
+	return (rv);
+}
+
 /*
- * Takes a snapshot of the given dataset.
+ * Creates snapshots.  The keys in the snaps nvlist are the snapshots to be
+ * created.
  */
 int
-zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
-    nvlist_t *props)
+zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props)
 {
-	const char *delim;
-	char parent[ZFS_MAXNAMELEN];
-	zfs_handle_t *zhp;
-	zfs_cmd_t zc = { 0 };
 	int ret;
 	char errbuf[1024];
+	nvpair_t *elem;
+	nvlist_t *errors;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-	    "cannot snapshot '%s'"), path);
+	    "cannot create snapshots "));
 
-	/* validate the target name */
-	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
-		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
-
-	if (props) {
-		if ((props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
-		    props, B_FALSE, NULL, errbuf)) == NULL)
-			return (-1);
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) {
+		const char *snapname = nvpair_name(elem);
 
-		if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
-			nvlist_free(props);
-			return (-1);
+		/* validate the target name */
+		if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT,
+		    B_TRUE)) {
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN,
+			    "cannot create snapshot '%s'"), snapname);
+			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
+	}
 
-		nvlist_free(props);
+	/*
+	 * get pool handle for prop validation. assumes all snaps are in the
+	 * same pool, as does lzc_snapshot (below).
+	 */
+	char pool[ZFS_MAX_DATASET_NAME_LEN];
+	elem = nvlist_next_nvpair(snaps, NULL);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/@")] = '\0';
+	zpool_handle_t *zpool_hdl = zpool_open(hdl, pool);
+
+	if (props != NULL &&
+	    (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT,
+	    props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) {
+		zpool_close(zpool_hdl);
+		return (-1);
 	}
+	zpool_close(zpool_hdl);
 
-	/* make sure the parent exists and is of the appropriate type */
-	delim = strchr(path, '@');
-	(void) strncpy(parent, path, delim - path);
-	parent[delim - path] = '\0';
+	ret = lzc_snapshot(snaps, props, &errors);
 
-	if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM |
-	    ZFS_TYPE_VOLUME)) == NULL) {
-		zcmd_free_nvlists(&zc);
-		return (-1);
+	if (ret != 0) {
+		boolean_t printed = B_FALSE;
+		for (elem = nvlist_next_nvpair(errors, NULL);
+		    elem != NULL;
+		    elem = nvlist_next_nvpair(errors, elem)) {
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN,
+			    "cannot create snapshot '%s'"), nvpair_name(elem));
+			(void) zfs_standard_error(hdl,
+			    fnvpair_value_int32(elem), errbuf);
+			printed = B_TRUE;
+		}
+		if (!printed) {
+			switch (ret) {
+			case EXDEV:
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "multiple snapshots of same "
+				    "fs not allowed"));
+				(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
+
+				break;
+			default:
+				(void) zfs_standard_error(hdl, ret, errbuf);
+			}
+		}
 	}
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value));
-	if (ZFS_IS_VOLUME(zhp))
-		zc.zc_objset_type = DMU_OST_ZVOL;
-	else
-		zc.zc_objset_type = DMU_OST_ZFS;
-	zc.zc_cookie = recursive;
-	ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SNAPSHOT, &zc);
+	nvlist_free(props);
+	nvlist_free(errors);
+	return (ret);
+}
 
-	zcmd_free_nvlists(&zc);
+int
+zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
+    nvlist_t *props)
+{
+	int ret;
+	snapdata_t sd = { 0 };
+	char fsname[ZFS_MAX_DATASET_NAME_LEN];
+	char *cp;
+	zfs_handle_t *zhp;
+	char errbuf[1024];
 
-	/*
-	 * if it was recursive, the one that actually failed will be in
-	 * zc.zc_name.
-	 */
-	if (ret != 0) {
-		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
-		(void) zfs_standard_error(hdl, errno, errbuf);
+	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+	    "cannot snapshot %s"), path);
+
+	if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE))
+		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
+
+	(void) strlcpy(fsname, path, sizeof (fsname));
+	cp = strchr(fsname, '@');
+	*cp = '\0';
+	sd.sd_snapname = cp + 1;
+
+	if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME)) == NULL) {
+		return (-1);
 	}
 
-	zfs_close(zhp);
+	verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0);
+	if (recursive) {
+		(void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd);
+	} else {
+		fnvlist_add_boolean(sd.sd_nvl, path);
+	}
 
+	ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props);
+	nvlist_free(sd.sd_nvl);
+	zfs_close(zhp);
 	return (ret);
 }
 
@@ -3270,49 +3781,44 @@ typedef struct rollback_data {
 	const char	*cb_target;		/* the snapshot */
 	uint64_t	cb_create;		/* creation time reference */
 	boolean_t	cb_error;
-	boolean_t	cb_dependent;
 	boolean_t	cb_force;
 } rollback_data_t;
 
 static int
+rollback_destroy_dependent(zfs_handle_t *zhp, void *data)
+{
+	rollback_data_t *cbp = data;
+	prop_changelist_t *clp;
+
+	/* We must destroy this clone; first unmount it */
+	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
+	    cbp->cb_force ? MS_FORCE: 0);
+	if (clp == NULL || changelist_prefix(clp) != 0) {
+		cbp->cb_error = B_TRUE;
+		zfs_close(zhp);
+		return (0);
+	}
+	if (zfs_destroy(zhp, B_FALSE) != 0)
+		cbp->cb_error = B_TRUE;
+	else
+		changelist_remove(clp, zhp->zfs_name);
+	(void) changelist_postfix(clp);
+	changelist_free(clp);
+
+	zfs_close(zhp);
+	return (0);
+}
+
+static int
 rollback_destroy(zfs_handle_t *zhp, void *data)
 {
 	rollback_data_t *cbp = data;
 
-	if (!cbp->cb_dependent) {
-		if (strcmp(zhp->zfs_name, cbp->cb_target) != 0 &&
-		    zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
-		    zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) >
-		    cbp->cb_create) {
-			char *logstr;
-
-			cbp->cb_dependent = B_TRUE;
-			cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE,
-			    rollback_destroy, cbp);
-			cbp->cb_dependent = B_FALSE;
-
-			logstr = zhp->zfs_hdl->libzfs_log_str;
-			zhp->zfs_hdl->libzfs_log_str = NULL;
-			cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
-			zhp->zfs_hdl->libzfs_log_str = logstr;
-		}
-	} else {
-		/* We must destroy this clone; first unmount it */
-		prop_changelist_t *clp;
+	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) {
+		cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE,
+		    rollback_destroy_dependent, cbp);
 
-		clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
-		    cbp->cb_force ? MS_FORCE: 0);
-		if (clp == NULL || changelist_prefix(clp) != 0) {
-			cbp->cb_error = B_TRUE;
-			zfs_close(zhp);
-			return (0);
-		}
-		if (zfs_destroy(zhp, B_FALSE) != 0)
-			cbp->cb_error = B_TRUE;
-		else
-			changelist_remove(clp, zhp->zfs_name);
-		(void) changelist_postfix(clp);
-		changelist_free(clp);
+		cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
 	}
 
 	zfs_close(zhp);
@@ -3323,29 +3829,29 @@ rollback_destroy(zfs_handle_t *zhp, void
  * Given a dataset, rollback to a specific snapshot, discarding any
  * data changes since then and making it the active dataset.
  *
- * Any snapshots more recent than the target are destroyed, along with
- * their dependents.
+ * Any snapshots and bookmarks more recent than the target are
+ * destroyed, along with their dependents (i.e. clones).
  */
 int
 zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 {
 	rollback_data_t cb = { 0 };
 	int err;
-	zfs_cmd_t zc = { 0 };
 	boolean_t restore_resv = 0;
-	uint64_t old_volsize, new_volsize;
+	uint64_t old_volsize = 0, new_volsize;
 	zfs_prop_t resv_prop;
 
 	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM ||
 	    zhp->zfs_type == ZFS_TYPE_VOLUME);
 
 	/*
-	 * Destroy all recent snapshots and its dependends.
+	 * Destroy all recent snapshots and their dependents.
 	 */
 	cb.cb_force = force;
 	cb.cb_target = snap->zfs_name;
 	cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG);
-	(void) zfs_iter_children(zhp, rollback_destroy, &cb);
+	(void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb);
+	(void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb);
 
 	if (cb.cb_error)
 		return (-1);
@@ -3363,22 +3869,15 @@ zfs_rollback(zfs_handle_t *zhp, zfs_hand
 		    (old_volsize == zfs_prop_get_int(zhp, resv_prop));
 	}
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-
-	if (ZFS_IS_VOLUME(zhp))
-		zc.zc_objset_type = DMU_OST_ZVOL;
-	else
-		zc.zc_objset_type = DMU_OST_ZFS;
-
 	/*
 	 * We rely on zfs_iter_children() to verify that there are no
 	 * newer snapshots for the given dataset.  Therefore, we can
 	 * simply pass the name on to the ioctl() call.  There is still
 	 * an unlikely race condition where the user has taken a
 	 * snapshot since we verified that this was the most recent.
-	 *
 	 */
-	if ((err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_ROLLBACK, &zc)) != 0) {
+	err = lzc_rollback(zhp->zfs_name, NULL, 0);
+	if (err != 0) {
 		(void) zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot rollback '%s'"),
 		    zhp->zfs_name);
@@ -3405,54 +3904,20 @@ zfs_rollback(zfs_handle_t *zhp, zfs_hand
 }
 
 /*
- * Iterate over all dependents for a given dataset.  This includes both
- * hierarchical dependents (children) and data dependents (snapshots and
- * clones).  The bulk of the processing occurs in get_dependents() in
- * libzfs_graph.c.
- */
-int
-zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion,
-    zfs_iter_f func, void *data)
-{
-	char **dependents;
-	size_t count;
-	int i;
-	zfs_handle_t *child;
-	int ret = 0;
-
-	if (get_dependents(zhp->zfs_hdl, allowrecursion, zhp->zfs_name,
-	    &dependents, &count) != 0)
-		return (-1);
-
-	for (i = 0; i < count; i++) {
-		if ((child = make_dataset_handle(zhp->zfs_hdl,
-		    dependents[i])) == NULL)
-			continue;
-
-		if ((ret = func(child, data)) != 0)
-			break;
-	}
-
-	for (i = 0; i < count; i++)
-		free(dependents[i]);
-	free(dependents);
-
-	return (ret);
-}
-
-/*
  * Renames the given dataset.
  */
 int
-zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
+zfs_rename(zfs_handle_t *zhp, const char *source, const char *target,
+    renameflags_t flags)
 {
-	int ret;
+	int ret = 0;
 	zfs_cmd_t zc = { 0 };
 	char *delim;
 	prop_changelist_t *cl = NULL;
 	zfs_handle_t *zhrp = NULL;
 	char *parentname = NULL;
-	char parent[ZFS_MAXNAMELEN];
+	char parent[ZFS_MAX_DATASET_NAME_LEN];
+	char property[ZFS_MAXPROPLEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	char errbuf[1024];
 
@@ -3463,6 +3928,18 @@ zfs_rename(zfs_handle_t *zhp, const char
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot rename to '%s'"), target);
 
+	if (source != NULL) {
+		/*
+		 * This is recursive snapshots rename, put snapshot name
+		 * (that might not exist) into zfs_name.
+		 */
+		assert(flags.recurse);
+
+		(void) strlcat(zhp->zfs_name, "@", sizeof(zhp->zfs_name));
+		(void) strlcat(zhp->zfs_name, source, sizeof(zhp->zfs_name));
+		zhp->zfs_type = ZFS_TYPE_SNAPSHOT;
+	}
+
 	/*
 	 * Make sure the target name is valid
 	 */
@@ -3499,7 +3976,7 @@ zfs_rename(zfs_handle_t *zhp, const char
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 	} else {
-		if (recursive) {
+		if (flags.recurse) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "recursive rename must be a snapshot"));
 			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
@@ -3540,7 +4017,19 @@ zfs_rename(zfs_handle_t *zhp, const char
 		return (zfs_error(hdl, EZFS_ZONED, errbuf));
 	}
 
-	if (recursive) {
+	/*
+	 * Avoid unmounting file systems with mountpoint property set to
+	 * 'legacy' or 'none' even if -u option is not given.
+	 */
+	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
+	    !flags.recurse && !flags.nounmount &&
+	    zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property,
+	    sizeof (property), NULL, NULL, 0, B_FALSE) == 0 &&
+	    (strcmp(property, "legacy") == 0 ||
+	     strcmp(property, "none") == 0)) {
+		flags.nounmount = B_TRUE;
+	}
+	if (flags.recurse) {
 
 		parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
 		if (parentname == NULL) {
@@ -3554,16 +4043,19 @@ zfs_rename(zfs_handle_t *zhp, const char
 			ret = -1;
 			goto error;
 		}
-
-	} else {
-		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL)
+	} else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) {
+		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME,
+		    flags.nounmount ? CL_GATHER_DONT_UNMOUNT : 0,
+		    flags.forceunmount ? MS_FORCE : 0)) == NULL) {
 			return (-1);
+		}
 
 		if (changelist_haszonedchild(cl)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "child dataset with inherited mountpoint is used "
 			    "in a non-global zone"));
 			(void) zfs_error(hdl, EZFS_ZONED, errbuf);
+			ret = -1;
 			goto error;
 		}
 
@@ -3579,7 +4071,9 @@ zfs_rename(zfs_handle_t *zhp, const char
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
 
-	zc.zc_cookie = recursive;
+	zc.zc_cookie = flags.recurse ? 1 : 0;
+	if (flags.nounmount)
+		zc.zc_cookie |= 2;
 
 	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
 		/*
@@ -3589,7 +4083,7 @@ zfs_rename(zfs_handle_t *zhp, const char
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot rename '%s'"), zc.zc_name);
 
-		if (recursive && errno == EEXIST) {
+		if (flags.recurse && errno == EEXIST) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "a child dataset already has a snapshot "
 			    "with the new name"));
@@ -3602,23 +4096,23 @@ zfs_rename(zfs_handle_t *zhp, const char
 		 * On failure, we still want to remount any filesystems that
 		 * were previously mounted, so we don't alter the system state.
 		 */
-		if (!recursive)
+		if (cl != NULL)
 			(void) changelist_postfix(cl);
 	} else {
-		if (!recursive) {
+		if (cl != NULL) {
 			changelist_rename(cl, zfs_get_name(zhp), target);
 			ret = changelist_postfix(cl);
 		}
 	}
 
 error:
-	if (parentname) {
+	if (parentname != NULL) {
 		free(parentname);
 	}
-	if (zhrp) {
+	if (zhrp != NULL) {
 		zfs_close(zhrp);
 	}
-	if (cl) {
+	if (cl != NULL) {
 		changelist_free(cl);
 	}
 	return (ret);
@@ -3653,7 +4147,8 @@ zfs_get_recvd_props(zfs_handle_t *zhp)
  *        of the RECEIVED column.
  */
 int
-zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received)
+zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received,
+    boolean_t literal)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zprop_list_t *entry;
@@ -3715,18 +4210,18 @@ zfs_expand_proplist(zfs_handle_t *zhp, z
 	 * Now go through and check the width of any non-fixed columns
 	 */
 	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
-		if (entry->pl_fixed)
+		if (entry->pl_fixed && !literal)
 			continue;
 
 		if (entry->pl_prop != ZPROP_INVAL) {
 			if (zfs_prop_get(zhp, entry->pl_prop,
-			    buf, sizeof (buf), NULL, NULL, 0, B_FALSE) == 0) {
+			    buf, sizeof (buf), NULL, NULL, 0, literal) == 0) {
 				if (strlen(buf) > entry->pl_width)
 					entry->pl_width = strlen(buf);
 			}
 			if (received && zfs_prop_get_recvd(zhp,
 			    zfs_prop_to_name(entry->pl_prop),
-			    buf, sizeof (buf), B_FALSE) == 0)
+			    buf, sizeof (buf), literal) == 0)
 				if (strlen(buf) > entry->pl_recvd_width)
 					entry->pl_recvd_width = strlen(buf);
 		} else {
@@ -3739,7 +4234,7 @@ zfs_expand_proplist(zfs_handle_t *zhp, z
 			}
 			if (received && zfs_prop_get_recvd(zhp,
 			    entry->pl_user_prop,
-			    buf, sizeof (buf), B_FALSE) == 0)
+			    buf, sizeof (buf), literal) == 0)
 				if (strlen(buf) > entry->pl_recvd_width)
 					entry->pl_recvd_width = strlen(buf);
 		}
@@ -3749,52 +4244,6 @@ zfs_expand_proplist(zfs_handle_t *zhp, z
 }
 
 int
-zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred)
-{
-	zfs_cmd_t zc = { 0 };
-	nvlist_t *nvp;
-	gid_t gid;
-	uid_t uid;
-	const gid_t *groups;
-	int group_cnt;
-	int error;
-
-	if (nvlist_alloc(&nvp, NV_UNIQUE_NAME, 0) != 0)
-		return (no_memory(hdl));
-
-	uid = ucred_geteuid(cred);
-	gid = ucred_getegid(cred);
-	group_cnt = ucred_getgroups(cred, &groups);
-
-	if (uid == (uid_t)-1 || gid == (uid_t)-1 || group_cnt == (uid_t)-1)
-		return (1);
-
-	if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_UID, uid) != 0) {
-		nvlist_free(nvp);
-		return (1);
-	}
-
-	if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_GID, gid) != 0) {
-		nvlist_free(nvp);
-		return (1);
-	}
-
-	if (nvlist_add_uint32_array(nvp,
-	    ZFS_DELEG_PERM_GROUPS, (uint32_t *)groups, group_cnt) != 0) {
-		nvlist_free(nvp);
-		return (1);
-	}
-	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-	if (zcmd_write_src_nvlist(hdl, &zc, nvp))
-		return (-1);
-
-	error = ioctl(hdl->libzfs_fd, ZFS_IOC_ISCSI_PERM_CHECK, &zc);
-	nvlist_free(nvp);
-	return (error);
-}
-
-int
 zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
     char *resource, void *export, void *sharetab,
     int sharemax, zfs_share_op_t operation)
@@ -3845,6 +4294,7 @@ zfs_prune_proplist(zfs_handle_t *zhp, ui
 	}
 }
 
+#ifdef illumos
 static int
 zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path,
     zfs_smb_acl_op_t cmd, char *resource1, char *resource2)
@@ -3860,7 +4310,7 @@ zfs_smb_acl_mgmt(libzfs_handle_t *hdl, c
 	if (cmd == ZFS_SMB_ACL_RENAME) {
 		if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) {
 			(void) no_memory(hdl);
-			return 0;
+			return (0);
 		}
 	}
 
@@ -3891,8 +4341,7 @@ zfs_smb_acl_mgmt(libzfs_handle_t *hdl, c
 		return (-1);
 	}
 	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc);
-	if (nvlist)
-		nvlist_free(nvlist);
+	nvlist_free(nvlist);
 	return (error);
 }
 
@@ -3926,68 +4375,151 @@ zfs_smb_acl_rename(libzfs_handle_t *hdl,
 	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME,
 	    oldname, newname));
 }
+#endif	/* illumos */
 
 int
 zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
     zfs_userspace_cb_t func, void *arg)
 {
 	zfs_cmd_t zc = { 0 };
-	int error;
 	zfs_useracct_t buf[100];
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	int ret;
 
-	(void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 
 	zc.zc_objset_type = type;
 	zc.zc_nvlist_dst = (uintptr_t)buf;
 
-	/* CONSTCOND */
-	while (1) {
+	for (;;) {
 		zfs_useracct_t *zua = buf;
 
 		zc.zc_nvlist_dst_size = sizeof (buf);
-		error = ioctl(zhp->zfs_hdl->libzfs_fd,
-		    ZFS_IOC_USERSPACE_MANY, &zc);
-		if (error || zc.zc_nvlist_dst_size == 0)
+		if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) {
+			char errbuf[1024];
+
+			(void) snprintf(errbuf, sizeof (errbuf),
+			    dgettext(TEXT_DOMAIN,
+			    "cannot get used/quota for %s"), zc.zc_name);
+			return (zfs_standard_error_fmt(hdl, errno, errbuf));
+		}
+		if (zc.zc_nvlist_dst_size == 0)
 			break;
 
 		while (zc.zc_nvlist_dst_size > 0) {
-			error = func(arg, zua->zu_domain, zua->zu_rid,
-			    zua->zu_space);
-			if (error != 0)
-				return (error);
+			if ((ret = func(arg, zua->zu_domain, zua->zu_rid,
+			    zua->zu_space)) != 0)
+				return (ret);
 			zua++;
 			zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t);
 		}
 	}
 
-	return (error);
+	return (0);
+}
+
+struct holdarg {
+	nvlist_t *nvl;
+	const char *snapname;
+	const char *tag;
+	boolean_t recursive;
+	int error;
+};
+
+static int
+zfs_hold_one(zfs_handle_t *zhp, void *arg)
+{
+	struct holdarg *ha = arg;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	int rv = 0;
+
+	(void) snprintf(name, sizeof (name),
+	    "%s@%s", zhp->zfs_name, ha->snapname);
+
+	if (lzc_exists(name))
+		fnvlist_add_string(ha->nvl, name, ha->tag);
+
+	if (ha->recursive)
+		rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha);
+	zfs_close(zhp);
+	return (rv);
 }
 
 int
 zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
-    boolean_t recursive, boolean_t temphold, boolean_t enoent_ok)
+    boolean_t recursive, int cleanup_fd)
 {
-	zfs_cmd_t zc = { 0 };
+	int ret;
+	struct holdarg ha;
+
+	ha.nvl = fnvlist_alloc();
+	ha.snapname = snapname;
+	ha.tag = tag;
+	ha.recursive = recursive;
+	(void) zfs_hold_one(zfs_handle_dup(zhp), &ha);
+
+	if (nvlist_empty(ha.nvl)) {
+		char errbuf[1024];
+
+		fnvlist_free(ha.nvl);
+		ret = ENOENT;
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot hold snapshot '%s@%s'"),
+		    zhp->zfs_name, snapname);
+		(void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf);
+		return (ret);
+	}
+
+	ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl);
+	fnvlist_free(ha.nvl);
+
+	return (ret);
+}
+
+int
+zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds)
+{
+	int ret;
+	nvlist_t *errors;
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	char errbuf[1024];
+	nvpair_t *elem;
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-	if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
-	    >= sizeof (zc.zc_string))
-		return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
-	zc.zc_cookie = recursive;
-	zc.zc_temphold = temphold;
+	errors = NULL;
+	ret = lzc_hold(holds, cleanup_fd, &errors);
 
-	if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
-		char errbuf[ZFS_MAXNAMELEN+32];
+	if (ret == 0) {
+		/* There may be errors even in the success case. */
+		fnvlist_free(errors);
+		return (0);
+	}
 
-		/*
-		 * if it was recursive, the one that actually failed will be in
-		 * zc.zc_name.
-		 */
-		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot hold '%s@%s'"), zc.zc_name, snapname);
-		switch (errno) {
+	if (nvlist_empty(errors)) {
+		/* no hold-specific errors */
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot hold"));
+		switch (ret) {
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		default:
+			(void) zfs_standard_error(hdl, ret, errbuf);
+		}
+	}
+
+	for (elem = nvlist_next_nvpair(errors, NULL);
+	    elem != NULL;
+	    elem = nvlist_next_nvpair(errors, elem)) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot hold snapshot '%s'"), nvpair_name(elem));
+		switch (fnvpair_value_int32(elem)) {
 		case E2BIG:
 			/*
 			 * Temporary tags wind up having the ds object id
@@ -3995,173 +4527,294 @@ zfs_hold(zfs_handle_t *zhp, const char *
 			 * above, it's still possible for the tag to wind
 			 * up being slightly too long.
 			 */
-			return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf));
-		case ENOTSUP:
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "pool must be upgraded"));
-			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+			(void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf);
+			break;
 		case EINVAL:
-			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
 		case EEXIST:
-			return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
-		case ENOENT:
-			if (enoent_ok)
-				return (0);
-			/* FALLTHROUGH */
+			(void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf);
+			break;
 		default:
-			return (zfs_standard_error_fmt(hdl, errno, errbuf));
+			(void) zfs_standard_error(hdl,
+			    fnvpair_value_int32(elem), errbuf);
 		}
 	}
 
-	return (0);
+	fnvlist_free(errors);
+	return (ret);
 }
 
-struct hold_range_arg {
-	zfs_handle_t	*origin;
-	const char	*fromsnap;
-	const char	*tosnap;
-	char		lastsnapheld[ZFS_MAXNAMELEN];
-	const char	*tag;
-	boolean_t	temphold;
-	boolean_t	seento;
-	boolean_t	seenfrom;
-	boolean_t	holding;
-	boolean_t	recursive;
-};
-
 static int
-zfs_hold_range_one(zfs_handle_t *zhp, void *arg)
+zfs_release_one(zfs_handle_t *zhp, void *arg)
 {
-	struct hold_range_arg *hra = arg;
-	const char *thissnap;
-	int error;
+	struct holdarg *ha = arg;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	int rv = 0;
+	nvlist_t *existing_holds;
 
-	thissnap = strchr(zfs_get_name(zhp), '@') + 1;
+	(void) snprintf(name, sizeof (name),
+	    "%s@%s", zhp->zfs_name, ha->snapname);
 
-	if (hra->fromsnap && !hra->seenfrom &&
-	    strcmp(hra->fromsnap, thissnap) == 0)
-		hra->seenfrom = B_TRUE;
+	if (lzc_get_holds(name, &existing_holds) != 0) {
+		ha->error = ENOENT;
+	} else if (!nvlist_exists(existing_holds, ha->tag)) {
+		ha->error = ESRCH;
+	} else {
+		nvlist_t *torelease = fnvlist_alloc();
+		fnvlist_add_boolean(torelease, ha->tag);
+		fnvlist_add_nvlist(ha->nvl, name, torelease);
+		fnvlist_free(torelease);
+	}
 
-	/* snap is older or newer than the desired range, ignore it */
-	if (hra->seento || !hra->seenfrom) {
-		zfs_close(zhp);
+	if (ha->recursive)
+		rv = zfs_iter_filesystems(zhp, zfs_release_one, ha);
+	zfs_close(zhp);
+	return (rv);
+}
+
+int
+zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
+    boolean_t recursive)
+{
+	int ret;
+	struct holdarg ha;
+	nvlist_t *errors = NULL;
+	nvpair_t *elem;
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	char errbuf[1024];
+
+	ha.nvl = fnvlist_alloc();
+	ha.snapname = snapname;
+	ha.tag = tag;
+	ha.recursive = recursive;
+	ha.error = 0;
+	(void) zfs_release_one(zfs_handle_dup(zhp), &ha);
+
+	if (nvlist_empty(ha.nvl)) {
+		fnvlist_free(ha.nvl);
+		ret = ha.error;
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot release hold from snapshot '%s@%s'"),
+		    zhp->zfs_name, snapname);
+		if (ret == ESRCH) {
+			(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
+		} else {
+			(void) zfs_standard_error(hdl, ret, errbuf);
+		}
+		return (ret);
+	}
+
+	ret = lzc_release(ha.nvl, &errors);
+	fnvlist_free(ha.nvl);
+
+	if (ret == 0) {
+		/* There may be errors even in the success case. */
+		fnvlist_free(errors);
 		return (0);
 	}
 
-	if (hra->holding) {
-		/* We could be racing with destroy, so ignore ENOENT. */
-		error = zfs_hold(hra->origin, thissnap, hra->tag,
-		    hra->recursive, hra->temphold, B_TRUE);
-		if (error == 0) {
-			(void) strlcpy(hra->lastsnapheld, zfs_get_name(zhp),
-			    sizeof (hra->lastsnapheld));
+	if (nvlist_empty(errors)) {
+		/* no hold-specific errors */
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot release"));
+		switch (errno) {
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		default:
+			(void) zfs_standard_error_fmt(hdl, errno, errbuf);
 		}
-	} else {
-		error = zfs_release(hra->origin, thissnap, hra->tag,
-		    hra->recursive);
 	}
 
-	if (!hra->seento && strcmp(hra->tosnap, thissnap) == 0)
-		hra->seento = B_TRUE;
+	for (elem = nvlist_next_nvpair(errors, NULL);
+	    elem != NULL;
+	    elem = nvlist_next_nvpair(errors, elem)) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "cannot release hold from snapshot '%s'"),
+		    nvpair_name(elem));
+		switch (fnvpair_value_int32(elem)) {
+		case ESRCH:
+			(void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf);
+			break;
+		case EINVAL:
+			(void) zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		default:
+			(void) zfs_standard_error_fmt(hdl,
+			    fnvpair_value_int32(elem), errbuf);
+		}
+	}
 
-	zfs_close(zhp);
-	return (error);
+	fnvlist_free(errors);
+	return (ret);
 }
 
-/*
- * Add a user hold on the set of snapshots starting with fromsnap up to
- * and including tosnap. If we're unable to to acquire a particular hold,
- * undo any holds up to that point.
- */
 int
-zfs_hold_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
-    const char *tag, boolean_t recursive, boolean_t temphold)
+zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl)
 {
-	struct hold_range_arg arg = { 0 };
-	int error;
+	zfs_cmd_t zc = { 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	int nvsz = 2048;
+	void *nvbuf;
+	int err = 0;
+	char errbuf[1024];
 
-	arg.origin = zhp;
-	arg.fromsnap = fromsnap;
-	arg.tosnap = tosnap;
-	arg.tag = tag;
-	arg.temphold = temphold;
-	arg.holding = B_TRUE;
-	arg.recursive = recursive;
-	arg.seenfrom = (fromsnap == NULL);
+	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
-	error = zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg);
+tryagain:
 
-	/*
-	 * Make sure we either hold the entire range or none.
-	 */
-	if (error && arg.lastsnapheld[0] != '\0') {
-		(void) zfs_release_range(zhp, fromsnap,
-		    (const char *)arg.lastsnapheld, tag, recursive);
+	nvbuf = malloc(nvsz);
+	if (nvbuf == NULL) {
+		err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno)));
+		goto out;
 	}
-	return (error);
+
+	zc.zc_nvlist_dst_size = nvsz;
+	zc.zc_nvlist_dst = (uintptr_t)nvbuf;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"),
+		    zc.zc_name);
+		switch (errno) {
+		case ENOMEM:
+			free(nvbuf);
+			nvsz = zc.zc_nvlist_dst_size;
+			goto tryagain;
+
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	} else {
+		/* success */
+		int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0);
+		if (rc) {
+			(void) snprintf(errbuf, sizeof (errbuf), dgettext(
+			    TEXT_DOMAIN, "cannot get permissions on '%s'"),
+			    zc.zc_name);
+			err = zfs_standard_error_fmt(hdl, rc, errbuf);
+		}
+	}
+
+	free(nvbuf);
+out:
+	return (err);
 }
 
 int
-zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
-    boolean_t recursive)
+zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	char *nvbuf;
+	char errbuf[1024];
+	size_t nvsz;
+	int err;
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
-	if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
-	    >= sizeof (zc.zc_string))
-		return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
-	zc.zc_cookie = recursive;
+	assert(zhp->zfs_type == ZFS_TYPE_VOLUME ||
+	    zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
 
-	if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
-		char errbuf[ZFS_MAXNAMELEN+32];
+	err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE);
+	assert(err == 0);
 
-		/*
-		 * if it was recursive, the one that actually failed will be in
-		 * zc.zc_name.
-		 */
-		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot release '%s' from '%s@%s'"), tag, zc.zc_name,
-		    snapname);
+	nvbuf = malloc(nvsz);
+
+	err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0);
+	assert(err == 0);
+
+	zc.zc_nvlist_src_size = nvsz;
+	zc.zc_nvlist_src = (uintptr_t)nvbuf;
+	zc.zc_perm_action = un;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"),
+		    zc.zc_name);
 		switch (errno) {
-		case ESRCH:
-			return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
 		case ENOTSUP:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "pool must be upgraded"));
-			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
 		case EINVAL:
-			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
 		default:
-			return (zfs_standard_error_fmt(hdl, errno, errbuf));
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
 		}
 	}
 
-	return (0);
+	free(nvbuf);
+
+	return (err);
 }
 
-/*
- * Release a user hold from the set of snapshots starting with fromsnap
- * up to and including tosnap.
- */
 int
-zfs_release_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
-    const char *tag, boolean_t recursive)
+zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
 {
-	struct hold_range_arg arg = { 0 };
+	int err;
+	char errbuf[1024];
+
+	err = lzc_get_holds(zhp->zfs_name, nvl);
 
-	arg.origin = zhp;
-	arg.fromsnap = fromsnap;
-	arg.tosnap = tosnap;
-	arg.tag = tag;
-	arg.recursive = recursive;
-	arg.seenfrom = (fromsnap == NULL);
+	if (err != 0) {
+		libzfs_handle_t *hdl = zhp->zfs_hdl;
 
-	return (zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg));
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"),
+		    zhp->zfs_name);
+		switch (err) {
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			err = zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EINVAL:
+			err = zfs_error(hdl, EZFS_BADTYPE, errbuf);
+			break;
+		case ENOENT:
+			err = zfs_error(hdl, EZFS_NOENT, errbuf);
+			break;
+		default:
+			err = zfs_standard_error_fmt(hdl, errno, errbuf);
+			break;
+		}
+	}
+
+	return (err);
 }
 
+/*
+ * Convert the zvol's volume size to an appropriate reservation.
+ * Note: If this routine is updated, it is necessary to update the ZFS test
+ * suite's shell version in reservation.kshlib.
+ */
 uint64_t
 zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
 {
@@ -4199,3 +4852,48 @@ zvol_volsize_to_reservation(uint64_t vol
 	volsize += numdb;
 	return (volsize);
 }
+
+#ifdef __FreeBSD__
+/*
+ * Attach/detach the given filesystem to/from the given jail.
+ */
+int
+zfs_jail(zfs_handle_t *zhp, int jailid, int attach)
+{
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	zfs_cmd_t zc = { 0 };
+	char errbuf[1024];
+	unsigned long cmd;
+	int ret;
+
+	if (attach) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name);
+	} else {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name);
+	}
+
+	switch (zhp->zfs_type) {
+	case ZFS_TYPE_VOLUME:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "volumes can not be jailed"));
+		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+	case ZFS_TYPE_SNAPSHOT:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "snapshots can not be jailed"));
+		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+	}
+	assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM);
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	zc.zc_objset_type = DMU_OST_ZFS;
+	zc.zc_jailid = jailid;
+
+	cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL;
+	if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0)
+		zfs_standard_error(hdl, errno, errbuf);
+
+	return (ret);
+}
+#endif
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c
diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c	10 Oct 2016 11:14:25 -0000
@@ -0,0 +1,842 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
+ */
+
+/*
+ * zfs diff support
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <libintl.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/zfs_ioctl.h>
+#include <libzfs.h>
+#include "libzfs_impl.h"
+
+#define	ZDIFF_SNAPDIR		"/.zfs/snapshot/"
+#define	ZDIFF_SHARESDIR 	"/.zfs/shares/"
+#define	ZDIFF_PREFIX		"zfs-diff-%d"
+
+#define	ZDIFF_ADDED	'+'
+#define	ZDIFF_MODIFIED	'M'
+#define	ZDIFF_REMOVED	'-'
+#define	ZDIFF_RENAMED	'R'
+
+static boolean_t
+do_name_cmp(const char *fpath, const char *tpath)
+{
+	char *fname, *tname;
+	fname = strrchr(fpath, '/') + 1;
+	tname = strrchr(tpath, '/') + 1;
+	return (strcmp(fname, tname) == 0);
+}
+
+typedef struct differ_info {
+	zfs_handle_t *zhp;
+	char *fromsnap;
+	char *frommnt;
+	char *tosnap;
+	char *tomnt;
+	char *ds;
+	char *dsmnt;
+	char *tmpsnap;
+	char errbuf[1024];
+	boolean_t isclone;
+	boolean_t scripted;
+	boolean_t classify;
+	boolean_t timestamped;
+	uint64_t shares;
+	int zerr;
+	int cleanupfd;
+	int outputfd;
+	int datafd;
+} differ_info_t;
+
+/*
+ * Given a {dsname, object id}, get the object path
+ */
+static int
+get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj,
+    char *pn, int maxlen, zfs_stat_t *sb)
+{
+	zfs_cmd_t zc = { 0 };
+	int error;
+
+	(void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name));
+	zc.zc_obj = obj;
+
+	errno = 0;
+	error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc);
+	di->zerr = errno;
+
+	/* we can get stats even if we failed to get a path */
+	(void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t));
+	if (error == 0) {
+		ASSERT(di->zerr == 0);
+		(void) strlcpy(pn, zc.zc_value, maxlen);
+		return (0);
+	}
+
+	if (di->zerr == EPERM) {
+		(void) snprintf(di->errbuf, sizeof (di->errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "The sys_config privilege or diff delegated permission "
+		    "is needed\nto discover path names"));
+		return (-1);
+	} else {
+		(void) snprintf(di->errbuf, sizeof (di->errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "Unable to determine path or stats for "
+		    "object %lld in %s"), obj, dsname);
+		return (-1);
+	}
+}
+
+/*
+ * stream_bytes
+ *
+ * Prints a file name out a character at a time.  If the character is
+ * not in the range of what we consider "printable" ASCII, display it
+ * as an escaped 3-digit octal value.  ASCII values less than a space
+ * are all control characters and we declare the upper end as the
+ * DELete character.  This also is the last 7-bit ASCII character.
+ * We choose to treat all 8-bit ASCII as not printable for this
+ * application.
+ */
+static void
+stream_bytes(FILE *fp, const char *string)
+{
+	char c;
+
+	while ((c = *string++) != '\0') {
+		if (c > ' ' && c != '\\' && c < '\177') {
+			(void) fprintf(fp, "%c", c);
+		} else {
+			(void) fprintf(fp, "\\%03o", (uint8_t)c);
+		}
+	}
+}
+
+static void
+print_what(FILE *fp, mode_t what)
+{
+	char symbol;
+
+	switch (what & S_IFMT) {
+	case S_IFBLK:
+		symbol = 'B';
+		break;
+	case S_IFCHR:
+		symbol = 'C';
+		break;
+	case S_IFDIR:
+		symbol = '/';
+		break;
+#ifdef S_IFDOOR
+	case S_IFDOOR:
+		symbol = '>';
+		break;
+#endif
+	case S_IFIFO:
+		symbol = '|';
+		break;
+	case S_IFLNK:
+		symbol = '@';
+		break;
+#ifdef S_IFPORT
+	case S_IFPORT:
+		symbol = 'P';
+		break;
+#endif
+	case S_IFSOCK:
+		symbol = '=';
+		break;
+	case S_IFREG:
+		symbol = 'F';
+		break;
+	default:
+		symbol = '?';
+		break;
+	}
+	(void) fprintf(fp, "%c", symbol);
+}
+
+static void
+print_cmn(FILE *fp, differ_info_t *di, const char *file)
+{
+	stream_bytes(fp, di->dsmnt);
+	stream_bytes(fp, file);
+}
+
+static void
+print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new,
+    zfs_stat_t *isb)
+{
+	if (di->timestamped)
+		(void) fprintf(fp, "%10lld.%09lld\t",
+		    (longlong_t)isb->zs_ctime[0],
+		    (longlong_t)isb->zs_ctime[1]);
+	(void) fprintf(fp, "%c\t", ZDIFF_RENAMED);
+	if (di->classify) {
+		print_what(fp, isb->zs_mode);
+		(void) fprintf(fp, "\t");
+	}
+	print_cmn(fp, di, old);
+	if (di->scripted)
+		(void) fprintf(fp, "\t");
+	else
+		(void) fprintf(fp, " -> ");
+	print_cmn(fp, di, new);
+	(void) fprintf(fp, "\n");
+}
+
+static void
+print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file,
+    zfs_stat_t *isb)
+{
+	if (di->timestamped)
+		(void) fprintf(fp, "%10lld.%09lld\t",
+		    (longlong_t)isb->zs_ctime[0],
+		    (longlong_t)isb->zs_ctime[1]);
+	(void) fprintf(fp, "%c\t", ZDIFF_MODIFIED);
+	if (di->classify) {
+		print_what(fp, isb->zs_mode);
+		(void) fprintf(fp, "\t");
+	}
+	print_cmn(fp, di, file);
+	(void) fprintf(fp, "\t(%+d)", delta);
+	(void) fprintf(fp, "\n");
+}
+
+static void
+print_file(FILE *fp, differ_info_t *di, char type, const char *file,
+    zfs_stat_t *isb)
+{
+	if (di->timestamped)
+		(void) fprintf(fp, "%10lld.%09lld\t",
+		    (longlong_t)isb->zs_ctime[0],
+		    (longlong_t)isb->zs_ctime[1]);
+	(void) fprintf(fp, "%c\t", type);
+	if (di->classify) {
+		print_what(fp, isb->zs_mode);
+		(void) fprintf(fp, "\t");
+	}
+	print_cmn(fp, di, file);
+	(void) fprintf(fp, "\n");
+}
+
+static int
+write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj)
+{
+	struct zfs_stat fsb, tsb;
+	boolean_t same_name;
+	mode_t fmode, tmode;
+	char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN];
+	int fobjerr, tobjerr;
+	int change;
+
+	if (dobj == di->shares)
+		return (0);
+
+	/*
+	 * Check the from and to snapshots for info on the object. If
+	 * we get ENOENT, then the object just didn't exist in that
+	 * snapshot.  If we get ENOTSUP, then we tried to get
+	 * info on a non-ZPL object, which we don't care about anyway.
+	 */
+	fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname,
+	    MAXPATHLEN, &fsb);
+	if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
+		return (-1);
+
+	tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname,
+	    MAXPATHLEN, &tsb);
+	if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP)
+		return (-1);
+
+	/*
+	 * Unallocated object sharing the same meta dnode block
+	 */
+	if (fobjerr && tobjerr) {
+		ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP);
+		di->zerr = 0;
+		return (0);
+	}
+
+	di->zerr = 0; /* negate get_stats_for_obj() from side that failed */
+	fmode = fsb.zs_mode & S_IFMT;
+	tmode = tsb.zs_mode & S_IFMT;
+	if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 ||
+	    tsb.zs_links == 0)
+		change = 0;
+	else
+		change = tsb.zs_links - fsb.zs_links;
+
+	if (fobjerr) {
+		if (change) {
+			print_link_change(fp, di, change, tobjname, &tsb);
+			return (0);
+		}
+		print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
+		return (0);
+	} else if (tobjerr) {
+		if (change) {
+			print_link_change(fp, di, change, fobjname, &fsb);
+			return (0);
+		}
+		print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
+		return (0);
+	}
+
+	if (fmode != tmode && fsb.zs_gen == tsb.zs_gen)
+		tsb.zs_gen++;	/* Force a generational difference */
+	same_name = do_name_cmp(fobjname, tobjname);
+
+	/* Simple modification or no change */
+	if (fsb.zs_gen == tsb.zs_gen) {
+		/* No apparent changes.  Could we assert !this?  */
+		if (fsb.zs_ctime[0] == tsb.zs_ctime[0] &&
+		    fsb.zs_ctime[1] == tsb.zs_ctime[1])
+			return (0);
+		if (change) {
+			print_link_change(fp, di, change,
+			    change > 0 ? fobjname : tobjname, &tsb);
+		} else if (same_name) {
+			print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb);
+		} else {
+			print_rename(fp, di, fobjname, tobjname, &tsb);
+		}
+		return (0);
+	} else {
+		/* file re-created or object re-used */
+		print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb);
+		print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb);
+		return (0);
+	}
+}
+
+static int
+write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr)
+{
+	uint64_t o;
+	int err;
+
+	for (o = dr->ddr_first; o <= dr->ddr_last; o++) {
+		if ((err = write_inuse_diffs_one(fp, di, o)) != 0)
+			return (err);
+	}
+	return (0);
+}
+
+static int
+describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf,
+    int maxlen)
+{
+	struct zfs_stat sb;
+
+	if (get_stats_for_obj(di, di->fromsnap, object, namebuf,
+	    maxlen, &sb) != 0) {
+		/* Let it slide, if in the delete queue on from side */
+		if (di->zerr == ENOENT && sb.zs_links == 0) {
+			di->zerr = 0;
+			return (0);
+		}
+		return (-1);
+	}
+
+	print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb);
+	return (0);
+}
+
+static int
+write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr)
+{
+	zfs_cmd_t zc = { 0 };
+	libzfs_handle_t *lhdl = di->zhp->zfs_hdl;
+	char fobjname[MAXPATHLEN];
+
+	(void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name));
+	zc.zc_obj = dr->ddr_first - 1;
+
+	ASSERT(di->zerr == 0);
+
+	while (zc.zc_obj < dr->ddr_last) {
+		int err;
+
+		err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc);
+		if (err == 0) {
+			if (zc.zc_obj == di->shares) {
+				zc.zc_obj++;
+				continue;
+			}
+			if (zc.zc_obj > dr->ddr_last) {
+				break;
+			}
+			err = describe_free(fp, di, zc.zc_obj, fobjname,
+			    MAXPATHLEN);
+			if (err)
+				break;
+		} else if (errno == ESRCH) {
+			break;
+		} else {
+			(void) snprintf(di->errbuf, sizeof (di->errbuf),
+			    dgettext(TEXT_DOMAIN,
+			    "next allocated object (> %lld) find failure"),
+			    zc.zc_obj);
+			di->zerr = errno;
+			break;
+		}
+	}
+	if (di->zerr)
+		return (-1);
+	return (0);
+}
+
+static void *
+differ(void *arg)
+{
+	differ_info_t *di = arg;
+	dmu_diff_record_t dr;
+	FILE *ofp;
+	int err = 0;
+
+	if ((ofp = fdopen(di->outputfd, "w")) == NULL) {
+		di->zerr = errno;
+		(void) strerror_r(errno, di->errbuf, sizeof (di->errbuf));
+		(void) close(di->datafd);
+		return ((void *)-1);
+	}
+
+	for (;;) {
+		char *cp = (char *)&dr;
+		int len = sizeof (dr);
+		int rv;
+
+		do {
+			rv = read(di->datafd, cp, len);
+			cp += rv;
+			len -= rv;
+		} while (len > 0 && rv > 0);
+
+		if (rv < 0 || (rv == 0 && len != sizeof (dr))) {
+			di->zerr = EPIPE;
+			break;
+		} else if (rv == 0) {
+			/* end of file at a natural breaking point */
+			break;
+		}
+
+		switch (dr.ddr_type) {
+		case DDR_FREE:
+			err = write_free_diffs(ofp, di, &dr);
+			break;
+		case DDR_INUSE:
+			err = write_inuse_diffs(ofp, di, &dr);
+			break;
+		default:
+			di->zerr = EPIPE;
+			break;
+		}
+
+		if (err || di->zerr)
+			break;
+	}
+
+	(void) fclose(ofp);
+	(void) close(di->datafd);
+	if (err)
+		return ((void *)-1);
+	if (di->zerr) {
+		ASSERT(di->zerr == EINVAL);
+		(void) snprintf(di->errbuf, sizeof (di->errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "Internal error: bad data from diff IOCTL"));
+		return ((void *)-1);
+	}
+	return ((void *)0);
+}
+
+static int
+find_shares_object(differ_info_t *di)
+{
+	char fullpath[MAXPATHLEN];
+	struct stat64 sb = { 0 };
+
+	(void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN);
+	(void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN);
+
+	if (stat64(fullpath, &sb) != 0) {
+#ifdef illumos
+		(void) snprintf(di->errbuf, sizeof (di->errbuf),
+		    dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath);
+		return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf));
+#else
+		return (0);
+#endif
+	}
+
+	di->shares = (uint64_t)sb.st_ino;
+	return (0);
+}
+
+static int
+make_temp_snapshot(differ_info_t *di)
+{
+	libzfs_handle_t *hdl = di->zhp->zfs_hdl;
+	zfs_cmd_t zc = { 0 };
+
+	(void) snprintf(zc.zc_value, sizeof (zc.zc_value),
+	    ZDIFF_PREFIX, getpid());
+	(void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name));
+	zc.zc_cleanup_fd = di->cleanupfd;
+
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) {
+		int err = errno;
+		if (err == EPERM) {
+			(void) snprintf(di->errbuf, sizeof (di->errbuf),
+			    dgettext(TEXT_DOMAIN, "The diff delegated "
+			    "permission is needed in order\nto create a "
+			    "just-in-time snapshot for diffing\n"));
+			return (zfs_error(hdl, EZFS_DIFF, di->errbuf));
+		} else {
+			(void) snprintf(di->errbuf, sizeof (di->errbuf),
+			    dgettext(TEXT_DOMAIN, "Cannot create just-in-time "
+			    "snapshot of '%s'"), zc.zc_name);
+			return (zfs_standard_error(hdl, err, di->errbuf));
+		}
+	}
+
+	di->tmpsnap = zfs_strdup(hdl, zc.zc_value);
+	di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap);
+	return (0);
+}
+
+static void
+teardown_differ_info(differ_info_t *di)
+{
+	free(di->ds);
+	free(di->dsmnt);
+	free(di->fromsnap);
+	free(di->frommnt);
+	free(di->tosnap);
+	free(di->tmpsnap);
+	free(di->tomnt);
+	(void) close(di->cleanupfd);
+}
+
+static int
+get_snapshot_names(differ_info_t *di, const char *fromsnap,
+    const char *tosnap)
+{
+	libzfs_handle_t *hdl = di->zhp->zfs_hdl;
+	char *atptrf = NULL;
+	char *atptrt = NULL;
+	int fdslen, fsnlen;
+	int tdslen, tsnlen;
+
+	/*
+	 * Can accept
+	 *    dataset@snap1
+	 *    dataset@snap1 dataset@snap2
+	 *    dataset@snap1 @snap2
+	 *    dataset@snap1 dataset
+	 *    @snap1 dataset@snap2
+	 */
+	if (tosnap == NULL) {
+		/* only a from snapshot given, must be valid */
+		(void) snprintf(di->errbuf, sizeof (di->errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "Badly formed snapshot name %s"), fromsnap);
+
+		if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT,
+		    B_FALSE)) {
+			return (zfs_error(hdl, EZFS_INVALIDNAME,
+			    di->errbuf));
+		}
+
+		atptrf = strchr(fromsnap, '@');
+		ASSERT(atptrf != NULL);
+		fdslen = atptrf - fromsnap;
+
+		di->fromsnap = zfs_strdup(hdl, fromsnap);
+		di->ds = zfs_strdup(hdl, fromsnap);
+		di->ds[fdslen] = '\0';
+
+		/* the to snap will be a just-in-time snap of the head */
+		return (make_temp_snapshot(di));
+	}
+
+	(void) snprintf(di->errbuf, sizeof (di->errbuf),
+	    dgettext(TEXT_DOMAIN,
+	    "Unable to determine which snapshots to compare"));
+
+	atptrf = strchr(fromsnap, '@');
+	atptrt = strchr(tosnap, '@');
+	fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap);
+	tdslen = atptrt ? atptrt - tosnap : strlen(tosnap);
+	fsnlen = strlen(fromsnap) - fdslen;	/* includes @ sign */
+	tsnlen = strlen(tosnap) - tdslen;	/* includes @ sign */
+
+	if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0) ||
+	    (fsnlen == 0 && tsnlen == 0)) {
+		return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
+	} else if ((fdslen > 0 && tdslen > 0) &&
+	    ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) {
+		/*
+		 * not the same dataset name, might be okay if
+		 * tosnap is a clone of a fromsnap descendant.
+		 */
+		char origin[ZFS_MAX_DATASET_NAME_LEN];
+		zprop_source_t src;
+		zfs_handle_t *zhp;
+
+		di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1);
+		(void) strncpy(di->ds, tosnap, tdslen);
+		di->ds[tdslen] = '\0';
+
+		zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM);
+		while (zhp != NULL) {
+			if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin,
+			    sizeof (origin), &src, NULL, 0, B_FALSE) != 0) {
+				(void) zfs_close(zhp);
+				zhp = NULL;
+				break;
+			}
+			if (strncmp(origin, fromsnap, fsnlen) == 0)
+				break;
+
+			(void) zfs_close(zhp);
+			zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM);
+		}
+
+		if (zhp == NULL) {
+			(void) snprintf(di->errbuf, sizeof (di->errbuf),
+			    dgettext(TEXT_DOMAIN,
+			    "Not an earlier snapshot from the same fs"));
+			return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf));
+		} else {
+			(void) zfs_close(zhp);
+		}
+
+		di->isclone = B_TRUE;
+		di->fromsnap = zfs_strdup(hdl, fromsnap);
+		if (tsnlen) {
+			di->tosnap = zfs_strdup(hdl, tosnap);
+		} else {
+			return (make_temp_snapshot(di));
+		}
+	} else {
+		int dslen = fdslen ? fdslen : tdslen;
+
+		di->ds = zfs_alloc(hdl, dslen + 1);
+		(void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen);
+		di->ds[dslen] = '\0';
+
+		di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf);
+		if (tsnlen) {
+			di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt);
+		} else {
+			return (make_temp_snapshot(di));
+		}
+	}
+	return (0);
+}
+
+static int
+get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt)
+{
+	boolean_t mounted;
+
+	mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt);
+	if (mounted == B_FALSE) {
+		(void) snprintf(di->errbuf, sizeof (di->errbuf),
+		    dgettext(TEXT_DOMAIN,
+		    "Cannot diff an unmounted snapshot"));
+		return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf));
+	}
+
+	/* Avoid a double slash at the beginning of root-mounted datasets */
+	if (**mntpt == '/' && *(*mntpt + 1) == '\0')
+		**mntpt = '\0';
+	return (0);
+}
+
+static int
+get_mountpoints(differ_info_t *di)
+{
+	char *strptr;
+	char *frommntpt;
+
+	/*
+	 * first get the mountpoint for the parent dataset
+	 */
+	if (get_mountpoint(di, di->ds, &di->dsmnt) != 0)
+		return (-1);
+
+	strptr = strchr(di->tosnap, '@');
+	ASSERT3P(strptr, !=, NULL);
+	di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt,
+	    ZDIFF_SNAPDIR, ++strptr);
+
+	strptr = strchr(di->fromsnap, '@');
+	ASSERT3P(strptr, !=, NULL);
+
+	frommntpt = di->dsmnt;
+	if (di->isclone) {
+		char *mntpt;
+		int err;
+
+		*strptr = '\0';
+		err = get_mountpoint(di, di->fromsnap, &mntpt);
+		*strptr = '@';
+		if (err != 0)
+			return (-1);
+		frommntpt = mntpt;
+	}
+
+	di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt,
+	    ZDIFF_SNAPDIR, ++strptr);
+
+	if (di->isclone)
+		free(frommntpt);
+
+	return (0);
+}
+
+static int
+setup_differ_info(zfs_handle_t *zhp, const char *fromsnap,
+    const char *tosnap, differ_info_t *di)
+{
+	di->zhp = zhp;
+
+	di->cleanupfd = open(ZFS_DEV, O_RDWR|O_EXCL);
+	VERIFY(di->cleanupfd >= 0);
+
+	if (get_snapshot_names(di, fromsnap, tosnap) != 0)
+		return (-1);
+
+	if (get_mountpoints(di) != 0)
+		return (-1);
+
+	if (find_shares_object(di) != 0)
+		return (-1);
+
+	return (0);
+}
+
+int
+zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap,
+    const char *tosnap, int flags)
+{
+	zfs_cmd_t zc = { 0 };
+	char errbuf[1024];
+	differ_info_t di = { 0 };
+	pthread_t tid;
+	int pipefd[2];
+	int iocerr;
+
+	(void) snprintf(errbuf, sizeof (errbuf),
+	    dgettext(TEXT_DOMAIN, "zfs diff failed"));
+
+	if (setup_differ_info(zhp, fromsnap, tosnap, &di)) {
+		teardown_differ_info(&di);
+		return (-1);
+	}
+
+	if (pipe(pipefd)) {
+		zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+		teardown_differ_info(&di);
+		return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf));
+	}
+
+	di.scripted = (flags & ZFS_DIFF_PARSEABLE);
+	di.classify = (flags & ZFS_DIFF_CLASSIFY);
+	di.timestamped = (flags & ZFS_DIFF_TIMESTAMP);
+
+	di.outputfd = outfd;
+	di.datafd = pipefd[0];
+
+	if (pthread_create(&tid, NULL, differ, &di)) {
+		zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+		(void) close(pipefd[0]);
+		(void) close(pipefd[1]);
+		teardown_differ_info(&di);
+		return (zfs_error(zhp->zfs_hdl,
+		    EZFS_THREADCREATEFAILED, errbuf));
+	}
+
+	/* do the ioctl() */
+	(void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1);
+	(void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1);
+	zc.zc_cookie = pipefd[1];
+
+	iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc);
+	if (iocerr != 0) {
+		(void) snprintf(errbuf, sizeof (errbuf),
+		    dgettext(TEXT_DOMAIN, "Unable to obtain diffs"));
+		if (errno == EPERM) {
+			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
+			    "\n   The sys_mount privilege or diff delegated "
+			    "permission is needed\n   to execute the "
+			    "diff ioctl"));
+		} else if (errno == EXDEV) {
+			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
+			    "\n   Not an earlier snapshot from the same fs"));
+		} else if (errno != EPIPE || di.zerr == 0) {
+			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+		}
+		(void) close(pipefd[1]);
+		(void) pthread_cancel(tid);
+		(void) pthread_join(tid, NULL);
+		teardown_differ_info(&di);
+		if (di.zerr != 0 && di.zerr != EPIPE) {
+			zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
+			return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
+		} else {
+			return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf));
+		}
+	}
+
+	(void) close(pipefd[1]);
+	(void) pthread_join(tid, NULL);
+
+	if (di.zerr != 0) {
+		zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr));
+		return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf));
+	}
+	teardown_differ_info(&di);
+	return (0);
+}
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c
diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c	27 Feb 2010 22:30:22 -0000	1.1.1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,653 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Iterate over all children of the current object.  This includes the normal
- * dataset hierarchy, but also arbitrary hierarchies due to clones.  We want to
- * walk all datasets in the pool, and construct a directed graph of the form:
- *
- * 			home
- *                        |
- *                   +----+----+
- *                   |         |
- *                   v         v             ws
- *                  bar       baz             |
- *                             |              |
- *                             v              v
- *                          @yesterday ----> foo
- *
- * In order to construct this graph, we have to walk every dataset in the pool,
- * because the clone parent is stored as a property of the child, not the
- * parent.  The parent only keeps track of the number of clones.
- *
- * In the normal case (without clones) this would be rather expensive.  To avoid
- * unnecessary computation, we first try a walk of the subtree hierarchy
- * starting from the initial node.  At each dataset, we construct a node in the
- * graph and an edge leading from its parent.  If we don't see any snapshots
- * with a non-zero clone count, then we are finished.
- *
- * If we do find a cloned snapshot, then we finish the walk of the current
- * subtree, but indicate that we need to do a complete walk.  We then perform a
- * global walk of all datasets, avoiding the subtree we already processed.
- *
- * At the end of this, we'll end up with a directed graph of all relevant (and
- * possible some irrelevant) datasets in the system.  We need to both find our
- * limiting subgraph and determine a safe ordering in which to destroy the
- * datasets.  We do a topological ordering of our graph starting at our target
- * dataset, and then walk the results in reverse.
- *
- * It's possible for the graph to have cycles if, for example, the user renames
- * a clone to be the parent of its origin snapshot.  The user can request to
- * generate an error in this case, or ignore the cycle and continue.
- *
- * When removing datasets, we want to destroy the snapshots in chronological
- * order (because this is the most efficient method).  In order to accomplish
- * this, we store the creation transaction group with each vertex and keep each
- * vertex's edges sorted according to this value.  The topological sort will
- * automatically walk the snapshots in the correct order.
- */
-
-#include <assert.h>
-#include <libintl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <unistd.h>
-
-#include <libzfs.h>
-
-#include "libzfs_impl.h"
-#include "zfs_namecheck.h"
-
-#define	MIN_EDGECOUNT	4
-
-/*
- * Vertex structure.  Indexed by dataset name, this structure maintains a list
- * of edges to other vertices.
- */
-struct zfs_edge;
-typedef struct zfs_vertex {
-	char			zv_dataset[ZFS_MAXNAMELEN];
-	struct zfs_vertex	*zv_next;
-	int			zv_visited;
-	uint64_t		zv_txg;
-	struct zfs_edge		**zv_edges;
-	int			zv_edgecount;
-	int			zv_edgealloc;
-} zfs_vertex_t;
-
-enum {
-	VISIT_SEEN = 1,
-	VISIT_SORT_PRE,
-	VISIT_SORT_POST
-};
-
-/*
- * Edge structure.  Simply maintains a pointer to the destination vertex.  There
- * is no need to store the source vertex, since we only use edges in the context
- * of the source vertex.
- */
-typedef struct zfs_edge {
-	zfs_vertex_t		*ze_dest;
-	struct zfs_edge		*ze_next;
-} zfs_edge_t;
-
-#define	ZFS_GRAPH_SIZE		1027	/* this could be dynamic some day */
-
-/*
- * Graph structure.  Vertices are maintained in a hash indexed by dataset name.
- */
-typedef struct zfs_graph {
-	zfs_vertex_t		**zg_hash;
-	size_t			zg_size;
-	size_t			zg_nvertex;
-	const char		*zg_root;
-	int			zg_clone_count;
-} zfs_graph_t;
-
-/*
- * Allocate a new edge pointing to the target vertex.
- */
-static zfs_edge_t *
-zfs_edge_create(libzfs_handle_t *hdl, zfs_vertex_t *dest)
-{
-	zfs_edge_t *zep = zfs_alloc(hdl, sizeof (zfs_edge_t));
-
-	if (zep == NULL)
-		return (NULL);
-
-	zep->ze_dest = dest;
-
-	return (zep);
-}
-
-/*
- * Destroy an edge.
- */
-static void
-zfs_edge_destroy(zfs_edge_t *zep)
-{
-	free(zep);
-}
-
-/*
- * Allocate a new vertex with the given name.
- */
-static zfs_vertex_t *
-zfs_vertex_create(libzfs_handle_t *hdl, const char *dataset)
-{
-	zfs_vertex_t *zvp = zfs_alloc(hdl, sizeof (zfs_vertex_t));
-
-	if (zvp == NULL)
-		return (NULL);
-
-	assert(strlen(dataset) < ZFS_MAXNAMELEN);
-
-	(void) strlcpy(zvp->zv_dataset, dataset, sizeof (zvp->zv_dataset));
-
-	if ((zvp->zv_edges = zfs_alloc(hdl,
-	    MIN_EDGECOUNT * sizeof (void *))) == NULL) {
-		free(zvp);
-		return (NULL);
-	}
-
-	zvp->zv_edgealloc = MIN_EDGECOUNT;
-
-	return (zvp);
-}
-
-/*
- * Destroy a vertex.  Frees up any associated edges.
- */
-static void
-zfs_vertex_destroy(zfs_vertex_t *zvp)
-{
-	int i;
-
-	for (i = 0; i < zvp->zv_edgecount; i++)
-		zfs_edge_destroy(zvp->zv_edges[i]);
-
-	free(zvp->zv_edges);
-	free(zvp);
-}
-
-/*
- * Given a vertex, add an edge to the destination vertex.
- */
-static int
-zfs_vertex_add_edge(libzfs_handle_t *hdl, zfs_vertex_t *zvp,
-    zfs_vertex_t *dest)
-{
-	zfs_edge_t *zep = zfs_edge_create(hdl, dest);
-
-	if (zep == NULL)
-		return (-1);
-
-	if (zvp->zv_edgecount == zvp->zv_edgealloc) {
-		void *ptr;
-
-		if ((ptr = zfs_realloc(hdl, zvp->zv_edges,
-		    zvp->zv_edgealloc * sizeof (void *),
-		    zvp->zv_edgealloc * 2 * sizeof (void *))) == NULL)
-			return (-1);
-
-		zvp->zv_edges = ptr;
-		zvp->zv_edgealloc *= 2;
-	}
-
-	zvp->zv_edges[zvp->zv_edgecount++] = zep;
-
-	return (0);
-}
-
-static int
-zfs_edge_compare(const void *a, const void *b)
-{
-	const zfs_edge_t *ea = *((zfs_edge_t **)a);
-	const zfs_edge_t *eb = *((zfs_edge_t **)b);
-
-	if (ea->ze_dest->zv_txg < eb->ze_dest->zv_txg)
-		return (-1);
-	if (ea->ze_dest->zv_txg > eb->ze_dest->zv_txg)
-		return (1);
-	return (0);
-}
-
-/*
- * Sort the given vertex edges according to the creation txg of each vertex.
- */
-static void
-zfs_vertex_sort_edges(zfs_vertex_t *zvp)
-{
-	if (zvp->zv_edgecount == 0)
-		return;
-
-	qsort(zvp->zv_edges, zvp->zv_edgecount, sizeof (void *),
-	    zfs_edge_compare);
-}
-
-/*
- * Construct a new graph object.  We allow the size to be specified as a
- * parameter so in the future we can size the hash according to the number of
- * datasets in the pool.
- */
-static zfs_graph_t *
-zfs_graph_create(libzfs_handle_t *hdl, const char *dataset, size_t size)
-{
-	zfs_graph_t *zgp = zfs_alloc(hdl, sizeof (zfs_graph_t));
-
-	if (zgp == NULL)
-		return (NULL);
-
-	zgp->zg_size = size;
-	if ((zgp->zg_hash = zfs_alloc(hdl,
-	    size * sizeof (zfs_vertex_t *))) == NULL) {
-		free(zgp);
-		return (NULL);
-	}
-
-	zgp->zg_root = dataset;
-	zgp->zg_clone_count = 0;
-
-	return (zgp);
-}
-
-/*
- * Destroy a graph object.  We have to iterate over all the hash chains,
- * destroying each vertex in the process.
- */
-static void
-zfs_graph_destroy(zfs_graph_t *zgp)
-{
-	int i;
-	zfs_vertex_t *current, *next;
-
-	for (i = 0; i < zgp->zg_size; i++) {
-		current = zgp->zg_hash[i];
-		while (current != NULL) {
-			next = current->zv_next;
-			zfs_vertex_destroy(current);
-			current = next;
-		}
-	}
-
-	free(zgp->zg_hash);
-	free(zgp);
-}
-
-/*
- * Graph hash function.  Classic bernstein k=33 hash function, taken from
- * usr/src/cmd/sgs/tools/common/strhash.c
- */
-static size_t
-zfs_graph_hash(zfs_graph_t *zgp, const char *str)
-{
-	size_t hash = 5381;
-	int c;
-
-	while ((c = *str++) != 0)
-		hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
-
-	return (hash % zgp->zg_size);
-}
-
-/*
- * Given a dataset name, finds the associated vertex, creating it if necessary.
- */
-static zfs_vertex_t *
-zfs_graph_lookup(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset,
-    uint64_t txg)
-{
-	size_t idx = zfs_graph_hash(zgp, dataset);
-	zfs_vertex_t *zvp;
-
-	for (zvp = zgp->zg_hash[idx]; zvp != NULL; zvp = zvp->zv_next) {
-		if (strcmp(zvp->zv_dataset, dataset) == 0) {
-			if (zvp->zv_txg == 0)
-				zvp->zv_txg = txg;
-			return (zvp);
-		}
-	}
-
-	if ((zvp = zfs_vertex_create(hdl, dataset)) == NULL)
-		return (NULL);
-
-	zvp->zv_next = zgp->zg_hash[idx];
-	zvp->zv_txg = txg;
-	zgp->zg_hash[idx] = zvp;
-	zgp->zg_nvertex++;
-
-	return (zvp);
-}
-
-/*
- * Given two dataset names, create an edge between them.  For the source vertex,
- * mark 'zv_visited' to indicate that we have seen this vertex, and not simply
- * created it as a destination of another edge.  If 'dest' is NULL, then this
- * is an individual vertex (i.e. the starting vertex), so don't add an edge.
- */
-static int
-zfs_graph_add(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *source,
-    const char *dest, uint64_t txg)
-{
-	zfs_vertex_t *svp, *dvp;
-
-	if ((svp = zfs_graph_lookup(hdl, zgp, source, 0)) == NULL)
-		return (-1);
-	svp->zv_visited = VISIT_SEEN;
-	if (dest != NULL) {
-		dvp = zfs_graph_lookup(hdl, zgp, dest, txg);
-		if (dvp == NULL)
-			return (-1);
-		if (zfs_vertex_add_edge(hdl, svp, dvp) != 0)
-			return (-1);
-	}
-
-	return (0);
-}
-
-/*
- * Iterate over all children of the given dataset, adding any vertices
- * as necessary.  Returns -1 if there was an error, or 0 otherwise.
- * This is a simple recursive algorithm - the ZFS namespace typically
- * is very flat.  We manually invoke the necessary ioctl() calls to
- * avoid the overhead and additional semantics of zfs_open().
- */
-static int
-iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset)
-{
-	zfs_cmd_t zc = { 0 };
-	zfs_vertex_t *zvp;
-
-	/*
-	 * Look up the source vertex, and avoid it if we've seen it before.
-	 */
-	zvp = zfs_graph_lookup(hdl, zgp, dataset, 0);
-	if (zvp == NULL)
-		return (-1);
-	if (zvp->zv_visited == VISIT_SEEN)
-		return (0);
-
-	/*
-	 * Iterate over all children
-	 */
-	for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-	    ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
-	    (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
-		/*
-		 * Get statistics for this dataset, to determine the type of the
-		 * dataset and clone statistics.  If this fails, the dataset has
-		 * since been removed, and we're pretty much screwed anyway.
-		 */
-		zc.zc_objset_stats.dds_origin[0] = '\0';
-		if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
-			continue;
-
-		if (zc.zc_objset_stats.dds_origin[0] != '\0') {
-			if (zfs_graph_add(hdl, zgp,
-			    zc.zc_objset_stats.dds_origin, zc.zc_name,
-			    zc.zc_objset_stats.dds_creation_txg) != 0)
-				return (-1);
-			/*
-			 * Count origins only if they are contained in the graph
-			 */
-			if (isa_child_of(zc.zc_objset_stats.dds_origin,
-			    zgp->zg_root))
-				zgp->zg_clone_count--;
-		}
-
-		/*
-		 * Add an edge between the parent and the child.
-		 */
-		if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name,
-		    zc.zc_objset_stats.dds_creation_txg) != 0)
-			return (-1);
-
-		/*
-		 * Recursively visit child
-		 */
-		if (iterate_children(hdl, zgp, zc.zc_name))
-			return (-1);
-	}
-
-	/*
-	 * Now iterate over all snapshots.
-	 */
-	bzero(&zc, sizeof (zc));
-
-	for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-	    ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0;
-	    (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
-
-		/*
-		 * Get statistics for this dataset, to determine the type of the
-		 * dataset and clone statistics.  If this fails, the dataset has
-		 * since been removed, and we're pretty much screwed anyway.
-		 */
-		if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
-			continue;
-
-		/*
-		 * Add an edge between the parent and the child.
-		 */
-		if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name,
-		    zc.zc_objset_stats.dds_creation_txg) != 0)
-			return (-1);
-
-		zgp->zg_clone_count += zc.zc_objset_stats.dds_num_clones;
-	}
-
-	zvp->zv_visited = VISIT_SEEN;
-
-	return (0);
-}
-
-/*
- * Returns false if there are no snapshots with dependent clones in this
- * subtree or if all of those clones are also in this subtree.  Returns
- * true if there is an error or there are external dependents.
- */
-static boolean_t
-external_dependents(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset)
-{
-	zfs_cmd_t zc = { 0 };
-
-	/*
-	 * Check whether this dataset is a clone or has clones since
-	 * iterate_children() only checks the children.
-	 */
-	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-	if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0)
-		return (B_TRUE);
-
-	if (zc.zc_objset_stats.dds_origin[0] != '\0') {
-		if (zfs_graph_add(hdl, zgp,
-		    zc.zc_objset_stats.dds_origin, zc.zc_name,
-		    zc.zc_objset_stats.dds_creation_txg) != 0)
-			return (B_TRUE);
-		if (isa_child_of(zc.zc_objset_stats.dds_origin, dataset))
-			zgp->zg_clone_count--;
-	}
-
-	if ((zc.zc_objset_stats.dds_num_clones) ||
-	    iterate_children(hdl, zgp, dataset))
-		return (B_TRUE);
-
-	return (zgp->zg_clone_count != 0);
-}
-
-/*
- * Construct a complete graph of all necessary vertices.  First, iterate over
- * only our object's children.  If no cloned snapshots are found, or all of
- * the cloned snapshots are in this subtree then return a graph of the subtree.
- * Otherwise, start at the root of the pool and iterate over all datasets.
- */
-static zfs_graph_t *
-construct_graph(libzfs_handle_t *hdl, const char *dataset)
-{
-	zfs_graph_t *zgp = zfs_graph_create(hdl, dataset, ZFS_GRAPH_SIZE);
-	int ret = 0;
-
-	if (zgp == NULL)
-		return (zgp);
-
-	if ((strchr(dataset, '/') == NULL) ||
-	    (external_dependents(hdl, zgp, dataset))) {
-		/*
-		 * Determine pool name and try again.
-		 */
-		int len = strcspn(dataset, "/@") + 1;
-		char *pool = zfs_alloc(hdl, len);
-
-		if (pool == NULL) {
-			zfs_graph_destroy(zgp);
-			return (NULL);
-		}
-		(void) strlcpy(pool, dataset, len);
-
-		if (iterate_children(hdl, zgp, pool) == -1 ||
-		    zfs_graph_add(hdl, zgp, pool, NULL, 0) != 0) {
-			free(pool);
-			zfs_graph_destroy(zgp);
-			return (NULL);
-		}
-		free(pool);
-	}
-
-	if (ret == -1 || zfs_graph_add(hdl, zgp, dataset, NULL, 0) != 0) {
-		zfs_graph_destroy(zgp);
-		return (NULL);
-	}
-
-	return (zgp);
-}
-
-/*
- * Given a graph, do a recursive topological sort into the given array.  This is
- * really just a depth first search, so that the deepest nodes appear first.
- * hijack the 'zv_visited' marker to avoid visiting the same vertex twice.
- */
-static int
-topo_sort(libzfs_handle_t *hdl, boolean_t allowrecursion, char **result,
-    size_t *idx, zfs_vertex_t *zgv)
-{
-	int i;
-
-	if (zgv->zv_visited == VISIT_SORT_PRE && !allowrecursion) {
-		/*
-		 * If we've already seen this vertex as part of our depth-first
-		 * search, then we have a cyclic dependency, and we must return
-		 * an error.
-		 */
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "recursive dependency at '%s'"),
-		    zgv->zv_dataset);
-		return (zfs_error(hdl, EZFS_RECURSIVE,
-		    dgettext(TEXT_DOMAIN,
-		    "cannot determine dependent datasets")));
-	} else if (zgv->zv_visited >= VISIT_SORT_PRE) {
-		/*
-		 * If we've already processed this as part of the topological
-		 * sort, then don't bother doing so again.
-		 */
-		return (0);
-	}
-
-	zgv->zv_visited = VISIT_SORT_PRE;
-
-	/* avoid doing a search if we don't have to */
-	zfs_vertex_sort_edges(zgv);
-	for (i = 0; i < zgv->zv_edgecount; i++) {
-		if (topo_sort(hdl, allowrecursion, result, idx,
-		    zgv->zv_edges[i]->ze_dest) != 0)
-			return (-1);
-	}
-
-	/* we may have visited this in the course of the above */
-	if (zgv->zv_visited == VISIT_SORT_POST)
-		return (0);
-
-	if ((result[*idx] = zfs_alloc(hdl,
-	    strlen(zgv->zv_dataset) + 1)) == NULL)
-		return (-1);
-
-	(void) strcpy(result[*idx], zgv->zv_dataset);
-	*idx += 1;
-	zgv->zv_visited = VISIT_SORT_POST;
-	return (0);
-}
-
-/*
- * The only public interface for this file.  Do the dirty work of constructing a
- * child list for the given object.  Construct the graph, do the toplogical
- * sort, and then return the array of strings to the caller.
- *
- * The 'allowrecursion' parameter controls behavior when cycles are found.  If
- * it is set, the the cycle is ignored and the results returned as if the cycle
- * did not exist.  If it is not set, then the routine will generate an error if
- * a cycle is found.
- */
-int
-get_dependents(libzfs_handle_t *hdl, boolean_t allowrecursion,
-    const char *dataset, char ***result, size_t *count)
-{
-	zfs_graph_t *zgp;
-	zfs_vertex_t *zvp;
-
-	if ((zgp = construct_graph(hdl, dataset)) == NULL)
-		return (-1);
-
-	if ((*result = zfs_alloc(hdl,
-	    zgp->zg_nvertex * sizeof (char *))) == NULL) {
-		zfs_graph_destroy(zgp);
-		return (-1);
-	}
-
-	if ((zvp = zfs_graph_lookup(hdl, zgp, dataset, 0)) == NULL) {
-		free(*result);
-		zfs_graph_destroy(zgp);
-		return (-1);
-	}
-
-	*count = 0;
-	if (topo_sort(hdl, allowrecursion, *result, count, zvp) != 0) {
-		free(*result);
-		zfs_graph_destroy(zgp);
-		return (-1);
-	}
-
-	/*
-	 * Get rid of the last entry, which is our starting vertex and not
-	 * strictly a dependent.
-	 */
-	assert(*count > 0);
-	free((*result)[*count - 1]);
-	(*count)--;
-
-	zfs_graph_destroy(zgp);
-
-	return (0);
-}
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 libzfs_impl.h
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h	27 Feb 2010 22:30:17 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h	20 Apr 2017 00:10:55 -0000
@@ -20,35 +20,31 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  */
 
-#ifndef	_LIBFS_IMPL_H
-#define	_LIBFS_IMPL_H
+#ifndef	_LIBZFS_IMPL_H
+#define	_LIBZFS_IMPL_H
 
-#include <sys/dmu.h>
 #include <sys/fs/zfs.h>
-#include <sys/zfs_ioctl.h>
-#include <sys/zfs_acl.h>
 #include <sys/spa.h>
 #include <sys/nvpair.h>
+#include <sys/dmu.h>
+#include <sys/zfs_ioctl.h>
 
+#include <libshare.h>
 #include <libuutil.h>
 #include <libzfs.h>
-#include <libshare.h>
-
-#include <fm/libtopo.h>
+#include <libzfs_core.h>
+#include <libzfs_compat.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-#ifdef	VERIFY
-#undef	VERIFY
-#endif
-#define	VERIFY	verify
-
 typedef struct libzfs_fru {
 	char *zf_device;
 	char *zf_fru;
@@ -68,14 +64,13 @@ struct libzfs_handle {
 	int libzfs_desc_active;
 	char libzfs_action[1024];
 	char libzfs_desc[1024];
-	char *libzfs_log_str;
 	int libzfs_printerr;
+	int libzfs_storeerr; /* stuff error messages into buffer */
 	void *libzfs_sharehdl; /* libshare handle */
 	uint_t libzfs_shareflags;
 	boolean_t libzfs_mnttab_enable;
 	avl_tree_t libzfs_mnttab_cache;
 	int libzfs_pool_iter;
-	topo_hdl_t *libzfs_topo_hdl;
 	libzfs_fru_t **libzfs_fru_hash;
 	libzfs_fru_t *libzfs_fru_list;
 	char libzfs_chassis_id[256];
@@ -86,7 +81,7 @@ struct libzfs_handle {
 struct zfs_handle {
 	libzfs_handle_t *zfs_hdl;
 	zpool_handle_t *zpool_hdl;
-	char zfs_name[ZFS_MAXNAMELEN];
+	char zfs_name[ZFS_MAX_DATASET_NAME_LEN];
 	zfs_type_t zfs_type; /* type including snapshot */
 	zfs_type_t zfs_head_type; /* type excluding snapshot */
 	dmu_objset_stats_t zfs_dmustats;
@@ -107,7 +102,7 @@ struct zfs_handle {
 struct zpool_handle {
 	libzfs_handle_t *zpool_hdl;
 	zpool_handle_t *zpool_next;
-	char zpool_name[ZPOOL_MAXNAMELEN];
+	char zpool_name[ZFS_MAX_DATASET_NAME_LEN];
 	int zpool_state;
 	size_t zpool_config_size;
 	nvlist_t *zpool_config;
@@ -116,7 +111,7 @@ struct zpool_handle {
 	diskaddr_t zpool_start_block;
 };
 
-typedef  enum {
+typedef enum {
 	PROTO_NFS = 0,
 	PROTO_SMB = 1,
 	PROTO_END = 2
@@ -128,7 +123,6 @@ typedef  enum {
  */
 typedef enum {
 	SHARED_NOT_SHARED = 0x0,
-	SHARED_ISCSI = 0x1,
 	SHARED_NFS = 0x2,
 	SHARED_SMB = 0x4
 } zfs_share_type_t;
@@ -138,6 +132,7 @@ int zfs_error_fmt(libzfs_handle_t *, int
 void zfs_error_aux(libzfs_handle_t *, const char *, ...);
 void *zfs_alloc(libzfs_handle_t *, size_t);
 void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t);
+char *zfs_asprintf(libzfs_handle_t *, const char *, ...);
 char *zfs_strdup(libzfs_handle_t *, const char *);
 int no_memory(libzfs_handle_t *);
 
@@ -148,7 +143,8 @@ int zpool_standard_error_fmt(libzfs_hand
 
 int get_dependents(libzfs_handle_t *, boolean_t, const char *, char ***,
     size_t *);
-
+zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *);
+zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *);
 
 int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t,
     nvlist_t *, char **, uint64_t *, const char *);
@@ -160,7 +156,11 @@ int zprop_expand_list(libzfs_handle_t *h
  * on each change node regardless of whether or not it is currently
  * mounted.
  */
-#define	CL_GATHER_MOUNT_ALWAYS	1
+#define	CL_GATHER_MOUNT_ALWAYS	0x01
+/*
+ * Use this changelist_gather() flag to prevent unmounting of file systems.
+ */
+#define	CL_GATHER_DONT_UNMOUNT	0x02
 
 typedef struct prop_changelist prop_changelist_t;
 
@@ -185,11 +185,16 @@ int create_parents(libzfs_handle_t *, ch
 boolean_t isa_child_of(const char *dataset, const char *parent);
 
 zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *);
+zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *,
+    nvlist_t *props);
 
 int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
 
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
 
+int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
+    boolean_t modifying);
+
 void namespace_clear(libzfs_handle_t *);
 
 /*
@@ -209,4 +214,4 @@ extern void libzfs_fru_clear(libzfs_hand
 }
 #endif
 
-#endif	/* _LIBFS_IMPL_H */
+#endif	/* _LIBZFS_IMPL_H */
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c,v
retrieving revision 1.3
diff -u -p -r1.3 libzfs_import.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c	27 Feb 2010 23:43:52 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c	5 May 2017 17:34:14 -0000
@@ -18,9 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 RackTop Systems.
+ * Copyright 2016 Nexenta Systems, Inc.
  */
 
 /*
@@ -39,15 +42,25 @@
  * using our derived config, and record the results.
  */
 
+#include <ctype.h>
 #include <devid.h>
 #include <dirent.h>
 #include <errno.h>
 #include <libintl.h>
+#include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <thread_pool.h>
+#ifdef __FreeBSD__
+#include <libgeom.h>
+#endif
+#ifdef __NetBSD__
+#include <util.h>
+static int native_ioctl(int fd, unsigned long cmd, void *arg);
+#endif
 
 #include <sys/vdev_impl.h>
 
@@ -89,6 +102,7 @@ typedef struct pool_list {
 static char *
 get_devid(const char *path)
 {
+#ifdef have_devid
 	int fd;
 	ddi_devid_t devid;
 	char *minor, *ret;
@@ -108,6 +122,9 @@ get_devid(const char *path)
 	(void) close(fd);
 
 	return (ret);
+#else
+	return (NULL);
+#endif
 }
 
 
@@ -189,8 +206,10 @@ fix_paths(nvlist_t *nv, name_entry_t *na
 	if ((devid = get_devid(best->ne_name)) == NULL) {
 		(void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID);
 	} else {
-		if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0)
+		if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) {
+			devid_str_free(devid);
 			return (-1);
+		}
 		devid_str_free(devid);
 	}
 
@@ -427,13 +446,13 @@ get_configs(libzfs_handle_t *hdl, pool_l
 	pool_entry_t *pe;
 	vdev_entry_t *ve;
 	config_entry_t *ce;
-	nvlist_t *ret = NULL, *config = NULL, *tmp, *nvtop, *nvroot;
+	nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t i, nspares, nl2cache;
 	boolean_t config_seen;
 	uint64_t best_txg;
-	char *name, *hostname;
-	uint64_t version, guid;
+	char *name, *hostname = NULL;
+	uint64_t guid;
 	uint_t children = 0;
 	nvlist_t **child = NULL;
 	uint_t holes;
@@ -519,47 +538,48 @@ get_configs(libzfs_handle_t *hdl, pool_l
 				 * configuration:
 				 *
 				 *	version
-				 * 	pool guid
-				 * 	name
-				 * 	pool state
+				 *	pool guid
+				 *	name
+				 *	comment (if available)
+				 *	pool state
 				 *	hostid (if available)
 				 *	hostname (if available)
 				 */
-				uint64_t state;
+				uint64_t state, version;
+				char *comment = NULL;
+
+				version = fnvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_VERSION);
+				fnvlist_add_uint64(config,
+				    ZPOOL_CONFIG_VERSION, version);
+				guid = fnvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_POOL_GUID);
+				fnvlist_add_uint64(config,
+				    ZPOOL_CONFIG_POOL_GUID, guid);
+				name = fnvlist_lookup_string(tmp,
+				    ZPOOL_CONFIG_POOL_NAME);
+				fnvlist_add_string(config,
+				    ZPOOL_CONFIG_POOL_NAME, name);
+
+				if (nvlist_lookup_string(tmp,
+				    ZPOOL_CONFIG_COMMENT, &comment) == 0)
+					fnvlist_add_string(config,
+					    ZPOOL_CONFIG_COMMENT, comment);
+
+				state = fnvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_POOL_STATE);
+				fnvlist_add_uint64(config,
+				    ZPOOL_CONFIG_POOL_STATE, state);
 
-				verify(nvlist_lookup_uint64(tmp,
-				    ZPOOL_CONFIG_VERSION, &version) == 0);
-				if (nvlist_add_uint64(config,
-				    ZPOOL_CONFIG_VERSION, version) != 0)
-					goto nomem;
-				verify(nvlist_lookup_uint64(tmp,
-				    ZPOOL_CONFIG_POOL_GUID, &guid) == 0);
-				if (nvlist_add_uint64(config,
-				    ZPOOL_CONFIG_POOL_GUID, guid) != 0)
-					goto nomem;
-				verify(nvlist_lookup_string(tmp,
-				    ZPOOL_CONFIG_POOL_NAME, &name) == 0);
-				if (nvlist_add_string(config,
-				    ZPOOL_CONFIG_POOL_NAME, name) != 0)
-					goto nomem;
-				verify(nvlist_lookup_uint64(tmp,
-				    ZPOOL_CONFIG_POOL_STATE, &state) == 0);
-				if (nvlist_add_uint64(config,
-				    ZPOOL_CONFIG_POOL_STATE, state) != 0)
-					goto nomem;
 				hostid = 0;
 				if (nvlist_lookup_uint64(tmp,
 				    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
-					if (nvlist_add_uint64(config,
-					    ZPOOL_CONFIG_HOSTID, hostid) != 0)
-						goto nomem;
-					verify(nvlist_lookup_string(tmp,
-					    ZPOOL_CONFIG_HOSTNAME,
-					    &hostname) == 0);
-					if (nvlist_add_string(config,
-					    ZPOOL_CONFIG_HOSTNAME,
-					    hostname) != 0)
-						goto nomem;
+					fnvlist_add_uint64(config,
+					    ZPOOL_CONFIG_HOSTID, hostid);
+					hostname = fnvlist_lookup_string(tmp,
+					    ZPOOL_CONFIG_HOSTNAME);
+					fnvlist_add_string(config,
+					    ZPOOL_CONFIG_HOSTNAME, hostname);
 				}
 
 				config_seen = B_TRUE;
@@ -655,8 +675,10 @@ get_configs(libzfs_handle_t *hdl, pool_l
 				    nvlist_add_uint64(holey,
 				    ZPOOL_CONFIG_ID, c) != 0 ||
 				    nvlist_add_uint64(holey,
-				    ZPOOL_CONFIG_GUID, 0ULL) != 0)
+				    ZPOOL_CONFIG_GUID, 0ULL) != 0) {
+					nvlist_free(holey);
 					goto nomem;
+				}
 				child[c] = holey;
 			}
 		}
@@ -897,10 +919,203 @@ zpool_read_label(int fd, nvlist_t **conf
 	return (0);
 }
 
+typedef struct rdsk_node {
+	char *rn_name;
+	int rn_dfd;
+	libzfs_handle_t *rn_hdl;
+	nvlist_t *rn_config;
+	avl_tree_t *rn_avl;
+	avl_node_t rn_node;
+	boolean_t rn_nozpool;
+} rdsk_node_t;
+
+static int
+slice_cache_compare(const void *arg1, const void *arg2)
+{
+	const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+	const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+	char *nm1slice, *nm2slice;
+	int rv;
+
+	/*
+	 * slices zero and two are the most likely to provide results,
+	 * so put those first
+	 */
+	nm1slice = strstr(nm1, "s0");
+	nm2slice = strstr(nm2, "s0");
+	if (nm1slice && !nm2slice) {
+		return (-1);
+	}
+	if (!nm1slice && nm2slice) {
+		return (1);
+	}
+	nm1slice = strstr(nm1, "s2");
+	nm2slice = strstr(nm2, "s2");
+	if (nm1slice && !nm2slice) {
+		return (-1);
+	}
+	if (!nm1slice && nm2slice) {
+		return (1);
+	}
+
+	rv = strcmp(nm1, nm2);
+	if (rv == 0)
+		return (0);
+	return (rv > 0 ? 1 : -1);
+}
+
+#ifdef illumos
+static void
+check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
+    diskaddr_t size, uint_t blksz)
+{
+	rdsk_node_t tmpnode;
+	rdsk_node_t *node;
+	char sname[MAXNAMELEN];
+
+	tmpnode.rn_name = &sname[0];
+	(void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
+	    diskname, partno);
+	/*
+	 * protect against division by zero for disk labels that
+	 * contain a bogus sector size
+	 */
+	if (blksz == 0)
+		blksz = DEV_BSIZE;
+	/* too small to contain a zpool? */
+	if ((size < (SPA_MINDEVSIZE / blksz)) &&
+	    (node = avl_find(r, &tmpnode, NULL)))
+		node->rn_nozpool = B_TRUE;
+}
+#endif	/* illumos */
+
+static void
+nozpool_all_slices(avl_tree_t *r, const char *sname)
+{
+#ifdef illumos
+	char diskname[MAXNAMELEN];
+	char *ptr;
+	int i;
+
+	(void) strncpy(diskname, sname, MAXNAMELEN);
+	if (((ptr = strrchr(diskname, 's')) == NULL) &&
+	    ((ptr = strrchr(diskname, 'p')) == NULL))
+		return;
+	ptr[0] = 's';
+	ptr[1] = '\0';
+	for (i = 0; i < NDKMAP; i++)
+		check_one_slice(r, diskname, i, 0, 1);
+	ptr[0] = 'p';
+	for (i = 0; i <= FD_NUMPART; i++)
+		check_one_slice(r, diskname, i, 0, 1);
+#endif	/* illumos */
+}
+
+#ifdef illumos
+static void
+check_slices(avl_tree_t *r, int fd, const char *sname)
+{
+	struct extvtoc vtoc;
+	struct dk_gpt *gpt;
+	char diskname[MAXNAMELEN];
+	char *ptr;
+	int i;
+
+	(void) strncpy(diskname, sname, MAXNAMELEN);
+	if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
+		return;
+	ptr[1] = '\0';
+
+	if (read_extvtoc(fd, &vtoc) >= 0) {
+		for (i = 0; i < NDKMAP; i++)
+			check_one_slice(r, diskname, i,
+			    vtoc.v_part[i].p_size, vtoc.v_sectorsz);
+	} else if (efi_alloc_and_read(fd, &gpt) >= 0) {
+		/*
+		 * on x86 we'll still have leftover links that point
+		 * to slices s[9-15], so use NDKMAP instead
+		 */
+		for (i = 0; i < NDKMAP; i++)
+			check_one_slice(r, diskname, i,
+			    gpt->efi_parts[i].p_size, gpt->efi_lbasize);
+		/* nodes p[1-4] are never used with EFI labels */
+		ptr[0] = 'p';
+		for (i = 1; i <= FD_NUMPART; i++)
+			check_one_slice(r, diskname, i, 0, 1);
+		efi_free(gpt);
+	}
+}
+#endif	/* illumos */
+
+static void
+zpool_open_func(void *arg)
+{
+	rdsk_node_t *rn = arg;
+	struct stat64 statbuf;
+	nvlist_t *config;
+	int fd;
+
+	if (rn->rn_nozpool)
+		return;
+	if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
+		/* symlink to a device that's no longer there */
+		if (errno == ENOENT)
+			nozpool_all_slices(rn->rn_avl, rn->rn_name);
+		return;
+	}
+	/*
+	 * Ignore failed stats.  We only want regular
+	 * files, character devs and block devs.
+	 */
+	if (fstat64(fd, &statbuf) != 0 ||
+	    (!S_ISREG(statbuf.st_mode) &&
+	    !S_ISCHR(statbuf.st_mode) &&
+	    !S_ISBLK(statbuf.st_mode))) {
+		(void) close(fd);
+		return;
+	}
+	/* this file is too small to hold a zpool */
+#ifdef illumos
+	if (S_ISREG(statbuf.st_mode) &&
+	    statbuf.st_size < SPA_MINDEVSIZE) {
+		(void) close(fd);
+		return;
+	} else if (!S_ISREG(statbuf.st_mode)) {
+		/*
+		 * Try to read the disk label first so we don't have to
+		 * open a bunch of minor nodes that can't have a zpool.
+		 */
+		check_slices(rn->rn_avl, fd, rn->rn_name);
+	}
+#endif /* illumos */
+#ifdef __FreeBSD__
+	if (statbuf.st_size < SPA_MINDEVSIZE) {
+		(void) close(fd);
+		return;
+	}
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+	off_t size;
+
+	if (native_ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+	    size < SPA_MINDEVSIZE) {
+		(void) close(fd);
+		return;
+	}
+#endif
+
+	if ((zpool_read_label(fd, &config)) != 0) {
+		(void) close(fd);
+		(void) no_memory(rn->rn_hdl);
+		return;
+	}
+	(void) close(fd);
+
+	rn->rn_config = config;
+}
+
 /*
- * Given a file descriptor, clear (zero) the label information.  This function
- * is currently only used in the appliance stack as part of the ZFS sysevent
- * module.
+ * Given a file descriptor, clear (zero) the label information.
  */
 int
 zpool_clear_label(int fd)
@@ -919,8 +1134,10 @@ zpool_clear_label(int fd)
 
 	for (l = 0; l < VDEV_LABELS; l++) {
 		if (pwrite64(fd, label, sizeof (vdev_label_t),
-		    label_offset(size, l)) != sizeof (vdev_label_t))
+		    label_offset(size, l)) != sizeof (vdev_label_t)) {
+			free(label);
 			return (-1);
+		}
 	}
 
 	free(label);
@@ -938,20 +1155,20 @@ static nvlist_t *
 zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 {
 	int i, dirs = iarg->paths;
-	DIR *dirp = NULL;
 	struct dirent64 *dp;
 	char path[MAXPATHLEN];
 	char *end, **dir = iarg->path;
 	size_t pathleft;
-	struct stat64 statbuf;
-	nvlist_t *ret = NULL, *config;
-	static char *default_dir = "/dev/dsk";
-	int fd;
+	nvlist_t *ret = NULL;
+	static char *default_dir = "/dev";
 	pool_list_t pools = { 0 };
 	pool_entry_t *pe, *penext;
 	vdev_entry_t *ve, *venext;
 	config_entry_t *ce, *cenext;
 	name_entry_t *ne, *nenext;
+	avl_tree_t slice_cache;
+	rdsk_node_t *slice;
+	void *cookie;
 
 	if (dirs == 0) {
 		dirs = 1;
@@ -964,8 +1181,11 @@ zpool_find_import_impl(libzfs_handle_t *
 	 * and toplevel GUID.
 	 */
 	for (i = 0; i < dirs; i++) {
-		char *rdsk;
+		tpool_t *t;
+		char rdsk[MAXPATHLEN];
 		int dfd;
+		boolean_t config_failed = B_FALSE;
+		DIR *dirp;
 
 		/* use realpath to normalize the path */
 		if (realpath(dir[i], path) == 0) {
@@ -978,18 +1198,22 @@ zpool_find_import_impl(libzfs_handle_t *
 		*end = 0;
 		pathleft = &path[sizeof (path)] - end;
 
+#ifdef illumos
 		/*
 		 * Using raw devices instead of block devices when we're
 		 * reading the labels skips a bunch of slow operations during
 		 * close(2) processing, so we replace /dev/dsk with /dev/rdsk.
 		 */
-		if (strcmp(path, "/dev/dsk/") == 0)
-			rdsk = "/dev/rdsk/";
+		if (strcmp(path, ZFS_DISK_ROOTD) == 0)
+			(void) strlcpy(rdsk, ZFS_RDISK_ROOTD, sizeof (rdsk));
 		else
-			rdsk = path;
+#endif
+			(void) strlcpy(rdsk, path, sizeof (rdsk));
 
 		if ((dfd = open64(rdsk, O_RDONLY)) < 0 ||
 		    (dirp = fdopendir(dfd)) == NULL) {
+			if (dfd >= 0)
+				(void) close(dfd);
 			zfs_error_aux(hdl, strerror(errno));
 			(void) zfs_error_fmt(hdl, EZFS_BADPATH,
 			    dgettext(TEXT_DOMAIN, "cannot open '%s'"),
@@ -997,6 +1221,79 @@ zpool_find_import_impl(libzfs_handle_t *
 			goto error;
 		}
 
+		avl_create(&slice_cache, slice_cache_compare,
+		    sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
+
+#ifdef __FreeBSD__
+		if (strcmp(rdsk, "/dev/") == 0) {
+			struct gmesh mesh;
+			struct gclass *mp;
+			struct ggeom *gp;
+			struct gprovider *pp;
+
+			errno = geom_gettree(&mesh);
+			if (errno != 0) {
+				zfs_error_aux(hdl, strerror(errno));
+				(void) zfs_error_fmt(hdl, EZFS_BADPATH,
+				    dgettext(TEXT_DOMAIN, "cannot get GEOM tree"));
+				goto error;
+			}
+
+			LIST_FOREACH(mp, &mesh.lg_class, lg_class) {
+		        	LIST_FOREACH(gp, &mp->lg_geom, lg_geom) {
+					LIST_FOREACH(pp, &gp->lg_provider, lg_provider) {
+						slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+						slice->rn_name = zfs_strdup(hdl, pp->lg_name);
+						slice->rn_avl = &slice_cache;
+						slice->rn_dfd = dfd;
+						slice->rn_hdl = hdl;
+						slice->rn_nozpool = B_FALSE;
+						avl_add(&slice_cache, slice);
+					}
+				}
+			}
+
+			geom_deletetree(&mesh);
+			goto skipdir;
+		}
+#endif
+#ifdef __NetBSD__
+		if (strcmp(rdsk, "/dev/") == 0) {
+			static const char mib_name[] = "hw.disknames";
+			size_t len;
+			char *disknames, *last, *name;
+			char part;
+
+			part = getrawpartition();
+			if (sysctlbyname(mib_name, NULL, &len, NULL, 0) == -1) {
+				zfs_error_aux(hdl, strerror(errno));
+				(void) zfs_error_fmt(hdl, EZFS_BADPATH,
+				    dgettext(TEXT_DOMAIN, "cannot get hw.disknames list"));
+
+				avl_destroy(&slice_cache);
+				(void) closedir(dirp);
+				goto error;
+			}
+			disknames = zfs_alloc(hdl, len + 2);
+			(void)sysctlbyname(mib_name, disknames, &len, NULL, 0);
+
+
+			for ((name = strtok_r(disknames, " ", &last)); name;
+			    (name = strtok_r(NULL, " ", &last))) {
+				slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+				slice->rn_name = zfs_asprintf(hdl, "%s%c", name, 'a' + part);
+				slice->rn_avl = &slice_cache;
+				slice->rn_dfd = dfd;
+				slice->rn_hdl = hdl;
+				slice->rn_nozpool = B_FALSE;
+				avl_add(&slice_cache, slice);
+			}
+			free(disknames);
+
+			goto skipdir;
+		}
+#endif
+
 		/*
 		 * This is not MT-safe, but we have no MT consumers of libzfs
 		 */
@@ -1006,33 +1303,38 @@ zpool_find_import_impl(libzfs_handle_t *
 			    (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
 				continue;
 
-			(void)snprintf(path, sizeof (path), "%s/%s",
-			    rdsk, dp->d_name);
-
-			if ((fd = open(path, O_RDONLY)) < 0)
-				continue;
-
-			/*
-			 * Ignore failed stats.  We only want regular
-			 * files, character devs and block devs.
-			 */
-			if (fstat64(fd, &statbuf) != 0 ||
-			    (!S_ISREG(statbuf.st_mode) &&
-			    !S_ISCHR(statbuf.st_mode) &&
-			    !S_ISBLK(statbuf.st_mode))) {
-				(void) close(fd);
-				continue;
-			}
-
-			if ((zpool_read_label(fd, &config)) != 0) {
-				(void) close(fd);
-				(void) no_memory(hdl);
-				goto error;
-			}
-
-			(void) close(fd);
-
-			if (config != NULL) {
+			slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+			slice->rn_name = zfs_strdup(hdl, name);
+			slice->rn_avl = &slice_cache;
+			slice->rn_dfd = dfd;
+			slice->rn_hdl = hdl;
+			slice->rn_nozpool = B_FALSE;
+			avl_add(&slice_cache, slice);
+		}
+skipdir:
+		/*
+		 * create a thread pool to do all of this in parallel;
+		 * rn_nozpool is not protected, so this is racy in that
+		 * multiple tasks could decide that the same slice can
+		 * not hold a zpool, which is benign.  Also choose
+		 * double the number of processors; we hold a lot of
+		 * locks in the kernel, so going beyond this doesn't
+		 * buy us much.
+		 */
+		t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
+		    0, NULL);
+		for (slice = avl_first(&slice_cache); slice;
+		    (slice = avl_walk(&slice_cache, slice,
+		    AVL_AFTER)))
+			(void) tpool_dispatch(t, zpool_open_func, slice);
+		tpool_wait(t);
+		tpool_destroy(t);
+
+		cookie = NULL;
+		while ((slice = avl_destroy_nodes(&slice_cache,
+		    &cookie)) != NULL) {
+			if (slice->rn_config != NULL && !config_failed) {
+				nvlist_t *config = slice->rn_config;
 				boolean_t matched = B_TRUE;
 
 				if (iarg->poolname != NULL) {
@@ -1052,18 +1354,26 @@ zpool_find_import_impl(libzfs_handle_t *
 				}
 				if (!matched) {
 					nvlist_free(config);
-					config = NULL;
-					continue;
+				} else {
+					/*
+					 * use the non-raw path for the config
+					 */
+					(void) strlcpy(end, slice->rn_name,
+					    pathleft);
+					if (add_config(hdl, &pools, path,
+					    config) != 0)
+						config_failed = B_TRUE;
 				}
-				/* use the non-raw path for the config */
-				(void) strlcpy(end, name, pathleft);
-				if (add_config(hdl, &pools, path, config) != 0)
-					goto error;
 			}
+			free(slice->rn_name);
+			free(slice);
 		}
+		avl_destroy(&slice_cache);
 
 		(void) closedir(dirp);
-		dirp = NULL;
+
+		if (config_failed)
+			goto error;
 	}
 
 	ret = get_configs(hdl, &pools, iarg->can_be_active);
@@ -1075,8 +1385,7 @@ error:
 			venext = ve->ve_next;
 			for (ce = ve->ve_configs; ce != NULL; ce = cenext) {
 				cenext = ce->ce_next;
-				if (ce->ce_config)
-					nvlist_free(ce->ce_config);
+				nvlist_free(ce->ce_config);
 				free(ce);
 			}
 			free(ve);
@@ -1086,14 +1395,10 @@ error:
 
 	for (ne = pools.names; ne != NULL; ne = nenext) {
 		nenext = ne->ne_next;
-		if (ne->ne_name)
-			free(ne->ne_name);
+		free(ne->ne_name);
 		free(ne);
 	}
 
-	if (dirp)
-		(void) closedir(dirp);
-
 	return (ret);
 }
 
@@ -1182,21 +1487,15 @@ zpool_find_import_cached(libzfs_handle_t
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) {
-		verify(nvpair_value_nvlist(elem, &src) == 0);
+		src = fnvpair_value_nvlist(elem);
 
-		verify(nvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME,
-		    &name) == 0);
+		name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME);
 		if (poolname != NULL && strcmp(poolname, name) != 0)
 			continue;
 
-		verify(nvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID,
-		    &this_guid) == 0);
-		if (guid != 0) {
-			verify(nvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID,
-			    &this_guid) == 0);
-			if (guid != this_guid)
-				continue;
-		}
+		this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID);
+		if (guid != 0 && guid != this_guid)
+			continue;
 
 		if (pool_active(hdl, name, this_guid, &active) != 0) {
 			nvlist_free(raw);
@@ -1366,6 +1665,24 @@ zpool_in_use(libzfs_handle_t *hdl, int f
 
 	switch (stateval) {
 	case POOL_STATE_EXPORTED:
+		/*
+		 * A pool with an exported state may in fact be imported
+		 * read-only, so check the in-core state to see if it's
+		 * active and imported read-only.  If it is, set
+		 * its state to active.
+		 */
+		if (pool_active(hdl, name, guid, &isactive) == 0 && isactive &&
+		    (zhp = zpool_open_canfail(hdl, name)) != NULL) {
+			if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL))
+				stateval = POOL_STATE_ACTIVE;
+
+			/*
+			 * All we needed the zpool handle for is the
+			 * readonly prop check.
+			 */
+			zpool_close(zhp);
+		}
+
 		ret = B_TRUE;
 		break;
 
@@ -1439,9 +1756,9 @@ zpool_in_use(libzfs_handle_t *hdl, int f
 		cb.cb_type = ZPOOL_CONFIG_SPARES;
 		if (zpool_iter(hdl, find_aux, &cb) == 1) {
 			name = (char *)zpool_get_name(cb.cb_zhp);
-			ret = TRUE;
+			ret = B_TRUE;
 		} else {
-			ret = FALSE;
+			ret = B_FALSE;
 		}
 		break;
 
@@ -1455,9 +1772,9 @@ zpool_in_use(libzfs_handle_t *hdl, int f
 		cb.cb_type = ZPOOL_CONFIG_L2CACHE;
 		if (zpool_iter(hdl, find_aux, &cb) == 1) {
 			name = (char *)zpool_get_name(cb.cb_zhp);
-			ret = TRUE;
+			ret = B_TRUE;
 		} else {
-			ret = FALSE;
+			ret = B_FALSE;
 		}
 		break;
 
@@ -1483,3 +1800,18 @@ zpool_in_use(libzfs_handle_t *hdl, int f
 	*inuse = ret;
 	return (0);
 }
+
+#ifdef __NetBSD__
+/*
+ * This needs to be at the end of the file so that we can #undef ioctl
+ * without affecting anything else.
+ */
+#undef ioctl
+
+static int
+native_ioctl(int fd, unsigned long cmd, void *arg)
+{
+
+	return ioctl(fd, cmd, arg);
+}
+#endif
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c
diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c	10 Oct 2016 11:14:25 -0000
@@ -0,0 +1,526 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <unistd.h>
+#include <stddef.h>
+#include <libintl.h>
+#include <libzfs.h>
+
+#include "libzfs_impl.h"
+
+int
+zfs_iter_clones(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+{
+	nvlist_t *nvl = zfs_get_clones_nvl(zhp);
+	nvpair_t *pair;
+
+	if (nvl == NULL)
+		return (0);
+
+	for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(nvl, pair)) {
+		zfs_handle_t *clone = zfs_open(zhp->zfs_hdl, nvpair_name(pair),
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (clone != NULL) {
+			int err = func(clone, data);
+			if (err != 0)
+				return (err);
+		}
+	}
+	return (0);
+}
+
+static int
+zfs_do_list_ioctl(zfs_handle_t *zhp, unsigned long arg, zfs_cmd_t *zc)
+{
+	int rc;
+	uint64_t	orig_cookie;
+
+	orig_cookie = zc->zc_cookie;
+top:
+	(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
+	rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc);
+
+	if (rc == -1) {
+		switch (errno) {
+		case ENOMEM:
+			/* expand nvlist memory and try again */
+			if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) {
+				zcmd_free_nvlists(zc);
+				return (-1);
+			}
+			zc->zc_cookie = orig_cookie;
+			goto top;
+		/*
+		 * An errno value of ESRCH indicates normal completion.
+		 * If ENOENT is returned, then the underlying dataset
+		 * has been removed since we obtained the handle.
+		 */
+		case ESRCH:
+		case ENOENT:
+			rc = 1;
+			break;
+		default:
+			rc = zfs_standard_error(zhp->zfs_hdl, errno,
+			    dgettext(TEXT_DOMAIN,
+			    "cannot iterate filesystems"));
+			break;
+		}
+	}
+	return (rc);
+}
+
+/*
+ * Iterate over all child filesystems
+ */
+int
+zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+{
+	zfs_cmd_t zc = { 0 };
+	zfs_handle_t *nzhp;
+	int ret;
+
+	if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM)
+		return (0);
+
+	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+		return (-1);
+
+	while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT,
+	    &zc)) == 0) {
+		/*
+		 * Silently ignore errors, as the only plausible explanation is
+		 * that the pool has since been removed.
+		 */
+		if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
+		    &zc)) == NULL) {
+			continue;
+		}
+
+		if ((ret = func(nzhp, data)) != 0) {
+			zcmd_free_nvlists(&zc);
+			return (ret);
+		}
+	}
+	zcmd_free_nvlists(&zc);
+	return ((ret < 0) ? ret : 0);
+}
+
+/*
+ * Iterate over all snapshots
+ */
+int
+zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func,
+    void *data)
+{
+	zfs_cmd_t zc = { 0 };
+	zfs_handle_t *nzhp;
+	int ret;
+
+	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT ||
+	    zhp->zfs_type == ZFS_TYPE_BOOKMARK)
+		return (0);
+
+	zc.zc_simple = simple;
+
+	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+		return (-1);
+	while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT,
+	    &zc)) == 0) {
+
+		if (simple)
+			nzhp = make_dataset_simple_handle_zc(zhp, &zc);
+		else
+			nzhp = make_dataset_handle_zc(zhp->zfs_hdl, &zc);
+		if (nzhp == NULL)
+			continue;
+
+		if ((ret = func(nzhp, data)) != 0) {
+			zcmd_free_nvlists(&zc);
+			return (ret);
+		}
+	}
+	zcmd_free_nvlists(&zc);
+	return ((ret < 0) ? ret : 0);
+}
+
+/*
+ * Iterate over all bookmarks
+ */
+int
+zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+{
+	zfs_handle_t *nzhp;
+	nvlist_t *props = NULL;
+	nvlist_t *bmarks = NULL;
+	int err;
+
+	if ((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) != 0)
+		return (0);
+
+	/* Setup the requested properties nvlist. */
+	props = fnvlist_alloc();
+	fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_GUID));
+	fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATETXG));
+	fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATION));
+
+	if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0)
+		goto out;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
+		char name[ZFS_MAX_DATASET_NAME_LEN];
+		char *bmark_name;
+		nvlist_t *bmark_props;
+
+		bmark_name = nvpair_name(pair);
+		bmark_props = fnvpair_value_nvlist(pair);
+
+		(void) snprintf(name, sizeof (name), "%s#%s", zhp->zfs_name,
+		    bmark_name);
+
+		nzhp = make_bookmark_handle(zhp, name, bmark_props);
+		if (nzhp == NULL)
+			continue;
+
+		if ((err = func(nzhp, data)) != 0)
+			goto out;
+	}
+
+out:
+	fnvlist_free(props);
+	fnvlist_free(bmarks);
+
+	return (err);
+}
+
+/*
+ * Routines for dealing with the sorted snapshot functionality
+ */
+typedef struct zfs_node {
+	zfs_handle_t	*zn_handle;
+	avl_node_t	zn_avlnode;
+} zfs_node_t;
+
+static int
+zfs_sort_snaps(zfs_handle_t *zhp, void *data)
+{
+	avl_tree_t *avl = data;
+	zfs_node_t *node;
+	zfs_node_t search;
+
+	search.zn_handle = zhp;
+	node = avl_find(avl, &search, NULL);
+	if (node) {
+		/*
+		 * If this snapshot was renamed while we were creating the
+		 * AVL tree, it's possible that we already inserted it under
+		 * its old name. Remove the old handle before adding the new
+		 * one.
+		 */
+		zfs_close(node->zn_handle);
+		avl_remove(avl, node);
+		free(node);
+	}
+
+	node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
+	node->zn_handle = zhp;
+	avl_add(avl, node);
+
+	return (0);
+}
+
+static int
+zfs_snapshot_compare(const void *larg, const void *rarg)
+{
+	zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
+	zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
+	uint64_t lcreate, rcreate;
+
+	/*
+	 * Sort them according to creation time.  We use the hidden
+	 * CREATETXG property to get an absolute ordering of snapshots.
+	 */
+	lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
+	rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
+
+	if (lcreate < rcreate)
+		return (-1);
+	else if (lcreate > rcreate)
+		return (+1);
+	else
+		return (0);
+}
+
+int
+zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data)
+{
+	int ret = 0;
+	zfs_node_t *node;
+	avl_tree_t avl;
+	void *cookie = NULL;
+
+	avl_create(&avl, zfs_snapshot_compare,
+	    sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode));
+
+	ret = zfs_iter_snapshots(zhp, B_FALSE, zfs_sort_snaps, &avl);
+
+	for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node))
+		ret |= callback(node->zn_handle, data);
+
+	while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL)
+		free(node);
+
+	avl_destroy(&avl);
+
+	return (ret);
+}
+
+typedef struct {
+	char *ssa_first;
+	char *ssa_last;
+	boolean_t ssa_seenfirst;
+	boolean_t ssa_seenlast;
+	zfs_iter_f ssa_func;
+	void *ssa_arg;
+} snapspec_arg_t;
+
+static int
+snapspec_cb(zfs_handle_t *zhp, void *arg)
+{
+	snapspec_arg_t *ssa = arg;
+	char *shortsnapname;
+	int err = 0;
+
+	if (ssa->ssa_seenlast)
+		return (0);
+	shortsnapname = zfs_strdup(zhp->zfs_hdl,
+	    strchr(zfs_get_name(zhp), '@') + 1);
+
+	if (!ssa->ssa_seenfirst && strcmp(shortsnapname, ssa->ssa_first) == 0)
+		ssa->ssa_seenfirst = B_TRUE;
+
+	if (ssa->ssa_seenfirst) {
+		err = ssa->ssa_func(zhp, ssa->ssa_arg);
+	} else {
+		zfs_close(zhp);
+	}
+
+	if (strcmp(shortsnapname, ssa->ssa_last) == 0)
+		ssa->ssa_seenlast = B_TRUE;
+	free(shortsnapname);
+
+	return (err);
+}
+
+/*
+ * spec is a string like "A,B%C,D"
+ *
+ * <snaps>, where <snaps> can be:
+ *      <snap>          (single snapshot)
+ *      <snap>%<snap>   (range of snapshots, inclusive)
+ *      %<snap>         (range of snapshots, starting with earliest)
+ *      <snap>%         (range of snapshots, ending with last)
+ *      %               (all snapshots)
+ *      <snaps>[,...]   (comma separated list of the above)
+ *
+ * If a snapshot can not be opened, continue trying to open the others, but
+ * return ENOENT at the end.
+ */
+int
+zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig,
+    zfs_iter_f func, void *arg)
+{
+	char *buf, *comma_separated, *cp;
+	int err = 0;
+	int ret = 0;
+
+	buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig);
+	cp = buf;
+
+	while ((comma_separated = strsep(&cp, ",")) != NULL) {
+		char *pct = strchr(comma_separated, '%');
+		if (pct != NULL) {
+			snapspec_arg_t ssa = { 0 };
+			ssa.ssa_func = func;
+			ssa.ssa_arg = arg;
+
+			if (pct == comma_separated)
+				ssa.ssa_seenfirst = B_TRUE;
+			else
+				ssa.ssa_first = comma_separated;
+			*pct = '\0';
+			ssa.ssa_last = pct + 1;
+
+			/*
+			 * If there is a lastname specified, make sure it
+			 * exists.
+			 */
+			if (ssa.ssa_last[0] != '\0') {
+				char snapname[ZFS_MAX_DATASET_NAME_LEN];
+				(void) snprintf(snapname, sizeof (snapname),
+				    "%s@%s", zfs_get_name(fs_zhp),
+				    ssa.ssa_last);
+				if (!zfs_dataset_exists(fs_zhp->zfs_hdl,
+				    snapname, ZFS_TYPE_SNAPSHOT)) {
+					ret = ENOENT;
+					continue;
+				}
+			}
+
+			err = zfs_iter_snapshots_sorted(fs_zhp,
+			    snapspec_cb, &ssa);
+			if (ret == 0)
+				ret = err;
+			if (ret == 0 && (!ssa.ssa_seenfirst ||
+			    (ssa.ssa_last[0] != '\0' && !ssa.ssa_seenlast))) {
+				ret = ENOENT;
+			}
+		} else {
+			char snapname[ZFS_MAX_DATASET_NAME_LEN];
+			zfs_handle_t *snap_zhp;
+			(void) snprintf(snapname, sizeof (snapname), "%s@%s",
+			    zfs_get_name(fs_zhp), comma_separated);
+			snap_zhp = make_dataset_handle(fs_zhp->zfs_hdl,
+			    snapname);
+			if (snap_zhp == NULL) {
+				ret = ENOENT;
+				continue;
+			}
+			err = func(snap_zhp, arg);
+			if (ret == 0)
+				ret = err;
+		}
+	}
+
+	free(buf);
+	return (ret);
+}
+
+/*
+ * Iterate over all children, snapshots and filesystems
+ */
+int
+zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
+{
+	int ret;
+
+	if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0)
+		return (ret);
+
+	return (zfs_iter_snapshots(zhp, B_FALSE, func, data));
+}
+
+
+typedef struct iter_stack_frame {
+	struct iter_stack_frame *next;
+	zfs_handle_t *zhp;
+} iter_stack_frame_t;
+
+typedef struct iter_dependents_arg {
+	boolean_t first;
+	boolean_t allowrecursion;
+	iter_stack_frame_t *stack;
+	zfs_iter_f func;
+	void *data;
+} iter_dependents_arg_t;
+
+static int
+iter_dependents_cb(zfs_handle_t *zhp, void *arg)
+{
+	iter_dependents_arg_t *ida = arg;
+	int err = 0;
+	boolean_t first = ida->first;
+	ida->first = B_FALSE;
+
+	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) {
+		err = zfs_iter_clones(zhp, iter_dependents_cb, ida);
+	} else if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) {
+		iter_stack_frame_t isf;
+		iter_stack_frame_t *f;
+
+		/*
+		 * check if there is a cycle by seeing if this fs is already
+		 * on the stack.
+		 */
+		for (f = ida->stack; f != NULL; f = f->next) {
+			if (f->zhp->zfs_dmustats.dds_guid ==
+			    zhp->zfs_dmustats.dds_guid) {
+				if (ida->allowrecursion) {
+					zfs_close(zhp);
+					return (0);
+				} else {
+					zfs_error_aux(zhp->zfs_hdl,
+					    dgettext(TEXT_DOMAIN,
+					    "recursive dependency at '%s'"),
+					    zfs_get_name(zhp));
+					err = zfs_error(zhp->zfs_hdl,
+					    EZFS_RECURSIVE,
+					    dgettext(TEXT_DOMAIN,
+					    "cannot determine dependent "
+					    "datasets"));
+					zfs_close(zhp);
+					return (err);
+				}
+			}
+		}
+
+		isf.zhp = zhp;
+		isf.next = ida->stack;
+		ida->stack = &isf;
+		err = zfs_iter_filesystems(zhp, iter_dependents_cb, ida);
+		if (err == 0) {
+			err = zfs_iter_snapshots(zhp, B_FALSE,
+			    iter_dependents_cb, ida);
+		}
+		ida->stack = isf.next;
+	}
+
+	if (!first && err == 0)
+		err = ida->func(zhp, ida->data);
+	else
+		zfs_close(zhp);
+
+	return (err);
+}
+
+int
+zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion,
+    zfs_iter_f func, void *data)
+{
+	iter_dependents_arg_t ida;
+	ida.allowrecursion = allowrecursion;
+	ida.stack = NULL;
+	ida.func = func;
+	ida.data = data;
+	ida.first = B_TRUE;
+	return (iter_dependents_cb(zfs_handle_dup(zhp), &ida));
+}
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c,v
retrieving revision 1.4
diff -u -p -r1.4 libzfs_mount.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c	14 Dec 2010 01:22:24 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c	20 Apr 2017 22:40:09 -0000
@@ -20,8 +20,9 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
 /*
@@ -44,17 +45,14 @@
  *
  * 	zfs_is_shared_nfs()
  * 	zfs_is_shared_smb()
- * 	zfs_is_shared_iscsi()
  * 	zfs_share_proto()
  * 	zfs_shareall();
- * 	zfs_share_iscsi()
  * 	zfs_unshare_nfs()
  * 	zfs_unshare_smb()
  * 	zfs_unshareall_nfs()
  *	zfs_unshareall_smb()
  *	zfs_unshareall()
  *	zfs_unshareall_bypath()
- * 	zfs_unshare_iscsi()
  *
  * The following functions are available for pool consumers, and will
  * mount/unmount and share/unshare all datasets within pool:
@@ -66,6 +64,7 @@
 #include <dirent.h>
 #include <dlfcn.h>
 #include <errno.h>
+#include <fcntl.h>
 #include <libgen.h>
 #include <libintl.h>
 #include <stdio.h>
@@ -76,24 +75,19 @@
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
+#include <sys/statvfs.h>
 
 #include <libzfs.h>
 
 #include "libzfs_impl.h"
 
 #include <libshare.h>
-#include <sys/systeminfo.h>
 #define	MAXISALEN	257	/* based on sysinfo(2) man page */
 
 static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
 zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
     zfs_share_proto_t);
 
-static int (*iscsitgt_zfs_share)(const char *);
-static int (*iscsitgt_zfs_unshare)(const char *);
-static int (*iscsitgt_zfs_is_shared)(const char *);
-static int (*iscsitgt_svc_online)();
-
 /*
  * The share protocols table must be in the same order as the zfs_share_prot_t
  * enum in libzfs_impl.h
@@ -125,29 +119,6 @@ zfs_share_proto_t share_all_proto[] = {
 	PROTO_END
 };
 
-#pragma init(zfs_iscsi_init)
-static void
-zfs_iscsi_init(void)
-{
-	void *libiscsitgt;
-
-	if ((libiscsitgt = dlopen("/lib/libiscsitgt.so.1",
-	    RTLD_LAZY | RTLD_GLOBAL)) == NULL ||
-	    (iscsitgt_zfs_share = (int (*)(const char *))dlsym(libiscsitgt,
-	    "iscsitgt_zfs_share")) == NULL ||
-	    (iscsitgt_zfs_unshare = (int (*)(const char *))dlsym(libiscsitgt,
-	    "iscsitgt_zfs_unshare")) == NULL ||
-	    (iscsitgt_zfs_is_shared = (int (*)(const char *))dlsym(libiscsitgt,
-	    "iscsitgt_zfs_is_shared")) == NULL ||
-	    (iscsitgt_svc_online = (int (*)(const char *))dlsym(libiscsitgt,
-	    "iscsitgt_svc_online")) == NULL) {
-		iscsitgt_zfs_share = NULL;
-		iscsitgt_zfs_unshare = NULL;
-		iscsitgt_zfs_is_shared = NULL;
-		iscsitgt_svc_online = NULL;
-	}
-}
-
 /*
  * Search the sharetab for the given mountpoint and protocol, returning
  * a zfs_share_type_t value.
@@ -168,9 +139,10 @@ is_shared(libzfs_handle_t *hdl, const ch
 		/* the mountpoint is the first entry on each line */
 		if ((tab = strchr(buf, '\t')) == NULL)
 			continue;
-#if defined(sun)
+
 		*tab = '\0';
 		if (strcmp(buf, mountpoint) == 0) {
+#ifdef illumos
 			/*
 			 * the protocol field is the third field
 			 * skip over second field
@@ -193,17 +165,17 @@ is_shared(libzfs_handle_t *hdl, const ch
 					return (0);
 				}
 			}
-		}
 #else
 			if (proto == PROTO_NFS)
 				return (SHARED_NFS);
 #endif
-
+		}
 	}
 
 	return (SHARED_NOT_SHARED);
 }
 
+#ifdef illumos
 /*
  * Returns true if the specified directory is empty.  If we can't open the
  * directory at all, return true so that the mount can fail with a more
@@ -231,6 +203,7 @@ dir_is_empty(const char *dirname)
 	(void) closedir(dirp);
 	return (B_TRUE);
 }
+#endif
 
 /*
  * Checks to see if the mount is active.  If the filesystem is mounted, we fill
@@ -265,7 +238,7 @@ static boolean_t
 zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen,
     zprop_source_t *source)
 {
-	char sourceloc[ZFS_MAXNAMELEN];
+	char sourceloc[MAXNAMELEN];
 	zprop_source_t sourcetype;
 
 	if (!zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type))
@@ -307,6 +280,12 @@ zfs_mount(zfs_handle_t *zhp, const char 
 	else
 		(void) strlcpy(mntopts, options, sizeof (mntopts));
 
+	/*
+	 * If the pool is imported read-only then all mounts must be read-only
+	 */
+	if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL))
+		flags |= MS_RDONLY;
+
 	if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL))
 		return (0);
 
@@ -321,6 +300,7 @@ zfs_mount(zfs_handle_t *zhp, const char 
 		}
 	}
 
+#ifdef illumos	/* FreeBSD: overlay mounts are not checked. */
 	/*
 	 * Determine if the mountpoint is empty.  If so, refuse to perform the
 	 * mount.  We don't perform this check if MS_OVERLAY is specified, which
@@ -335,9 +315,10 @@ zfs_mount(zfs_handle_t *zhp, const char 
 		return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
 		    dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint));
 	}
+#endif
 
 	/* perform the mount */
-	if (mount(zfs_get_name(zhp), mountpoint, MS_OPTIONSTR | flags,
+	if (zmount(zfs_get_name(zhp), mountpoint, flags,
 	    MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) {
 		/*
 		 * Generic errors are nasty, but there are just way too many
@@ -350,6 +331,18 @@ zfs_mount(zfs_handle_t *zhp, const char 
 		} else if (errno == EPERM) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "Insufficient privileges"));
+		} else if (errno == ENOTSUP) {
+			char buf[256];
+			int spa_version;
+
+			VERIFY(zfs_spa_version(zhp, &spa_version) == 0);
+			(void) snprintf(buf, sizeof (buf),
+			    dgettext(TEXT_DOMAIN, "Can't mount a version %lld "
+			    "file system on a version %d pool. Pool must be"
+			    " upgraded to mount this file system."),
+			    (u_longlong_t)zfs_prop_get_int(zhp,
+			    ZFS_PROP_VERSION), spa_version);
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf));
 		} else {
 			zfs_error_aux(hdl, strerror(errno));
 		}
@@ -450,7 +443,7 @@ zfs_is_shared(zfs_handle_t *zhp)
 	zfs_share_proto_t *curr_proto;
 
 	if (ZFS_IS_VOLUME(zhp))
-		return (zfs_is_shared_iscsi(zhp));
+		return (B_FALSE);
 
 	for (curr_proto = share_all_proto; *curr_proto != PROTO_END;
 	    curr_proto++)
@@ -462,18 +455,14 @@ zfs_is_shared(zfs_handle_t *zhp)
 int
 zfs_share(zfs_handle_t *zhp)
 {
-	if (ZFS_IS_VOLUME(zhp))
-		return (zfs_share_iscsi(zhp));
-
+	assert(!ZFS_IS_VOLUME(zhp));
 	return (zfs_share_proto(zhp, share_all_proto));
 }
 
 int
 zfs_unshare(zfs_handle_t *zhp)
 {
-	if (ZFS_IS_VOLUME(zhp))
-		return (zfs_unshare_iscsi(zhp));
-
+	assert(!ZFS_IS_VOLUME(zhp));
 	return (zfs_unshareall(zhp));
 }
 
@@ -489,7 +478,8 @@ zfs_is_shared_proto(zfs_handle_t *zhp, c
 	if (!zfs_is_mounted(zhp, &mountpoint))
 		return (SHARED_NOT_SHARED);
 
-	if (rc = is_shared(zhp->zfs_hdl, mountpoint, proto)) {
+	if ((rc = is_shared(zhp->zfs_hdl, mountpoint, proto))
+	    != SHARED_NOT_SHARED) {
 		if (where != NULL)
 			*where = mountpoint;
 		else
@@ -520,7 +510,8 @@ zfs_is_shared_smb(zfs_handle_t *zhp, cha
  * wrapper functions that check to see that the pointers to functions
  * initialized in _zfs_init_libshare() are actually present.
  */
-#ifdef PORT_SOLARIS
+
+#ifdef illumos
 static sa_handle_t (*_sa_init)(int);
 static void (*_sa_fini)(sa_handle_t);
 static sa_share_t (*_sa_find_share)(sa_handle_t, char *);
@@ -534,6 +525,7 @@ static int (*_sa_zfs_process_share)(sa_h
     char *, char *, zprop_source_t, char *, char *, char *);
 static void (*_sa_update_sharetab_ts)(sa_handle_t);
 #endif
+
 /*
  * _zfs_init_libshare()
  *
@@ -546,7 +538,7 @@ static void (*_sa_update_sharetab_ts)(sa
 static void
 _zfs_init_libshare(void)
 {
-#ifdef PORT_SOLARIS
+#ifdef illumos
 	void *libshare;
 	char path[MAXPATHLEN];
 	char isa[MAXISALEN];
@@ -616,7 +608,8 @@ int
 zfs_init_libshare(libzfs_handle_t *zhandle, int service)
 {
 	int ret = SA_OK;
-#ifdef PORT_SOLARIS
+
+#ifdef illumos
 	if (_sa_init == NULL)
 		ret = SA_CONFIG_ERR;
 
@@ -643,6 +636,7 @@ zfs_init_libshare(libzfs_handle_t *zhand
 	if (ret == SA_OK && zhandle->libzfs_sharehdl == NULL)
 		ret = SA_NO_MEMORY;
 #endif
+
 	return (ret);
 }
 
@@ -656,7 +650,7 @@ void
 zfs_uninit_libshare(libzfs_handle_t *zhandle)
 {
 	if (zhandle != NULL && zhandle->libzfs_sharehdl != NULL) {
-#ifdef PORT_SOLARIS
+#ifdef illumos
 		if (_sa_fini != NULL)
 			_sa_fini(zhandle->libzfs_sharehdl);
 #endif
@@ -673,7 +667,7 @@ zfs_uninit_libshare(libzfs_handle_t *zha
 int
 zfs_parse_options(char *options, zfs_share_proto_t proto)
 {
-#ifdef PORT_SOLARIS
+#ifdef illumos
 	if (_sa_parse_legacy_options != NULL) {
 		return (_sa_parse_legacy_options(NULL, options,
 		    proto_table[proto].p_name));
@@ -684,7 +678,7 @@ zfs_parse_options(char *options, zfs_sha
 #endif
 }
 
-#ifdef PORT_SOLARIS
+#ifdef illumos
 /*
  * zfs_sa_find_share(handle, path)
  *
@@ -726,7 +720,8 @@ zfs_sa_disable_share(sa_share_t share, c
 		return (_sa_disable_share(share, proto));
 	return (SA_CONFIG_ERR);
 }
-#endif
+#endif	/* illumos */
+
 /*
  * Share the given filesystem according to the options in the specified
  * protocol specific properties (sharenfs, sharesmb).  We rely
@@ -739,22 +734,13 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s
 	char shareopts[ZFS_MAXPROPLEN];
 	char sourcestr[ZFS_MAXPROPLEN];
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
-	sa_share_t share;
 	zfs_share_proto_t *curr_proto;
 	zprop_source_t sourcetype;
 	int error, ret;
 
 	if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL))
 		return (0);
-#ifdef PORT_SOLARIS
-	if ((ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) {
-		(void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED,
-		    dgettext(TEXT_DOMAIN, "cannot share '%s': %s"),
-		    zfs_get_name(zhp), _sa_errorstr != NULL ?
-		    _sa_errorstr(ret) : "");
-		return (-1);
-	}
-#endif
+
 	for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) {
 		/*
 		 * Return success if there are no share options.
@@ -765,6 +751,17 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s
 		    strcmp(shareopts, "off") == 0)
 			continue;
 
+#ifdef illumos
+		ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API);
+		if (ret != SA_OK) {
+			(void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED,
+			    dgettext(TEXT_DOMAIN, "cannot share '%s': %s"),
+			    zfs_get_name(zhp), _sa_errorstr != NULL ?
+			    _sa_errorstr(ret) : "");
+			return (-1);
+		}
+#endif
+
 		/*
 		 * If the 'zoned' property is set, then zfs_is_mountable()
 		 * will have already bailed out if we are in the global zone.
@@ -774,7 +771,7 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s
 		if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED))
 			continue;
 
-#ifdef PORT_SOLARIS
+#ifdef illumos
 		share = zfs_sa_find_share(hdl->libzfs_sharehdl, mountpoint);
 		if (share == NULL) {
 			/*
@@ -811,14 +808,28 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s
 				    zfs_get_name(zhp));
 				return (-1);
 			}
-		} else {
+		} else
+#else
+		if (*curr_proto != PROTO_NFS) {
+			fprintf(stderr, "Unsupported share protocol: %d.\n",
+			    *curr_proto);
+			continue;
+		}
+
+		if (strcmp(shareopts, "on") == 0)
+			error = fsshare(ZFS_EXPORTS_PATH, mountpoint, "");
+		else
+			error = fsshare(ZFS_EXPORTS_PATH, mountpoint, shareopts);
+		if (error != 0)
+#endif
+		{
 			(void) zfs_error_fmt(hdl,
 			    proto_table[*curr_proto].p_share_err,
 			    dgettext(TEXT_DOMAIN, "cannot share '%s'"),
 			    zfs_get_name(zhp));
 			return (-1);
 		}
-#endif
+
 	}
 	return (0);
 }
@@ -849,7 +860,7 @@ static int
 unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint,
     zfs_share_proto_t proto)
 {
-#ifdef PORT_SOLARIS
+#ifdef illumos
 	sa_share_t share;
 	int err;
 	char *mntpt;
@@ -883,6 +894,23 @@ unshare_one(libzfs_handle_t *hdl, const 
 		    dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"),
 		    name));
 	}
+#else
+	char buf[MAXPATHLEN];
+	FILE *fp;
+	int err;
+
+	if (proto != PROTO_NFS) {
+		fprintf(stderr, "No SMB support in FreeBSD yet.\n");
+		return (EOPNOTSUPP);
+	}
+
+	err = fsunshare(ZFS_EXPORTS_PATH, mountpoint);
+	if (err != 0) {
+		zfs_error_aux(hdl, "%s", strerror(err));
+		return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED,
+		    dgettext(TEXT_DOMAIN,
+		    "cannot unshare '%s'"), name));
+	}
 #endif
 	return (0);
 }
@@ -1016,94 +1044,29 @@ remove_mountpoint(zfs_handle_t *zhp)
 	}
 }
 
-boolean_t
-zfs_is_shared_iscsi(zfs_handle_t *zhp)
-{
-
-	/*
-	 * If iscsi deamon isn't running then we aren't shared
-	 */
-	if (iscsitgt_svc_online && iscsitgt_svc_online() == 1)
-		return (B_FALSE);
-	else
-		return (iscsitgt_zfs_is_shared != NULL &&
-		    iscsitgt_zfs_is_shared(zhp->zfs_name) != 0);
-}
-
-int
-zfs_share_iscsi(zfs_handle_t *zhp)
-{
-	char shareopts[ZFS_MAXPROPLEN];
-	const char *dataset = zhp->zfs_name;
-	libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-	/*
-	 * Return success if there are no share options.
-	 */
-	if (zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts,
-	    sizeof (shareopts), NULL, NULL, 0, B_FALSE) != 0 ||
-	    strcmp(shareopts, "off") == 0)
-		return (0);
-#ifdef PORT_ISCSI /* NetBSD do not support zfssharing with iscsi, yet */
-	if (iscsitgt_zfs_share == NULL || iscsitgt_zfs_share(dataset) != 0) {
-		int error = EZFS_SHAREISCSIFAILED;
-
-		/*
-		 * If service isn't availabele and EPERM was
-		 * returned then use special error.
-		 */
-		if (iscsitgt_svc_online && errno == EPERM &&
-		    (iscsitgt_svc_online() != 0))
-			error = EZFS_ISCSISVCUNAVAIL;
-
-		return (zfs_error_fmt(hdl, error,
-		    dgettext(TEXT_DOMAIN, "cannot share '%s'"), dataset));
-	}
-#endif
-	return (0);
-}
-
-int
-zfs_unshare_iscsi(zfs_handle_t *zhp)
+void
+libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
 {
-	const char *dataset = zfs_get_name(zhp);
-	libzfs_handle_t *hdl = zhp->zfs_hdl;
-
-#ifdef PORT_ISCSI /* NetBSD do not support zfssharing with iscsi, yet */
-	/*
-	 * Return if the volume is not shared
-	 */
-	if (zfs_is_shared_iscsi(zhp) != SHARED_ISCSI)
-		return (0);
+	if (cbp->cb_alloc == cbp->cb_used) {
+		size_t newsz;
+		void *ptr;
 
-	/*
-	 * If this fails with ENODEV it indicates that zvol wasn't shared so
-	 * we should return success in that case.
-	 */
-	if (iscsitgt_zfs_unshare == NULL ||
-	    (iscsitgt_zfs_unshare(dataset) != 0 && errno != ENODEV)) {
-		if (errno == EPERM)
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "Insufficient privileges to unshare iscsi"));
-		return (zfs_error_fmt(hdl, EZFS_UNSHAREISCSIFAILED,
-		    dgettext(TEXT_DOMAIN, "cannot unshare '%s'"), dataset));
+		newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
+		ptr = zfs_realloc(zhp->zfs_hdl,
+		    cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
+		    newsz * sizeof (void *));
+		cbp->cb_handles = ptr;
+		cbp->cb_alloc = newsz;
 	}
-#endif
-	return (0);
+	cbp->cb_handles[cbp->cb_used++] = zhp;
 }
 
-typedef struct mount_cbdata {
-	zfs_handle_t	**cb_datasets;
-	int 		cb_used;
-	int		cb_alloc;
-} mount_cbdata_t;
-
 static int
 mount_cb(zfs_handle_t *zhp, void *data)
 {
-	mount_cbdata_t *cbp = data;
+	get_all_cb_t *cbp = data;
 
-	if (!(zfs_get_type(zhp) & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) {
+	if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) {
 		zfs_close(zhp);
 		return (0);
 	}
@@ -1113,25 +1076,27 @@ mount_cb(zfs_handle_t *zhp, void *data)
 		return (0);
 	}
 
-	if (cbp->cb_alloc == cbp->cb_used) {
-		void *ptr;
-
-		if ((ptr = zfs_realloc(zhp->zfs_hdl,
-		    cbp->cb_datasets, cbp->cb_alloc * sizeof (void *),
-		    cbp->cb_alloc * 2 * sizeof (void *))) == NULL)
-			return (-1);
-		cbp->cb_datasets = ptr;
-
-		cbp->cb_alloc *= 2;
+	/*
+	 * If this filesystem is inconsistent and has a receive resume
+	 * token, we can not mount it.
+	 */
+	if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+	    zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+		zfs_close(zhp);
+		return (0);
 	}
 
-	cbp->cb_datasets[cbp->cb_used++] = zhp;
-
-	return (zfs_iter_filesystems(zhp, mount_cb, cbp));
+	libzfs_add_handle(cbp, zhp);
+	if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
+		zfs_close(zhp);
+		return (-1);
+	}
+	return (0);
 }
 
-static int
-dataset_cmp(const void *a, const void *b)
+int
+libzfs_dataset_cmp(const void *a, const void *b)
 {
 	zfs_handle_t **za = (zfs_handle_t **)a;
 	zfs_handle_t **zb = (zfs_handle_t **)b;
@@ -1169,7 +1134,7 @@ dataset_cmp(const void *a, const void *b
 int
 zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 {
-	mount_cbdata_t cb = { 0 };
+	get_all_cb_t cb = { 0 };
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	zfs_handle_t *zfsp;
 	int i, ret = -1;
@@ -1178,23 +1143,17 @@ zpool_enable_datasets(zpool_handle_t *zh
 	/*
 	 * Gather all non-snap datasets within the pool.
 	 */
-	if ((cb.cb_datasets = zfs_alloc(hdl, 4 * sizeof (void *))) == NULL)
-		return (-1);
-	cb.cb_alloc = 4;
-
 	if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)
 		goto out;
 
-	cb.cb_datasets[0] = zfsp;
-	cb.cb_used = 1;
-
+	libzfs_add_handle(&cb, zfsp);
 	if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
 		goto out;
-
 	/*
 	 * Sort the datasets by mountpoint.
 	 */
-	qsort(cb.cb_datasets, cb.cb_used, sizeof (void *), dataset_cmp);
+	qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
+	    libzfs_dataset_cmp);
 
 	/*
 	 * And mount all the datasets, keeping track of which ones
@@ -1206,7 +1165,7 @@ zpool_enable_datasets(zpool_handle_t *zh
 
 	ret = 0;
 	for (i = 0; i < cb.cb_used; i++) {
-		if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0)
+		if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
 			ret = -1;
 		else
 			good[i] = 1;
@@ -1219,7 +1178,7 @@ zpool_enable_datasets(zpool_handle_t *zh
 	 * zfs_alloc is supposed to exit if memory isn't available.
 	 */
 	for (i = 0; i < cb.cb_used; i++) {
-		if (good[i] && zfs_share(cb.cb_datasets[i]) != 0)
+		if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
 			ret = -1;
 	}
 
@@ -1227,27 +1186,12 @@ zpool_enable_datasets(zpool_handle_t *zh
 
 out:
 	for (i = 0; i < cb.cb_used; i++)
-		zfs_close(cb.cb_datasets[i]);
-	free(cb.cb_datasets);
+		zfs_close(cb.cb_handles[i]);
+	free(cb.cb_handles);
 
 	return (ret);
 }
 
-/*ARGSUSED1*/
-static int
-zvol_cb(zfs_handle_t *zhp, void *unused)
-{
-	int error = 0;
-
-	if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
-		(void) zfs_iter_children(zhp, zvol_cb, NULL);
-	if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME)
-		error = zfs_unshare_iscsi(zhp);
-	zfs_close(zhp);
-
-	return (error);
-}
-
 static int
 mountpoint_compare(const void *a, const void *b)
 {
@@ -1270,48 +1214,34 @@ int
 zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 {
 	int used, alloc;
-	struct statvfs *sfs;
-	int n;
+	struct mnttab entry;
 	size_t namelen;
 	char **mountpoints = NULL;
-	zfs_handle_t *zfp;
 	zfs_handle_t **datasets = NULL;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	int i;
 	int ret = -1;
 	int flags = (force ? MS_FORCE : 0);
 
-	/*
-	 * First unshare all zvols.
-	 */
-	zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
-	    ZFS_TYPE_FILESYSTEM);
-	if (zfp != NULL) {
-		(void) zfs_iter_children(zfp, zvol_cb, NULL);
-		zfs_close(zfp);
-	}
-
 	namelen = strlen(zhp->zpool_name);
 
 	rewind(hdl->libzfs_mnttab);
 	used = alloc = 0;
-	if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) {
-		fprintf(stderr, "getmntinfo(): %s\n", strerror(errno));
-		return (-1);
-	}
-	for (i = 0; i < n; i++) {
+	while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
 		/*
 		 * Ignore non-ZFS entries.
 		 */
-		if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0)
+		if (entry.mnt_fstype == NULL ||
+		    strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
 			continue;
 
 		/*
 		 * Ignore filesystems not within this pool.
 		 */
-		if (strncmp(sfs[i].f_mntfromname, zhp->zpool_name, namelen) != 0 ||
-		    (sfs[i].f_mntfromname[namelen] != '/' &&
-		    sfs[i].f_mntfromname[namelen] != '\0'))
+		if (entry.mnt_mountp == NULL ||
+		    strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 ||
+		    (entry.mnt_special[namelen] != '/' &&
+		    entry.mnt_special[namelen] != '\0'))
 			continue;
 
 		/*
@@ -1349,7 +1279,7 @@ zpool_disable_datasets(zpool_handle_t *z
 		}
 
 		if ((mountpoints[used] = zfs_strdup(hdl,
-		    sfs[i].f_mntonname)) == NULL)
+		    entry.mnt_mountp)) == NULL)
 			goto out;
 
 		/*
@@ -1357,7 +1287,7 @@ zpool_disable_datasets(zpool_handle_t *z
 		 * is only used to determine if we need to remove the underlying
 		 * mountpoint, so failure is not fatal.
 		 */
-		datasets[used] = make_dataset_handle(hdl, sfs[i].f_mntfromname);
+		datasets[used] = make_dataset_handle(hdl, entry.mnt_special);
 
 		used++;
 	}
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c,v
retrieving revision 1.4
diff -u -p -r1.4 libzfs_pool.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c	10 Jan 2017 19:20:34 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c	5 May 2017 17:36:25 -0000
@@ -20,10 +20,15 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <ctype.h>
 #include <errno.h>
 #include <devid.h>
@@ -33,73 +38,25 @@
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
-#include <sys/efi_partition.h>
-#include <sys/vtoc.h>
+#include <libgen.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/stat.h>
 #include <dlfcn.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
 #include "zfs_comutil.h"
-
-const char *hist_event_table[LOG_END] = {
-	"invalid event",
-	"pool create",
-	"vdev add",
-	"pool remove",
-	"pool destroy",
-	"pool export",
-	"pool import",
-	"vdev attach",
-	"vdev replace",
-	"vdev detach",
-	"vdev online",
-	"vdev offline",
-	"vdev upgrade",
-	"pool clear",
-	"pool scrub",
-	"pool property set",
-	"create",
-	"clone",
-	"destroy",
-	"destroy_begin_sync",
-	"inherit",
-	"property set",
-	"quota set",
-	"permission update",
-	"permission remove",
-	"permission who remove",
-	"promote",
-	"receive",
-	"rename",
-	"reservation set",
-	"replay_inc_sync",
-	"replay_full_sync",
-	"rollback",
-	"snapshot",
-	"filesystem version upgrade",
-	"refquota set",
-	"refreservation set",
-	"pool scrub done",
-	"user hold",
-	"user release",
-	"pool split",
-};
+#include "zfeature_common.h"
 
 static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 
-#if defined(__i386) || defined(__amd64)
-#define	BOOTCMD	"installgrub(1M)"
-#else
-#define	BOOTCMD	"installboot(1M)"
-#endif
-
-#define	DISK_ROOT	"/dev/dsk"
-#define	RDISK_ROOT	"/dev/rdsk"
 #define	BACKUP_SLICE	"s2"
 
+typedef struct prop_flags {
+	int create:1;	/* Validate property on creation */
+	int import:1;	/* Validate property on import */
+} prop_flags_t;
+
 /*
  * ====================================================================
  *   zpool property functions
@@ -169,14 +126,14 @@ zpool_get_prop_string(zpool_handle_t *zh
 		verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
 	} else {
 		source = ZPROP_SRC_DEFAULT;
-		if ((value = __UNCONST(zpool_prop_default_string(prop))) == NULL)
-			value = __UNCONST("-");
+		if ((value = (char *)zpool_prop_default_string(prop)) == NULL)
+			value = "-";
 	}
 
 	if (src)
 		*src = source;
 
-	return value;
+	return (value);
 }
 
 uint64_t
@@ -243,10 +200,40 @@ zpool_state_to_name(vdev_state_t state, 
 		return (gettext("DEGRADED"));
 	case VDEV_STATE_HEALTHY:
 		return (gettext("ONLINE"));
-	case VDEV_STATE_UNKNOWN:
+
 	default:
-		return (gettext("UNKNOWN"));
+		break;
+	}
+
+	return (gettext("UNKNOWN"));
+}
+
+/*
+ * Map POOL STATE to printed strings.
+ */
+const char *
+zpool_pool_state_to_name(pool_state_t state)
+{
+	switch (state) {
+	case POOL_STATE_ACTIVE:
+		return (gettext("ACTIVE"));
+	case POOL_STATE_EXPORTED:
+		return (gettext("EXPORTED"));
+	case POOL_STATE_DESTROYED:
+		return (gettext("DESTROYED"));
+	case POOL_STATE_SPARE:
+		return (gettext("SPARE"));
+	case POOL_STATE_L2CACHE:
+		return (gettext("L2CACHE"));
+	case POOL_STATE_UNINITIALIZED:
+		return (gettext("UNINITIALIZED"));
+	case POOL_STATE_UNAVAIL:
+		return (gettext("UNAVAIL"));
+	case POOL_STATE_POTENTIALLY_ACTIVE:
+		return (gettext("POTENTIALLY_ACTIVE"));
 	}
+
+	return (gettext("UNKNOWN"));
 }
 
 /*
@@ -255,7 +242,7 @@ zpool_state_to_name(vdev_state_t state, 
  */
 int
 zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
-    zprop_source_t *srctype)
+    zprop_source_t *srctype, boolean_t literal)
 {
 	uint64_t intval;
 	const char *strval;
@@ -271,24 +258,24 @@ zpool_get_prop(zpool_handle_t *zhp, zpoo
 			break;
 
 		case ZPOOL_PROP_HEALTH:
-			(void) strlcpy(buf, "FAULTED", len);
+			(void) strlcpy(buf,
+			    zpool_pool_state_to_name(POOL_STATE_UNAVAIL), len);
 			break;
 
 		case ZPOOL_PROP_GUID:
 			intval = zpool_get_prop_int(zhp, prop, &src);
-			(void) snprintf(buf, len, "%" PRIu64, intval);
+			(void) snprintf(buf, len, "%llu", intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 		case ZPOOL_PROP_CACHEFILE:
+		case ZPOOL_PROP_COMMENT:
 			if (zhp->zpool_props != NULL ||
 			    zpool_get_all_props(zhp) == 0) {
 				(void) strlcpy(buf,
 				    zpool_get_prop_string(zhp, prop, &src),
 				    len);
-				if (srctype != NULL)
-					*srctype = src;
-				return (0);
+				break;
 			}
 			/* FALLTHROUGH */
 		default:
@@ -318,31 +305,65 @@ zpool_get_prop(zpool_handle_t *zhp, zpoo
 		case ZPOOL_PROP_SIZE:
 		case ZPOOL_PROP_ALLOCATED:
 		case ZPOOL_PROP_FREE:
-			(void) zfs_nicenum(intval, buf, len);
+		case ZPOOL_PROP_FREEING:
+		case ZPOOL_PROP_LEAKED:
+			if (literal) {
+				(void) snprintf(buf, len, "%llu",
+				    (u_longlong_t)intval);
+			} else {
+				(void) zfs_nicenum(intval, buf, len);
+			}
+			break;
+		case ZPOOL_PROP_EXPANDSZ:
+			if (intval == 0) {
+				(void) strlcpy(buf, "-", len);
+			} else if (literal) {
+				(void) snprintf(buf, len, "%llu",
+				    (u_longlong_t)intval);
+			} else {
+				(void) zfs_nicenum(intval, buf, len);
+			}
 			break;
-
 		case ZPOOL_PROP_CAPACITY:
-			(void) snprintf(buf, len, "%ju%%",
-			    (uintmax_t)intval);
+			if (literal) {
+				(void) snprintf(buf, len, "%llu",
+				    (u_longlong_t)intval);
+			} else {
+				(void) snprintf(buf, len, "%llu%%",
+				    (u_longlong_t)intval);
+			}
+			break;
+		case ZPOOL_PROP_FRAGMENTATION:
+			if (intval == UINT64_MAX) {
+				(void) strlcpy(buf, "-", len);
+			} else {
+				(void) snprintf(buf, len, "%llu%%",
+				    (u_longlong_t)intval);
+			}
 			break;
-
 		case ZPOOL_PROP_DEDUPRATIO:
-			(void) snprintf(buf, len, "%ju.%02jux",
-			    (uintmax_t)(intval / 100),
-			    (uintmax_t)(intval % 100));
+			(void) snprintf(buf, len, "%llu.%02llux",
+			    (u_longlong_t)(intval / 100),
+			    (u_longlong_t)(intval % 100));
 			break;
-
 		case ZPOOL_PROP_HEALTH:
 			verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
 			    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 			verify(nvlist_lookup_uint64_array(nvroot,
-			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+			    == 0);
 
 			(void) strlcpy(buf, zpool_state_to_name(intval,
 			    vs->vs_aux), len);
 			break;
+		case ZPOOL_PROP_VERSION:
+			if (intval >= SPA_VERSION_FEATURES) {
+				(void) snprintf(buf, len, "-");
+				break;
+			}
+			/* FALLTHROUGH */
 		default:
-			(void) snprintf(buf, len, "%ju", (uintmax_t)intval);
+			(void) snprintf(buf, len, "%llu", intval);
 		}
 		break;
 
@@ -383,34 +404,13 @@ bootfs_name_valid(const char *pool, char
 	return (B_FALSE);
 }
 
-/*
- * Inspect the configuration to determine if any of the devices contain
- * an EFI label.
- */
-static boolean_t
-pool_uses_efi(nvlist_t *config)
-{
-	nvlist_t **child;
-	uint_t c, children;
-
-	if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) != 0)
-		return (read_efi_label(config, NULL) >= 0);
-
-	for (c = 0; c < children; c++) {
-		if (pool_uses_efi(child[c]))
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-static boolean_t
-pool_is_bootable(zpool_handle_t *zhp)
+boolean_t
+zpool_is_bootable(zpool_handle_t *zhp)
 {
-	char bootfs[ZPOOL_MAXNAMELEN];
+	char bootfs[ZFS_MAX_DATASET_NAME_LEN];
 
 	return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs,
-	    sizeof (bootfs), NULL) == 0 && strncmp(bootfs, "-",
+	    sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-",
 	    sizeof (bootfs)) != 0);
 }
 
@@ -422,17 +422,16 @@ pool_is_bootable(zpool_handle_t *zhp)
  */
 static nvlist_t *
 zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname,
-    nvlist_t *props, uint64_t version, boolean_t create_or_import, char *errbuf)
+    nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf)
 {
 	nvpair_t *elem;
 	nvlist_t *retprops;
 	zpool_prop_t prop;
 	char *strval;
 	uint64_t intval;
-	char *slash;
+	char *slash, *check;
 	struct stat64 statbuf;
 	zpool_handle_t *zhp;
-	nvlist_t *nvroot;
 
 	if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) {
 		(void) no_memory(hdl);
@@ -443,10 +442,47 @@ zpool_valid_proplist(libzfs_handle_t *hd
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 
+		prop = zpool_name_to_prop(propname);
+		if (prop == ZPROP_INVAL && zpool_prop_feature(propname)) {
+			int err;
+			char *fname = strchr(propname, '@') + 1;
+
+			err = zfeature_lookup_name(fname, NULL);
+			if (err != 0) {
+				ASSERT3U(err, ==, ENOENT);
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "invalid feature '%s'"), fname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (nvpair_type(elem) != DATA_TYPE_STRING) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "'%s' must be a string"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			(void) nvpair_value_string(elem, &strval);
+			if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' can only be set to "
+				    "'enabled'"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (nvlist_add_uint64(retprops, propname, 0) != 0) {
+				(void) no_memory(hdl);
+				goto error;
+			}
+			continue;
+		}
+
 		/*
 		 * Make sure this property is valid and applies to this type.
 		 */
-		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) {
+		if (prop == ZPROP_INVAL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "invalid property '%s'"), propname);
 			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
@@ -469,7 +505,8 @@ zpool_valid_proplist(libzfs_handle_t *hd
 		 */
 		switch (prop) {
 		case ZPOOL_PROP_VERSION:
-			if (intval < version || intval > SPA_VERSION) {
+			if (intval < version ||
+			    !SPA_VERSION_IS_SUPPORTED(intval)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "property '%s' number %d is invalid."),
 				    propname, intval);
@@ -479,7 +516,7 @@ zpool_valid_proplist(libzfs_handle_t *hd
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
-			if (create_or_import) {
+			if (flags.create || flags.import) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "property '%s' cannot be set at creation "
 				    "or import time"), propname);
@@ -513,26 +550,11 @@ zpool_valid_proplist(libzfs_handle_t *hd
 				(void) zfs_error(hdl, EZFS_OPENFAILED, errbuf);
 				goto error;
 			}
-			verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
-			    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
-
-			/*
-			 * bootfs property cannot be set on a disk which has
-			 * been EFI labeled.
-			 */
-			if (pool_uses_efi(nvroot)) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "property '%s' not supported on "
-				    "EFI labeled devices"), propname);
-				(void) zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf);
-				zpool_close(zhp);
-				goto error;
-			}
 			zpool_close(zhp);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
-			if (!create_or_import) {
+			if (!flags.create && !flags.import) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "property '%s' can only be set during pool "
 				    "creation or import"), propname);
@@ -587,21 +609,40 @@ zpool_valid_proplist(libzfs_handle_t *hd
 
 			*slash = '/';
 			break;
-		case ZPOOL_PROP_FREE:
-		case ZPOOL_PROP_ALLOCATED:
-		case ZPOOL_NUM_PROPS:
-		case ZPOOL_PROP_AUTOEXPAND:
-		case ZPOOL_PROP_DEDUPDITTO:
-		case ZPOOL_PROP_SIZE:
-		case ZPOOL_PROP_CAPACITY:
-		case ZPOOL_PROP_HEALTH:
-		case ZPOOL_PROP_GUID:
-		case ZPOOL_PROP_DELEGATION:
-		case ZPOOL_PROP_AUTOREPLACE:
-		case ZPOOL_PROP_FAILUREMODE:
-		case ZPOOL_PROP_LISTSNAPS:
-		case ZPOOL_PROP_DEDUPRATIO:
-		case ZPOOL_PROP_NAME:
+
+		case ZPOOL_PROP_COMMENT:
+			for (check = strval; *check != '\0'; check++) {
+				if (!isprint(*check)) {
+					zfs_error_aux(hdl,
+					    dgettext(TEXT_DOMAIN,
+					    "comment may only have printable "
+					    "characters"));
+					(void) zfs_error(hdl, EZFS_BADPROP,
+					    errbuf);
+					goto error;
+				}
+			}
+			if (strlen(strval) > ZPROP_MAX_COMMENT) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "comment must not exceed %d characters"),
+				    ZPROP_MAX_COMMENT);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+			break;
+		case ZPOOL_PROP_READONLY:
+			if (!flags.import) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "property '%s' can only be set at "
+				    "import time"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+			break;
+
+		default:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "property '%s'(%d) not defined"), propname, prop);
 			break;
 		}
 	}
@@ -624,6 +665,7 @@ zpool_set_prop(zpool_handle_t *zhp, cons
 	nvlist_t *nvl = NULL;
 	nvlist_t *realprops;
 	uint64_t version;
+	prop_flags_t flags = { 0 };
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
@@ -639,7 +681,7 @@ zpool_set_prop(zpool_handle_t *zhp, cons
 
 	version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
 	if ((realprops = zpool_valid_proplist(zhp->zpool_hdl,
-	    zhp->zpool_name, nvl, version, B_FALSE, errbuf)) == NULL) {
+	    zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) {
 		nvlist_free(nvl);
 		return (-1);
 	}
@@ -676,10 +718,77 @@ zpool_expand_proplist(zpool_handle_t *zh
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	zprop_list_t *entry;
 	char buf[ZFS_MAXPROPLEN];
+	nvlist_t *features = NULL;
+	zprop_list_t **last;
+	boolean_t firstexpand = (NULL == *plp);
 
 	if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0)
 		return (-1);
 
+	last = plp;
+	while (*last != NULL)
+		last = &(*last)->pl_next;
+
+	if ((*plp)->pl_all)
+		features = zpool_get_features(zhp);
+
+	if ((*plp)->pl_all && firstexpand) {
+		for (int i = 0; i < SPA_FEATURES; i++) {
+			zprop_list_t *entry = zfs_alloc(hdl,
+			    sizeof (zprop_list_t));
+			entry->pl_prop = ZPROP_INVAL;
+			entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s",
+			    spa_feature_table[i].fi_uname);
+			entry->pl_width = strlen(entry->pl_user_prop);
+			entry->pl_all = B_TRUE;
+
+			*last = entry;
+			last = &entry->pl_next;
+		}
+	}
+
+	/* add any unsupported features */
+	for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL);
+	    nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) {
+		char *propname;
+		boolean_t found;
+		zprop_list_t *entry;
+
+		if (zfeature_is_supported(nvpair_name(nvp)))
+			continue;
+
+		propname = zfs_asprintf(hdl, "unsupported@%s",
+		    nvpair_name(nvp));
+
+		/*
+		 * Before adding the property to the list make sure that no
+		 * other pool already added the same property.
+		 */
+		found = B_FALSE;
+		entry = *plp;
+		while (entry != NULL) {
+			if (entry->pl_user_prop != NULL &&
+			    strcmp(propname, entry->pl_user_prop) == 0) {
+				found = B_TRUE;
+				break;
+			}
+			entry = entry->pl_next;
+		}
+		if (found) {
+			free(propname);
+			continue;
+		}
+
+		entry = zfs_alloc(hdl, sizeof (zprop_list_t));
+		entry->pl_prop = ZPROP_INVAL;
+		entry->pl_user_prop = propname;
+		entry->pl_width = strlen(entry->pl_user_prop);
+		entry->pl_all = B_TRUE;
+
+		*last = entry;
+		last = &entry->pl_next;
+	}
+
 	for (entry = *plp; entry != NULL; entry = entry->pl_next) {
 
 		if (entry->pl_fixed)
@@ -687,7 +796,7 @@ zpool_expand_proplist(zpool_handle_t *zh
 
 		if (entry->pl_prop != ZPROP_INVAL &&
 		    zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf),
-		    NULL) == 0) {
+		    NULL, B_FALSE) == 0) {
 			if (strlen(buf) > entry->pl_width)
 				entry->pl_width = strlen(buf);
 		}
@@ -696,6 +805,66 @@ zpool_expand_proplist(zpool_handle_t *zh
 	return (0);
 }
 
+/*
+ * Get the state for the given feature on the given ZFS pool.
+ */
+int
+zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf,
+    size_t len)
+{
+	uint64_t refcount;
+	boolean_t found = B_FALSE;
+	nvlist_t *features = zpool_get_features(zhp);
+	boolean_t supported;
+	const char *feature = strchr(propname, '@') + 1;
+
+	supported = zpool_prop_feature(propname);
+	ASSERT(supported || zpool_prop_unsupported(propname));
+
+	/*
+	 * Convert from feature name to feature guid. This conversion is
+	 * unecessary for unsupported@... properties because they already
+	 * use guids.
+	 */
+	if (supported) {
+		int ret;
+		spa_feature_t fid;
+
+		ret = zfeature_lookup_name(feature, &fid);
+		if (ret != 0) {
+			(void) strlcpy(buf, "-", len);
+			return (ENOTSUP);
+		}
+		feature = spa_feature_table[fid].fi_guid;
+	}
+
+	if (nvlist_lookup_uint64(features, feature, &refcount) == 0)
+		found = B_TRUE;
+
+	if (supported) {
+		if (!found) {
+			(void) strlcpy(buf, ZFS_FEATURE_DISABLED, len);
+		} else  {
+			if (refcount == 0)
+				(void) strlcpy(buf, ZFS_FEATURE_ENABLED, len);
+			else
+				(void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len);
+		}
+	} else {
+		if (found) {
+			if (refcount == 0) {
+				(void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE);
+			} else {
+				(void) strcpy(buf, ZFS_UNSUPPORTED_READONLY);
+			}
+		} else {
+			(void) strlcpy(buf, "-", len);
+			return (ENOTSUP);
+		}
+	}
+
+	return (0);
+}
 
 /*
  * Don't start the slice at the default block of 34; many storage
@@ -782,9 +951,10 @@ zpool_name_valid(libzfs_handle_t *hdl, b
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "multiple '@' delimiters in name"));
 				break;
-			case NAME_ERR_NO_AT:
+
+			default:
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "no attribute in name"));
+				    "(%d) not defined"), why);
 				break;
 			}
 		}
@@ -895,12 +1065,9 @@ zpool_open(libzfs_handle_t *hdl, const c
 void
 zpool_close(zpool_handle_t *zhp)
 {
-	if (zhp->zpool_config)
-		nvlist_free(zhp->zpool_config);
-	if (zhp->zpool_old_config)
-		nvlist_free(zhp->zpool_old_config);
-	if (zhp->zpool_props)
-		nvlist_free(zhp->zpool_props);
+	nvlist_free(zhp->zpool_config);
+	nvlist_free(zhp->zpool_old_config);
+	nvlist_free(zhp->zpool_props);
 	free(zhp);
 }
 
@@ -936,7 +1103,6 @@ zpool_create(libzfs_handle_t *hdl, const
 	nvlist_t *zc_fsprops = NULL;
 	nvlist_t *zc_props = NULL;
 	char msg[1024];
-	char *altroot;
 	int ret = -1;
 
 	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
@@ -949,8 +1115,10 @@ zpool_create(libzfs_handle_t *hdl, const
 		return (-1);
 
 	if (props) {
+		prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE };
+
 		if ((zc_props = zpool_valid_proplist(hdl, pool, props,
-		    SPA_VERSION_1, B_TRUE, msg)) == NULL) {
+		    SPA_VERSION_1, flags, msg)) == NULL) {
 			goto create_failed;
 		}
 	}
@@ -963,8 +1131,8 @@ zpool_create(libzfs_handle_t *hdl, const
 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) &&
 		    strcmp(zonestr, "on") == 0);
 
-		if ((zc_fsprops = zfs_valid_proplist(hdl,
-		    ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, msg)) == NULL) {
+		if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM,
+		    fsprops, zoned, NULL, NULL, msg)) == NULL) {
 			goto create_failed;
 		}
 		if (!zc_props &&
@@ -1000,6 +1168,21 @@ zpool_create(libzfs_handle_t *hdl, const
 			    "one or more vdevs refer to the same device"));
 			return (zfs_error(hdl, EZFS_BADDEV, msg));
 
+		case ERANGE:
+			/*
+			 * This happens if the record size is smaller or larger
+			 * than the allowed size range, or not a power of 2.
+			 *
+			 * NOTE: although zfs_valid_proplist is called earlier,
+			 * this case may have slipped through since the
+			 * pool does not exist yet and it is therefore
+			 * impossible to read properties e.g. max blocksize
+			 * from the pool.
+			 */
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "record size invalid"));
+			return (zfs_error(hdl, EZFS_BADPROP, msg));
+
 		case EOVERFLOW:
 			/*
 			 * This occurs when one of the devices is below
@@ -1033,21 +1216,6 @@ zpool_create(libzfs_handle_t *hdl, const
 		}
 	}
 
-	/*
-	 * If this is an alternate root pool, then we automatically set the
-	 * mountpoint of the root dataset to be '/'.
-	 */
-	if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT),
-	    &altroot) == 0) {
-		zfs_handle_t *zhp;
-
-		verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL);
-		verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
-		    "/") == 0);
-
-		zfs_close(zhp);
-	}
-
 create_failed:
 	zcmd_free_nvlists(&zc);
 	nvlist_free(zc_props);
@@ -1060,7 +1228,7 @@ create_failed:
  * datasets left in the pool.
  */
 int
-zpool_destroy(zpool_handle_t *zhp)
+zpool_destroy(zpool_handle_t *zhp, const char *log_str)
 {
 	zfs_cmd_t zc = { 0 };
 	zfs_handle_t *zfp = NULL;
@@ -1068,13 +1236,13 @@ zpool_destroy(zpool_handle_t *zhp)
 	char msg[1024];
 
 	if (zhp->zpool_state == POOL_STATE_ACTIVE &&
-	    (zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
-	    ZFS_TYPE_FILESYSTEM)) == NULL)
+	    (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (-1);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_history = (uint64_t)(uintptr_t)log_str;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
 		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
 		    "cannot destroy '%s'"), zhp->zpool_name);
 
@@ -1125,25 +1293,6 @@ zpool_add(zpool_handle_t *zhp, nvlist_t 
 		return (zfs_error(hdl, EZFS_BADVERSION, msg));
 	}
 
-	if (pool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot,
-	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
-		uint64_t s;
-
-		for (s = 0; s < nspares; s++) {
-			char *path;
-
-			if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH,
-			    &path) == 0 && pool_uses_efi(spares[s])) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "device '%s' contains an EFI label and "
-				    "cannot be used on root pools."),
-				    zpool_vdev_name(hdl, NULL, spares[s],
-				    B_FALSE));
-				return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
-			}
-		}
-	}
-
 	if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
 	    SPA_VERSION_L2CACHE &&
 	    nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
@@ -1157,7 +1306,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t 
 		return (-1);
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
 		switch (errno) {
 		case EBUSY:
 			/*
@@ -1228,7 +1377,8 @@ zpool_add(zpool_handle_t *zhp, nvlist_t 
  * mounted datasets in the pool.
  */
 static int
-zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
+zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce,
+    const char *log_str)
 {
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
@@ -1239,6 +1389,7 @@ zpool_export_common(zpool_handle_t *zhp,
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_cookie = force;
 	zc.zc_guid = hardforce;
+	zc.zc_history = (uint64_t)(uintptr_t)log_str;
 
 	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
 		switch (errno) {
@@ -1260,35 +1411,41 @@ zpool_export_common(zpool_handle_t *zhp,
 }
 
 int
-zpool_export(zpool_handle_t *zhp, boolean_t force)
+zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str)
 {
-	return (zpool_export_common(zhp, force, B_FALSE));
+	return (zpool_export_common(zhp, force, B_FALSE, log_str));
 }
 
 int
-zpool_export_force(zpool_handle_t *zhp)
+zpool_export_force(zpool_handle_t *zhp, const char *log_str)
 {
-	return (zpool_export_common(zhp, B_TRUE, B_TRUE));
+	return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str));
 }
 
 static void
 zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
-    nvlist_t *rbi)
+    nvlist_t *config)
 {
+	nvlist_t *nv = NULL;
 	uint64_t rewindto;
 	int64_t loss = -1;
 	struct tm t;
 	char timestr[128];
 
-	if (!hdl->libzfs_printerr || rbi == NULL)
+	if (!hdl->libzfs_printerr || config == NULL)
 		return;
 
-	if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+	    nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) {
+		return;
+	}
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
 		return;
-	(void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss);
+	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
 
 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "", &t) != 0) {
+	    strftime(timestr, 128, 0, &t) != 0) {
 		if (dryrun) {
 			(void) printf(dgettext(TEXT_DOMAIN,
 			    "Would be able to return %s "
@@ -1301,16 +1458,15 @@ zpool_rewind_exclaim(libzfs_handle_t *hd
 		}
 		if (loss > 120) {
 			(void) printf(dgettext(TEXT_DOMAIN,
-			    "%s approximately %jd "),
+			    "%s approximately %lld "),
 			    dryrun ? "Would discard" : "Discarded",
-			    ((uintmax_t)(loss + 30) / 60));
+			    (loss + 30) / 60);
 			(void) printf(dgettext(TEXT_DOMAIN,
 			    "minutes of transactions.\n"));
 		} else if (loss > 0) {
 			(void) printf(dgettext(TEXT_DOMAIN,
-			    "%s approximately %jd "),
-			    dryrun ? "Would discard" : "Discarded",
-			    (uintmax_t)loss);
+			    "%s approximately %lld "),
+			    dryrun ? "Would discard" : "Discarded", loss);
 			(void) printf(dgettext(TEXT_DOMAIN,
 			    "seconds of transactions.\n"));
 		}
@@ -1321,6 +1477,7 @@ void
 zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
     nvlist_t *config)
 {
+	nvlist_t *nv = NULL;
 	int64_t loss = -1;
 	uint64_t edata = UINT64_MAX;
 	uint64_t rewindto;
@@ -1336,19 +1493,20 @@ zpool_explain_recover(libzfs_handle_t *h
 		(void) printf(dgettext(TEXT_DOMAIN, "\t"));
 
 	/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
-	if (nvlist_lookup_uint64(config,
-	    ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 ||
+	    nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 ||
+	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
 		goto no_info;
 
-	(void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss);
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+	(void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss);
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
 	    &edata);
 
 	(void) printf(dgettext(TEXT_DOMAIN,
 	    "Recovery is possible, but will result in some data loss.\n"));
 
 	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
-	    strftime(timestr, 128, "", &t) != 0) {
+	    strftime(timestr, 128, 0, &t) != 0) {
 		(void) printf(dgettext(TEXT_DOMAIN,
 		    "\tReturning the pool to its state as of %s\n"
 		    "\tshould correct the problem.  "),
@@ -1361,13 +1519,12 @@ zpool_explain_recover(libzfs_handle_t *h
 
 	if (loss > 120) {
 		(void) printf(dgettext(TEXT_DOMAIN,
-		    "Approximately %jd minutes of data\n"
-		    "\tmust be discarded, irreversibly.  "),
-		    (uintmax_t)((loss + 30) / 60));
+		    "Approximately %lld minutes of data\n"
+		    "\tmust be discarded, irreversibly.  "), (loss + 30) / 60);
 	} else if (loss > 0) {
 		(void) printf(dgettext(TEXT_DOMAIN,
-		    "Approximately %jd seconds of data\n"
-		    "\tmust be discarded, irreversibly.  "), (uintmax_t)loss);
+		    "Approximately %lld seconds of data\n"
+		    "\tmust be discarded, irreversibly.  "), loss);
 	}
 	if (edata != 0 && edata != UINT64_MAX) {
 		if (edata == 1) {
@@ -1426,12 +1583,63 @@ zpool_import(libzfs_handle_t *hdl, nvlis
 		}
 	}
 
-	ret = zpool_import_props(hdl, config, newname, props, B_FALSE);
-	if (props)
-		nvlist_free(props);
+	ret = zpool_import_props(hdl, config, newname, props,
+	    ZFS_IMPORT_NORMAL);
+	nvlist_free(props);
 	return (ret);
 }
 
+static void
+print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv,
+    int indent)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	char *vname;
+	uint64_t is_log = 0;
+
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG,
+	    &is_log);
+
+	if (name != NULL)
+		(void) printf("\t%*s%s%s\n", indent, "", name,
+		    is_log ? " [log]" : "");
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		return;
+
+	for (c = 0; c < children; c++) {
+		vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE);
+		print_vdev_tree(hdl, vname, child[c], indent + 2);
+		free(vname);
+	}
+}
+
+void
+zpool_print_unsup_feat(nvlist_t *config)
+{
+	nvlist_t *nvinfo, *unsup_feat;
+
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) ==
+	    0);
+	verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT,
+	    &unsup_feat) == 0);
+
+	for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL;
+	    nvp = nvlist_next_nvpair(unsup_feat, nvp)) {
+		char *desc;
+
+		verify(nvpair_type(nvp) == DATA_TYPE_STRING);
+		verify(nvpair_value_string(nvp, &desc) == 0);
+
+		if (strlen(desc) > 0)
+			(void) printf("\t%s (%s)\n", nvpair_name(nvp), desc);
+		else
+			(void) printf("\t%s\n", nvpair_name(nvp));
+	}
+}
+
 /*
  * Import the given pool using the known configuration and a list of
  * properties to be set. The configuration should have come from
@@ -1440,15 +1648,17 @@ zpool_import(libzfs_handle_t *hdl, nvlis
  */
 int
 zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
-    nvlist_t *props, boolean_t importfaulted)
+    nvlist_t *props, int flags)
 {
 	zfs_cmd_t zc = { 0 };
 	zpool_rewind_policy_t policy;
-	nvlist_t *nvi = NULL;
+	nvlist_t *nv = NULL;
+	nvlist_t *nvinfo = NULL;
+	nvlist_t *missing = NULL;
 	char *thename;
 	char *origname;
-	uint64_t returned_size;
 	int ret;
+	int error = 0;
 	char errbuf[1024];
 
 	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
@@ -1462,24 +1672,26 @@ zpool_import_props(libzfs_handle_t *hdl,
 			return (zfs_error_fmt(hdl, EZFS_INVALIDNAME,
 			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
 			    newname));
-		thename = __UNCONST(newname);
+		thename = (char *)newname;
 	} else {
 		thename = origname;
 	}
 
-	if (props) {
+	if (props != NULL) {
 		uint64_t version;
+		prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
 
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 		    &version) == 0);
 
 		if ((props = zpool_valid_proplist(hdl, origname,
-		    props, version, B_TRUE, errbuf)) == NULL) {
+		    props, version, flags, errbuf)) == NULL)
 			return (-1);
-		} else if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
+		if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) {
 			nvlist_free(props);
 			return (-1);
 		}
+		nvlist_free(props);
 	}
 
 	(void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name));
@@ -1488,30 +1700,42 @@ zpool_import_props(libzfs_handle_t *hdl,
 	    &zc.zc_guid) == 0);
 
 	if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) {
-		nvlist_free(props);
+		zcmd_free_nvlists(&zc);
 		return (-1);
 	}
-	returned_size =  zc.zc_nvlist_conf_size + 512;
-	if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) {
-		nvlist_free(props);
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) {
+		zcmd_free_nvlists(&zc);
 		return (-1);
 	}
 
-	zc.zc_cookie = (uint64_t)importfaulted;
-	ret = 0;
-	if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
+	zc.zc_cookie = flags;
+	while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 &&
+	    errno == ENOMEM) {
+		if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+			zcmd_free_nvlists(&zc);
+			return (-1);
+		}
+	}
+	if (ret != 0)
+		error = errno;
+
+	(void) zcmd_read_dst_nvlist(hdl, &zc, &nv);
+
+	zcmd_free_nvlists(&zc);
+
+	zpool_get_rewind_policy(config, &policy);
+
+	if (error) {
 		char desc[1024];
 
-		(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
-		zpool_get_rewind_policy(config, &policy);
 		/*
 		 * Dry-run failed, but we print out what success
 		 * looks like if we found a best txg
 		 */
-		if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) {
+		if (policy.zrp_request & ZPOOL_TRY_REWIND) {
 			zpool_rewind_exclaim(hdl, newname ? origname : thename,
-			    B_TRUE, nvi);
-			nvlist_free(nvi);
+			    B_TRUE, nv);
+			nvlist_free(nv);
 			return (-1);
 		}
 
@@ -1524,8 +1748,24 @@ zpool_import_props(libzfs_handle_t *hdl,
 			    dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"),
 			    origname, thename);
 
-		switch (errno) {
+		switch (error) {
 		case ENOTSUP:
+			if (nv != NULL && nvlist_lookup_nvlist(nv,
+			    ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
+			    nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) {
+				(void) printf(dgettext(TEXT_DOMAIN, "This "
+				    "pool uses the following feature(s) not "
+				    "supported by this system:\n"));
+				zpool_print_unsup_feat(nv);
+				if (nvlist_exists(nvinfo,
+				    ZPOOL_CONFIG_CAN_RDONLY)) {
+					(void) printf(dgettext(TEXT_DOMAIN,
+					    "All unsupported features are only "
+					    "required for writing to the pool."
+					    "\nThe pool can be imported using "
+					    "'-o readonly=on'.\n"));
+				}
+			}
 			/*
 			 * Unsupported version.
 			 */
@@ -1536,15 +1776,43 @@ zpool_import_props(libzfs_handle_t *hdl,
 			(void) zfs_error(hdl, EZFS_INVALCONFIG, desc);
 			break;
 
+		case EROFS:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "one or more devices is read only"));
+			(void) zfs_error(hdl, EZFS_BADDEV, desc);
+			break;
+
+		case ENXIO:
+			if (nv && nvlist_lookup_nvlist(nv,
+			    ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 &&
+			    nvlist_lookup_nvlist(nvinfo,
+			    ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) {
+				(void) printf(dgettext(TEXT_DOMAIN,
+				    "The devices below are missing, use "
+				    "'-m' to import the pool anyway:\n"));
+				print_vdev_tree(hdl, NULL, missing, 2);
+				(void) printf("\n");
+			}
+			(void) zpool_standard_error(hdl, error, desc);
+			break;
+
+		case EEXIST:
+			(void) zpool_standard_error(hdl, error, desc);
+			break;
+		case ENAMETOOLONG:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "new name of at least one dataset is longer than "
+			    "the maximum allowable length"));
+			(void) zfs_error(hdl, EZFS_NAMETOOLONG, desc);
+			break;
 		default:
-			(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
-			(void) zpool_standard_error(hdl, errno, desc);
+			(void) zpool_standard_error(hdl, error, desc);
 			zpool_explain_recover(hdl,
-			    newname ? origname : thename, -errno, nvi);
-			nvlist_free(nvi);
+			    newname ? origname : thename, -error, nv);
 			break;
 		}
 
+		nvlist_free(nv);
 		ret = -1;
 	} else {
 		zpool_handle_t *zhp;
@@ -1556,49 +1824,101 @@ zpool_import_props(libzfs_handle_t *hdl,
 			ret = -1;
 		else if (zhp != NULL)
 			zpool_close(zhp);
-		(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
-		zpool_get_rewind_policy(config, &policy);
 		if (policy.zrp_request &
 		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
 			zpool_rewind_exclaim(hdl, newname ? origname : thename,
-			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
-			    nvi);
+			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv);
 		}
-		nvlist_free(nvi);
+		nvlist_free(nv);
 		return (0);
 	}
 
-	zcmd_free_nvlists(&zc);
-	nvlist_free(props);
-
 	return (ret);
 }
 
 /*
- * Scrub the pool.
+ * Scan the pool.
  */
 int
-zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
+zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func)
 {
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	zc.zc_cookie = type;
+	zc.zc_cookie = func;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCRUB, &zc) == 0)
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 ||
+	    (errno == ENOENT && func != POOL_SCAN_NONE))
 		return (0);
 
-	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+	if (func == POOL_SCAN_SCRUB) {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name);
+	} else if (func == POOL_SCAN_NONE) {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"),
+		    zc.zc_name);
+	} else {
+		assert(!"unexpected result");
+	}
 
-	if (errno == EBUSY)
-		return (zfs_error(hdl, EZFS_RESILVERING, msg));
-	else
+	if (errno == EBUSY) {
+		nvlist_t *nvroot;
+		pool_scan_stat_t *ps = NULL;
+		uint_t psc;
+
+		verify(nvlist_lookup_nvlist(zhp->zpool_config,
+		    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+		(void) nvlist_lookup_uint64_array(nvroot,
+		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
+		if (ps && ps->pss_func == POOL_SCAN_SCRUB)
+			return (zfs_error(hdl, EZFS_SCRUBBING, msg));
+		else
+			return (zfs_error(hdl, EZFS_RESILVERING, msg));
+	} else if (errno == ENOENT) {
+		return (zfs_error(hdl, EZFS_NO_SCRUB, msg));
+	} else {
 		return (zpool_standard_error(hdl, errno, msg));
+	}
 }
 
+#ifdef illumos
+/*
+ * This provides a very minimal check whether a given string is likely a
+ * c#t#d# style string.  Users of this are expected to do their own
+ * verification of the s# part.
+ */
+#define	CTD_CHECK(str)  (str && str[0] == 'c' && isdigit(str[1]))
+
+/*
+ * More elaborate version for ones which may start with "/dev/dsk/"
+ * and the like.
+ */
+static int
+ctd_check_path(char *str)
+{
+	/*
+	 * If it starts with a slash, check the last component.
+	 */
+	if (str && str[0] == '/') {
+		char *tmp = strrchr(str, '/');
+
+		/*
+		 * If it ends in "/old", check the second-to-last
+		 * component of the string instead.
+		 */
+		if (tmp != str && strcmp(tmp, "/old") == 0) {
+			for (tmp--; *tmp != '/'; tmp--)
+				;
+		}
+		str = tmp + 1;
+	}
+	return (CTD_CHECK(str));
+}
+#endif
+
 /*
  * Find a vdev that matches the search criteria specified. We use the
  * the nvpair name to determine how we should look for the device.
@@ -1624,26 +1944,17 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist
 	srchkey = nvpair_name(pair);
 
 	switch (nvpair_type(pair)) {
-	case DATA_TYPE_UINT64: {
-		uint64_t srchval, theguid, present;
-
-		verify(nvpair_value_uint64(pair, &srchval) == 0);
+	case DATA_TYPE_UINT64:
 		if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
-			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-			    &present) == 0) {
-				/*
-				 * If the device has never been present since
-				 * import, the only reliable way to match the
-				 * vdev is by GUID.
-				 */
-				verify(nvlist_lookup_uint64(nv,
-				    ZPOOL_CONFIG_GUID, &theguid) == 0);
-				if (theguid == srchval)
-					return (nv);
-			}
+			uint64_t srchval, theguid;
+
+			verify(nvpair_value_uint64(pair, &srchval) == 0);
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+			    &theguid) == 0);
+			if (theguid == srchval)
+				return (nv);
 		}
 		break;
-	}
 
 	case DATA_TYPE_STRING: {
 		char *srchval, *val;
@@ -1653,27 +1964,58 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist
 			break;
 
 		/*
-		 * Search for the requested value. We special case the search
-		 * for ZPOOL_CONFIG_PATH when it's a wholedisk and when
-		 * Looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
+		 * Search for the requested value. Special cases:
+		 *
+		 * - ZPOOL_CONFIG_PATH for whole disk entries.  These end in
+		 *   "s0" or "s0/old".  The "s0" part is hidden from the user,
+		 *   but included in the string, so this matches around it.
+		 * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
+		 *
 		 * Otherwise, all other searches are simple string compares.
 		 */
-		if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) {
+#ifdef illumos
+		if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 &&
+		    ctd_check_path(val)) {
 			uint64_t wholedisk = 0;
 
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 			    &wholedisk);
 			if (wholedisk) {
+				int slen = strlen(srchval);
+				int vlen = strlen(val);
+
+				if (slen != vlen - 2)
+					break;
+
 				/*
-				 * For whole disks, the internal path has 's0',
-				 * but the path passed in by the user doesn't.
+				 * make_leaf_vdev() should only set
+				 * wholedisk for ZPOOL_CONFIG_PATHs which
+				 * will include "/dev/dsk/", giving plenty of
+				 * room for the indices used next.
 				 */
-				if (strlen(srchval) == strlen(val) - 2 &&
-				    strncmp(srchval, val, strlen(srchval)) == 0)
+				ASSERT(vlen >= 6);
+
+				/*
+				 * strings identical except trailing "s0"
+				 */
+				if (strcmp(&val[vlen - 2], "s0") == 0 &&
+				    strncmp(srchval, val, slen) == 0)
 					return (nv);
+
+				/*
+				 * strings identical except trailing "s0/old"
+				 */
+				if (strcmp(&val[vlen - 6], "s0/old") == 0 &&
+				    strcmp(&srchval[slen - 4], "/old") == 0 &&
+				    strncmp(srchval, val, slen - 4) == 0)
+					return (nv);
+
 				break;
 			}
 		} else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
+#else
+		if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
+#endif
 			char *type, *idx, *end, *p;
 			uint64_t id, vdev_id;
 
@@ -1798,6 +2140,9 @@ zpool_find_vdev_by_physpath(zpool_handle
 	    &nvroot) == 0);
 
 	*avail_spare = B_FALSE;
+	*l2cache = B_FALSE;
+	if (log != NULL)
+		*log = B_FALSE;
 	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
 	nvlist_free(search);
 
@@ -1807,7 +2152,7 @@ zpool_find_vdev_by_physpath(zpool_handle
 /*
  * Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
  */
-static boolean_t
+boolean_t
 zpool_vdev_is_interior(const char *name)
 {
 	if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
@@ -1833,7 +2178,7 @@ zpool_find_vdev(zpool_handle_t *zhp, con
 	} else if (zpool_vdev_is_interior(path)) {
 		verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
 	} else if (path[0] != '/') {
-		(void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path);
+		(void) snprintf(buf, sizeof (buf), "%s%s", _PATH_DEV, path);
 		verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
 	} else {
 		verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
@@ -1931,17 +2276,17 @@ vdev_get_physpaths(nvlist_t *nv, char *p
 	    (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
 		nvlist_t **child;
 		uint_t count;
-		int i, rv;
+		int i, ret;
 
 		if (nvlist_lookup_nvlist_array(nv,
 		    ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
 			return (EZFS_INVALCONFIG);
 
 		for (i = 0; i < count; i++) {
-			rv = vdev_get_physpaths(child[i], physpath,
+			ret = vdev_get_physpaths(child[i], physpath,
 			    phypath_size, rsz, is_spare);
-			if (rv == EZFS_NOSPC)
-				return (rv);
+			if (ret == EZFS_NOSPC)
+				return (ret);
 		}
 	}
 
@@ -1973,11 +2318,9 @@ zpool_get_config_physpath(nvlist_t *conf
 		return (EZFS_INVALCONFIG);
 
 	/*
-	 * root pool can not have EFI labeled disks and can only have
-	 * a single top-level vdev.
+	 * root pool can only have a single top-level vdev.
 	 */
-	if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 ||
-	    pool_uses_efi(vdev_root))
+	if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1)
 		return (EZFS_POOL_INVALARG);
 
 	(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
@@ -2008,6 +2351,7 @@ zpool_get_physpath(zpool_handle_t *zhp, 
 static int
 zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
 {
+#ifdef illumos
 	char path[MAXPATHLEN];
 	char errbuf[1024];
 	int fd, error;
@@ -2017,7 +2361,7 @@ zpool_relabel_disk(libzfs_handle_t *hdl,
 	    "efi_use_whole_disk")) == NULL)
 		return (-1);
 
-	(void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name);
+	(void) snprintf(path, sizeof (path), "%s/%s", ZFS_RDISK_ROOT, name);
 
 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
@@ -2037,6 +2381,7 @@ zpool_relabel_disk(libzfs_handle_t *hdl,
 		    "relabel '%s': unable to read disk capacity"), name);
 		return (zfs_error(hdl, EZFS_NOCAP, errbuf));
 	}
+#endif	/* illumos */
 	return (0);
 }
 
@@ -2092,15 +2437,15 @@ zpool_vdev_online(zpool_handle_t *zhp, c
 		}
 
 		if (wholedisk) {
-			pathname += strlen(DISK_ROOT) + 1;
-			(void) zpool_relabel_disk(zhp->zpool_hdl, pathname);
+			pathname += strlen(ZFS_DISK_ROOT) + 1;
+			(void) zpool_relabel_disk(hdl, pathname);
 		}
 	}
 
 	zc.zc_cookie = VDEV_STATE_ONLINE;
 	zc.zc_obj = flags;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
 		if (errno == EINVAL) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
 			    "from this pool into a new one.  Use '%s' "
@@ -2142,7 +2487,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, 
 	zc.zc_cookie = VDEV_STATE_OFFLINE;
 	zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
 
 	switch (errno) {
@@ -2175,14 +2520,14 @@ zpool_vdev_fault(zpool_handle_t *zhp, ui
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 
 	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot fault %ju"), (uintmax_t)guid);
+	    dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_guid = guid;
 	zc.zc_cookie = VDEV_STATE_FAULTED;
 	zc.zc_obj = aux;
 
-	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
 
 	switch (errno) {
@@ -2210,14 +2555,14 @@ zpool_vdev_degrade(zpool_handle_t *zhp, 
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 
 	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot degrade %ju"), (uintmax_t)guid);
+	    dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_guid = guid;
 	zc.zc_cookie = VDEV_STATE_DEGRADED;
 	zc.zc_obj = aux;
 
-	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
+	if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
 
 	return (zpool_standard_error(hdl, errno, msg));
@@ -2265,12 +2610,12 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 	nvlist_t *tgt;
 	boolean_t avail_spare, l2cache, islog;
 	uint64_t val;
-	char *path, *newname;
+	char *newname;
 	nvlist_t **child;
 	uint_t children;
 	nvlist_t *config_root;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
-	boolean_t rootpool = pool_is_bootable(zhp);
+	boolean_t rootpool = zpool_is_bootable(zhp);
 
 	if (replacing)
 		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
@@ -2279,16 +2624,6 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
 		    "cannot attach %s to %s"), new_disk, old_disk);
 
-	/*
-	 * If this is a root pool, make sure that we're not attaching an
-	 * EFI labeled device.
-	 */
-	if (rootpool && pool_uses_efi(nvroot)) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "EFI labeled devices are not supported on root pools."));
-		return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
-	}
-
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache,
 	    &islog)) == 0)
@@ -2331,48 +2666,34 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		return (zfs_error(hdl, EZFS_BADTARGET, msg));
 	}
 
-	/*
-	 * If we are attempting to replace a spare, it canot be applied to an
-	 * already spared device.
-	 */
-	if (replacing &&
-	    nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 &&
-	    zpool_find_vdev(zhp, newname, &avail_spare,
-	    &l2cache, NULL) != NULL && avail_spare &&
-	    is_replacing_spare(config_root, tgt, 0)) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "device has already been replaced with a spare"));
-		free(newname);
-		return (zfs_error(hdl, EZFS_BADTARGET, msg));
-	}
-
 	free(newname);
 
 	if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
 		return (-1);
 
-	ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ATTACH, &zc);
+	ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc);
 
 	zcmd_free_nvlists(&zc);
 
 	if (ret == 0) {
 		if (rootpool) {
 			/*
-			 * XXX - This should be removed once we can
-			 * automatically install the bootblocks on the
-			 * newly attached disk.
-			 */
-			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please "
-			    "be sure to invoke %s to make '%s' bootable.\n"),
-			    BOOTCMD, new_disk);
-
-			/*
 			 * XXX need a better way to prevent user from
 			 * booting up a half-baked vdev.
 			 */
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
 			    "sure to wait until resilver is done "
 			    "before rebooting.\n"));
+			(void) fprintf(stderr, "\n");
+			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "If "
+			    "you boot from pool '%s', you may need to update\n"
+			    "boot code on newly attached disk '%s'.\n\n"
+			    "Assuming you use GPT partitioning and 'da0' is "
+			    "your new boot disk\n"
+			    "you may use the following command:\n\n"
+			    "\tgpart bootcode -b /boot/pmbr -p "
+			    "/boot/gptzfsboot -i 1 da0\n\n"),
+			    zhp->zpool_name, new_disk);
 		}
 		return (0);
 	}
@@ -2383,9 +2704,16 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 		 * Can't attach to or replace this type of vdev.
 		 */
 		if (replacing) {
+			uint64_t version = zpool_get_prop_int(zhp,
+			    ZPOOL_PROP_VERSION, NULL);
+
 			if (islog)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "cannot replace a log with a spare"));
+			else if (version >= SPA_VERSION_MULTI_REPLACE)
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "already in replacing/spare config; wait "
+				    "for completion or use 'zpool detach'"));
 			else
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "cannot replace a replacing device"));
@@ -2483,7 +2811,7 @@ zpool_vdev_detach(zpool_handle_t *zhp, c
 		 */
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only "
 		    "applicable to mirror and replacing vdevs"));
-		(void) zfs_error(zhp->zpool_hdl, EZFS_BADTARGET, msg);
+		(void) zfs_error(hdl, EZFS_BADTARGET, msg);
 		break;
 
 	case EBUSY:
@@ -2575,8 +2903,9 @@ zpool_vdev_split(zpool_handle_t *zhp, ch
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
 
 	if (props) {
+		prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE };
 		if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
-		    props, vers, B_TRUE, msg)) == NULL)
+		    props, vers, flags, msg)) == NULL)
 			return (-1);
 	}
 
@@ -2584,8 +2913,7 @@ zpool_vdev_split(zpool_handle_t *zhp, ch
 	    &children) != 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "Source pool is missing vdev tree"));
-		if (zc_props)
-			nvlist_free(zc_props);
+		nvlist_free(zc_props);
 		return (-1);
 	}
 
@@ -2733,10 +3061,8 @@ out:
 		free(varray);
 	}
 	zcmd_free_nvlists(&zc);
-	if (zc_props)
-		nvlist_free(zc_props);
-	if (newconfig)
-		nvlist_free(newconfig);
+	nvlist_free(zc_props);
+	nvlist_free(newconfig);
 	if (freelist) {
 		nvlist_free(*newroot);
 		*newroot = NULL;
@@ -2810,6 +3136,7 @@ zpool_clear(zpool_handle_t *zhp, const c
 	boolean_t avail_spare, l2cache;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	nvlist_t *nvi = NULL;
+	int error;
 
 	if (path)
 		(void) snprintf(msg, sizeof (msg),
@@ -2840,14 +3167,21 @@ zpool_clear(zpool_handle_t *zhp, const c
 	zpool_get_rewind_policy(rewindnvl, &policy);
 	zc.zc_cookie = policy.zrp_request;
 
-	if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0)
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0)
 		return (-1);
 
-	if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0)
+	if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0)
 		return (-1);
 
-	if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 ||
-	    ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
+	while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 &&
+	    errno == ENOMEM) {
+		if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
+			zcmd_free_nvlists(&zc);
+			return (-1);
+		}
+	}
+
+	if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
 	    errno != EPERM && errno != EACCES)) {
 		if (policy.zrp_request &
 		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
@@ -2876,11 +3210,12 @@ zpool_vdev_clear(zpool_handle_t *zhp, ui
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 
 	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot clear errors for %x"),
-	    (uintmax_t)guid);
+	    dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"),
+	    guid);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_guid = guid;
+	zc.zc_cookie = ZPOOL_NO_REWIND;
 
 	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0)
 		return (0);
@@ -2889,6 +3224,46 @@ zpool_vdev_clear(zpool_handle_t *zhp, ui
 }
 
 /*
+ * Change the GUID for a pool.
+ */
+int
+zpool_reguid(zpool_handle_t *zhp)
+{
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	zfs_cmd_t zc = { 0 };
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
+		return (0);
+
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
+ * Reopen the pool.
+ */
+int
+zpool_reopen(zpool_handle_t *zhp)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "cannot reopen '%s'"),
+	    zhp->zpool_name);
+
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0)
+		return (0);
+	return (zpool_standard_error(hdl, errno, msg));
+}
+
+/*
  * Convert from a devid string to a path.
  */
 static char *
@@ -2911,8 +3286,10 @@ devid_to_path(char *devid_str)
 	if (ret != 0)
 		return (NULL);
 
-	if ((path = strdup(list[0].devname)) == NULL)
-		return (NULL);
+	/*
+	 * In a case the strdup() fails, we will just return NULL below.
+	 */
+	path = strdup(list[0].devname);
 
 	devid_free_nmlist(list);
 
@@ -2925,6 +3302,7 @@ devid_to_path(char *devid_str)
 static char *
 path_to_devid(const char *path)
 {
+#ifdef have_devid
 	int fd;
 	ddi_devid_t devid;
 	char *minor, *ret;
@@ -2944,6 +3322,9 @@ path_to_devid(const char *path)
 	(void) close(fd);
 
 	return (ret);
+#else
+	return (NULL);
+#endif
 }
 
 /*
@@ -2988,15 +3369,25 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp
 	char buf[64];
 	vdev_stat_t *vs;
 	uint_t vsc;
+	int have_stats;
+	int have_path;
+
+	have_stats = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &vsc) == 0;
+	have_path = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0;
 
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-	    &value) == 0) {
+	/*
+	 * If the device is not currently present, assume it will not
+	 * come back at the same device path.  Display the device by GUID.
+	 */
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 ||
+	    have_path && have_stats && vs->vs_state <= VDEV_STATE_CANT_OPEN) {
 		verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
 		    &value) == 0);
-		(void) snprintf(buf, sizeof (buf), "%ju",
-		    (uintmax_t)value);
+		(void) snprintf(buf, sizeof (buf), "%llu",
+		    (u_longlong_t)value);
 		path = buf;
-	} else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
+	} else if (have_path) {
 
 		/*
 		 * If the device is dead (faulted, offline, etc) then don't
@@ -3004,8 +3395,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp
 		 * open a misbehaving device, which can have undesirable
 		 * effects.
 		 */
-		if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
-		    (uint64_t **)&vs, &vsc) != 0 ||
+		if ((have_stats == 0 ||
 		    vs->vs_state >= VDEV_STATE_DEGRADED) &&
 		    zhp != NULL &&
 		    nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) {
@@ -3036,17 +3426,35 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp
 				devid_str_free(newdevid);
 		}
 
-		if (strncmp(path, "/dev/dsk/", 9) == 0)
-			path += 9;
+#ifdef illumos
+		if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0)
+			path += strlen(ZFS_DISK_ROOTD);
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    &value) == 0 && value) {
+			int pathlen = strlen(path);
 			char *tmp = zfs_strdup(hdl, path);
-			if (tmp == NULL)
-				return (NULL);
-			tmp[strlen(path) - 2] = '\0';
+
+			/*
+			 * If it starts with c#, and ends with "s0", chop
+			 * the "s0" off, or if it ends with "s0/old", remove
+			 * the "s0" from the middle.
+			 */
+			if (CTD_CHECK(tmp)) {
+				if (strcmp(&tmp[pathlen - 2], "s0") == 0) {
+					tmp[pathlen - 2] = '\0';
+				} else if (pathlen > 6 &&
+				    strcmp(&tmp[pathlen - 6], "s0/old") == 0) {
+					(void) strcpy(&tmp[pathlen - 6],
+					    "/old");
+				}
+			}
 			return (tmp);
 		}
+#else	/* !illumos */
+		if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
+			path += sizeof(_PATH_DEV) - 1;
+#endif	/* illumos */
 	} else {
 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0);
 
@@ -3056,8 +3464,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp
 		if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) {
 			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 			    &value) == 0);
-			(void) snprintf(buf, sizeof (buf), "%s%ju", path,
-			    (uintmax_t)value);
+			(void) snprintf(buf, sizeof (buf), "%s%llu", path,
+			    (u_longlong_t)value);
 			path = buf;
 		}
 
@@ -3070,8 +3478,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp
 
 			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
 			    &id) == 0);
-			(void) snprintf(buf, sizeof (buf), "%s-%ju", path,
-			    (uintmax_t)id);
+			(void) snprintf(buf, sizeof (buf), "%s-%llu", path,
+			    (u_longlong_t)id);
 			path = buf;
 		}
 	}
@@ -3080,9 +3488,9 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp
 }
 
 static int
-zbookmark_compare(const void *a, const void *b)
+zbookmark_mem_compare(const void *a, const void *b)
 {
-	return (memcmp(a, b, sizeof (zbookmark_t)));
+	return (memcmp(a, b, sizeof (zbookmark_phys_t)));
 }
 
 /*
@@ -3094,7 +3502,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nv
 {
 	zfs_cmd_t zc = { 0 };
 	uint64_t count;
-	zbookmark_t *zb = NULL;
+	zbookmark_phys_t *zb = NULL;
 	int i;
 
 	/*
@@ -3107,7 +3515,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nv
 	if (count == 0)
 		return (0);
 	if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
-	    count * sizeof (zbookmark_t))) == (uintptr_t)NULL)
+	    count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL)
 		return (-1);
 	zc.zc_nvlist_dst_size = count;
 	(void) strcpy(zc.zc_name, zhp->zpool_name);
@@ -3116,11 +3524,14 @@ zpool_get_errlog(zpool_handle_t *zhp, nv
 		    &zc) != 0) {
 			free((void *)(uintptr_t)zc.zc_nvlist_dst);
 			if (errno == ENOMEM) {
+				void *dst;
+
 				count = zc.zc_nvlist_dst_size;
-				if ((zc.zc_nvlist_dst = (uintptr_t)
-				    zfs_alloc(zhp->zpool_hdl, count *
-				    sizeof (zbookmark_t))) == (uintptr_t)NULL)
+				dst = zfs_alloc(zhp->zpool_hdl, count *
+				    sizeof (zbookmark_phys_t));
+				if (dst == NULL)
 					return (-1);
+				zc.zc_nvlist_dst = (uintptr_t)dst;
 			} else {
 				return (-1);
 			}
@@ -3136,11 +3547,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nv
 	 * _not_ copied as part of the process.  So we point the start of our
 	 * array appropriate and decrement the total number of elements.
 	 */
-	zb = ((zbookmark_t *)(uintptr_t)zc.zc_nvlist_dst) +
+	zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
 	    zc.zc_nvlist_dst_size;
 	count -= zc.zc_nvlist_dst_size;
 
-	qsort(zb, count, sizeof (zbookmark_t), zbookmark_compare);
+	qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
 
 	verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
 
@@ -3202,40 +3613,30 @@ zpool_upgrade(zpool_handle_t *zhp, uint6
 }
 
 void
-zpool_set_history_str(const char *subcommand, int argc, char **argv,
-    char *history_str)
+zfs_save_arguments(int argc, char **argv, char *string, int len)
 {
-	int i;
-
-	(void) strlcpy(history_str, subcommand, HIS_MAX_RECORD_LEN);
-	for (i = 1; i < argc; i++) {
-		if (strlen(history_str) + 1 + strlen(argv[i]) >
-		    HIS_MAX_RECORD_LEN)
-			break;
-		(void) strlcat(history_str, " ", HIS_MAX_RECORD_LEN);
-		(void) strlcat(history_str, argv[i], HIS_MAX_RECORD_LEN);
+	(void) strlcpy(string, basename(argv[0]), len);
+	for (int i = 1; i < argc; i++) {
+		(void) strlcat(string, " ", len);
+		(void) strlcat(string, argv[i], len);
 	}
 }
 
-/*
- * Stage command history for logging.
- */
 int
-zpool_stage_history(libzfs_handle_t *hdl, const char *history_str)
+zpool_log_history(libzfs_handle_t *hdl, const char *message)
 {
-	if (history_str == NULL)
-		return (EINVAL);
-
-	if (strlen(history_str) > HIS_MAX_RECORD_LEN)
-		return (EINVAL);
-
-	if (hdl->libzfs_log_str != NULL)
-		free(hdl->libzfs_log_str);
-
-	if ((hdl->libzfs_log_str = strdup(history_str)) == NULL)
-		return (no_memory(hdl));
+	zfs_cmd_t zc = { 0 };
+	nvlist_t *args;
+	int err;
 
-	return (0);
+	args = fnvlist_alloc();
+	fnvlist_add_string(args, "message", message);
+	err = zcmd_write_src_nvlist(hdl, &zc, args);
+	if (err == 0)
+		err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc);
+	nvlist_free(args);
+	zcmd_free_nvlists(&zc);
+	return (err);
 }
 
 /*
@@ -3328,7 +3729,9 @@ zpool_history_unpack(char *buf, uint64_t
 	return (0);
 }
 
-#define	HIS_BUF_LEN	(128*1024)
+/* from spa_history.c: spa_history_create_obj() */
+#define	HIS_BUF_LEN_DEF	(128 << 10)
+#define	HIS_BUF_LEN_MAX	(1 << 30)
 
 /*
  * Retrieve the command history of a pool.
@@ -3336,31 +3739,51 @@ zpool_history_unpack(char *buf, uint64_t
 int
 zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp)
 {
-	char buf[HIS_BUF_LEN];
+	char *buf;
+	uint64_t buflen = HIS_BUF_LEN_DEF;
 	uint64_t off = 0;
 	nvlist_t **records = NULL;
 	uint_t numrecords = 0;
 	int err, i;
 
+	buf = malloc(buflen);
+	if (buf == NULL)
+		return (ENOMEM);
 	do {
-		uint64_t bytes_read = sizeof (buf);
+		uint64_t bytes_read = buflen;
 		uint64_t leftover;
 
 		if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0)
 			break;
 
 		/* if nothing else was read in, we're at EOF, just return */
-		if (!bytes_read)
+		if (bytes_read == 0)
 			break;
 
 		if ((err = zpool_history_unpack(buf, bytes_read,
 		    &leftover, &records, &numrecords)) != 0)
 			break;
 		off -= leftover;
+		if (leftover == bytes_read) {
+			/*
+			 * no progress made, because buffer is not big enough
+			 * to hold this record; resize and retry.
+			 */
+			buflen *= 2;
+			free(buf);
+			buf = NULL;
+			if ((buflen >= HIS_BUF_LEN_MAX) ||
+			    ((buf = malloc(buflen)) == NULL)) {
+				err = ENOMEM;
+				break;
+			}
+		}
 
 		/* CONSTCOND */
 	} while (1);
 
+	free(buf);
+
 	if (!err) {
 		verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0);
 		verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD,
@@ -3380,12 +3803,11 @@ zpool_obj_to_path(zpool_handle_t *zhp, u
 	zfs_cmd_t zc = { 0 };
 	boolean_t mounted = B_FALSE;
 	char *mntpnt = NULL;
-	char dsname[MAXNAMELEN];
+	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
 	if (dsobj == 0) {
 		/* special case for the MOS */
-		(void) snprintf(pathname, len, "<metadata>:<%#jx>",
-		    (uintmax_t)obj);
+		(void) snprintf(pathname, len, "<metadata>:<0x%llx>", obj);
 		return;
 	}
 
@@ -3395,8 +3817,8 @@ zpool_obj_to_path(zpool_handle_t *zhp, u
 	if (ioctl(zhp->zpool_hdl->libzfs_fd,
 	    ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) {
 		/* just write out a path of two object numbers */
-		(void) snprintf(pathname, len, "<%#jx>:<%#jx>",
-		    (uintmax_t)dsobj, (uintmax_t)obj);
+		(void) snprintf(pathname, len, "<0x%llx>:<0x%llx>",
+		    dsobj, obj);
 		return;
 	}
 	(void) strlcpy(dsname, zc.zc_value, sizeof (dsname));
@@ -3417,12 +3839,12 @@ zpool_obj_to_path(zpool_handle_t *zhp, u
 			    dsname, zc.zc_value);
 		}
 	} else {
-		(void) snprintf(pathname, len, "%s:<%#jx>", dsname,
-		    (uintmax_t)obj);
+		(void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj);
 	}
 	free(mntpnt);
 }
 
+#ifdef illumos
 /*
  * Read the EFI label from the config, if a label does not exist then
  * pass back the error to the caller. If the caller has passed a non-NULL
@@ -3440,7 +3862,7 @@ read_efi_label(nvlist_t *config, diskadd
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0)
 		return (err);
 
-	(void) snprintf(diskname, sizeof (diskname), "%s%s", RDISK_ROOT,
+	(void) snprintf(diskname, sizeof (diskname), "%s%s", ZFS_RDISK_ROOT,
 	    strrchr(path, '/'));
 	if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) {
 		struct dk_gpt *vtoc;
@@ -3487,14 +3909,16 @@ find_start_block(nvlist_t *config)
 	}
 	return (MAXOFFSET_T);
 }
+#endif /* illumos */
 
 /*
  * Label an individual disk.  The name provided is the short name,
  * stripped of any leading /dev path.
  */
 int
-zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
+zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name)
 {
+#ifdef illumos
 	char path[MAXPATHLEN];
 	struct dk_gpt *vtoc;
 	int fd;
@@ -3510,13 +3934,6 @@ zpool_label_disk(libzfs_handle_t *hdl, z
 	if (zhp) {
 		nvlist_t *nvroot;
 
-		if (pool_is_bootable(zhp)) {
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "EFI labeled devices are not supported on root "
-			    "pools."));
-			return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf));
-		}
-
 		verify(nvlist_lookup_nvlist(zhp->zpool_config,
 		    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 
@@ -3530,7 +3947,7 @@ zpool_label_disk(libzfs_handle_t *hdl, z
 		start_block = NEW_START_BLOCK;
 	}
 
-	(void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name,
+	(void) snprintf(path, sizeof (path), "%s/%s%s", ZFS_RDISK_ROOT, name,
 	    BACKUP_SLICE);
 
 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
@@ -3599,6 +4016,7 @@ zpool_label_disk(libzfs_handle_t *hdl, z
 
 	(void) close(fd);
 	efi_free(vtoc);
+#endif /* illumos */
 	return (0);
 }
 
@@ -3610,9 +4028,7 @@ supported_dump_vdev_type(libzfs_handle_t
 	uint_t children, c;
 
 	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0);
-	if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
-	    strcmp(type, VDEV_TYPE_FILE) == 0 ||
-	    strcmp(type, VDEV_TYPE_LOG) == 0 ||
+	if (strcmp(type, VDEV_TYPE_FILE) == 0 ||
 	    strcmp(type, VDEV_TYPE_HOLE) == 0 ||
 	    strcmp(type, VDEV_TYPE_MISSING) == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
@@ -3631,8 +4047,12 @@ supported_dump_vdev_type(libzfs_handle_t
 }
 
 /*
- * check if this zvol is allowable for use as a dump device; zero if
- * it is, > 0 if it isn't, < 0 if it isn't a zvol
+ * Check if this zvol is allowable for use as a dump device; zero if
+ * it is, > 0 if it isn't, < 0 if it isn't a zvol.
+ *
+ * Allowable storage configurations include mirrors, all raidz variants, and
+ * pools with log, cache, and spare devices.  Pools which are backed by files or
+ * have missing/hole vdevs are not suitable.
  */
 int
 zvol_check_dump_config(char *arg)
@@ -3644,7 +4064,7 @@ zvol_check_dump_config(char *arg)
 	uint_t toplevels;
 	libzfs_handle_t *hdl;
 	char errbuf[1024];
-	char poolname[ZPOOL_MAXNAMELEN];
+	char poolname[ZFS_MAX_DATASET_NAME_LEN];
 	int pathlen = strlen(ZVOL_FULL_DEV_DIR);
 	int ret = 1;
 
@@ -3667,7 +4087,7 @@ zvol_check_dump_config(char *arg)
 		    "malformed dataset name"));
 		(void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf);
 		return (1);
-	} else if (p - volname >= ZFS_MAXNAMELEN) {
+	} else if (p - volname >= ZFS_MAX_DATASET_NAME_LEN) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "dataset name is too long"));
 		(void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf);
@@ -3694,12 +4114,6 @@ zvol_check_dump_config(char *arg)
 
 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &top, &toplevels) == 0);
-	if (toplevels != 1) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "'%s' has multiple top level vdevs"), poolname);
-		(void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf);
-		goto out;
-	}
 
 	if (!supported_dump_vdev_type(hdl, top[0], errbuf)) {
 		goto out;
@@ -3712,3 +4126,25 @@ out:
 	libzfs_fini(hdl);
 	return (ret);
 }
+
+int
+zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid,
+    const char *command)
+{
+	zfs_cmd_t zc = { 0 };
+	nvlist_t *args;
+	char *packed;
+	size_t size;
+	int error;
+
+	args = fnvlist_alloc();
+	fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid);
+	fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid);
+	fnvlist_add_string(args, "command", command);
+	error = zcmd_write_src_nvlist(hdl, &zc, args);
+	if (error == 0)
+		error = ioctl(hdl->libzfs_fd, ZFS_IOC_NEXTBOOT, &zc);
+	zcmd_free_nvlists(&zc);
+	nvlist_free(args);
+	return (error);
+}
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c,v
retrieving revision 1.2
diff -u -p -r1.2 libzfs_sendrecv.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c	3 Jun 2012 10:53:51 -0000	1.2
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c	22 Apr 2017 17:03:30 -0000
@@ -20,8 +20,14 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
 #include <assert.h>
@@ -34,25 +40,38 @@
 #include <unistd.h>
 #include <stddef.h>
 #include <fcntl.h>
+#include <sys/param.h>
 #include <sys/mount.h>
 #include <pthread.h>
 #include <umem.h>
+#include <time.h>
 
 #include <libzfs.h>
+#include <libzfs_core.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_fletcher.h"
 #include "libzfs_impl.h"
+#include <zlib.h>
 #include <sha2.h>
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 
+#ifdef __FreeBSD__
+extern int zfs_ioctl_version;
+/* We need to use something for ENODATA. */
+#define	ENODATA	EIDRM
+#endif
+
 /* in libzfs_dataset.c */
 extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 
-static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t,
-    int, avl_tree_t *, char **);
+static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
+    recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
+    uint64_t *, const char *);
+static int guid_to_name(libzfs_handle_t *, const char *,
+    uint64_t, boolean_t, char *);
 
 static const zio_cksum_t zero_cksum = { 0 };
 
@@ -62,6 +81,12 @@ typedef struct dedup_arg {
 	libzfs_handle_t  *dedup_hdl;
 } dedup_arg_t;
 
+typedef struct progress_arg {
+	zfs_handle_t *pa_zhp;
+	int pa_fd;
+	boolean_t pa_parsable;
+} progress_arg_t;
+
 typedef struct dataref {
 	uint64_t ref_guid;
 	uint64_t ref_object;
@@ -169,10 +194,28 @@ ddt_update(libzfs_handle_t *hdl, dedup_t
 }
 
 static int
-cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd)
+dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
+    zio_cksum_t *zc, int outfd)
 {
-	fletcher_4_incremental_native(buf, len, zc);
-	return (write(outfd, buf, len));
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	fletcher_4_incremental_native(drr,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
+	if (drr->drr_type != DRR_BEGIN) {
+		ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
+		    drr_checksum.drr_checksum));
+		drr->drr_u.drr_checksum.drr_checksum = *zc;
+	}
+	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
+	    sizeof (zio_cksum_t), zc);
+	if (write(outfd, drr, sizeof (*drr)) == -1)
+		return (errno);
+	if (payload_len != 0) {
+		fletcher_4_incremental_native(payload, payload_len, zc);
+		if (write(outfd, payload, payload_len) == -1)
+			return (errno);
+	}
+	return (0);
 }
 
 /*
@@ -196,27 +239,21 @@ static void *
 cksummer(void *arg)
 {
 	dedup_arg_t *dda = arg;
-	char *buf = malloc(1<<20);
+	char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
 	dmu_replay_record_t thedrr;
 	dmu_replay_record_t *drr = &thedrr;
-	struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
-	struct drr_end *drre = &thedrr.drr_u.drr_end;
-	struct drr_object *drro = &thedrr.drr_u.drr_object;
-	struct drr_write *drrw = &thedrr.drr_u.drr_write;
 	FILE *ofp;
 	int outfd;
-	dmu_replay_record_t wbr_drr = {0};
-	struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref;
 	dedup_table_t ddt;
 	zio_cksum_t stream_cksum;
 	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
 	uint64_t numbuckets;
 
 	ddt.max_ddt_size =
-	    MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100,
-	    SMALLEST_POSSIBLE_MAX_DDT_MB<<20);
+	    MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100,
+	    SMALLEST_POSSIBLE_MAX_DDT_MB << 20);
 
-	numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t));
+	numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t));
 
 	/*
 	 * numbuckets must be a power of 2.  Increase number to
@@ -232,86 +269,90 @@ cksummer(void *arg)
 	ddt.numhashbits = high_order_bit(numbuckets) - 1;
 	ddt.ddt_full = B_FALSE;
 
-	/* Initialize the write-by-reference block. */
-	wbr_drr.drr_type = DRR_WRITE_BYREF;
-	wbr_drr.drr_payloadlen = 0;
-
 	outfd = dda->outputfd;
 	ofp = fdopen(dda->inputfd, "r");
-	while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) {
+	while (ssread(drr, sizeof (*drr), ofp) != 0) {
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 		{
-			int	fflags;
+			struct drr_begin *drrb = &drr->drr_u.drr_begin;
+			int fflags;
+			int sz = 0;
 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
 
+			ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
 			/* set the DEDUP feature flag for this stream */
 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 			fflags |= (DMU_BACKUP_FEATURE_DEDUP |
 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
 
-			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-			    &stream_cksum, outfd) == -1)
-				goto out;
-			if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
-			    DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
-				int sz = drr->drr_payloadlen;
-
-				if (sz > 1<<20) {
-					free(buf);
-					buf = malloc(sz);
+			if (drr->drr_payloadlen != 0) {
+				sz = drr->drr_payloadlen;
+
+				if (sz > SPA_MAXBLOCKSIZE) {
+					buf = zfs_realloc(dda->dedup_hdl, buf,
+					    SPA_MAXBLOCKSIZE, sz);
 				}
 				(void) ssread(buf, sz, ofp);
 				if (ferror(stdin))
 					perror("fread");
-				if (cksum_and_write(buf, sz, &stream_cksum,
-				    outfd) == -1)
-					goto out;
 			}
+			if (dump_record(drr, buf, sz, &stream_cksum,
+			    outfd) != 0)
+				goto out;
 			break;
 		}
 
 		case DRR_END:
 		{
+			struct drr_end *drre = &drr->drr_u.drr_end;
 			/* use the recalculated checksum */
-			ZIO_SET_CHECKSUM(&drre->drr_checksum,
-			    stream_cksum.zc_word[0], stream_cksum.zc_word[1],
-			    stream_cksum.zc_word[2], stream_cksum.zc_word[3]);
-			if ((write(outfd, drr,
-			    sizeof (dmu_replay_record_t))) == -1)
+			drre->drr_checksum = stream_cksum;
+			if (dump_record(drr, NULL, 0, &stream_cksum,
+			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_OBJECT:
 		{
-			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-			    &stream_cksum, outfd) == -1)
-				goto out;
+			struct drr_object *drro = &drr->drr_u.drr_object;
 			if (drro->drr_bonuslen > 0) {
 				(void) ssread(buf,
 				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
 				    ofp);
-				if (cksum_and_write(buf,
-				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
-				    &stream_cksum, outfd) == -1)
-					goto out;
 			}
+			if (dump_record(drr, buf,
+			    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+			    &stream_cksum, outfd) != 0)
+				goto out;
+			break;
+		}
+
+		case DRR_SPILL:
+		{
+			struct drr_spill *drrs = &drr->drr_u.drr_spill;
+			(void) ssread(buf, drrs->drr_length, ofp);
+			if (dump_record(drr, buf, drrs->drr_length,
+			    &stream_cksum, outfd) != 0)
+				goto out;
 			break;
 		}
 
 		case DRR_FREEOBJECTS:
 		{
-			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-			    &stream_cksum, outfd) == -1)
+			if (dump_record(drr, NULL, 0, &stream_cksum,
+			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		case DRR_WRITE:
 		{
+			struct drr_write *drrw = &drr->drr_u.drr_write;
 			dataref_t	dataref;
 
 			(void) ssread(buf, drrw->drr_length, ofp);
@@ -349,7 +390,13 @@ cksummer(void *arg)
 			if (ddt_update(dda->dedup_hdl, &ddt,
 			    &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
 			    &dataref)) {
+				dmu_replay_record_t wbr_drr = {0};
+				struct drr_write_byref *wbr_drrr =
+				    &wbr_drr.drr_u.drr_write_byref;
+
 				/* block already present in stream */
+				wbr_drr.drr_type = DRR_WRITE_BYREF;
+
 				wbr_drrr->drr_object = drrw->drr_object;
 				wbr_drrr->drr_offset = drrw->drr_offset;
 				wbr_drrr->drr_length = drrw->drr_length;
@@ -369,34 +416,41 @@ cksummer(void *arg)
 				wbr_drrr->drr_key.ddk_prop =
 				    drrw->drr_key.ddk_prop;
 
-				if (cksum_and_write(&wbr_drr,
-				    sizeof (dmu_replay_record_t), &stream_cksum,
-				    outfd) == -1)
+				if (dump_record(&wbr_drr, NULL, 0,
+				    &stream_cksum, outfd) != 0)
 					goto out;
 			} else {
 				/* block not previously seen */
-				if (cksum_and_write(drr,
-				    sizeof (dmu_replay_record_t), &stream_cksum,
-				    outfd) == -1)
-					goto out;
-				if (cksum_and_write(buf,
-				    drrw->drr_length,
-				    &stream_cksum, outfd) == -1)
+				if (dump_record(drr, buf, drrw->drr_length,
+				    &stream_cksum, outfd) != 0)
 					goto out;
 			}
 			break;
 		}
 
+		case DRR_WRITE_EMBEDDED:
+		{
+			struct drr_write_embedded *drrwe =
+			    &drr->drr_u.drr_write_embedded;
+			(void) ssread(buf,
+			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
+			if (dump_record(drr, buf,
+			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
+			    &stream_cksum, outfd) != 0)
+				goto out;
+			break;
+		}
+
 		case DRR_FREE:
 		{
-			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-			    &stream_cksum, outfd) == -1)
+			if (dump_record(drr, NULL, 0, &stream_cksum,
+			    outfd) != 0)
 				goto out;
 			break;
 		}
 
 		default:
-			(void) printf("INVALID record type 0x%x\n",
+			(void) fprintf(stderr, "INVALID record type 0x%x\n",
 			    drr->drr_type);
 			/* should never happen, so assert */
 			assert(B_FALSE);
@@ -526,13 +580,30 @@ fsavl_create(nvlist_t *fss)
  * Routines for dealing with the giant nvlist of fs-nvlists, etc.
  */
 typedef struct send_data {
+	/*
+	 * assigned inside every recursive call,
+	 * restored from *_save on return:
+	 *
+	 * guid of fromsnap snapshot in parent dataset
+	 * txg of fromsnap snapshot in current dataset
+	 * txg of tosnap snapshot in current dataset
+	 */
+
 	uint64_t parent_fromsnap_guid;
+	uint64_t fromsnap_txg;
+	uint64_t tosnap_txg;
+
+	/* the nvlists get accumulated during depth-first traversal */
 	nvlist_t *parent_snaps;
 	nvlist_t *fss;
 	nvlist_t *snapprops;
+
+	/* send-receive configuration, does not change during traversal */
+	const char *fsname;
 	const char *fromsnap;
 	const char *tosnap;
 	boolean_t recursive;
+	boolean_t verbose;
 
 	/*
 	 * The header nvlist is of the following format:
@@ -565,11 +636,23 @@ send_iterate_snap(zfs_handle_t *zhp, voi
 {
 	send_data_t *sd = arg;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
+	uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
 	char *snapname;
 	nvlist_t *nv;
 
 	snapname = strrchr(zhp->zfs_name, '@')+1;
 
+	if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
+		if (sd->verbose) {
+			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+			    "skipping snapshot %s because it was created "
+			    "after the destination snapshot (%s)\n"),
+			    zhp->zfs_name, sd->tosnap);
+		}
+		zfs_close(zhp);
+		return (0);
+	}
+
 	VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid));
 	/*
 	 * NB: if there is no fromsnap here (it's a newly created fs in
@@ -663,6 +746,31 @@ send_iterate_prop(zfs_handle_t *zhp, nvl
 }
 
 /*
+ * returns snapshot creation txg
+ * and returns 0 if the snapshot does not exist
+ */
+static uint64_t
+get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap)
+{
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t txg = 0;
+
+	if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0')
+		return (txg);
+
+	(void) snprintf(name, sizeof (name), "%s@%s", fs, snap);
+	if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) {
+		zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT);
+		if (zhp != NULL) {
+			txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG);
+			zfs_close(zhp);
+		}
+	}
+
+	return (txg);
+}
+
+/*
  * recursively generate nvlists describing datasets.  See comment
  * for the data structure send_data_t above for description of contents
  * of the nvlist.
@@ -674,9 +782,48 @@ send_iterate_fs(zfs_handle_t *zhp, void 
 	nvlist_t *nvfs, *nv;
 	int rv = 0;
 	uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
+	uint64_t fromsnap_txg_save = sd->fromsnap_txg;
+	uint64_t tosnap_txg_save = sd->tosnap_txg;
+	uint64_t txg = zhp->zfs_dmustats.dds_creation_txg;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
+	uint64_t fromsnap_txg, tosnap_txg;
 	char guidstring[64];
 
+	fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap);
+	if (fromsnap_txg != 0)
+		sd->fromsnap_txg = fromsnap_txg;
+
+	tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap);
+	if (tosnap_txg != 0)
+		sd->tosnap_txg = tosnap_txg;
+
+	/*
+	 * on the send side, if the current dataset does not have tosnap,
+	 * perform two additional checks:
+	 *
+	 * - skip sending the current dataset if it was created later than
+	 *   the parent tosnap
+	 * - return error if the current dataset was created earlier than
+	 *   the parent tosnap
+	 */
+	if (sd->tosnap != NULL && tosnap_txg == 0) {
+		if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) {
+			if (sd->verbose) {
+				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+				    "skipping dataset %s: snapshot %s does "
+				    "not exist\n"), zhp->zfs_name, sd->tosnap);
+			}
+		} else {
+			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+			    "cannot send %s@%s%s: snapshot %s@%s does not "
+			    "exist\n"), sd->fsname, sd->tosnap, sd->recursive ?
+			    dgettext(TEXT_DOMAIN, " recursively") : "",
+			    zhp->zfs_name, sd->tosnap);
+			rv = -1;
+		}
+		goto out;
+	}
+
 	VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0));
 	VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name));
 	VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap",
@@ -685,8 +832,10 @@ send_iterate_fs(zfs_handle_t *zhp, void 
 	if (zhp->zfs_dmustats.dds_origin[0]) {
 		zfs_handle_t *origin = zfs_open(zhp->zfs_hdl,
 		    zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
-		if (origin == NULL)
-			return (-1);
+		if (origin == NULL) {
+			rv = -1;
+			goto out;
+		}
 		VERIFY(0 == nvlist_add_uint64(nvfs, "origin",
 		    origin->zfs_dmustats.dds_guid));
 	}
@@ -701,7 +850,7 @@ send_iterate_fs(zfs_handle_t *zhp, void 
 	sd->parent_fromsnap_guid = 0;
 	VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0));
 	VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0));
-	(void) zfs_iter_snapshots(zhp, send_iterate_snap, sd);
+	(void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd);
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps));
 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops));
 	nvlist_free(sd->parent_snaps);
@@ -717,7 +866,10 @@ send_iterate_fs(zfs_handle_t *zhp, void 
 	if (sd->recursive)
 		rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
 
+out:
 	sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
+	sd->fromsnap_txg = fromsnap_txg_save;
+	sd->tosnap_txg = tosnap_txg_save;
 
 	zfs_close(zhp);
 	return (rv);
@@ -725,7 +877,8 @@ send_iterate_fs(zfs_handle_t *zhp, void 
 
 static int
 gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
-    const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp)
+    const char *tosnap, boolean_t recursive, boolean_t verbose,
+    nvlist_t **nvlp, avl_tree_t **avlp)
 {
 	zfs_handle_t *zhp;
 	send_data_t sd = { 0 };
@@ -736,9 +889,11 @@ gather_nvlist(libzfs_handle_t *hdl, cons
 		return (EZFS_BADTYPE);
 
 	VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
+	sd.fsname = fsname;
 	sd.fromsnap = fromsnap;
 	sd.tosnap = tosnap;
 	sd.recursive = recursive;
+	sd.verbose = verbose;
 
 	if ((error = send_iterate_fs(zhp, &sd)) != 0) {
 		nvlist_free(sd.fss);
@@ -759,128 +914,137 @@ gather_nvlist(libzfs_handle_t *hdl, cons
 }
 
 /*
- * Routines for dealing with the sorted snapshot functionality
- */
-typedef struct zfs_node {
-	zfs_handle_t	*zn_handle;
-	avl_node_t	zn_avlnode;
-} zfs_node_t;
-
-static int
-zfs_sort_snaps(zfs_handle_t *zhp, void *data)
-{
-	avl_tree_t *avl = data;
-	zfs_node_t *node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
-
-	node->zn_handle = zhp;
-	avl_add(avl, node);
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-zfs_snapshot_compare(const void *larg, const void *rarg)
-{
-	zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
-	zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
-	uint64_t lcreate, rcreate;
-
-	/*
-	 * Sort them according to creation time.  We use the hidden
-	 * CREATETXG property to get an absolute ordering of snapshots.
-	 */
-	lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
-	rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
-
-	if (lcreate < rcreate)
-		return (-1);
-	else if (lcreate > rcreate)
-		return (+1);
-	else
-		return (0);
-}
-
-int
-zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data)
-{
-	int ret = 0;
-	zfs_node_t *node;
-	avl_tree_t avl;
-	void *cookie = NULL;
-
-	avl_create(&avl, zfs_snapshot_compare,
-	    sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode));
-
-	ret = zfs_iter_snapshots(zhp, zfs_sort_snaps, &avl);
-
-	for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node))
-		ret |= callback(node->zn_handle, data);
-
-	while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL)
-		free(node);
-
-	avl_destroy(&avl);
-
-	return (ret);
-}
-
-/*
  * Routines specific to "zfs send"
  */
 typedef struct send_dump_data {
 	/* these are all just the short snapname (the part after the @) */
 	const char *fromsnap;
 	const char *tosnap;
-	char prevsnap[ZFS_MAXNAMELEN];
+	char prevsnap[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t prevsnap_obj;
 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
-	boolean_t verbose;
+	boolean_t verbose, dryrun, parsable, progress, embed_data, std_out;
+	boolean_t large_block;
 	int outfd;
 	boolean_t err;
 	nvlist_t *fss;
+	nvlist_t *snapholds;
 	avl_tree_t *fsavl;
 	snapfilter_cb_t *filter_cb;
 	void *filter_cb_arg;
+	nvlist_t *debugnv;
+	char holdtag[ZFS_MAX_DATASET_NAME_LEN];
+	int cleanup_fd;
+	uint64_t size;
 } send_dump_data_t;
 
+static int
+estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
+    boolean_t fromorigin, uint64_t *sizep)
+{
+	zfs_cmd_t zc = { 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+	assert(fromsnap_obj == 0 || !fromorigin);
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	zc.zc_obj = fromorigin;
+	zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
+	zc.zc_fromobj = fromsnap_obj;
+	zc.zc_guid = 1;  /* estimate flag */
+
+	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
+		char errbuf[1024];
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "warning: cannot estimate space for '%s'"), zhp->zfs_name);
+
+		switch (errno) {
+		case EXDEV:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "not an earlier snapshot from the same fs"));
+			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
+
+		case ENOENT:
+			if (zfs_dataset_exists(hdl, zc.zc_name,
+			    ZFS_TYPE_SNAPSHOT)) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "incremental source (@%s) does not exist"),
+				    zc.zc_value);
+			}
+			return (zfs_error(hdl, EZFS_NOENT, errbuf));
+
+		case EDQUOT:
+		case EFBIG:
+		case EIO:
+		case ENOLINK:
+		case ENOSPC:
+		case ENXIO:
+		case EPIPE:
+		case ERANGE:
+		case EFAULT:
+		case EROFS:
+			zfs_error_aux(hdl, strerror(errno));
+			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
+
+		default:
+			return (zfs_standard_error(hdl, errno, errbuf));
+		}
+	}
+
+	*sizep = zc.zc_objset_type;
+
+	return (0);
+}
+
 /*
  * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
  * NULL) to the file descriptor specified by outfd.
  */
 static int
-dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
-    int outfd, boolean_t enoent_ok, boolean_t *got_enoent)
+dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
+    boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
+    nvlist_t *debugnv)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	nvlist_t *thisdbg;
 
 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
-	assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin);
+	assert(fromsnap_obj == 0 || !fromorigin);
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	if (fromsnap)
-		(void) strlcpy(zc.zc_value, fromsnap, sizeof (zc.zc_value));
 	zc.zc_cookie = outfd;
 	zc.zc_obj = fromorigin;
+	zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
+	zc.zc_fromobj = fromsnap_obj;
+	zc.zc_flags = flags;
 
-	*got_enoent = B_FALSE;
+	VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
+	if (fromsnap && fromsnap[0] != '\0') {
+		VERIFY(0 == nvlist_add_string(thisdbg,
+		    "fromsnap", fromsnap));
+	}
 
-	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) {
+	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "warning: cannot send '%s'"), zhp->zfs_name);
 
-		switch (errno) {
+		VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno));
+		if (debugnv) {
+			VERIFY(0 == nvlist_add_nvlist(debugnv,
+			    zhp->zfs_name, thisdbg));
+		}
+		nvlist_free(thisdbg);
 
+		switch (errno) {
 		case EXDEV:
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "not an earlier snapshot from the same fs"));
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
-			if (enoent_ok) {
-				*got_enoent = B_TRUE;
-				return (0);
-			}
 			if (zfs_dataset_exists(hdl, zc.zc_name,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
@@ -894,7 +1058,9 @@ dump_ioctl(zfs_handle_t *zhp, const char
 		case EIO:
 		case ENOLINK:
 		case ENOSPC:
+#ifdef illumos
 		case ENOSTR:
+#endif
 		case ENXIO:
 		case EPIPE:
 		case ERANGE:
@@ -908,23 +1074,139 @@ dump_ioctl(zfs_handle_t *zhp, const char
 		}
 	}
 
+	if (debugnv)
+		VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg));
+	nvlist_free(thisdbg);
+
 	return (0);
 }
 
+static void
+gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd)
+{
+	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
+
+	/*
+	 * zfs_send() only sets snapholds for sends that need them,
+	 * e.g. replication and doall.
+	 */
+	if (sdd->snapholds == NULL)
+		return;
+
+	fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag);
+}
+
+static void *
+send_progress_thread(void *arg)
+{
+	progress_arg_t *pa = arg;
+	zfs_cmd_t zc = { 0 };
+	zfs_handle_t *zhp = pa->pa_zhp;
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	unsigned long long bytes;
+	char buf[16];
+	time_t t;
+	struct tm *tm;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	if (!pa->pa_parsable)
+		(void) fprintf(stderr, "TIME        SENT   SNAPSHOT\n");
+
+	/*
+	 * Print the progress from ZFS_IOC_SEND_PROGRESS every second.
+	 */
+	for (;;) {
+		(void) sleep(1);
+
+		zc.zc_cookie = pa->pa_fd;
+		if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0)
+			return ((void *)-1);
+
+		(void) time(&t);
+		tm = localtime(&t);
+		bytes = zc.zc_cookie;
+
+		if (pa->pa_parsable) {
+			(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
+			    tm->tm_hour, tm->tm_min, tm->tm_sec,
+			    bytes, zhp->zfs_name);
+		} else {
+			zfs_nicenum(bytes, buf, sizeof (buf));
+			(void) fprintf(stderr, "%02d:%02d:%02d   %5s   %s\n",
+			    tm->tm_hour, tm->tm_min, tm->tm_sec,
+			    buf, zhp->zfs_name);
+		}
+	}
+}
+
+static void
+send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap,
+    uint64_t size, boolean_t parsable)
+{
+	if (parsable) {
+		if (fromsnap != NULL) {
+			(void) fprintf(fout, "incremental\t%s\t%s",
+			    fromsnap, tosnap);
+		} else {
+			(void) fprintf(fout, "full\t%s",
+			    tosnap);
+		}
+	} else {
+		if (fromsnap != NULL) {
+			if (strchr(fromsnap, '@') == NULL &&
+			    strchr(fromsnap, '#') == NULL) {
+				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+				    "send from @%s to %s"),
+				    fromsnap, tosnap);
+			} else {
+				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+				    "send from %s to %s"),
+				    fromsnap, tosnap);
+			}
+		} else {
+			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+			    "full send of %s"),
+			    tosnap);
+		}
+	}
+
+	if (size != 0) {
+		if (parsable) {
+			(void) fprintf(fout, "\t%llu",
+			    (longlong_t)size);
+		} else {
+			char buf[16];
+			zfs_nicenum(size, buf, sizeof (buf));
+			(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+			    " estimated size is %s"), buf);
+		}
+	}
+	(void) fprintf(fout, "\n");
+}
+
 static int
 dump_snapshot(zfs_handle_t *zhp, void *arg)
 {
 	send_dump_data_t *sdd = arg;
-	const char *thissnap;
+	progress_arg_t pa = { 0 };
+	pthread_t tid;
+	char *thissnap;
 	int err;
-	boolean_t got_enoent;
+	boolean_t isfromsnap, istosnap, fromorigin;
+	boolean_t exclude = B_FALSE;
+	FILE *fout = sdd->std_out ? stdout : stderr;
 
+	err = 0;
 	thissnap = strchr(zhp->zfs_name, '@') + 1;
+	isfromsnap = (sdd->fromsnap != NULL &&
+	    strcmp(sdd->fromsnap, thissnap) == 0);
 
-	if (sdd->fromsnap && !sdd->seenfrom &&
-	    strcmp(sdd->fromsnap, thissnap) == 0) {
+	if (!sdd->seenfrom && isfromsnap) {
+		gather_holds(zhp, sdd);
 		sdd->seenfrom = B_TRUE;
 		(void) strcpy(sdd->prevsnap, thissnap);
+		sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 		zfs_close(zhp);
 		return (0);
 	}
@@ -934,18 +1216,40 @@ dump_snapshot(zfs_handle_t *zhp, void *a
 		return (0);
 	}
 
-	if (strcmp(sdd->tosnap, thissnap) == 0)
+	istosnap = (strcmp(sdd->tosnap, thissnap) == 0);
+	if (istosnap)
 		sdd->seento = B_TRUE;
 
+	if (!sdd->doall && !isfromsnap && !istosnap) {
+		if (sdd->replicate) {
+			char *snapname;
+			nvlist_t *snapprops;
+			/*
+			 * Filter out all intermediate snapshots except origin
+			 * snapshots needed to replicate clones.
+			 */
+			nvlist_t *nvfs = fsavl_find(sdd->fsavl,
+			    zhp->zfs_dmustats.dds_guid, &snapname);
+
+			VERIFY(0 == nvlist_lookup_nvlist(nvfs,
+			    "snapprops", &snapprops));
+			VERIFY(0 == nvlist_lookup_nvlist(snapprops,
+			    thissnap, &snapprops));
+			exclude = !nvlist_exists(snapprops, "is_clone_origin");
+		} else {
+			exclude = B_TRUE;
+		}
+	}
+
 	/*
 	 * If a filter function exists, call it to determine whether
 	 * this snapshot will be sent.
 	 */
-	if (sdd->filter_cb != NULL &&
-	    sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE) {
+	if (exclude || (sdd->filter_cb != NULL &&
+	    sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) {
 		/*
 		 * This snapshot is filtered out.  Don't send it, and don't
-		 * set prevsnap, so it will be as if this snapshot didn't
+		 * set prevsnap_obj, so it will be as if this snapshot didn't
 		 * exist, and the next accepted snapshot will be sent as
 		 * an incremental from the last accepted one, or as the
 		 * first (and full) snapshot in the case of a replication,
@@ -955,20 +1259,55 @@ dump_snapshot(zfs_handle_t *zhp, void *a
 		return (0);
 	}
 
-	/* send it */
+	gather_holds(zhp, sdd);
+	fromorigin = sdd->prevsnap[0] == '\0' &&
+	    (sdd->fromorigin || sdd->replicate);
+
 	if (sdd->verbose) {
-		(void) fprintf(stderr, "sending from @%s to %s\n",
-		    sdd->prevsnap, zhp->zfs_name);
+		uint64_t size = 0;
+		(void) estimate_ioctl(zhp, sdd->prevsnap_obj,
+		    fromorigin, &size);
+
+		send_print_verbose(fout, zhp->zfs_name,
+		    sdd->prevsnap[0] ? sdd->prevsnap : NULL,
+		    size, sdd->parsable);
+		sdd->size += size;
 	}
 
-	err = dump_ioctl(zhp, sdd->prevsnap,
-	    sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
-	    sdd->outfd, B_TRUE, &got_enoent);
+	if (!sdd->dryrun) {
+		/*
+		 * If progress reporting is requested, spawn a new thread to
+		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
+		 */
+		if (sdd->progress) {
+			pa.pa_zhp = zhp;
+			pa.pa_fd = sdd->outfd;
+			pa.pa_parsable = sdd->parsable;
 
-	if (got_enoent)
-		err = 0;
-	else
-		(void) strcpy(sdd->prevsnap, thissnap);
+			if ((err = pthread_create(&tid, NULL,
+			    send_progress_thread, &pa)) != 0) {
+				zfs_close(zhp);
+				return (err);
+			}
+		}
+
+		enum lzc_send_flags flags = 0;
+		if (sdd->large_block)
+			flags |= LZC_SEND_FLAG_LARGE_BLOCK;
+		if (sdd->embed_data)
+			flags |= LZC_SEND_FLAG_EMBED_DATA;
+
+		err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
+		    fromorigin, sdd->outfd, flags, sdd->debugnv);
+
+		if (sdd->progress) {
+			(void) pthread_cancel(tid);
+			(void) pthread_join(tid, NULL);
+		}
+	}
+
+	(void) strcpy(sdd->prevsnap, thissnap);
+	sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
 	zfs_close(zhp);
 	return (err);
 }
@@ -984,8 +1323,8 @@ dump_filesystem(zfs_handle_t *zhp, void 
 	(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
 	    zhp->zfs_name, sdd->tosnap);
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
-		(void) fprintf(stderr, "WARNING: "
-		    "could not send %s@%s: does not exist\n",
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "WARNING: could not send %s@%s: does not exist\n"),
 		    zhp->zfs_name, sdd->tosnap);
 		sdd->err = B_TRUE;
 		return (0);
@@ -1007,57 +1346,34 @@ dump_filesystem(zfs_handle_t *zhp, void 
 		}
 	}
 
-	if (sdd->doall) {
-		sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
-		if (sdd->fromsnap == NULL || missingfrom)
-			sdd->seenfrom = B_TRUE;
-
-		rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
-		if (!sdd->seenfrom) {
-			(void) fprintf(stderr,
+	sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
+	sdd->prevsnap_obj = 0;
+	if (sdd->fromsnap == NULL || missingfrom)
+		sdd->seenfrom = B_TRUE;
+
+	rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
+	if (!sdd->seenfrom) {
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "WARNING: could not send %s@%s:\n"
+		    "incremental source (%s@%s) does not exist\n"),
+		    zhp->zfs_name, sdd->tosnap,
+		    zhp->zfs_name, sdd->fromsnap);
+		sdd->err = B_TRUE;
+	} else if (!sdd->seento) {
+		if (sdd->fromsnap) {
+			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
 			    "WARNING: could not send %s@%s:\n"
-			    "incremental source (%s@%s) does not exist\n",
+			    "incremental source (%s@%s) "
+			    "is not earlier than it\n"),
 			    zhp->zfs_name, sdd->tosnap,
 			    zhp->zfs_name, sdd->fromsnap);
-			sdd->err = B_TRUE;
-		} else if (!sdd->seento) {
-			if (sdd->fromsnap) {
-				(void) fprintf(stderr,
-				    "WARNING: could not send %s@%s:\n"
-				    "incremental source (%s@%s) "
-				    "is not earlier than it\n",
-				    zhp->zfs_name, sdd->tosnap,
-				    zhp->zfs_name, sdd->fromsnap);
-			} else {
-				(void) fprintf(stderr, "WARNING: "
-				    "could not send %s@%s: does not exist\n",
-				    zhp->zfs_name, sdd->tosnap);
-			}
-			sdd->err = B_TRUE;
-		}
-	} else {
-		zfs_handle_t *snapzhp;
-		char snapname[ZFS_MAXNAMELEN];
-
-		(void) snprintf(snapname, sizeof (snapname), "%s@%s",
-		    zfs_get_name(zhp), sdd->tosnap);
-		snapzhp = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT);
-		if (snapzhp == NULL) {
-			rv = -1;
 		} else {
-			if (sdd->filter_cb == NULL ||
-			    sdd->filter_cb(snapzhp, sdd->filter_cb_arg) ==
-			    B_TRUE) {
-				boolean_t got_enoent;
-
-				rv = dump_ioctl(snapzhp,
-				    missingfrom ? NULL : sdd->fromsnap,
-				    sdd->fromorigin || missingfrom,
-				    sdd->outfd, B_FALSE, &got_enoent);
-			}
-			sdd->seento = B_TRUE;
-			zfs_close(snapzhp);
+			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+			    "WARNING: "
+			    "could not send %s@%s: does not exist\n"),
+			    zhp->zfs_name, sdd->tosnap);
 		}
+		sdd->err = B_TRUE;
 	}
 
 	return (rv);
@@ -1073,16 +1389,39 @@ dump_filesystems(zfs_handle_t *rzhp, voi
 	if (!sdd->replicate)
 		return (dump_filesystem(rzhp, sdd));
 
+	/* Mark the clone origin snapshots. */
+	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
+	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
+		nvlist_t *nvfs;
+		uint64_t origin_guid = 0;
+
+		VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs));
+		(void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid);
+		if (origin_guid != 0) {
+			char *snapname;
+			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
+			    origin_guid, &snapname);
+			if (origin_nv != NULL) {
+				nvlist_t *snapprops;
+				VERIFY(0 == nvlist_lookup_nvlist(origin_nv,
+				    "snapprops", &snapprops));
+				VERIFY(0 == nvlist_lookup_nvlist(snapprops,
+				    snapname, &snapprops));
+				VERIFY(0 == nvlist_add_boolean(
+				    snapprops, "is_clone_origin"));
+			}
+		}
+	}
 again:
 	needagain = progress = B_FALSE;
 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
-		nvlist_t *fslist;
+		nvlist_t *fslist, *parent_nv;
 		char *fsname;
 		zfs_handle_t *zhp;
 		int err;
 		uint64_t origin_guid = 0;
-		nvlist_t *origin_nv;
+		uint64_t parent_guid = 0;
 
 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
 		if (nvlist_lookup_boolean(fslist, "sent") == 0)
@@ -1090,16 +1429,30 @@ again:
 
 		VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
 		(void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
+		(void) nvlist_lookup_uint64(fslist, "parentfromsnap",
+		    &parent_guid);
 
-		origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL);
-		if (origin_nv &&
-		    nvlist_lookup_boolean(origin_nv, "sent") == ENOENT) {
-			/*
-			 * origin has not been sent yet;
-			 * skip this clone.
-			 */
-			needagain = B_TRUE;
-			continue;
+		if (parent_guid != 0) {
+			parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL);
+			if (!nvlist_exists(parent_nv, "sent")) {
+				/* parent has not been sent; skip this one */
+				needagain = B_TRUE;
+				continue;
+			}
+		}
+
+		if (origin_guid != 0) {
+			nvlist_t *origin_nv = fsavl_find(sdd->fsavl,
+			    origin_guid, NULL);
+			if (origin_nv != NULL &&
+			    !nvlist_exists(origin_nv, "sent")) {
+				/*
+				 * origin has not been sent yet;
+				 * skip this clone.
+				 */
+				needagain = B_TRUE;
+				continue;
+			}
 		}
 
 		zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
@@ -1116,9 +1469,246 @@ again:
 		assert(progress);
 		goto again;
 	}
+
+	/* clean out the sent flags in case we reuse this fss */
+	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
+	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
+		nvlist_t *fslist;
+
+		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
+		(void) nvlist_remove_all(fslist, "sent");
+	}
+
 	return (0);
 }
 
+nvlist_t *
+zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token)
+{
+	unsigned int version;
+	int nread;
+	unsigned long long checksum, packed_len;
+
+	/*
+	 * Decode token header, which is:
+	 *   <token version>-<checksum of payload>-<uncompressed payload length>
+	 * Note that the only supported token version is 1.
+	 */
+	nread = sscanf(token, "%u-%llx-%llx-",
+	    &version, &checksum, &packed_len);
+	if (nread != 3) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (invalid format)"));
+		return (NULL);
+	}
+
+	if (version != ZFS_SEND_RESUME_TOKEN_VERSION) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (invalid version %u)"),
+		    version);
+		return (NULL);
+	}
+
+	/* convert hexadecimal representation to binary */
+	token = strrchr(token, '-') + 1;
+	int len = strlen(token) / 2;
+	unsigned char *compressed = zfs_alloc(hdl, len);
+	for (int i = 0; i < len; i++) {
+		nread = sscanf(token + i * 2, "%2hhx", compressed + i);
+		if (nread != 1) {
+			free(compressed);
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "resume token is corrupt "
+			    "(payload is not hex-encoded)"));
+			return (NULL);
+		}
+	}
+
+	/* verify checksum */
+	zio_cksum_t cksum;
+	fletcher_4_native(compressed, len, NULL, &cksum);
+	if (cksum.zc_word[0] != checksum) {
+		free(compressed);
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (incorrect checksum)"));
+		return (NULL);
+	}
+
+	/* uncompress */
+	void *packed = zfs_alloc(hdl, packed_len);
+	uLongf packed_len_long = packed_len;
+	if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK ||
+	    packed_len_long != packed_len) {
+		free(packed);
+		free(compressed);
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (decompression failed)"));
+		return (NULL);
+	}
+
+	/* unpack nvlist */
+	nvlist_t *nv;
+	int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP);
+	free(packed);
+	free(compressed);
+	if (error != 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt (nvlist_unpack failed)"));
+		return (NULL);
+	}
+	return (nv);
+}
+
+int
+zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
+    const char *resume_token)
+{
+	char errbuf[1024];
+	char *toname;
+	char *fromname = NULL;
+	uint64_t resumeobj, resumeoff, toguid, fromguid, bytes;
+	zfs_handle_t *zhp;
+	int error = 0;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	enum lzc_send_flags lzc_flags = 0;
+
+	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+	    "cannot resume send"));
+
+	nvlist_t *resume_nvl =
+	    zfs_send_resume_token_to_nvlist(hdl, resume_token);
+	if (resume_nvl == NULL) {
+		/*
+		 * zfs_error_aux has already been set by
+		 * zfs_send_resume_token_to_nvlist
+		 */
+		return (zfs_error(hdl, EZFS_FAULT, errbuf));
+	}
+	if (flags->verbose) {
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "resume token contents:\n"));
+		nvlist_print(stderr, resume_nvl);
+	}
+
+	if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 ||
+	    nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "resume token is corrupt"));
+		return (zfs_error(hdl, EZFS_FAULT, errbuf));
+	}
+	fromguid = 0;
+	(void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
+
+	if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
+		lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+
+	if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
+		if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "'%s' is no longer the same snapshot used in "
+			    "the initial send"), toname);
+		} else {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "'%s' used in the initial send no longer exists"),
+			    toname);
+		}
+		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+	}
+	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
+	if (zhp == NULL) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "unable to access '%s'"), name);
+		return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+	}
+
+	if (fromguid != 0) {
+		if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "incremental source %#llx no longer exists"),
+			    (longlong_t)fromguid);
+			return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+		}
+		fromname = name;
+	}
+
+	if (flags->verbose) {
+		uint64_t size = 0;
+		error = lzc_send_space(zhp->zfs_name, fromname, &size);
+		if (error == 0)
+			size = MAX(0, (int64_t)(size - bytes));
+		send_print_verbose(stderr, zhp->zfs_name, fromname,
+		    size, flags->parsable);
+	}
+
+	if (!flags->dryrun) {
+		progress_arg_t pa = { 0 };
+		pthread_t tid;
+		/*
+		 * If progress reporting is requested, spawn a new thread to
+		 * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
+		 */
+		if (flags->progress) {
+			pa.pa_zhp = zhp;
+			pa.pa_fd = outfd;
+			pa.pa_parsable = flags->parsable;
+
+			error = pthread_create(&tid, NULL,
+			    send_progress_thread, &pa);
+			if (error != 0) {
+				zfs_close(zhp);
+				return (error);
+			}
+		}
+
+		error = lzc_send_resume(zhp->zfs_name, fromname, outfd,
+		    lzc_flags, resumeobj, resumeoff);
+
+		if (flags->progress) {
+			(void) pthread_cancel(tid);
+			(void) pthread_join(tid, NULL);
+		}
+
+		char errbuf[1024];
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "warning: cannot send '%s'"), zhp->zfs_name);
+
+		zfs_close(zhp);
+
+		switch (error) {
+		case 0:
+			return (0);
+		case EXDEV:
+		case ENOENT:
+		case EDQUOT:
+		case EFBIG:
+		case EIO:
+		case ENOLINK:
+		case ENOSPC:
+#ifdef illumos
+		case ENOSTR:
+#endif
+		case ENXIO:
+		case EPIPE:
+		case ERANGE:
+		case EFAULT:
+		case EROFS:
+			zfs_error_aux(hdl, strerror(errno));
+			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
+
+		default:
+			return (zfs_standard_error(hdl, errno, errbuf));
+		}
+	}
+
+
+	zfs_close(zhp);
+
+	return (error);
+}
+
 /*
  * Generate a send stream for the dataset identified by the argument zhp.
  *
@@ -1137,22 +1727,21 @@ again:
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
-    sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
-    void *cb_arg)
+    sendflags_t *flags, int outfd, snapfilter_cb_t filter_func,
+    void *cb_arg, nvlist_t **debugnvp)
 {
 	char errbuf[1024];
 	send_dump_data_t sdd = { 0 };
-	int err;
+	int err = 0;
 	nvlist_t *fss = NULL;
 	avl_tree_t *fsavl = NULL;
-	char holdtag[128];
 	static uint64_t holdseq;
 	int spa_version;
-	boolean_t holdsnaps = B_FALSE;
-	pthread_t tid;
+	pthread_t tid = 0;
 	int pipefd[2];
 	dedup_arg_t dda = { 0 };
 	int featureflags = 0;
+	FILE *fout;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot send '%s'"), zhp->zfs_name);
@@ -1163,14 +1752,18 @@ zfs_send(zfs_handle_t *zhp, const char *
 		return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 	}
 
-	if (zfs_spa_version(zhp, &spa_version) == 0 &&
-	    spa_version >= SPA_VERSION_USERREFS)
-		holdsnaps = B_TRUE;
+	if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) {
+		uint64_t version;
+		version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
+		if (version >= ZPL_VERSION_SA) {
+			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+		}
+	}
 
-	if (flags.dedup) {
+	if (flags->dedup && !flags->dryrun) {
 		featureflags |= (DMU_BACKUP_FEATURE_DEDUP |
 		    DMU_BACKUP_FEATURE_DEDUPPROPS);
-		if (err = pipe(pipefd)) {
+		if ((err = pipe(pipefd)) != 0) {
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
 			return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
 			    errbuf));
@@ -1178,7 +1771,7 @@ zfs_send(zfs_handle_t *zhp, const char *
 		dda.outputfd = outfd;
 		dda.inputfd = pipefd[1];
 		dda.dedup_hdl = zhp->zfs_hdl;
-		if (err = pthread_create(&tid, NULL, cksummer, &dda)) {
+		if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) {
 			(void) close(pipefd[0]);
 			(void) close(pipefd[1]);
 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
@@ -1187,23 +1780,13 @@ zfs_send(zfs_handle_t *zhp, const char *
 		}
 	}
 
-	if (flags.replicate || flags.doall || flags.props) {
+	if (flags->replicate || flags->doall || flags->props) {
 		dmu_replay_record_t drr = { 0 };
 		char *packbuf = NULL;
 		size_t buflen = 0;
 		zio_cksum_t zc = { 0 };
 
-		if (holdsnaps) {
-			(void) snprintf(holdtag, sizeof (holdtag),
-			    ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
-			++holdseq;
-			err = zfs_hold_range(zhp, fromsnap, tosnap,
-			    holdtag, flags.replicate, B_TRUE);
-			if (err)
-				goto err_out;
-		}
-
-		if (flags.replicate || flags.props) {
+		if (flags->replicate || flags->props) {
 			nvlist_t *hdrnv;
 
 			VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
@@ -1212,108 +1795,178 @@ zfs_send(zfs_handle_t *zhp, const char *
 				    "fromsnap", fromsnap));
 			}
 			VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
-			if (!flags.replicate) {
+			if (!flags->replicate) {
 				VERIFY(0 == nvlist_add_boolean(hdrnv,
 				    "not_recursive"));
 			}
 
 			err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
-			    fromsnap, tosnap, flags.replicate, &fss, &fsavl);
-			if (err) {
-				if (holdsnaps) {
-					(void) zfs_release_range(zhp, fromsnap,
-					    tosnap, holdtag, flags.replicate);
-				}
+			    fromsnap, tosnap, flags->replicate, flags->verbose,
+			    &fss, &fsavl);
+			if (err)
 				goto err_out;
-			}
 			VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
 			err = nvlist_pack(hdrnv, &packbuf, &buflen,
 			    NV_ENCODE_XDR, 0);
-			nvlist_free(hdrnv);
-			if (err) {
-				fsavl_destroy(fsavl);
-				nvlist_free(fss);
-				if (holdsnaps) {
-					(void) zfs_release_range(zhp, fromsnap,
-					    tosnap, holdtag, flags.replicate);
-				}
+			if (debugnvp)
+				*debugnvp = hdrnv;
+			else
+				nvlist_free(hdrnv);
+			if (err)
 				goto stderr_out;
-			}
 		}
 
-		/* write first begin record */
-		drr.drr_type = DRR_BEGIN;
-		drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-		DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo,
-		    DMU_COMPOUNDSTREAM);
-		DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo,
-		    featureflags);
-		(void) snprintf(drr.drr_u.drr_begin.drr_toname,
-		    sizeof (drr.drr_u.drr_begin.drr_toname),
-		    "%s@%s", zhp->zfs_name, tosnap);
-		drr.drr_payloadlen = buflen;
-		err = cksum_and_write(&drr, sizeof (drr), &zc, outfd);
-
-		/* write header nvlist */
-		if (err != -1 && packbuf != NULL) {
-			err = cksum_and_write(packbuf, buflen, &zc, outfd);
-		}
-		free(packbuf);
-		if (err == -1) {
-			fsavl_destroy(fsavl);
-			nvlist_free(fss);
-			if (holdsnaps) {
-				(void) zfs_release_range(zhp, fromsnap, tosnap,
-				    holdtag, flags.replicate);
-			}
-			err = errno;
-			goto stderr_out;
-		}
+		if (!flags->dryrun) {
+			/* write first begin record */
+			drr.drr_type = DRR_BEGIN;
+			drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
+			DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.
+			    drr_versioninfo, DMU_COMPOUNDSTREAM);
+			DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.
+			    drr_versioninfo, featureflags);
+			(void) snprintf(drr.drr_u.drr_begin.drr_toname,
+			    sizeof (drr.drr_u.drr_begin.drr_toname),
+			    "%s@%s", zhp->zfs_name, tosnap);
+			drr.drr_payloadlen = buflen;
+
+			err = dump_record(&drr, packbuf, buflen, &zc, outfd);
+			free(packbuf);
+			if (err != 0)
+				goto stderr_out;
 
-		/* write end record */
-		if (err != -1) {
+			/* write end record */
 			bzero(&drr, sizeof (drr));
 			drr.drr_type = DRR_END;
 			drr.drr_u.drr_end.drr_checksum = zc;
 			err = write(outfd, &drr, sizeof (drr));
 			if (err == -1) {
-				fsavl_destroy(fsavl);
-				nvlist_free(fss);
 				err = errno;
-				if (holdsnaps) {
-					(void) zfs_release_range(zhp, fromsnap,
-					    tosnap, holdtag, flags.replicate);
-				}
 				goto stderr_out;
 			}
+
+			err = 0;
 		}
 	}
 
 	/* dump each stream */
 	sdd.fromsnap = fromsnap;
 	sdd.tosnap = tosnap;
-	if (flags.dedup)
+	if (tid != 0)
 		sdd.outfd = pipefd[0];
 	else
 		sdd.outfd = outfd;
-	sdd.replicate = flags.replicate;
-	sdd.doall = flags.doall;
-	sdd.fromorigin = flags.fromorigin;
+	sdd.replicate = flags->replicate;
+	sdd.doall = flags->doall;
+	sdd.fromorigin = flags->fromorigin;
 	sdd.fss = fss;
 	sdd.fsavl = fsavl;
-	sdd.verbose = flags.verbose;
+	sdd.verbose = flags->verbose;
+	sdd.parsable = flags->parsable;
+	sdd.progress = flags->progress;
+	sdd.dryrun = flags->dryrun;
+	sdd.large_block = flags->largeblock;
+	sdd.embed_data = flags->embed_data;
 	sdd.filter_cb = filter_func;
 	sdd.filter_cb_arg = cb_arg;
+	if (debugnvp)
+		sdd.debugnv = *debugnvp;
+	if (sdd.verbose && sdd.dryrun)
+		sdd.std_out = B_TRUE;
+	fout = sdd.std_out ? stdout : stderr;
+
+	/*
+	 * Some flags require that we place user holds on the datasets that are
+	 * being sent so they don't get destroyed during the send. We can skip
+	 * this step if the pool is imported read-only since the datasets cannot
+	 * be destroyed.
+	 */
+	if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp),
+	    ZPOOL_PROP_READONLY, NULL) &&
+	    zfs_spa_version(zhp, &spa_version) == 0 &&
+	    spa_version >= SPA_VERSION_USERREFS &&
+	    (flags->doall || flags->replicate)) {
+		++holdseq;
+		(void) snprintf(sdd.holdtag, sizeof (sdd.holdtag),
+		    ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
+		sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
+		if (sdd.cleanup_fd < 0) {
+			err = errno;
+			goto stderr_out;
+		}
+		sdd.snapholds = fnvlist_alloc();
+	} else {
+		sdd.cleanup_fd = -1;
+		sdd.snapholds = NULL;
+	}
+	if (flags->verbose || sdd.snapholds != NULL) {
+		/*
+		 * Do a verbose no-op dry run to get all the verbose output
+		 * or to gather snapshot hold's before generating any data,
+		 * then do a non-verbose real run to generate the streams.
+		 */
+		sdd.dryrun = B_TRUE;
+		err = dump_filesystems(zhp, &sdd);
+
+		if (err != 0)
+			goto stderr_out;
+
+		if (flags->verbose) {
+			if (flags->parsable) {
+				(void) fprintf(fout, "size\t%llu\n",
+				    (longlong_t)sdd.size);
+			} else {
+				char buf[16];
+				zfs_nicenum(sdd.size, buf, sizeof (buf));
+				(void) fprintf(fout, dgettext(TEXT_DOMAIN,
+				    "total estimated size is %s\n"), buf);
+			}
+		}
+
+		/* Ensure no snaps found is treated as an error. */
+		if (!sdd.seento) {
+			err = ENOENT;
+			goto err_out;
+		}
+
+		/* Skip the second run if dryrun was requested. */
+		if (flags->dryrun)
+			goto err_out;
+
+		if (sdd.snapholds != NULL) {
+			err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds);
+			if (err != 0)
+				goto stderr_out;
+
+			fnvlist_free(sdd.snapholds);
+			sdd.snapholds = NULL;
+		}
+
+		sdd.dryrun = B_FALSE;
+		sdd.verbose = B_FALSE;
+	}
+
 	err = dump_filesystems(zhp, &sdd);
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 
-	if (flags.dedup) {
+	/* Ensure no snaps found is treated as an error. */
+	if (err == 0 && !sdd.seento)
+		err = ENOENT;
+
+	if (tid != 0) {
+		if (err != 0)
+			(void) pthread_cancel(tid);
 		(void) close(pipefd[0]);
 		(void) pthread_join(tid, NULL);
 	}
 
-	if (flags.replicate || flags.doall || flags.props) {
+	if (sdd.cleanup_fd != -1) {
+		VERIFY(0 == close(sdd.cleanup_fd));
+		sdd.cleanup_fd = -1;
+	}
+
+	if (!flags->dryrun && (flags->replicate || flags->doall ||
+	    flags->props)) {
 		/*
 		 * write final end record.  NB: want to do this even if
 		 * there was some error, because it might not be totally
@@ -1321,10 +1974,6 @@ zfs_send(zfs_handle_t *zhp, const char *
 		 */
 		dmu_replay_record_t drr = { 0 };
 		drr.drr_type = DRR_END;
-		if (holdsnaps) {
-			(void) zfs_release_range(zhp, fromsnap, tosnap,
-			    holdtag, flags.replicate);
-		}
 		if (write(outfd, &drr, sizeof (drr)) == -1) {
 			return (zfs_standard_error(zhp->zfs_hdl,
 			    errno, errbuf));
@@ -1336,14 +1985,77 @@ zfs_send(zfs_handle_t *zhp, const char *
 stderr_out:
 	err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
 err_out:
-	if (flags.dedup) {
+	fsavl_destroy(fsavl);
+	nvlist_free(fss);
+	fnvlist_free(sdd.snapholds);
+
+	if (sdd.cleanup_fd != -1)
+		VERIFY(0 == close(sdd.cleanup_fd));
+	if (tid != 0) {
 		(void) pthread_cancel(tid);
-		(void) pthread_join(tid, NULL);
 		(void) close(pipefd[0]);
+		(void) pthread_join(tid, NULL);
 	}
 	return (err);
 }
 
+int
+zfs_send_one(zfs_handle_t *zhp, const char *from, int fd,
+    enum lzc_send_flags flags)
+{
+	int err;
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+	char errbuf[1024];
+	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+	    "warning: cannot send '%s'"), zhp->zfs_name);
+
+	err = lzc_send(zhp->zfs_name, from, fd, flags);
+	if (err != 0) {
+		switch (errno) {
+		case EXDEV:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "not an earlier snapshot from the same fs"));
+			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
+
+		case ENOENT:
+		case ESRCH:
+			if (lzc_exists(zhp->zfs_name)) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "incremental source (%s) does not exist"),
+				    from);
+			}
+			return (zfs_error(hdl, EZFS_NOENT, errbuf));
+
+		case EBUSY:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "target is busy; if a filesystem, "
+			    "it must not be mounted"));
+			return (zfs_error(hdl, EZFS_BUSY, errbuf));
+
+		case EDQUOT:
+		case EFBIG:
+		case EIO:
+		case ENOLINK:
+		case ENOSPC:
+#ifdef illumos
+		case ENOSTR:
+#endif
+		case ENXIO:
+		case EPIPE:
+		case ERANGE:
+		case EFAULT:
+		case EROFS:
+			zfs_error_aux(hdl, strerror(errno));
+			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
+
+		default:
+			return (zfs_standard_error(hdl, errno, errbuf));
+		}
+	}
+	return (err != 0);
+}
+
 /*
  * Routines specific to "zfs recv"
  */
@@ -1356,6 +2068,8 @@ recv_read(libzfs_handle_t *hdl, int fd, 
 	int rv;
 	int len = ilen;
 
+	assert(ilen <= SPA_MAXBLOCKSIZE);
+
 	do {
 		rv = read(fd, cp, len);
 		cp += rv;
@@ -1407,7 +2121,7 @@ recv_read_nvlist(libzfs_handle_t *hdl, i
 
 static int
 recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
-    int baselen, char *newname, recvflags_t flags)
+    int baselen, char *newname, recvflags_t *flags)
 {
 	static int seq;
 	zfs_cmd_t zc = { 0 };
@@ -1419,7 +2133,7 @@ recv_rename(libzfs_handle_t *hdl, const 
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
-	    flags.force ? MS_FORCE : 0);
+	    flags->force ? MS_FORCE : 0);
 	zfs_close(zhp);
 	if (clp == NULL)
 		return (-1);
@@ -1435,7 +2149,7 @@ recv_rename(libzfs_handle_t *hdl, const 
 
 		(void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value));
 
-		if (flags.verbose) {
+		if (flags->verbose) {
 			(void) printf("attempting rename %s to %s\n",
 			    zc.zc_name, zc.zc_value);
 		}
@@ -1446,27 +2160,26 @@ recv_rename(libzfs_handle_t *hdl, const 
 		err = ENOENT;
 	}
 
-	if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) {
+	if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) {
 		seq++;
 
-		(void) strncpy(newname, name, baselen);
-		(void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen,
-		    "recv-%u-%u", getpid(), seq);
+		(void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN,
+		    "%.*srecv-%u-%u", baselen, name, getpid(), seq);
 		(void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value));
 
-		if (flags.verbose) {
+		if (flags->verbose) {
 			(void) printf("failed - trying rename %s to %s\n",
 			    zc.zc_name, zc.zc_value);
 		}
 		err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc);
 		if (err == 0)
 			changelist_rename(clp, name, newname);
-		if (err && flags.verbose) {
+		if (err && flags->verbose) {
 			(void) printf("failed (%u) - "
 			    "will try again on next pass\n", errno);
 		}
 		err = EAGAIN;
-	} else if (flags.verbose) {
+	} else if (flags->verbose) {
 		if (err == 0)
 			(void) printf("success\n");
 		else
@@ -1481,7 +2194,7 @@ recv_rename(libzfs_handle_t *hdl, const 
 
 static int
 recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
-    char *newname, recvflags_t flags)
+    char *newname, recvflags_t *flags)
 {
 	zfs_cmd_t zc = { 0 };
 	int err = 0;
@@ -1494,7 +2207,7 @@ recv_destroy(libzfs_handle_t *hdl, const
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
-	    flags.force ? MS_FORCE : 0);
+	    flags->force ? MS_FORCE : 0);
 	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
 	    zfs_spa_version(zhp, &spa_version) == 0 &&
 	    spa_version >= SPA_VERSION_USERREFS)
@@ -1510,11 +2223,11 @@ recv_destroy(libzfs_handle_t *hdl, const
 	zc.zc_defer_destroy = defer;
 	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 
-	if (flags.verbose)
+	if (flags->verbose)
 		(void) printf("attempting destroy %s\n", zc.zc_name);
 	err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc);
 	if (err == 0) {
-		if (flags.verbose)
+		if (flags->verbose)
 			(void) printf("success\n");
 		changelist_remove(clp, zc.zc_name);
 	}
@@ -1536,66 +2249,97 @@ recv_destroy(libzfs_handle_t *hdl, const
 
 typedef struct guid_to_name_data {
 	uint64_t guid;
+	boolean_t bookmark_ok;
 	char *name;
+	char *skip;
 } guid_to_name_data_t;
 
 static int
 guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 {
 	guid_to_name_data_t *gtnd = arg;
+	const char *slash;
 	int err;
 
-	if (zhp->zfs_dmustats.dds_guid == gtnd->guid) {
+	if (gtnd->skip != NULL &&
+	    (slash = strrchr(zhp->zfs_name, '/')) != NULL &&
+	    strcmp(slash + 1, gtnd->skip) == 0) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) {
 		(void) strcpy(gtnd->name, zhp->zfs_name);
 		zfs_close(zhp);
 		return (EEXIST);
 	}
+
 	err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
+	if (err != EEXIST && gtnd->bookmark_ok)
+		err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd);
 	zfs_close(zhp);
 	return (err);
 }
 
+/*
+ * Attempt to find the local dataset associated with this guid.  In the case of
+ * multiple matches, we attempt to find the "best" match by searching
+ * progressively larger portions of the hierarchy.  This allows one to send a
+ * tree of datasets individually and guarantee that we will find the source
+ * guid within that hierarchy, even if there are multiple matches elsewhere.
+ */
 static int
 guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid,
-    char *name)
+    boolean_t bookmark_ok, char *name)
 {
-	/* exhaustive search all local snapshots */
+	char pname[ZFS_MAX_DATASET_NAME_LEN];
 	guid_to_name_data_t gtnd;
-	int err = 0;
-	zfs_handle_t *zhp;
-	char *cp;
 
 	gtnd.guid = guid;
+	gtnd.bookmark_ok = bookmark_ok;
 	gtnd.name = name;
+	gtnd.skip = NULL;
 
-	if (strchr(parent, '@') == NULL) {
-		zhp = make_dataset_handle(hdl, parent);
-		if (zhp != NULL) {
-			err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
-			zfs_close(zhp);
-			if (err == EEXIST)
-				return (0);
-		}
-	}
-
-	cp = strchr(parent, '/');
-	if (cp)
+	/*
+	 * Search progressively larger portions of the hierarchy, starting
+	 * with the filesystem specified by 'parent'.  This will
+	 * select the "most local" version of the origin snapshot in the case
+	 * that there are multiple matching snapshots in the system.
+	 */
+	(void) strlcpy(pname, parent, sizeof (pname));
+	char *cp = strrchr(pname, '@');
+	if (cp == NULL)
+		cp = strchr(pname, '\0');
+	for (; cp != NULL; cp = strrchr(pname, '/')) {
+		/* Chop off the last component and open the parent */
 		*cp = '\0';
-	zhp = make_dataset_handle(hdl, parent);
-	if (cp)
-		*cp = '/';
+		zfs_handle_t *zhp = make_dataset_handle(hdl, pname);
 
-	if (zhp) {
-		err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
+		if (zhp == NULL)
+			continue;
+		int err = guid_to_name_cb(zfs_handle_dup(zhp), &gtnd);
+		if (err != EEXIST)
+			err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
+		if (err != EEXIST && bookmark_ok)
+			err = zfs_iter_bookmarks(zhp, guid_to_name_cb, &gtnd);
 		zfs_close(zhp);
-	}
+		if (err == EEXIST)
+			return (0);
 
-	return (err == EEXIST ? 0 : ENOENT);
+		/*
+		 * Remember the last portion of the dataset so we skip it next
+		 * time through (as we've already searched that portion of the
+		 * hierarchy).
+		 */
+		gtnd.skip = strrchr(pname, '/') + 1;
+	}
 
+	return (ENOENT);
 }
 
 /*
- * Return true if dataset guid1 is created before guid2.
+ * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if
+ * guid1 is after guid2.
  */
 static int
 created_before(libzfs_handle_t *hdl, avl_tree_t *avl,
@@ -1603,9 +2347,10 @@ created_before(libzfs_handle_t *hdl, avl
 {
 	nvlist_t *nvfs;
 	char *fsname, *snapname;
-	char buf[ZFS_MAXNAMELEN];
+	char buf[ZFS_MAX_DATASET_NAME_LEN];
 	int rv;
-	zfs_node_t zn1, zn2;
+	zfs_handle_t *guid1hdl, *guid2hdl;
+	uint64_t create1, create2;
 
 	if (guid2 == 0)
 		return (0);
@@ -1615,54 +2360,65 @@ created_before(libzfs_handle_t *hdl, avl
 	nvfs = fsavl_find(avl, guid1, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
-	zn1.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
-	if (zn1.zn_handle == NULL)
+	guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
+	if (guid1hdl == NULL)
 		return (-1);
 
 	nvfs = fsavl_find(avl, guid2, &snapname);
 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
-	zn2.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
-	if (zn2.zn_handle == NULL) {
-		zfs_close(zn2.zn_handle);
+	guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
+	if (guid2hdl == NULL) {
+		zfs_close(guid1hdl);
 		return (-1);
 	}
 
-	rv = (zfs_snapshot_compare(&zn1, &zn2) == -1);
+	create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG);
+	create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG);
+
+	if (create1 < create2)
+		rv = -1;
+	else if (create1 > create2)
+		rv = +1;
+	else
+		rv = 0;
 
-	zfs_close(zn1.zn_handle);
-	zfs_close(zn2.zn_handle);
+	zfs_close(guid1hdl);
+	zfs_close(guid2hdl);
 
 	return (rv);
 }
 
 static int
 recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
-    recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl)
+    recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl,
+    nvlist_t *renamed)
 {
-	nvlist_t *local_nv;
+	nvlist_t *local_nv, *deleted = NULL;
 	avl_tree_t *local_avl;
 	nvpair_t *fselem, *nextfselem;
-	char *tosnap, *fromsnap;
-	char newname[ZFS_MAXNAMELEN];
+	char *fromsnap;
+	char newname[ZFS_MAX_DATASET_NAME_LEN];
+	char guidname[32];
 	int error;
 	boolean_t needagain, progress, recursive;
 	char *s1, *s2;
 
 	VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
-	VERIFY(0 == nvlist_lookup_string(stream_nv, "tosnap", &tosnap));
 
 	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
 	    ENOENT);
 
-	if (flags.dryrun)
+	if (flags->dryrun)
 		return (0);
 
 again:
 	needagain = progress = B_FALSE;
 
+	VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0));
+
 	if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
-	    recursive, &local_nv, &local_avl)) != 0)
+	    recursive, B_FALSE, &local_nv, &local_avl)) != 0)
 		return (error);
 
 	/*
@@ -1715,7 +2471,7 @@ again:
 				nvlist_t *origin_nvfs;
 				char *origin_fsname;
 
-				if (flags.verbose)
+				if (flags->verbose)
 					(void) printf("promoting %s\n", fsname);
 
 				origin_nvfs = fsavl_find(local_avl, originguid,
@@ -1761,9 +2517,9 @@ again:
 
 			/* check for delete */
 			if (found == NULL) {
-				char name[ZFS_MAXNAMELEN];
+				char name[ZFS_MAX_DATASET_NAME_LEN];
 
-				if (!flags.force)
+				if (!flags->force)
 					continue;
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
@@ -1775,6 +2531,8 @@ again:
 					needagain = B_TRUE;
 				else
 					progress = B_TRUE;
+				sprintf(guidname, "%lu", thisguid);
+				nvlist_add_boolean(deleted, guidname);
 				continue;
 			}
 
@@ -1799,8 +2557,8 @@ again:
 			/* check for different snapname */
 			if (strcmp(nvpair_name(snapelem),
 			    stream_snapname) != 0) {
-				char name[ZFS_MAXNAMELEN];
-				char tryname[ZFS_MAXNAMELEN];
+				char name[ZFS_MAX_DATASET_NAME_LEN];
+				char tryname[ZFS_MAX_DATASET_NAME_LEN];
 
 				(void) snprintf(name, sizeof (name), "%s@%s",
 				    fsname, nvpair_name(snapelem));
@@ -1821,7 +2579,7 @@ again:
 
 		/* check for delete */
 		if (stream_nvfs == NULL) {
-			if (!flags.force)
+			if (!flags->force)
 				continue;
 
 			error = recv_destroy(hdl, fsname, strlen(tofs)+1,
@@ -1830,13 +2588,18 @@ again:
 				needagain = B_TRUE;
 			else
 				progress = B_TRUE;
+			sprintf(guidname, "%lu", parent_fromsnap_guid);
+			nvlist_add_boolean(deleted, guidname);
 			continue;
 		}
 
-		if (fromguid == 0 && flags.verbose) {
-			(void) printf("local fs %s does not have fromsnap "
-			    "(%s in stream); must have been deleted locally; "
-			    "ignoring\n", fsname, fromsnap);
+		if (fromguid == 0) {
+			if (flags->verbose) {
+				(void) printf("local fs %s does not have "
+				    "fromsnap (%s in stream); must have "
+				    "been deleted locally; ignoring\n",
+				    fsname, fromsnap);
+			}
 			continue;
 		}
 
@@ -1848,12 +2611,36 @@ again:
 		s1 = strrchr(fsname, '/');
 		s2 = strrchr(stream_fsname, '/');
 
-		/* check for rename */
+		/*
+		 * Check if we're going to rename based on parent guid change
+		 * and the current parent guid was also deleted. If it was then
+		 * rename will fail and is likely unneeded, so avoid this and
+		 * force an early retry to determine the new
+		 * parent_fromsnap_guid.
+		 */
+		if (stream_parent_fromsnap_guid != 0 &&
+                    parent_fromsnap_guid != 0 &&
+                    stream_parent_fromsnap_guid != parent_fromsnap_guid) {
+			sprintf(guidname, "%lu", parent_fromsnap_guid);
+			if (nvlist_exists(deleted, guidname)) {
+				progress = B_TRUE;
+				needagain = B_TRUE;
+				goto doagain;
+			}
+		}
+
+		/*
+		 * Check for rename. If the exact receive path is specified, it
+		 * does not count as a rename, but we still need to check the
+		 * datasets beneath it.
+		 */
 		if ((stream_parent_fromsnap_guid != 0 &&
+		    parent_fromsnap_guid != 0 &&
 		    stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
-		    ((s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
+		    ((flags->isprefix || strcmp(tofs, fsname) != 0) &&
+		    (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
 			nvlist_t *parent;
-			char tryname[ZFS_MAXNAMELEN];
+			char tryname[ZFS_MAX_DATASET_NAME_LEN];
 
 			parent = fsavl_find(local_avl,
 			    stream_parent_fromsnap_guid, NULL);
@@ -1873,14 +2660,22 @@ again:
 				    "%s%s", pname, strrchr(stream_fsname, '/'));
 			} else {
 				tryname[0] = '\0';
-				if (flags.verbose) {
+				if (flags->verbose) {
 					(void) printf("local fs %s new parent "
 					    "not found\n", fsname);
 				}
 			}
 
+			newname[0] = '\0';
+
 			error = recv_rename(hdl, fsname, tryname,
 			    strlen(tofs)+1, newname, flags);
+
+			if (renamed != NULL && newname[0] != '\0') {
+				VERIFY(0 == nvlist_add_boolean(renamed,
+				    newname));
+			}
+
 			if (error)
 				needagain = B_TRUE;
 			else
@@ -1888,12 +2683,14 @@ again:
 		}
 	}
 
+doagain:
 	fsavl_destroy(local_avl);
 	nvlist_free(local_nv);
+	nvlist_free(deleted);
 
 	if (needagain && progress) {
 		/* do another pass to fix up temporary names */
-		if (flags.verbose)
+		if (flags->verbose)
 			(void) printf("another pass:\n");
 		goto again;
 	}
@@ -1903,28 +2700,26 @@ again:
 
 static int
 zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
-    recvflags_t flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
-    char **top_zfs)
+    recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
+    char **top_zfs, int cleanup_fd, uint64_t *action_handlep)
 {
 	nvlist_t *stream_nv = NULL;
 	avl_tree_t *stream_avl = NULL;
 	char *fromsnap = NULL;
-	char tofs[ZFS_MAXNAMELEN];
+	char *sendsnap = NULL;
+	char *cp;
+	char tofs[ZFS_MAX_DATASET_NAME_LEN];
+	char sendfs[ZFS_MAX_DATASET_NAME_LEN];
 	char errbuf[1024];
 	dmu_replay_record_t drre;
 	int error;
 	boolean_t anyerr = B_FALSE;
 	boolean_t softerr = B_FALSE;
+	boolean_t recursive;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
-	if (strchr(destname, '@')) {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "can not specify snapshot name for multi-snapshot stream"));
-		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
-	}
-
 	assert(drr->drr_type == DRR_BEGIN);
 	assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
 	assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
@@ -1935,20 +2730,30 @@ zfs_receive_package(libzfs_handle_t *hdl
 	 */
 	if (drr->drr_payloadlen != 0) {
 		error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
-		    &stream_nv, flags.byteswap, zc);
+		    &stream_nv, flags->byteswap, zc);
 		if (error) {
 			error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			goto out;
 		}
 	}
 
+	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+	    ENOENT);
+
+	if (recursive && strchr(destname, '@')) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "cannot specify snapshot name for multi-snapshot stream"));
+		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
+		goto out;
+	}
+
 	/*
 	 * Read in the end record and verify checksum.
 	 */
 	if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre),
-	    flags.byteswap, NULL)))
+	    flags->byteswap, NULL)))
 		goto out;
-	if (flags.byteswap) {
+	if (flags->byteswap) {
 		drre.drr_type = BSWAP_32(drre.drr_type);
 		drre.drr_u.drr_end.drr_checksum.zc_word[0] =
 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]);
@@ -1985,21 +2790,81 @@ zfs_receive_package(libzfs_handle_t *hdl
 		}
 
 		if (fromsnap != NULL) {
-			(void) strlcpy(tofs, destname, ZFS_MAXNAMELEN);
-			if (flags.isprefix) {
-				int i = strcspn(drr->drr_u.drr_begin.drr_toname,
-				    "/@");
+			nvlist_t *renamed = NULL;
+			nvpair_t *pair = NULL;
+
+			(void) strlcpy(tofs, destname, sizeof (tofs));
+			if (flags->isprefix) {
+				struct drr_begin *drrb = &drr->drr_u.drr_begin;
+				int i;
+
+				if (flags->istail) {
+					cp = strrchr(drrb->drr_toname, '/');
+					if (cp == NULL) {
+						(void) strlcat(tofs, "/",
+						    sizeof (tofs));
+						i = 0;
+					} else {
+						i = (cp - drrb->drr_toname);
+					}
+				} else {
+					i = strcspn(drrb->drr_toname, "/@");
+				}
 				/* zfs_receive_one() will create_parents() */
-				(void) strlcat(tofs,
-				    &drr->drr_u.drr_begin.drr_toname[i],
-				    ZFS_MAXNAMELEN);
+				(void) strlcat(tofs, &drrb->drr_toname[i],
+				    sizeof (tofs));
 				*strchr(tofs, '@') = '\0';
 			}
-			softerr = recv_incremental_replication(hdl, tofs,
-			    flags, stream_nv, stream_avl);
+
+			if (recursive && !flags->dryrun && !flags->nomount) {
+				VERIFY(0 == nvlist_alloc(&renamed,
+				    NV_UNIQUE_NAME, 0));
+			}
+
+			softerr = recv_incremental_replication(hdl, tofs, flags,
+			    stream_nv, stream_avl, renamed);
+
+			/* Unmount renamed filesystems before receiving. */
+			while ((pair = nvlist_next_nvpair(renamed,
+			    pair)) != NULL) {
+				zfs_handle_t *zhp;
+				prop_changelist_t *clp = NULL;
+
+				zhp = zfs_open(hdl, nvpair_name(pair),
+				    ZFS_TYPE_FILESYSTEM);
+				if (zhp != NULL) {
+					clp = changelist_gather(zhp,
+					    ZFS_PROP_MOUNTPOINT, 0, 0);
+					zfs_close(zhp);
+					if (clp != NULL) {
+						softerr |=
+						    changelist_prefix(clp);
+						changelist_free(clp);
+					}
+				}
+			}
+
+			nvlist_free(renamed);
 		}
 	}
 
+	/*
+	 * Get the fs specified by the first path in the stream (the top level
+	 * specified by 'zfs send') and pass it to each invocation of
+	 * zfs_receive_one().
+	 */
+	(void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname,
+	    sizeof (sendfs));
+	if ((cp = strchr(sendfs, '@')) != NULL) {
+		*cp = '\0';
+		/*
+		 * Find the "sendsnap", the final snapshot in a replication
+		 * stream.  zfs_receive_one() handles certain errors
+		 * differently, depending on if the contained stream is the
+		 * last one or not.
+		 */
+		sendsnap = (cp + 1);
+	}
 
 	/* Finally, receive each contained stream */
 	do {
@@ -2010,8 +2875,9 @@ zfs_receive_package(libzfs_handle_t *hdl
 		 * zfs_receive_one() will take care of it (ie,
 		 * recv_skip() and return 0).
 		 */
-		error = zfs_receive_impl(hdl, destname, flags, fd,
-		    stream_avl, top_zfs);
+		error = zfs_receive_impl(hdl, destname, NULL, flags, fd,
+		    sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd,
+		    action_handlep, sendsnap);
 		if (error == ENODATA) {
 			error = 0;
 			break;
@@ -2025,13 +2891,12 @@ zfs_receive_package(libzfs_handle_t *hdl
 		 * renames again.
 		 */
 		softerr = recv_incremental_replication(hdl, tofs, flags,
-		    stream_nv, stream_avl);
+		    stream_nv, stream_avl, NULL);
 	}
 
 out:
 	fsavl_destroy(stream_avl);
-	if (stream_nv)
-		nvlist_free(stream_nv);
+	nvlist_free(stream_nv);
 	if (softerr)
 		error = -2;
 	if (anyerr)
@@ -2056,7 +2921,7 @@ static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
 	dmu_replay_record_t *drr;
-	void *buf = malloc(1<<20);
+	void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE);
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
@@ -2072,11 +2937,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, 
 
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
-			/* NB: not to be used on v2 stream packages */
 			if (drr->drr_payloadlen != 0) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "invalid substream header"));
-				return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+				(void) recv_read(hdl, fd, buf,
+				    drr->drr_payloadlen, B_FALSE, NULL);
 			}
 			break;
 
@@ -2103,7 +2966,24 @@ recv_skip(libzfs_handle_t *hdl, int fd, 
 			(void) recv_read(hdl, fd, buf,
 			    drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
 			break;
-
+		case DRR_SPILL:
+			if (byteswap) {
+				drr->drr_u.drr_spill.drr_length =
+				    BSWAP_64(drr->drr_u.drr_spill.drr_length);
+			}
+			(void) recv_read(hdl, fd, buf,
+			    drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
+			break;
+		case DRR_WRITE_EMBEDDED:
+			if (byteswap) {
+				drr->drr_u.drr_write_embedded.drr_psize =
+				    BSWAP_32(drr->drr_u.drr_write_embedded.
+				    drr_psize);
+			}
+			(void) recv_read(hdl, fd, buf,
+			    P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
+			    8), B_FALSE, NULL);
+			break;
 		case DRR_WRITE_BYREF:
 		case DRR_FREEOBJECTS:
 		case DRR_FREE:
@@ -2120,37 +3000,76 @@ recv_skip(libzfs_handle_t *hdl, int fd, 
 	return (-1);
 }
 
+static void
+recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap,
+    boolean_t resumable)
+{
+	char target_fs[ZFS_MAX_DATASET_NAME_LEN];
+
+	zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+	    "checksum mismatch or incomplete stream"));
+
+	if (!resumable)
+		return;
+	(void) strlcpy(target_fs, target_snap, sizeof (target_fs));
+	*strchr(target_fs, '@') = '\0';
+	zfs_handle_t *zhp = zfs_open(hdl, target_fs,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+	if (zhp == NULL)
+		return;
+
+	char token_buf[ZFS_MAXPROPLEN];
+	int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    token_buf, sizeof (token_buf),
+	    NULL, NULL, 0, B_TRUE);
+	if (error == 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "checksum mismatch or incomplete stream.\n"
+		    "Partially received snapshot is saved.\n"
+		    "A resuming stream can be generated on the sending "
+		    "system by running:\n"
+		    "    zfs send -t %s"),
+		    token_buf);
+	}
+	zfs_close(zhp);
+}
+
 /*
  * Restores a backup of tosnap from the file descriptor specified by infd.
  */
 static int
 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
-    recvflags_t flags, dmu_replay_record_t *drr,
-    dmu_replay_record_t *drr_noswap, avl_tree_t *stream_avl,
-    char **top_zfs)
+    const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr,
+    dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv,
+    avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
+    uint64_t *action_handlep, const char *finalsnap)
 {
 	zfs_cmd_t zc = { 0 };
 	time_t begin_time;
-	int ioctl_err, ioctl_errno, err, choplen;
+	int ioctl_err, ioctl_errno, err;
 	char *cp;
 	struct drr_begin *drrb = &drr->drr_u.drr_begin;
 	char errbuf[1024];
 	char prop_errbuf[1024];
-	char chopprefix[ZFS_MAXNAMELEN];
+	const char *chopprefix;
 	boolean_t newfs = B_FALSE;
 	boolean_t stream_wantsnewfs;
 	uint64_t parent_snapguid = 0;
 	prop_changelist_t *clp = NULL;
 	nvlist_t *snapprops_nvlist = NULL;
 	zprop_errflags_t prop_errflags;
+	boolean_t recursive;
+	char *snapname = NULL;
 
 	begin_time = time(NULL);
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
+	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+	    ENOENT);
+
 	if (stream_avl != NULL) {
-		char *snapname;
 		nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
 		    &snapname);
 		nvlist_t *props;
@@ -2162,7 +3081,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 		if (err)
 			VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
 
-		if (flags.canmountoff) {
+		if (flags->canmountoff) {
 			VERIFY(0 == nvlist_add_uint64(props,
 			    zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0));
 		}
@@ -2179,6 +3098,8 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 			return (-1);
 	}
 
+	cp = NULL;
+
 	/*
 	 * Determine how much of the snapshot name stored in the stream
 	 * we are going to tack on to the name they specified on the
@@ -2187,43 +3108,85 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 	 * If they specified a snapshot, chop the entire name stored in
 	 * the stream.
 	 */
-	(void) strcpy(chopprefix, drrb->drr_toname);
-	if (flags.isprefix) {
+	if (flags->istail) {
+		/*
+		 * A filesystem was specified with -e. We want to tack on only
+		 * the tail of the sent snapshot path.
+		 */
+		if (strchr(tosnap, '@')) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
+			    "argument - snapshot not allowed with -e"));
+			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
+		}
+
+		chopprefix = strrchr(sendfs, '/');
+
+		if (chopprefix == NULL) {
+			/*
+			 * The tail is the poolname, so we need to
+			 * prepend a path separator.
+			 */
+			int len = strlen(drrb->drr_toname);
+			cp = malloc(len + 2);
+			cp[0] = '/';
+			(void) strcpy(&cp[1], drrb->drr_toname);
+			chopprefix = cp;
+		} else {
+			chopprefix = drrb->drr_toname + (chopprefix - sendfs);
+		}
+	} else if (flags->isprefix) {
 		/*
-		 * They specified a fs with -d or -e. We want to tack on
+		 * A filesystem was specified with -d. We want to tack on
 		 * everything but the first element of the sent snapshot path
-		 * (all but the pool name) in the case of -d, or only the tail
-		 * of the sent snapshot path in the case of -e.
+		 * (all but the pool name).
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
-			    "argument - snapshot not allowed with %s"),
-			    (flags.istail ? "-e" : "-d"));
+			    "argument - snapshot not allowed with -d"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
-		cp = (flags.istail ? strrchr(chopprefix, '/') :
-		    strchr(chopprefix, '/'));
-		if (cp == NULL)
-			cp = strchr(chopprefix, '@');
-		*cp = '\0';
+
+		chopprefix = strchr(drrb->drr_toname, '/');
+		if (chopprefix == NULL)
+			chopprefix = strchr(drrb->drr_toname, '@');
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
-		 * If they specified a filesystem without -d or -e, we want to
-		 * tack on everything after the fs specified in the first name
-		 * from the stream.
+		 * If a filesystem was specified without -d or -e, we want to
+		 * tack on everything after the fs specified by 'zfs send'.
 		 */
-		cp = strchr(chopprefix, '@');
-		*cp = '\0';
+		chopprefix = drrb->drr_toname + strlen(sendfs);
+	} else {
+		/* A snapshot was specified as an exact path (no -d or -e). */
+		if (recursive) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "cannot specify snapshot name for multi-snapshot "
+			    "stream"));
+			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+		}
+		chopprefix = drrb->drr_toname + strlen(drrb->drr_toname);
 	}
-	choplen = strlen(chopprefix);
+
+	ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname);
+	ASSERT(chopprefix > drrb->drr_toname);
+	ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname));
+	ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' ||
+	    chopprefix[0] == '\0');
 
 	/*
 	 * Determine name of destination snapshot, store in zc_value.
 	 */
-	(void) strcpy(zc.zc_top_ds, tosnap);
 	(void) strcpy(zc.zc_value, tosnap);
-	(void) strncat(zc.zc_value, drrb->drr_toname+choplen,
-	    sizeof (zc.zc_value));
+	(void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value));
+#ifdef __FreeBSD__
+	if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
+		zfs_ioctl_version = get_zfs_ioctl_version();
+	/*
+	 * For forward compatibility hide tosnap in zc_value
+	 */
+	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
+		(void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap);
+#endif
+	free(cp);
 	if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) {
 		zcmd_free_nvlists(&zc);
 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
@@ -2233,20 +3196,27 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 	 * Determine the name of the origin snapshot, store in zc_string.
 	 */
 	if (drrb->drr_flags & DRR_FLAG_CLONE) {
-		if (guid_to_name(hdl, tosnap,
-		    drrb->drr_fromguid, zc.zc_string) != 0) {
+		if (guid_to_name(hdl, zc.zc_value,
+		    drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) {
 			zcmd_free_nvlists(&zc);
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "local origin for clone %s does not exist"),
 			    zc.zc_value);
 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
 		}
-		if (flags.verbose)
+		if (flags->verbose)
 			(void) printf("found clone origin %s\n", zc.zc_string);
+	} else if (originsnap) {
+		(void) strncpy(zc.zc_string, originsnap, sizeof (zc.zc_string));
+		if (flags->verbose)
+			(void) printf("using provided clone origin %s\n",
+			    zc.zc_string);
 	}
 
+	boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_RESUMING;
 	stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
-	    (drrb->drr_flags & DRR_FLAG_CLONE));
+	    (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming;
 
 	if (stream_wantsnewfs) {
 		/*
@@ -2262,10 +3232,10 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 			*cp = '\0';
 		if (cp &&
 		    !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
-			char suffix[ZFS_MAXNAMELEN];
+			char suffix[ZFS_MAX_DATASET_NAME_LEN];
 			(void) strcpy(suffix, strrchr(zc.zc_value, '/'));
-			if (guid_to_name(hdl, tosnap, parent_snapguid,
-			    zc.zc_value) == 0) {
+			if (guid_to_name(hdl, zc.zc_name, parent_snapguid,
+			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, suffix);
 			}
@@ -2281,11 +3251,18 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 		(void) strcpy(zc.zc_name, zc.zc_value);
 		*strchr(zc.zc_name, '@') = '\0';
 
-		if (!zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
-			char snap[ZFS_MAXNAMELEN];
+		/*
+		 * If the exact receive path was specified and this is the
+		 * topmost path in the stream, then if the fs does not exist we
+		 * should look no further.
+		 */
+		if ((flags->isprefix || (*(chopprefix = drrb->drr_toname +
+		    strlen(sendfs)) != '\0' && *chopprefix != '@')) &&
+		    !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
+			char snap[ZFS_MAX_DATASET_NAME_LEN];
 			(void) strcpy(snap, strchr(zc.zc_value, '@'));
-			if (guid_to_name(hdl, tosnap, drrb->drr_fromguid,
-			    zc.zc_value) == 0) {
+			if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid,
+			    B_FALSE, zc.zc_value) == 0) {
 				*strchr(zc.zc_value, '@') = '\0';
 				(void) strcat(zc.zc_value, snap);
 			}
@@ -2297,16 +3274,17 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 
 	if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
 		zfs_handle_t *zhp;
+
 		/*
-		 * Destination fs exists.  Therefore this should either
-		 * be an incremental, or the stream specifies a new fs
-		 * (full stream or clone) and they want us to blow it
-		 * away (and have therefore specified -F and removed any
-		 * snapshots).
+		 * Destination fs exists.  It must be one of these cases:
+		 *  - an incremental send stream
+		 *  - the stream specifies a new fs (full stream or clone)
+		 *    and they want us to blow away the existing fs (and
+		 *    have therefore specified -F and removed any snapshots)
+		 *  - we are resuming a failed receive.
 		 */
-
 		if (stream_wantsnewfs) {
-			if (!flags.force) {
+			if (!flags->force) {
 				zcmd_free_nvlists(&zc);
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "destination '%s' exists\n"
@@ -2342,7 +3320,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 		}
 
-		if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
+		if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
 		    stream_wantsnewfs) {
 			/* We can't do online recv in this case */
 			clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
@@ -2358,6 +3336,18 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 				return (-1);
 			}
 		}
+
+		/*
+		 * If we are resuming a newfs, set newfs here so that we will
+		 * mount it if the recv succeeds this time.  We can tell
+		 * that it was a newfs on the first recv because the fs
+		 * itself will be inconsistent (if the fs existed when we
+		 * did the first recv, we would have received it into
+		 * .../%recv).
+		 */
+		if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT))
+			newfs = B_TRUE;
+
 		zfs_close(zhp);
 	} else {
 		/*
@@ -2381,7 +3371,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 		 */
 		*cp = '\0';
 
-		if (flags.isprefix && !flags.dryrun &&
+		if (flags->isprefix && !flags->istail && !flags->dryrun &&
 		    create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) {
 			zcmd_free_nvlists(&zc);
 			return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
@@ -2390,24 +3380,27 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 		newfs = B_TRUE;
 	}
 
-	zc.zc_begin_record = drr_noswap->drr_u.drr_begin;
+	zc.zc_begin_record = *drr_noswap;
 	zc.zc_cookie = infd;
-	zc.zc_guid = flags.force;
-	if (flags.verbose) {
+	zc.zc_guid = flags->force;
+	zc.zc_resumable = flags->resumable;
+	if (flags->verbose) {
 		(void) printf("%s %s stream of %s into %s\n",
-		    flags.dryrun ? "would receive" : "receiving",
+		    flags->dryrun ? "would receive" : "receiving",
 		    drrb->drr_fromguid ? "incremental" : "full",
 		    drrb->drr_toname, zc.zc_value);
 		(void) fflush(stdout);
 	}
 
-	if (flags.dryrun) {
+	if (flags->dryrun) {
 		zcmd_free_nvlists(&zc);
-		return (recv_skip(hdl, infd, flags.byteswap));
+		return (recv_skip(hdl, infd, flags->byteswap));
 	}
 
 	zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
 	zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
+	zc.zc_cleanup_fd = cleanup_fd;
+	zc.zc_action_handle = *action_handlep;
 
 	err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
 	ioctl_errno = errno;
@@ -2432,7 +3425,21 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 			    ZPROP_N_MORE_ERRORS) == 0) {
 				trunc_prop_errs(intval);
 				break;
-			} else {
+			} else if (snapname == NULL || finalsnap == NULL ||
+			    strcmp(finalsnap, snapname) == 0 ||
+			    strcmp(nvpair_name(prop_err),
+			    zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) {
+				/*
+				 * Skip the special case of, for example,
+				 * "refquota", errors on intermediate
+				 * snapshots leading up to a final one.
+				 * That's why we have all of the checks above.
+				 *
+				 * See zfs_ioctl.c's extract_delay_props() for
+				 * a list of props which can fail on
+				 * intermediate snapshots, but shouldn't
+				 * affect the overall receive.
+				 */
 				(void) snprintf(tbuf, sizeof (tbuf),
 				    dgettext(TEXT_DOMAIN,
 				    "cannot receive %s property on %s"),
@@ -2458,7 +3465,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 		}
 	}
 
-	if (err && (ioctl_errno == ENOENT || ioctl_errno == ENODEV)) {
+	if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) {
 		/*
 		 * It may be that this snapshot already exists,
 		 * in which case we want to consume & ignore it
@@ -2466,7 +3473,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 		 */
 		avl_tree_t *local_avl;
 		nvlist_t *local_nv, *fs;
-		char *cp = strchr(zc.zc_value, '@');
+		cp = strchr(zc.zc_value, '@');
 
 		/*
 		 * XXX Do this faster by just iterating over snaps in
@@ -2475,19 +3482,19 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 		 */
 		*cp = '\0';
 		if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE,
-		    &local_nv, &local_avl) == 0) {
+		    B_FALSE, &local_nv, &local_avl) == 0) {
 			*cp = '@';
 			fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
 			fsavl_destroy(local_avl);
 			nvlist_free(local_nv);
 
 			if (fs != NULL) {
-				if (flags.verbose) {
+				if (flags->verbose) {
 					(void) printf("snap %s already exists; "
 					    "ignoring\n", zc.zc_value);
 				}
 				err = ioctl_err = recv_skip(hdl, infd,
-				    flags.byteswap);
+				    flags->byteswap);
 			}
 		}
 		*cp = '@';
@@ -2527,10 +3534,19 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
 		case ECKSUM:
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "invalid stream (checksum mismatch)"));
+			recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable);
 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
 			break;
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded to receive this stream."));
+			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+			break;
+		case EDQUOT:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "destination %s space quota exceeded"), zc.zc_name);
+			(void) zfs_error(hdl, EZFS_NOSPC, errbuf);
+			break;
 		default:
 			(void) zfs_standard_error(hdl, ioctl_errno, errbuf);
 		}
@@ -2565,7 +3581,8 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 	}
 
 	if (clp) {
-		err |= changelist_postfix(clp);
+		if (!flags->nomount)
+			err |= changelist_postfix(clp);
 		changelist_free(clp);
 	}
 
@@ -2585,7 +3602,9 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 	if (err || ioctl_err)
 		return (-1);
 
-	if (flags.verbose) {
+	*action_handlep = zc.zc_action_handle;
+
+	if (flags->verbose) {
 		char buf1[64];
 		char buf2[64];
 		uint64_t bytes = zc.zc_cookie;
@@ -2603,8 +3622,10 @@ zfs_receive_one(libzfs_handle_t *hdl, in
 }
 
 static int
-zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
-    int infd, avl_tree_t *stream_avl, char **top_zfs)
+zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
+    const char *originsnap, recvflags_t *flags, int infd, const char *sendfs,
+    nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd,
+    uint64_t *action_handlep, const char *finalsnap)
 {
 	int err;
 	dmu_replay_record_t drr, drr_noswap;
@@ -2617,12 +3638,18 @@ zfs_receive_impl(libzfs_handle_t *hdl, c
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
 
-	if (flags.isprefix &&
+	if (flags->isprefix &&
 	    !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs "
 		    "(%s) does not exist"), tosnap);
 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
 	}
+	if (originsnap &&
+	    !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs "
+		    "(%s) does not exist"), originsnap);
+		return (zfs_error(hdl, EZFS_NOENT, errbuf));
+	}
 
 	/* read in the BEGIN record */
 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
@@ -2637,7 +3664,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, c
 	/* the kernel needs the non-byteswapped begin record */
 	drr_noswap = drr;
 
-	flags.byteswap = B_FALSE;
+	flags->byteswap = B_FALSE;
 	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 		/*
 		 * We computed the checksum in the wrong byteorder in
@@ -2645,7 +3672,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, c
 		 */
 		bzero(&zcksum, sizeof (zio_cksum_t));
 		fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
-		flags.byteswap = B_TRUE;
+		flags->byteswap = B_TRUE;
 
 		drr.drr_type = BSWAP_32(drr.drr_type);
 		drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
@@ -2682,13 +3709,29 @@ zfs_receive_impl(libzfs_handle_t *hdl, c
 	}
 
 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
-		return (zfs_receive_one(hdl, infd, tosnap, flags,
-		    &drr, &drr_noswap, stream_avl, top_zfs));
-	} else {  /* must be DMU_COMPOUNDSTREAM */
+		char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN];
+		if (sendfs == NULL) {
+			/*
+			 * We were not called from zfs_receive_package(). Get
+			 * the fs specified by 'zfs send'.
+			 */
+			char *cp;
+			(void) strlcpy(nonpackage_sendfs,
+			    drr.drr_u.drr_begin.drr_toname,
+			    sizeof (nonpackage_sendfs));
+			if ((cp = strchr(nonpackage_sendfs, '@')) != NULL)
+				*cp = '\0';
+			sendfs = nonpackage_sendfs;
+			VERIFY(finalsnap == NULL);
+		}
+		return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags,
+		    &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs,
+		    cleanup_fd, action_handlep, finalsnap));
+	} else {
 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 		    DMU_COMPOUNDSTREAM);
-		return (zfs_receive_package(hdl, infd, tosnap, flags,
-		    &drr, &zcksum, top_zfs));
+		return (zfs_receive_package(hdl, infd, tosnap, flags, &drr,
+		    &zcksum, top_zfs, cleanup_fd, action_handlep));
 	}
 }
 
@@ -2696,18 +3739,33 @@ zfs_receive_impl(libzfs_handle_t *hdl, c
  * Restores a backup of tosnap from the file descriptor specified by infd.
  * Return 0 on total success, -2 if some things couldn't be
  * destroyed/renamed/promoted, -1 if some things couldn't be received.
- * (-1 will override -2).
+ * (-1 will override -2, if -1 and the resumable flag was specified the
+ * transfer can be resumed if the sending side supports it).
  */
 int
-zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
-    int infd, avl_tree_t *stream_avl)
+zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
+    recvflags_t *flags, int infd, avl_tree_t *stream_avl)
 {
 	char *top_zfs = NULL;
 	int err;
+	int cleanup_fd;
+	uint64_t action_handle = 0;
+	char *originsnap = NULL;
+	if (props) {
+		err = nvlist_lookup_string(props, "origin", &originsnap);
+		if (err && err != ENOENT)
+			return (err);
+	}
+
+	cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL);
+	VERIFY(cleanup_fd >= 0);
+
+	err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL,
+	    stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL);
 
-	err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs);
+	VERIFY(0 == close(cleanup_fd));
 
-	if (err == 0 && !flags.nomount && top_zfs) {
+	if (err == 0 && !flags->nomount && top_zfs) {
 		zfs_handle_t *zhp;
 		prop_changelist_t *clp;
 
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 libzfs_status.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c	27 Feb 2010 22:30:28 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c	2 Sep 2013 11:36:30 -0000
@@ -18,9 +18,11 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
 /*
@@ -43,6 +45,7 @@
 #include <string.h>
 #include <unistd.h>
 #include "libzfs_impl.h"
+#include "zfeature_common.h"
 
 /*
  * Message ID table.  This must be kept in sync with the ZPOOL_STATUS_* defines
@@ -70,57 +73,66 @@ static char *zfs_msgid_table[] = {
 
 /* ARGSUSED */
 static int
-vdev_missing(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_missing(vdev_stat_t *vs, uint_t vsc)
 {
-	return (state == VDEV_STATE_CANT_OPEN &&
-	    aux == VDEV_AUX_OPEN_FAILED);
+	return (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+	    vs->vs_aux == VDEV_AUX_OPEN_FAILED);
 }
 
 /* ARGSUSED */
 static int
-vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_faulted(vdev_stat_t *vs, uint_t vsc)
 {
-	return (state == VDEV_STATE_FAULTED);
+	return (vs->vs_state == VDEV_STATE_FAULTED);
 }
 
 /* ARGSUSED */
 static int
-vdev_errors(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_errors(vdev_stat_t *vs, uint_t vsc)
 {
-	return (state == VDEV_STATE_DEGRADED || errs != 0);
+	return (vs->vs_state == VDEV_STATE_DEGRADED ||
+	    vs->vs_read_errors != 0 || vs->vs_write_errors != 0 ||
+	    vs->vs_checksum_errors != 0);
 }
 
 /* ARGSUSED */
 static int
-vdev_broken(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_broken(vdev_stat_t *vs, uint_t vsc)
 {
-	return (state == VDEV_STATE_CANT_OPEN);
+	return (vs->vs_state == VDEV_STATE_CANT_OPEN);
 }
 
 /* ARGSUSED */
 static int
-vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_offlined(vdev_stat_t *vs, uint_t vsc)
 {
-	return (state == VDEV_STATE_OFFLINE);
+	return (vs->vs_state == VDEV_STATE_OFFLINE);
 }
 
 /* ARGSUSED */
 static int
-vdev_removed(uint64_t state, uint64_t aux, uint64_t errs)
+vdev_removed(vdev_stat_t *vs, uint_t vsc)
+{
+	return (vs->vs_state == VDEV_STATE_REMOVED);
+}
+
+static int
+vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc)
 {
-	return (state == VDEV_STATE_REMOVED);
+	return (VDEV_STAT_VALID(vs_physical_ashift, vsc) &&
+	    vs->vs_configured_ashift < vs->vs_physical_ashift);
 }
 
 /*
  * Detect if any leaf devices that have seen errors or could not be opened.
  */
 static boolean_t
-find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
+find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t),
+    boolean_t ignore_replacing)
 {
 	nvlist_t **child;
 	vdev_stat_t *vs;
-	uint_t c, children;
-	char *type;
+	uint_t c, vsc, children;
 
 	/*
 	 * Ignore problems within a 'replacing' vdev, since we're presumably in
@@ -128,26 +140,38 @@ find_vdev_problem(nvlist_t *vdev, int (*
 	 * out again.  We'll pick up the fact that a resilver is happening
 	 * later.
 	 */
-	verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0);
-	if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
-		return (B_FALSE);
+	if (ignore_replacing == B_TRUE) {
+		char *type;
+
+		verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE,
+		    &type) == 0);
+		if (strcmp(type, VDEV_TYPE_REPLACING) == 0)
+			return (B_FALSE);
+	}
 
 	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) == 0) {
 		for (c = 0; c < children; c++)
-			if (find_vdev_problem(child[c], func))
+			if (find_vdev_problem(child[c], func, ignore_replacing))
 				return (B_TRUE);
 	} else {
-		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS,
-		    (uint64_t **)&vs, &c) == 0);
+		verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS,
+		    (uint64_t **)&vs, &vsc) == 0);
 
-		if (func(vs->vs_state, vs->vs_aux,
-		    vs->vs_read_errors +
-		    vs->vs_write_errors +
-		    vs->vs_checksum_errors))
+		if (func(vs, vsc) != 0)
 			return (B_TRUE);
 	}
 
+	/*
+	 * Check any L2 cache devs
+	 */
+	if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child,
+	    &children) == 0) {
+		for (c = 0; c < children; c++)
+			if (find_vdev_problem(child[c], func, ignore_replacing))
+				return (B_TRUE);
+	}
+
 	return (B_FALSE);
 }
 
@@ -173,7 +197,8 @@ check_status(nvlist_t *config, boolean_t
 {
 	nvlist_t *nvroot;
 	vdev_stat_t *vs;
-	uint_t vsc;
+	pool_scan_stat_t *ps = NULL;
+	uint_t vsc, psc;
 	uint64_t nerr;
 	uint64_t version;
 	uint64_t stateval;
@@ -184,15 +209,24 @@ check_status(nvlist_t *config, boolean_t
 	    &version) == 0);
 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
-	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS,
+	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
 	    (uint64_t **)&vs, &vsc) == 0);
 	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    &stateval) == 0);
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+
+	/*
+	 * Currently resilvering a vdev
+	 */
+	(void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+	    (uint64_t **)&ps, &psc);
+	if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+	    ps->pss_state == DSS_SCANNING)
+		return (ZPOOL_STATUS_RESILVERING);
 
 	/*
 	 * Pool last accessed by another system.
 	 */
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
 	if (hostid != 0 && (unsigned long)hostid != gethostid() &&
 	    stateval == POOL_STATE_ACTIVE)
 		return (ZPOOL_STATUS_HOSTID_MISMATCH);
@@ -205,6 +239,20 @@ check_status(nvlist_t *config, boolean_t
 		return (ZPOOL_STATUS_VERSION_NEWER);
 
 	/*
+	 * Unsupported feature(s).
+	 */
+	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
+	    vs->vs_aux == VDEV_AUX_UNSUP_FEAT) {
+		nvlist_t *nvinfo;
+
+		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+		    &nvinfo) == 0);
+		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY))
+			return (ZPOOL_STATUS_UNSUP_FEAT_WRITE);
+		return (ZPOOL_STATUS_UNSUP_FEAT_READ);
+	}
+
+	/*
 	 * Check that the config is complete.
 	 */
 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
@@ -233,15 +281,15 @@ check_status(nvlist_t *config, boolean_t
 	 * Bad devices in non-replicated config.
 	 */
 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
-	    find_vdev_problem(nvroot, vdev_faulted))
+	    find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
 		return (ZPOOL_STATUS_FAULTED_DEV_NR);
 
 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
-	    find_vdev_problem(nvroot, vdev_missing))
+	    find_vdev_problem(nvroot, vdev_missing, B_TRUE))
 		return (ZPOOL_STATUS_MISSING_DEV_NR);
 
 	if (vs->vs_state == VDEV_STATE_CANT_OPEN &&
-	    find_vdev_problem(nvroot, vdev_broken))
+	    find_vdev_problem(nvroot, vdev_broken, B_TRUE))
 		return (ZPOOL_STATUS_CORRUPT_LABEL_NR);
 
 	/*
@@ -263,43 +311,67 @@ check_status(nvlist_t *config, boolean_t
 	/*
 	 * Missing devices in a replicated config.
 	 */
-	if (find_vdev_problem(nvroot, vdev_faulted))
+	if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE))
 		return (ZPOOL_STATUS_FAULTED_DEV_R);
-	if (find_vdev_problem(nvroot, vdev_missing))
+	if (find_vdev_problem(nvroot, vdev_missing, B_TRUE))
 		return (ZPOOL_STATUS_MISSING_DEV_R);
-	if (find_vdev_problem(nvroot, vdev_broken))
+	if (find_vdev_problem(nvroot, vdev_broken, B_TRUE))
 		return (ZPOOL_STATUS_CORRUPT_LABEL_R);
 
 	/*
 	 * Devices with errors
 	 */
-	if (!isimport && find_vdev_problem(nvroot, vdev_errors))
+	if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE))
 		return (ZPOOL_STATUS_FAILING_DEV);
 
 	/*
 	 * Offlined devices
 	 */
-	if (find_vdev_problem(nvroot, vdev_offlined))
+	if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE))
 		return (ZPOOL_STATUS_OFFLINE_DEV);
 
 	/*
 	 * Removed device
 	 */
-	if (find_vdev_problem(nvroot, vdev_removed))
+	if (find_vdev_problem(nvroot, vdev_removed, B_TRUE))
 		return (ZPOOL_STATUS_REMOVED_DEV);
 
 	/*
-	 * Currently resilvering
+	 * Suboptimal, but usable, ashift configuration.
 	 */
-	if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER)
-		return (ZPOOL_STATUS_RESILVERING);
+	if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE))
+		return (ZPOOL_STATUS_NON_NATIVE_ASHIFT);
 
 	/*
 	 * Outdated, but usable, version
 	 */
-	if (version < SPA_VERSION)
+	if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION)
 		return (ZPOOL_STATUS_VERSION_OLDER);
 
+	/*
+	 * Usable pool with disabled features
+	 */
+	if (version >= SPA_VERSION_FEATURES) {
+		int i;
+		nvlist_t *feat;
+
+		if (isimport) {
+			feat = fnvlist_lookup_nvlist(config,
+			    ZPOOL_CONFIG_LOAD_INFO);
+			feat = fnvlist_lookup_nvlist(feat,
+			    ZPOOL_CONFIG_ENABLED_FEAT);
+		} else {
+			feat = fnvlist_lookup_nvlist(config,
+			    ZPOOL_CONFIG_FEATURE_STATS);
+		}
+
+		for (i = 0; i < SPA_FEATURES; i++) {
+			zfeature_info_t *fi = &spa_feature_table[i];
+			if (!nvlist_exists(feat, fi->fi_guid))
+				return (ZPOOL_STATUS_FEAT_DISABLED);
+		}
+	}
+
 	return (ZPOOL_STATUS_OK);
 }
 
Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c,v
retrieving revision 1.6
diff -u -p -r1.6 libzfs_util.c
--- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c	26 Jan 2013 20:15:50 -0000	1.6
+++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c	25 Apr 2017 17:20:07 -0000
@@ -18,15 +18,23 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  */
 
 /*
  * Internal utility routines for the ZFS library.
  */
 
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+#include <sys/stat.h>
+
 #include <errno.h>
 #include <fcntl.h>
 #include <libintl.h>
@@ -42,10 +50,17 @@
 #include <sys/types.h>
 #include <sys/ioctl.h>
 
+#ifdef __NetBSD__
+#include <sys/statvfs.h>
+#endif
+
 #include <libzfs.h>
+#include <libzfs_core.h>
 
 #include "libzfs_impl.h"
 #include "zfs_prop.h"
+#include "zfeature_common.h"
+
 
 int
 libzfs_errno(libzfs_handle_t *hdl)
@@ -71,7 +86,7 @@ libzfs_error_description(libzfs_handle_t
 	case EZFS_BADPROP:
 		return (dgettext(TEXT_DOMAIN, "invalid property value"));
 	case EZFS_PROPREADONLY:
-		return (dgettext(TEXT_DOMAIN, "read only property"));
+		return (dgettext(TEXT_DOMAIN, "read-only property"));
 	case EZFS_PROPTYPE:
 		return (dgettext(TEXT_DOMAIN, "property doesn't apply to "
 		    "datasets of this type"));
@@ -91,7 +106,7 @@ libzfs_error_description(libzfs_handle_t
 	case EZFS_BADSTREAM:
 		return (dgettext(TEXT_DOMAIN, "invalid backup stream"));
 	case EZFS_DSREADONLY:
-		return (dgettext(TEXT_DOMAIN, "dataset is read only"));
+		return (dgettext(TEXT_DOMAIN, "dataset is read-only"));
 	case EZFS_VOLTOOBIG:
 		return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
 		    "this system"));
@@ -113,7 +128,8 @@ libzfs_error_description(libzfs_handle_t
 	case EZFS_RESILVERING:
 		return (dgettext(TEXT_DOMAIN, "currently resilvering"));
 	case EZFS_BADVERSION:
-		return (dgettext(TEXT_DOMAIN, "unsupported version"));
+		return (dgettext(TEXT_DOMAIN, "unsupported version or "
+		    "feature"));
 	case EZFS_POOLUNAVAIL:
 		return (dgettext(TEXT_DOMAIN, "pool is unavailable"));
 	case EZFS_DEVOVERFLOW:
@@ -137,14 +153,12 @@ libzfs_error_description(libzfs_handle_t
 		return (dgettext(TEXT_DOMAIN, "smb remove share failed"));
 	case EZFS_SHARESMBFAILED:
 		return (dgettext(TEXT_DOMAIN, "smb add share failed"));
-	case EZFS_ISCSISVCUNAVAIL:
-		return (dgettext(TEXT_DOMAIN,
-		    "iscsitgt service need to be enabled by "
-		    "a privileged user"));
 	case EZFS_PERM:
 		return (dgettext(TEXT_DOMAIN, "permission denied"));
 	case EZFS_NOSPC:
 		return (dgettext(TEXT_DOMAIN, "out of space"));
+	case EZFS_FAULT:
+		return (dgettext(TEXT_DOMAIN, "bad address"));
 	case EZFS_IO:
 		return (dgettext(TEXT_DOMAIN, "I/O error"));
 	case EZFS_INTR:
@@ -158,12 +172,6 @@ libzfs_error_description(libzfs_handle_t
 		return (dgettext(TEXT_DOMAIN, "recursive dataset dependency"));
 	case EZFS_NOHISTORY:
 		return (dgettext(TEXT_DOMAIN, "no history available"));
-	case EZFS_UNSHAREISCSIFAILED:
-		return (dgettext(TEXT_DOMAIN,
-		    "iscsitgtd failed request to unshare"));
-	case EZFS_SHAREISCSIFAILED:
-		return (dgettext(TEXT_DOMAIN,
-		    "iscsitgtd failed request to share"));
 	case EZFS_POOLPROPS:
 		return (dgettext(TEXT_DOMAIN, "failed to retrieve "
 		    "pool properties"));
@@ -191,9 +199,6 @@ libzfs_error_description(libzfs_handle_t
 	case EZFS_NODELEGATION:
 		return (dgettext(TEXT_DOMAIN, "delegated administration is "
 		    "disabled on pool"));
-	case EZFS_PERMRDONLY:
-		return (dgettext(TEXT_DOMAIN, "snapshot permissions cannot be"
-		    " modified"));
 	case EZFS_BADCACHE:
 		return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
 	case EZFS_ISL2CACHE:
@@ -224,6 +229,17 @@ libzfs_error_description(libzfs_handle_t
 	case EZFS_POSTSPLIT_ONLINE:
 		return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
 		    "into a new one"));
+	case EZFS_SCRUBBING:
+		return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
+		    "use 'zpool scrub -s' to cancel current scrub"));
+	case EZFS_NO_SCRUB:
+		return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
+	case EZFS_DIFF:
+		return (dgettext(TEXT_DOMAIN, "unable to generate diffs"));
+	case EZFS_DIFFDATA:
+		return (dgettext(TEXT_DOMAIN, "invalid diff data"));
+	case EZFS_POOLREADONLY:
+		return (dgettext(TEXT_DOMAIN, "pool is read-only"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@@ -312,6 +328,10 @@ zfs_common_error(libzfs_handle_t *hdl, i
 		zfs_verror(hdl, EZFS_IO, fmt, ap);
 		return (-1);
 
+	case EFAULT:
+		zfs_verror(hdl, EZFS_FAULT, fmt, ap);
+		return (-1);
+
 	case EINTR:
 		zfs_verror(hdl, EZFS_INTR, fmt, ap);
 		return (-1);
@@ -342,6 +362,7 @@ zfs_standard_error_fmt(libzfs_handle_t *
 	switch (error) {
 	case ENXIO:
 	case ENODEV:
+	case EPIPE:
 		zfs_verror(hdl, EZFS_IO, fmt, ap);
 		break;
 
@@ -354,6 +375,7 @@ zfs_standard_error_fmt(libzfs_handle_t *
 	case ENOSPC:
 	case EDQUOT:
 		zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
+		va_end(ap);
 		return (-1);
 
 	case EEXIST:
@@ -368,9 +390,7 @@ zfs_standard_error_fmt(libzfs_handle_t *
 		zfs_verror(hdl, EZFS_BUSY, fmt, ap);
 		break;
 	case EROFS:
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "snapshot permissions cannot be modified"));
-		zfs_verror(hdl, EZFS_PERMRDONLY, fmt, ap);
+		zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
 		break;
 	case ENAMETOOLONG:
 		zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap);
@@ -455,13 +475,19 @@ zpool_standard_error_fmt(libzfs_handle_t
 	case ENOSPC:
 	case EDQUOT:
 		zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
+		va_end(ap);
 		return (-1);
+
 	case EAGAIN:
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "pool I/O is currently suspended"));
 		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
 		break;
 
+	case EROFS:
+		zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap);
+		break;
+
 	default:
 		zfs_error_aux(hdl, strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
@@ -495,6 +521,29 @@ zfs_alloc(libzfs_handle_t *hdl, size_t s
 }
 
 /*
+ * A safe form of asprintf() which will die if the allocation fails.
+ */
+/*PRINTFLIKE2*/
+char *
+zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...)
+{
+	va_list ap;
+	char *ret;
+	int err;
+
+	va_start(ap, fmt);
+
+	err = vasprintf(&ret, fmt, ap);
+
+	va_end(ap);
+
+	if (err < 0)
+		(void) no_memory(hdl);
+
+	return (ret);
+}
+
+/*
  * A safe form of realloc(), which also zeroes newly allocated space.
  */
 void *
@@ -575,12 +624,43 @@ libzfs_print_on_error(libzfs_handle_t *h
 	hdl->libzfs_printerr = printerr;
 }
 
+static int
+libzfs_load(void)
+{
+#ifdef __FreeBSD__
+	if (modfind("zfs") < 0) {
+		/* Not present in kernel, try loading it. */
+		if (kldload("zfs") < 0 || modfind("zfs") < 0) {
+			if (errno != EEXIST)
+				return (-1);
+		}
+	}
+#endif
+#ifdef __NetBSD__
+	modctl_load_t cmdargs;
+
+	cmdargs.ml_filename = "zfs";
+	cmdargs.ml_flags = MODCTL_NO_PROP;
+	cmdargs.ml_props = NULL;
+	cmdargs.ml_propslen = 0;
+
+	if (modctl(MODCTL_LOAD, &cmdargs) < 0 && errno != EEXIST)
+		return (-1);
+#endif
+	return (0);
+}
+
 libzfs_handle_t *
 libzfs_init(void)
 {
 	libzfs_handle_t *hdl;
 
-	if ((hdl = calloc(sizeof (libzfs_handle_t), 1)) == NULL) {
+	if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) {
+		return (NULL);
+	}
+
+	if (libzfs_load() < 0) {
+		free(hdl);
 		return (NULL);
 	}
 
@@ -595,10 +675,19 @@ libzfs_init(void)
 		return (NULL);
 	}
 
-	hdl->libzfs_sharetab = fopen("/etc/dfs/sharetab", "r");
+	hdl->libzfs_sharetab = fopen(ZFS_EXPORTS_PATH, "r");
+
+	if (libzfs_core_init() != 0) {
+		(void) close(hdl->libzfs_fd);
+		(void) fclose(hdl->libzfs_mnttab);
+		(void) fclose(hdl->libzfs_sharetab);
+		free(hdl);
+		return (NULL);
+	}
 
 	zfs_prop_init();
 	zpool_prop_init();
+	zpool_feature_init();
 	libzfs_mnttab_init(hdl);
 
 	return (hdl);
@@ -613,14 +702,13 @@ libzfs_fini(libzfs_handle_t *hdl)
 	if (hdl->libzfs_sharetab)
 		(void) fclose(hdl->libzfs_sharetab);
 	zfs_uninit_libshare(hdl);
-	if (hdl->libzfs_log_str)
-		(void) free(hdl->libzfs_log_str);
 	zpool_free_handles(hdl);
-#ifdef PORT_SOLARIS
+#ifdef illumos
 	libzfs_fru_clear(hdl, B_TRUE);
-#endif	
+#endif
 	namespace_clear(hdl);
 	libzfs_mnttab_fini(hdl);
+	libzfs_core_fini();
 	free(hdl);
 }
 
@@ -651,7 +739,8 @@ zfs_get_pool_handle(const zfs_handle_t *
 zfs_handle_t *
 zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype)
 {
-	struct statvfs statbuf;
+	struct stat64 statbuf;
+	struct extmnttab entry;
 	int ret;
 
 	if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) {
@@ -661,18 +750,57 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl
 		return (zfs_open(hdl, path, argtype));
 	}
 
-	if (getstatfs(&statbuf, path) != 0) {
+	if (stat64(path, &statbuf) != 0) {
 		(void) fprintf(stderr, "%s: %s\n", path, strerror(errno));
 		return (NULL);
 	}
 
-	if (strcmp(statbuf.f_fstypename, MNTTYPE_ZFS) != 0) {
+#ifdef illumos
+	rewind(hdl->libzfs_mnttab);
+	while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) {
+		if (makedevice(entry.mnt_major, entry.mnt_minor) ==
+		    statbuf.st_dev) {
+			break;
+		}
+	}
+#endif	/* illumos */
+#ifdef __FreeBSD__
+	{
+		struct statfs sfs;
+
+		ret = statfs(path, &sfs);
+		if (ret == 0)
+			statfs2mnttab(&sfs, &entry);
+		else {
+			(void) fprintf(stderr, "%s: %s\n", path,
+			    strerror(errno));
+		}
+	}
+#endif	/* __FreeBSD__ */
+#ifdef __NetBSD__
+	{
+		struct statvfs sfs;
+
+		ret = statvfs(path, &sfs);
+		if (ret == 0)
+			statvfs2mnttab(&sfs, &entry);
+		else {
+			(void) fprintf(stderr, "%s: %s\n", path,
+			    strerror(errno));
+		}
+	}
+#endif	/* __NetBSD__ */
+	if (ret != 0) {
+		return (NULL);
+	}
+
+	if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) {
 		(void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"),
 		    path);
 		return (NULL);
 	}
 
-	return (zfs_open(hdl, statbuf.f_mntfromname, ZFS_TYPE_FILESYSTEM));
+	return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM));
 }
 
 /*
@@ -683,10 +811,11 @@ int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
 	if (len == 0)
-		len = 4*1024;
+		len = 16 * 1024;
 	zc->zc_nvlist_dst_size = len;
-	if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
-	    zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == 0)
+	zc->zc_nvlist_dst =
+	    (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
+	if (zc->zc_nvlist_dst == 0)
 		return (-1);
 
 	return (0);
@@ -701,8 +830,9 @@ int
 zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc)
 {
 	free((void *)(uintptr_t)zc->zc_nvlist_dst);
-	if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
-	    zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == 0)
+	zc->zc_nvlist_dst =
+	    (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
+	if (zc->zc_nvlist_dst == 0)
 		return (-1);
 
 	return (0);
@@ -717,6 +847,9 @@ zcmd_free_nvlists(zfs_cmd_t *zc)
 	free((void *)(uintptr_t)zc->zc_nvlist_conf);
 	free((void *)(uintptr_t)zc->zc_nvlist_src);
 	free((void *)(uintptr_t)zc->zc_nvlist_dst);
+	zc->zc_nvlist_conf = NULL;
+	zc->zc_nvlist_src = NULL;
+	zc->zc_nvlist_dst = NULL;
 }
 
 static int
@@ -769,17 +902,7 @@ zcmd_read_dst_nvlist(libzfs_handle_t *hd
 int
 zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc)
 {
-	int error;
-
-	zc->zc_history = (uint64_t)(uintptr_t)hdl->libzfs_log_str;
-	error = ioctl(hdl->libzfs_fd, request, zc);
-	if (hdl->libzfs_log_str) {
-		free(hdl->libzfs_log_str);
-		hdl->libzfs_log_str = NULL;
-	}
-	zc->zc_history = 0;
-
-	return (error);
+	return (ioctl(hdl->libzfs_fd, request, zc));
 }
 
 /*
@@ -918,7 +1041,7 @@ zprop_print_one_property(const char *nam
     const char *source, const char *recvd_value)
 {
 	int i;
-	const char *str;
+	const char *str = NULL;
 	char buf[128];
 
 	/*
@@ -970,6 +1093,10 @@ zprop_print_one_property(const char *nam
 			case ZPROP_SRC_RECEIVED:
 				str = "received";
 				break;
+
+			default:
+				str = NULL;
+				assert(!"unhandled zprop_source_t");
 			}
 			break;
 
@@ -1180,6 +1307,16 @@ zprop_parse_value(libzfs_handle_t *hdl, 
 			    "use 'none' to disable quota/refquota"));
 			goto error;
 		}
+
+		/*
+		 * Special handling for "*_limit=none". In this case it's not
+		 * 0 but UINT64_MAX.
+		 */
+		if ((type & ZFS_TYPE_DATASET) && isnone &&
+		    (prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+		    prop == ZFS_PROP_SNAPSHOT_LIMIT)) {
+			*ivalp = UINT64_MAX;
+		}
 		break;
 
 	case PROP_TYPE_INDEX:
@@ -1241,8 +1378,11 @@ addlist(libzfs_handle_t *hdl, char *prop
 	 * this is a pool property or if this isn't a user-defined
 	 * dataset property,
 	 */
-	if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL ||
-	    (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) {
+	if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL &&
+	    !zpool_prop_feature(propname) &&
+	    !zpool_prop_unsupported(propname)) ||
+	    (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) &&
+	    !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid property '%s'"), propname);
 		return (zfs_error(hdl, EZFS_BADPROP,
@@ -1254,7 +1394,8 @@ addlist(libzfs_handle_t *hdl, char *prop
 
 	entry->pl_prop = prop;
 	if (prop == ZPROP_INVAL) {
-		if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) {
+		if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) ==
+		    NULL) {
 			free(entry);
 			return (-1);
 		}
Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c
diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c	3 Dec 2016 17:05:32 -0000
@@ -0,0 +1,848 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/*
+ * LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
+ * It has the following characteristics:
+ *
+ *  - Thread Safe.  libzfs_core is accessible concurrently from multiple
+ *  threads.  This is accomplished primarily by avoiding global data
+ *  (e.g. caching).  Since it's thread-safe, there is no reason for a
+ *  process to have multiple libzfs "instances".  Therefore, we store
+ *  our few pieces of data (e.g. the file descriptor) in global
+ *  variables.  The fd is reference-counted so that the libzfs_core
+ *  library can be "initialized" multiple times (e.g. by different
+ *  consumers within the same process).
+ *
+ *  - Committed Interface.  The libzfs_core interface will be committed,
+ *  therefore consumers can compile against it and be confident that
+ *  their code will continue to work on future releases of this code.
+ *  Currently, the interface is Evolving (not Committed), but we intend
+ *  to commit to it once it is more complete and we determine that it
+ *  meets the needs of all consumers.
+ *
+ *  - Programatic Error Handling.  libzfs_core communicates errors with
+ *  defined error numbers, and doesn't print anything to stdout/stderr.
+ *
+ *  - Thin Layer.  libzfs_core is a thin layer, marshaling arguments
+ *  to/from the kernel ioctls.  There is generally a 1:1 correspondence
+ *  between libzfs_core functions and ioctls to /dev/zfs.
+ *
+ *  - Clear Atomicity.  Because libzfs_core functions are generally 1:1
+ *  with kernel ioctls, and kernel ioctls are general atomic, each
+ *  libzfs_core function is atomic.  For example, creating multiple
+ *  snapshots with a single call to lzc_snapshot() is atomic -- it
+ *  can't fail with only some of the requested snapshots created, even
+ *  in the event of power loss or system crash.
+ *
+ *  - Continued libzfs Support.  Some higher-level operations (e.g.
+ *  support for "zfs send -R") are too complicated to fit the scope of
+ *  libzfs_core.  This functionality will continue to live in libzfs.
+ *  Where appropriate, libzfs will use the underlying atomic operations
+ *  of libzfs_core.  For example, libzfs may implement "zfs send -R |
+ *  zfs receive" by using individual "send one snapshot", rename,
+ *  destroy, and "receive one snapshot" operations in libzfs_core.
+ *  /sbin/zfs and /zbin/zpool will link with both libzfs and
+ *  libzfs_core.  Other consumers should aim to use only libzfs_core,
+ *  since that will be the supported, stable interface going forwards.
+ */
+
+#define _IN_LIBZFS_CORE_
+
+#include <libzfs_core.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/nvpair.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include "libzfs_core_compat.h"
+#include "libzfs_compat.h"
+
+#ifdef __FreeBSD__
+extern int zfs_ioctl_version;
+#endif
+
+static int g_fd;
+static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
+static int g_refcount;
+
+int
+libzfs_core_init(void)
+{
+	(void) pthread_mutex_lock(&g_lock);
+	if (g_refcount == 0) {
+		g_fd = open("/dev/zfs", O_RDWR);
+		if (g_fd < 0) {
+			(void) pthread_mutex_unlock(&g_lock);
+			return (errno);
+		}
+	}
+	g_refcount++;
+	(void) pthread_mutex_unlock(&g_lock);
+
+	return (0);
+}
+
+void
+libzfs_core_fini(void)
+{
+	(void) pthread_mutex_lock(&g_lock);
+	ASSERT3S(g_refcount, >, 0);
+	g_refcount--;
+	if (g_refcount == 0)
+		(void) close(g_fd);
+	(void) pthread_mutex_unlock(&g_lock);
+}
+
+static int
+lzc_ioctl(zfs_ioc_t ioc, const char *name,
+    nvlist_t *source, nvlist_t **resultp)
+{
+	zfs_cmd_t zc = { 0 };
+	int error = 0;
+	char *packed;
+#ifdef __FreeBSD__
+	nvlist_t *oldsource;
+#endif
+	size_t size;
+
+	ASSERT3S(g_refcount, >, 0);
+
+	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+
+#ifdef __FreeBSD__
+	if (zfs_ioctl_version == ZFS_IOCVER_UNDEF)
+		zfs_ioctl_version = get_zfs_ioctl_version();
+
+	if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
+		oldsource = source;
+		error = lzc_compat_pre(&zc, &ioc, &source);
+		if (error)
+			return (error);
+	}
+#endif
+
+	packed = fnvlist_pack(source, &size);
+	zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
+	zc.zc_nvlist_src_size = size;
+
+	if (resultp != NULL) {
+		*resultp = NULL;
+		zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
+		zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
+		    malloc(zc.zc_nvlist_dst_size);
+#ifdef illumos
+		if (zc.zc_nvlist_dst == NULL) {
+#else
+		if (zc.zc_nvlist_dst == 0) {
+#endif
+			error = ENOMEM;
+			goto out;
+		}
+	}
+
+	while (ioctl(g_fd, ioc, &zc) != 0) {
+		if (errno == ENOMEM && resultp != NULL) {
+			free((void *)(uintptr_t)zc.zc_nvlist_dst);
+			zc.zc_nvlist_dst_size *= 2;
+			zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
+			    malloc(zc.zc_nvlist_dst_size);
+#ifdef illumos
+			if (zc.zc_nvlist_dst == NULL) {
+#else
+			if (zc.zc_nvlist_dst == 0) {
+#endif
+				error = ENOMEM;
+				goto out;
+			}
+		} else {
+			error = errno;
+			break;
+		}
+	}
+
+#ifdef __FreeBSD__
+	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
+		lzc_compat_post(&zc, ioc);
+#endif
+	if (zc.zc_nvlist_dst_filled) {
+		*resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
+		    zc.zc_nvlist_dst_size);
+	}
+#ifdef __FreeBSD__
+	if (zfs_ioctl_version < ZFS_IOCVER_LZC)
+		lzc_compat_outnvl(&zc, ioc, resultp);
+#endif
+out:
+#ifdef __FreeBSD__
+	if (zfs_ioctl_version < ZFS_IOCVER_LZC) {
+		if (source != oldsource)
+			nvlist_free(source);
+		source = oldsource;
+	}
+#endif
+	fnvlist_pack_free(packed, size);
+	free((void *)(uintptr_t)zc.zc_nvlist_dst);
+	return (error);
+}
+
+int
+lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props)
+{
+	int error;
+	nvlist_t *args = fnvlist_alloc();
+	fnvlist_add_int32(args, "type", (dmu_objset_type_t)type);
+	if (props != NULL)
+		fnvlist_add_nvlist(args, "props", props);
+	error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
+	nvlist_free(args);
+	return (error);
+}
+
+int
+lzc_clone(const char *fsname, const char *origin,
+    nvlist_t *props)
+{
+	int error;
+	nvlist_t *args = fnvlist_alloc();
+	fnvlist_add_string(args, "origin", origin);
+	if (props != NULL)
+		fnvlist_add_nvlist(args, "props", props);
+	error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
+	nvlist_free(args);
+	return (error);
+}
+
+/*
+ * Creates snapshots.
+ *
+ * The keys in the snaps nvlist are the snapshots to be created.
+ * They must all be in the same pool.
+ *
+ * The props nvlist is properties to set.  Currently only user properties
+ * are supported.  { user:prop_name -> string value }
+ *
+ * The returned results nvlist will have an entry for each snapshot that failed.
+ * The value will be the (int32) error code.
+ *
+ * The return value will be 0 if all snapshots were created, otherwise it will
+ * be the errno of a (unspecified) snapshot that failed.
+ */
+int
+lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
+{
+	nvpair_t *elem;
+	nvlist_t *args;
+	int error;
+	char pool[ZFS_MAX_DATASET_NAME_LEN];
+
+	*errlist = NULL;
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(snaps, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/@")] = '\0';
+
+	args = fnvlist_alloc();
+	fnvlist_add_nvlist(args, "snaps", snaps);
+	if (props != NULL)
+		fnvlist_add_nvlist(args, "props", props);
+
+	error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist);
+	nvlist_free(args);
+
+	return (error);
+}
+
+/*
+ * Destroys snapshots.
+ *
+ * The keys in the snaps nvlist are the snapshots to be destroyed.
+ * They must all be in the same pool.
+ *
+ * Snapshots that do not exist will be silently ignored.
+ *
+ * If 'defer' is not set, and a snapshot has user holds or clones, the
+ * destroy operation will fail and none of the snapshots will be
+ * destroyed.
+ *
+ * If 'defer' is set, and a snapshot has user holds or clones, it will be
+ * marked for deferred destruction, and will be destroyed when the last hold
+ * or clone is removed/destroyed.
+ *
+ * The return value will be 0 if all snapshots were destroyed (or marked for
+ * later destruction if 'defer' is set) or didn't exist to begin with.
+ *
+ * Otherwise the return value will be the errno of a (unspecified) snapshot
+ * that failed, no snapshots will be destroyed, and the errlist will have an
+ * entry for each snapshot that failed.  The value in the errlist will be
+ * the (int32) error code.
+ */
+int
+lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist)
+{
+	nvpair_t *elem;
+	nvlist_t *args;
+	int error;
+	char pool[ZFS_MAX_DATASET_NAME_LEN];
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(snaps, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/@")] = '\0';
+
+	args = fnvlist_alloc();
+	fnvlist_add_nvlist(args, "snaps", snaps);
+	if (defer)
+		fnvlist_add_boolean(args, "defer");
+
+	error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist);
+	nvlist_free(args);
+
+	return (error);
+}
+
+int
+lzc_snaprange_space(const char *firstsnap, const char *lastsnap,
+    uint64_t *usedp)
+{
+	nvlist_t *args;
+	nvlist_t *result;
+	int err;
+	char fs[ZFS_MAX_DATASET_NAME_LEN];
+	char *atp;
+
+	/* determine the fs name */
+	(void) strlcpy(fs, firstsnap, sizeof (fs));
+	atp = strchr(fs, '@');
+	if (atp == NULL)
+		return (EINVAL);
+	*atp = '\0';
+
+	args = fnvlist_alloc();
+	fnvlist_add_string(args, "firstsnap", firstsnap);
+
+	err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result);
+	nvlist_free(args);
+	if (err == 0)
+		*usedp = fnvlist_lookup_uint64(result, "used");
+	fnvlist_free(result);
+
+	return (err);
+}
+
+boolean_t
+lzc_exists(const char *dataset)
+{
+	/*
+	 * The objset_stats ioctl is still legacy, so we need to construct our
+	 * own zfs_cmd_t rather than using zfsc_ioctl().
+	 */
+	zfs_cmd_t zc = { 0 };
+
+	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+	return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
+}
+
+/*
+ * Create "user holds" on snapshots.  If there is a hold on a snapshot,
+ * the snapshot can not be destroyed.  (However, it can be marked for deletion
+ * by lzc_destroy_snaps(defer=B_TRUE).)
+ *
+ * The keys in the nvlist are snapshot names.
+ * The snapshots must all be in the same pool.
+ * The value is the name of the hold (string type).
+ *
+ * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
+ * In this case, when the cleanup_fd is closed (including on process
+ * termination), the holds will be released.  If the system is shut down
+ * uncleanly, the holds will be released when the pool is next opened
+ * or imported.
+ *
+ * Holds for snapshots which don't exist will be skipped and have an entry
+ * added to errlist, but will not cause an overall failure.
+ *
+ * The return value will be 0 if all holds, for snapshots that existed,
+ * were succesfully created.
+ *
+ * Otherwise the return value will be the errno of a (unspecified) hold that
+ * failed and no holds will be created.
+ *
+ * In all cases the errlist will have an entry for each hold that failed
+ * (name = snapshot), with its value being the error code (int32).
+ */
+int
+lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist)
+{
+	char pool[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *args;
+	nvpair_t *elem;
+	int error;
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(holds, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/@")] = '\0';
+
+	args = fnvlist_alloc();
+	fnvlist_add_nvlist(args, "holds", holds);
+	if (cleanup_fd != -1)
+		fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
+
+	error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
+	nvlist_free(args);
+	return (error);
+}
+
+/*
+ * Release "user holds" on snapshots.  If the snapshot has been marked for
+ * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
+ * any clones, and all the user holds are removed, then the snapshot will be
+ * destroyed.
+ *
+ * The keys in the nvlist are snapshot names.
+ * The snapshots must all be in the same pool.
+ * The value is a nvlist whose keys are the holds to remove.
+ *
+ * Holds which failed to release because they didn't exist will have an entry
+ * added to errlist, but will not cause an overall failure.
+ *
+ * The return value will be 0 if the nvl holds was empty or all holds that
+ * existed, were successfully removed.
+ *
+ * Otherwise the return value will be the errno of a (unspecified) hold that
+ * failed to release and no holds will be released.
+ *
+ * In all cases the errlist will have an entry for each hold that failed to
+ * to release.
+ */
+int
+lzc_release(nvlist_t *holds, nvlist_t **errlist)
+{
+	char pool[ZFS_MAX_DATASET_NAME_LEN];
+	nvpair_t *elem;
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(holds, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/@")] = '\0';
+
+	return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
+}
+
+/*
+ * Retrieve list of user holds on the specified snapshot.
+ *
+ * On success, *holdsp will be set to a nvlist which the caller must free.
+ * The keys are the names of the holds, and the value is the creation time
+ * of the hold (uint64) in seconds since the epoch.
+ */
+int
+lzc_get_holds(const char *snapname, nvlist_t **holdsp)
+{
+	int error;
+	nvlist_t *innvl = fnvlist_alloc();
+	error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp);
+	fnvlist_free(innvl);
+	return (error);
+}
+
+/*
+ * Generate a zfs send stream for the specified snapshot and write it to
+ * the specified file descriptor.
+ *
+ * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
+ *
+ * If "from" is NULL, a full (non-incremental) stream will be sent.
+ * If "from" is non-NULL, it must be the full name of a snapshot or
+ * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
+ * "pool/fs#earlier_bmark").  If non-NULL, the specified snapshot or
+ * bookmark must represent an earlier point in the history of "snapname").
+ * It can be an earlier snapshot in the same filesystem or zvol as "snapname",
+ * or it can be the origin of "snapname"'s filesystem, or an earlier
+ * snapshot in the origin, etc.
+ *
+ * "fd" is the file descriptor to write the send stream to.
+ *
+ * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
+ * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
+ * records with drr_blksz > 128K.
+ *
+ * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
+ * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
+ * which the receiving system must support (as indicated by support
+ * for the "embedded_data" feature).
+ */
+int
+lzc_send(const char *snapname, const char *from, int fd,
+    enum lzc_send_flags flags)
+{
+	return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
+}
+
+int
+lzc_send_resume(const char *snapname, const char *from, int fd,
+    enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
+{
+	nvlist_t *args;
+	int err;
+
+	args = fnvlist_alloc();
+	fnvlist_add_int32(args, "fd", fd);
+	if (from != NULL)
+		fnvlist_add_string(args, "fromsnap", from);
+	if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
+		fnvlist_add_boolean(args, "largeblockok");
+	if (flags & LZC_SEND_FLAG_EMBED_DATA)
+		fnvlist_add_boolean(args, "embedok");
+	if (resumeobj != 0 || resumeoff != 0) {
+		fnvlist_add_uint64(args, "resume_object", resumeobj);
+		fnvlist_add_uint64(args, "resume_offset", resumeoff);
+	}
+	err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
+	nvlist_free(args);
+	return (err);
+}
+
+/*
+ * "from" can be NULL, a snapshot, or a bookmark.
+ *
+ * If from is NULL, a full (non-incremental) stream will be estimated.  This
+ * is calculated very efficiently.
+ *
+ * If from is a snapshot, lzc_send_space uses the deadlists attached to
+ * each snapshot to efficiently estimate the stream size.
+ *
+ * If from is a bookmark, the indirect blocks in the destination snapshot
+ * are traversed, looking for blocks with a birth time since the creation TXG of
+ * the snapshot this bookmark was created from.  This will result in
+ * significantly more I/O and be less efficient than a send space estimation on
+ * an equivalent snapshot.
+ */
+int
+lzc_send_space(const char *snapname, const char *from, uint64_t *spacep)
+{
+	nvlist_t *args;
+	nvlist_t *result;
+	int err;
+
+	args = fnvlist_alloc();
+	if (from != NULL)
+		fnvlist_add_string(args, "from", from);
+	err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
+	nvlist_free(args);
+	if (err == 0)
+		*spacep = fnvlist_lookup_uint64(result, "space");
+	nvlist_free(result);
+	return (err);
+}
+
+static int
+recv_read(int fd, void *buf, int ilen)
+{
+	char *cp = buf;
+	int rv;
+	int len = ilen;
+
+	do {
+		rv = read(fd, cp, len);
+		cp += rv;
+		len -= rv;
+	} while (rv > 0);
+
+	if (rv < 0 || len != 0)
+		return (EIO);
+
+	return (0);
+}
+
+static int
+recv_impl(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, boolean_t resumable, int fd,
+    const dmu_replay_record_t *begin_record)
+{
+	/*
+	 * The receive ioctl is still legacy, so we need to construct our own
+	 * zfs_cmd_t rather than using zfsc_ioctl().
+	 */
+	zfs_cmd_t zc = { 0 };
+	char *atp;
+	char *packed = NULL;
+	size_t size;
+	int error;
+
+	ASSERT3S(g_refcount, >, 0);
+
+	/* zc_name is name of containing filesystem */
+	(void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name));
+	atp = strchr(zc.zc_name, '@');
+	if (atp == NULL)
+		return (EINVAL);
+	*atp = '\0';
+
+	/* if the fs does not exist, try its parent. */
+	if (!lzc_exists(zc.zc_name)) {
+		char *slashp = strrchr(zc.zc_name, '/');
+		if (slashp == NULL)
+			return (ENOENT);
+		*slashp = '\0';
+
+	}
+
+	/* zc_value is full name of the snapshot to create */
+	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+
+	if (props != NULL) {
+		/* zc_nvlist_src is props to set */
+		packed = fnvlist_pack(props, &size);
+		zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
+		zc.zc_nvlist_src_size = size;
+	}
+
+	/* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
+	if (origin != NULL)
+		(void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));
+
+	/* zc_begin_record is non-byteswapped BEGIN record */
+	if (begin_record == NULL) {
+		error = recv_read(fd, &zc.zc_begin_record,
+		    sizeof (zc.zc_begin_record));
+		if (error != 0)
+			goto out;
+	} else {
+		zc.zc_begin_record = *begin_record;
+	}
+
+	/* zc_cookie is fd to read from */
+	zc.zc_cookie = fd;
+
+	/* zc guid is force flag */
+	zc.zc_guid = force;
+
+	zc.zc_resumable = resumable;
+
+	/* zc_cleanup_fd is unused */
+	zc.zc_cleanup_fd = -1;
+
+	error = ioctl(g_fd, ZFS_IOC_RECV, &zc);
+	if (error != 0)
+		error = errno;
+
+out:
+	if (packed != NULL)
+		fnvlist_pack_free(packed, size);
+	free((void*)(uintptr_t)zc.zc_nvlist_dst);
+	return (error);
+}
+
+/*
+ * The simplest receive case: receive from the specified fd, creating the
+ * specified snapshot.  Apply the specified properties as "received" properties
+ * (which can be overridden by locally-set properties).  If the stream is a
+ * clone, its origin snapshot must be specified by 'origin'.  The 'force'
+ * flag will cause the target filesystem to be rolled back or destroyed if
+ * necessary to receive.
+ *
+ * Return 0 on success or an errno on failure.
+ *
+ * Note: this interface does not work on dedup'd streams
+ * (those with DMU_BACKUP_FEATURE_DEDUP).
+ */
+int
+lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, int fd)
+{
+	return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL));
+}
+
+/*
+ * Like lzc_receive, but if the receive fails due to premature stream
+ * termination, the intermediate state will be preserved on disk.  In this
+ * case, ECKSUM will be returned.  The receive may subsequently be resumed
+ * with a resuming send stream generated by lzc_send_resume().
+ */
+int
+lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, int fd)
+{
+	return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL));
+}
+
+/*
+ * Like lzc_receive, but allows the caller to read the begin record and then to
+ * pass it in.  That could be useful if the caller wants to derive, for example,
+ * the snapname or the origin parameters based on the information contained in
+ * the begin record.
+ * The begin record must be in its original form as read from the stream,
+ * in other words, it should not be byteswapped.
+ *
+ * The 'resumable' parameter allows to obtain the same behavior as with
+ * lzc_receive_resumable.
+ */
+int
+lzc_receive_with_header(const char *snapname, nvlist_t *props,
+    const char *origin, boolean_t force, boolean_t resumable, int fd,
+    const dmu_replay_record_t *begin_record)
+{
+	if (begin_record == NULL)
+		return (EINVAL);
+	return (recv_impl(snapname, props, origin, force, resumable, fd,
+	    begin_record));
+}
+
+/*
+ * Roll back this filesystem or volume to its most recent snapshot.
+ * If snapnamebuf is not NULL, it will be filled in with the name
+ * of the most recent snapshot.
+ *
+ * Return 0 on success or an errno on failure.
+ */
+int
+lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen)
+{
+	nvlist_t *args;
+	nvlist_t *result;
+	int err;
+
+	args = fnvlist_alloc();
+	err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
+	nvlist_free(args);
+	if (err == 0 && snapnamebuf != NULL) {
+		const char *snapname = fnvlist_lookup_string(result, "target");
+		(void) strlcpy(snapnamebuf, snapname, snapnamelen);
+	}
+	return (err);
+}
+
+/*
+ * Creates bookmarks.
+ *
+ * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
+ * the name of the snapshot (e.g. "pool/fs@snap").  All the bookmarks and
+ * snapshots must be in the same pool.
+ *
+ * The returned results nvlist will have an entry for each bookmark that failed.
+ * The value will be the (int32) error code.
+ *
+ * The return value will be 0 if all bookmarks were created, otherwise it will
+ * be the errno of a (undetermined) bookmarks that failed.
+ */
+int
+lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist)
+{
+	nvpair_t *elem;
+	int error;
+	char pool[ZFS_MAX_DATASET_NAME_LEN];
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(bookmarks, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/#")] = '\0';
+
+	error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist);
+
+	return (error);
+}
+
+/*
+ * Retrieve bookmarks.
+ *
+ * Retrieve the list of bookmarks for the given file system. The props
+ * parameter is an nvlist of property names (with no values) that will be
+ * returned for each bookmark.
+ *
+ * The following are valid properties on bookmarks, all of which are numbers
+ * (represented as uint64 in the nvlist)
+ *
+ * "guid" - globally unique identifier of the snapshot it refers to
+ * "createtxg" - txg when the snapshot it refers to was created
+ * "creation" - timestamp when the snapshot it refers to was created
+ *
+ * The format of the returned nvlist as follows:
+ * <short name of bookmark> -> {
+ *     <name of property> -> {
+ *         "value" -> uint64
+ *     }
+ *  }
+ */
+int
+lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks)
+{
+	return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks));
+}
+
+/*
+ * Destroys bookmarks.
+ *
+ * The keys in the bmarks nvlist are the bookmarks to be destroyed.
+ * They must all be in the same pool.  Bookmarks are specified as
+ * <fs>#<bmark>.
+ *
+ * Bookmarks that do not exist will be silently ignored.
+ *
+ * The return value will be 0 if all bookmarks that existed were destroyed.
+ *
+ * Otherwise the return value will be the errno of a (undetermined) bookmark
+ * that failed, no bookmarks will be destroyed, and the errlist will have an
+ * entry for each bookmarks that failed.  The value in the errlist will be
+ * the (int32) error code.
+ */
+int
+lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist)
+{
+	nvpair_t *elem;
+	int error;
+	char pool[ZFS_MAX_DATASET_NAME_LEN];
+
+	/* determine the pool name */
+	elem = nvlist_next_nvpair(bmarks, NULL);
+	if (elem == NULL)
+		return (0);
+	(void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
+	pool[strcspn(pool, "/#")] = '\0';
+
+	error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist);
+
+	return (error);
+}
Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h
diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h	3 Dec 2016 17:05:32 -0000
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#ifndef	_LIBZFS_CORE_H
+#define	_LIBZFS_CORE_H
+
+#include <libnvpair.h>
+#include <sys/param.h>
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+int libzfs_core_init(void);
+void libzfs_core_fini(void);
+
+/*
+ * NB: this type should be kept binary compatible with dmu_objset_type_t.
+ */
+enum lzc_dataset_type {
+	LZC_DATSET_TYPE_ZFS = 2,
+	LZC_DATSET_TYPE_ZVOL
+};
+
+int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **);
+int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *);
+int lzc_clone(const char *, const char *, nvlist_t *);
+int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **);
+int lzc_bookmark(nvlist_t *, nvlist_t **);
+int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **);
+int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **);
+
+int lzc_snaprange_space(const char *, const char *, uint64_t *);
+
+int lzc_hold(nvlist_t *, int, nvlist_t **);
+int lzc_release(nvlist_t *, nvlist_t **);
+int lzc_get_holds(const char *, nvlist_t **);
+
+enum lzc_send_flags {
+	LZC_SEND_FLAG_EMBED_DATA = 1 << 0,
+	LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1
+};
+
+int lzc_send(const char *, const char *, int, enum lzc_send_flags);
+int lzc_send_resume(const char *, const char *, int,
+    enum lzc_send_flags, uint64_t, uint64_t);
+int lzc_send_space(const char *, const char *, uint64_t *);
+
+struct dmu_replay_record;
+
+int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
+int lzc_receive_resumable(const char *, nvlist_t *, const char *,
+    boolean_t, int);
+int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t,
+    boolean_t, int, const struct dmu_replay_record *);
+
+boolean_t lzc_exists(const char *);
+
+int lzc_rollback(const char *, char *, int);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBZFS_CORE_H */
Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c
diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c	23 Mar 2013 15:31:39 -0000
@@ -0,0 +1,189 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#include <sys/zfs_ioctl.h>
+#include <zfs_ioctl_compat.h>
+#include "libzfs_core_compat.h"
+
+extern int zfs_ioctl_version;
+
+int
+lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source)
+{
+	nvlist_t *nvl = NULL;
+	nvpair_t *pair, *hpair;
+	char *buf, *val;
+	zfs_ioc_t vecnum;
+	uint32_t type32;
+	int32_t cleanup_fd;
+	int error = 0;
+	int pos;
+
+	if (zfs_ioctl_version >= ZFS_IOCVER_LZC)
+		return (0);
+
+	vecnum = *ioc;
+
+	switch (vecnum) {
+	case ZFS_IOC_CREATE:
+		type32 = fnvlist_lookup_int32(*source, "type");
+		zc->zc_objset_type = (uint64_t)type32;
+		nvlist_lookup_nvlist(*source, "props", &nvl);
+		*source = nvl;
+	break;
+	case ZFS_IOC_CLONE:
+		buf = fnvlist_lookup_string(*source, "origin");
+		strlcpy(zc->zc_value, buf, MAXPATHLEN);
+		nvlist_lookup_nvlist(*source, "props", &nvl);
+		*ioc = ZFS_IOC_CREATE;
+		*source = nvl;
+	break;
+	case ZFS_IOC_SNAPSHOT:
+		nvl = fnvlist_lookup_nvlist(*source, "snaps");
+		pair = nvlist_next_nvpair(nvl, NULL);
+		if (pair != NULL) {
+			buf = nvpair_name(pair);
+			pos = strcspn(buf, "@");
+			strlcpy(zc->zc_name, buf, pos + 1);
+			strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN);
+		} else
+			error = EINVAL;
+		/* old kernel cannot create multiple snapshots */
+		if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
+			error = EOPNOTSUPP;
+		nvlist_free(nvl);
+		nvl = NULL;
+		nvlist_lookup_nvlist(*source, "props", &nvl);
+		*source = nvl;
+	break;
+	case ZFS_IOC_SPACE_SNAPS:
+		buf = fnvlist_lookup_string(*source, "firstsnap");
+		strlcpy(zc->zc_value, buf, MAXPATHLEN);
+	break;
+	case ZFS_IOC_DESTROY_SNAPS:
+		nvl = fnvlist_lookup_nvlist(*source, "snaps");
+		pair = nvlist_next_nvpair(nvl, NULL);
+		if (pair != NULL) {
+			buf = nvpair_name(pair);
+			pos = strcspn(buf, "@");
+			strlcpy(zc->zc_name, buf, pos + 1);
+		} else
+			error = EINVAL;
+		/* old kernel cannot atomically destroy multiple snaps */
+		if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
+			error = EOPNOTSUPP;
+		*source = nvl;
+	break;
+	case ZFS_IOC_HOLD:
+		nvl = fnvlist_lookup_nvlist(*source, "holds");
+		pair = nvlist_next_nvpair(nvl, NULL);
+		if (pair != NULL) {
+			buf = nvpair_name(pair);
+			pos = strcspn(buf, "@");
+			strlcpy(zc->zc_name, buf, pos + 1);
+			strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN);
+			if (nvpair_value_string(pair, &val) == 0)
+				strlcpy(zc->zc_string, val, MAXNAMELEN);
+			else
+				error = EINVAL;
+		} else
+			error = EINVAL;
+		/* old kernel cannot atomically create multiple holds */
+		if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
+			error = EOPNOTSUPP;
+		nvlist_free(nvl);
+		if (nvlist_lookup_int32(*source, "cleanup_fd",
+		    &cleanup_fd) == 0)
+			zc->zc_cleanup_fd = cleanup_fd;
+		else
+			zc->zc_cleanup_fd = -1;
+	break;
+	case ZFS_IOC_RELEASE:
+		pair = nvlist_next_nvpair(*source, NULL);
+		if (pair != NULL) {
+			buf = nvpair_name(pair);
+			pos = strcspn(buf, "@");
+			strlcpy(zc->zc_name, buf, pos + 1);
+			strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN);
+			if (nvpair_value_nvlist(pair, &nvl) == 0) {
+				hpair = nvlist_next_nvpair(nvl, NULL);
+				if (hpair != NULL)
+					strlcpy(zc->zc_string,
+					    nvpair_name(hpair), MAXNAMELEN);
+				else
+					error = EINVAL;
+				if (!error && nvlist_next_nvpair(nvl,
+				    hpair) != NULL)
+					error = EOPNOTSUPP;
+			} else
+				error = EINVAL;
+		} else
+			error = EINVAL;
+		/* old kernel cannot atomically release multiple holds */
+		if (!error && nvlist_next_nvpair(nvl, pair) != NULL)
+			error = EOPNOTSUPP;
+	break;
+	}
+
+	return (error);
+}
+
+void
+lzc_compat_post(zfs_cmd_t *zc, const zfs_ioc_t ioc)
+{
+	if (zfs_ioctl_version >= ZFS_IOCVER_LZC)
+		return;
+
+	switch (ioc) {
+	case ZFS_IOC_CREATE:
+	case ZFS_IOC_CLONE:
+	case ZFS_IOC_SNAPSHOT:
+	case ZFS_IOC_SPACE_SNAPS:
+	case ZFS_IOC_DESTROY_SNAPS:
+		zc->zc_nvlist_dst_filled = B_FALSE;
+	break;
+	}
+}
+
+int
+lzc_compat_outnvl(zfs_cmd_t *zc, const zfs_ioc_t ioc, nvlist_t **outnvl)
+{
+	nvlist_t *nvl;
+
+	if (zfs_ioctl_version >= ZFS_IOCVER_LZC)
+		return (0);
+
+	switch (ioc) {
+	case ZFS_IOC_SPACE_SNAPS:
+		nvl = fnvlist_alloc();
+		fnvlist_add_uint64(nvl, "used", zc->zc_cookie);
+		fnvlist_add_uint64(nvl, "compressed", zc->zc_objset_type);
+		fnvlist_add_uint64(nvl, "uncompressed", zc->zc_perm_action);
+		*outnvl = nvl;
+	break;
+	}
+
+	return (0);
+}
Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h
diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h	23 Mar 2013 15:31:39 -0000
@@ -0,0 +1,47 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#ifndef	_LIBZFS_CORE_COMPAT_H
+#define	_LIBZFS_CORE_COMPAT_H
+
+#include <libnvpair.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+int lzc_compat_pre(zfs_cmd_t *, zfs_ioc_t *, nvlist_t **);
+void lzc_compat_post(zfs_cmd_t *, const zfs_ioc_t);
+int lzc_compat_outnvl(zfs_cmd_t *, const zfs_ioc_t, nvlist_t **);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LIBZFS_CORE_COMPAT_H */
Index: src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c
===================================================================
RCS file: src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c
diff -N src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c	30 Apr 2017 04:17:20 -0000
@@ -0,0 +1,1210 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ */
+
+#include <assert.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <libgen.h>
+#include <sys/spa.h>
+#include <sys/stat.h>
+#include <sys/processor.h>
+#include <sys/zfs_context.h>
+#include <sys/rrwlock.h>
+#include <sys/zmod.h>
+#include <sys/utsname.h>
+#include <sys/systeminfo.h>
+
+/*
+ * Emulation of kernel services in userland.
+ */
+
+#ifndef __FreeBSD__
+int aok;
+#endif
+uint64_t physmem;
+vnode_t *rootdir = (vnode_t *)0xabcd1234;
+char hw_serial[HW_HOSTID_LEN];
+#ifdef illumos
+kmutex_t cpu_lock;
+#endif
+
+/* If set, all blocks read will be copied to the specified directory. */
+char *vn_dumpdir = NULL;
+
+struct utsname utsname = {
+	"userland", "libzpool", "1", "1", "na"
+};
+
+/* this only exists to have its address taken */
+struct proc p0;
+
+/*
+ * =========================================================================
+ * threads
+ * =========================================================================
+ */
+/*ARGSUSED*/
+kthread_t *
+zk_thread_create(void (*func)(), void *arg)
+{
+	thread_t tid;
+
+	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
+	    &tid) == 0);
+
+	return ((void *)(uintptr_t)tid);
+}
+
+/*
+ * =========================================================================
+ * kstats
+ * =========================================================================
+ */
+/*ARGSUSED*/
+kstat_t *
+kstat_create(char *module, int instance, char *name, char *class,
+    uchar_t type, ulong_t ndata, uchar_t ks_flag)
+{
+	return (NULL);
+}
+
+/*ARGSUSED*/
+void
+kstat_named_init(kstat_named_t *knp, const char *name, uchar_t type)
+{}
+
+/*ARGSUSED*/
+void
+kstat_install(kstat_t *ksp)
+{}
+
+/*ARGSUSED*/
+void
+kstat_delete(kstat_t *ksp)
+{}
+
+/*
+ * =========================================================================
+ * mutexes
+ * =========================================================================
+ */
+void
+zmutex_init(kmutex_t *mp)
+{
+	mp->m_owner = NULL;
+	mp->initialized = B_TRUE;
+	(void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL);
+}
+
+void
+zmutex_destroy(kmutex_t *mp)
+{
+	ASSERT(mp->initialized == B_TRUE);
+	ASSERT(mp->m_owner == NULL);
+	(void) _mutex_destroy(&(mp)->m_lock);
+	mp->m_owner = (void *)-1UL;
+	mp->initialized = B_FALSE;
+}
+
+int
+zmutex_owned(kmutex_t *mp)
+{
+	ASSERT(mp->initialized == B_TRUE);
+
+	return (mp->m_owner == curthread);
+}
+
+void
+mutex_enter(kmutex_t *mp)
+{
+	ASSERT(mp->initialized == B_TRUE);
+	ASSERT(mp->m_owner != (void *)-1UL);
+	ASSERT(mp->m_owner != curthread);
+	VERIFY(mutex_lock(&mp->m_lock) == 0);
+	ASSERT(mp->m_owner == NULL);
+	mp->m_owner = curthread;
+}
+
+int
+mutex_tryenter(kmutex_t *mp)
+{
+	ASSERT(mp->initialized == B_TRUE);
+	ASSERT(mp->m_owner != (void *)-1UL);
+	if (0 == mutex_trylock(&mp->m_lock)) {
+		ASSERT(mp->m_owner == NULL);
+		mp->m_owner = curthread;
+		return (1);
+	} else {
+		return (0);
+	}
+}
+
+void
+mutex_exit(kmutex_t *mp)
+{
+	ASSERT(mp->initialized == B_TRUE);
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	VERIFY(mutex_unlock(&mp->m_lock) == 0);
+}
+
+void *
+mutex_owner(kmutex_t *mp)
+{
+	ASSERT(mp->initialized == B_TRUE);
+	return (mp->m_owner);
+}
+
+/*
+ * =========================================================================
+ * rwlocks
+ * =========================================================================
+ */
+/*ARGSUSED*/
+void
+rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
+{
+	rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL);
+	rwlp->rw_owner = NULL;
+	rwlp->initialized = B_TRUE;
+	rwlp->rw_count = 0;
+}
+
+void
+rw_destroy(krwlock_t *rwlp)
+{
+	ASSERT(rwlp->rw_count == 0);
+	rwlock_destroy(&rwlp->rw_lock);
+	rwlp->rw_owner = (void *)-1UL;
+	rwlp->initialized = B_FALSE;
+}
+
+void
+rw_enter(krwlock_t *rwlp, krw_t rw)
+{
+	//ASSERT(!RW_LOCK_HELD(rwlp));
+	ASSERT(rwlp->initialized == B_TRUE);
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+	ASSERT(rwlp->rw_owner != curthread);
+
+	if (rw == RW_READER) {
+		VERIFY(rw_rdlock(&rwlp->rw_lock) == 0);
+		ASSERT(rwlp->rw_count >= 0);
+		atomic_add_int(&rwlp->rw_count, 1);
+	} else {
+		VERIFY(rw_wrlock(&rwlp->rw_lock) == 0);
+		ASSERT(rwlp->rw_count == 0);
+		rwlp->rw_count = -1;
+		rwlp->rw_owner = curthread;
+	}
+}
+
+void
+rw_exit(krwlock_t *rwlp)
+{
+	ASSERT(rwlp->initialized == B_TRUE);
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+	if (rwlp->rw_owner == curthread) {
+		/* Write locked. */
+		ASSERT(rwlp->rw_count == -1);
+		rwlp->rw_count = 0;
+		rwlp->rw_owner = NULL;
+	} else {
+		/* Read locked. */
+		ASSERT(rwlp->rw_count > 0);
+		atomic_add_int(&rwlp->rw_count, -1);
+	}
+	VERIFY(rw_unlock(&rwlp->rw_lock) == 0);
+}
+
+int
+rw_tryenter(krwlock_t *rwlp, krw_t rw)
+{
+	int rv;
+
+	ASSERT(rwlp->initialized == B_TRUE);
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+	ASSERT(rwlp->rw_owner != curthread);
+
+	if (rw == RW_READER)
+		rv = rw_tryrdlock(&rwlp->rw_lock);
+	else
+		rv = rw_trywrlock(&rwlp->rw_lock);
+
+	if (rv == 0) {
+		ASSERT(rwlp->rw_owner == NULL);
+		if (rw == RW_READER) {
+			ASSERT(rwlp->rw_count >= 0);
+			atomic_add_int(&rwlp->rw_count, 1);
+		} else {
+			ASSERT(rwlp->rw_count == 0);
+			rwlp->rw_count = -1;
+			rwlp->rw_owner = curthread;
+		}
+		return (1);
+	}
+
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+rw_tryupgrade(krwlock_t *rwlp)
+{
+	ASSERT(rwlp->initialized == B_TRUE);
+	ASSERT(rwlp->rw_owner != (void *)-1UL);
+
+	return (0);
+}
+
+int
+rw_lock_held(krwlock_t *rwlp)
+{
+
+	return (rwlp->rw_count != 0);
+}
+
+/*
+ * =========================================================================
+ * condition variables
+ * =========================================================================
+ */
+/*ARGSUSED*/
+void
+cv_init(kcondvar_t *cv, char *name, int type, void *arg)
+{
+	VERIFY(cond_init(cv, name, NULL) == 0);
+}
+
+void
+cv_destroy(kcondvar_t *cv)
+{
+	VERIFY(cond_destroy(cv) == 0);
+}
+
+void
+cv_wait(kcondvar_t *cv, kmutex_t *mp)
+{
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	int ret = cond_wait(cv, &mp->m_lock);
+	VERIFY(ret == 0 || ret == EINTR);
+	mp->m_owner = curthread;
+}
+
+clock_t
+cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
+{
+	int error;
+	struct timespec ts;
+	struct timeval tv;
+	clock_t delta;
+
+	abstime += ddi_get_lbolt();
+top:
+	delta = abstime - ddi_get_lbolt();
+	if (delta <= 0)
+		return (-1);
+
+	if (gettimeofday(&tv, NULL) != 0)
+		assert(!"gettimeofday() failed");
+
+	ts.tv_sec = tv.tv_sec + delta / hz;
+	ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz);
+	ASSERT(ts.tv_nsec >= 0);
+
+	if (ts.tv_nsec >= NANOSEC) {
+		ts.tv_sec++;
+		ts.tv_nsec -= NANOSEC;
+	}
+
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
+	mp->m_owner = curthread;
+
+	if (error == EINTR)
+		goto top;
+
+	if (error == ETIMEDOUT)
+		return (-1);
+
+	ASSERT(error == 0);
+
+	return (1);
+}
+
+/*ARGSUSED*/
+clock_t
+cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+    int flag)
+{
+	int error;
+	timestruc_t ts;
+	hrtime_t delta;
+
+	ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
+
+top:
+	delta = tim;
+	if (flag & CALLOUT_FLAG_ABSOLUTE)
+		delta -= gethrtime();
+
+	if (delta <= 0)
+		return (-1);
+
+	ts.tv_sec = delta / NANOSEC;
+	ts.tv_nsec = delta % NANOSEC;
+
+	ASSERT(mutex_owner(mp) == curthread);
+	mp->m_owner = NULL;
+	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
+	mp->m_owner = curthread;
+
+	if (error == ETIMEDOUT)
+		return (-1);
+
+	if (error == EINTR)
+		goto top;
+
+	ASSERT(error == 0);
+
+	return (1);
+}
+
+void
+cv_signal(kcondvar_t *cv)
+{
+	VERIFY(cond_signal(cv) == 0);
+}
+
+void
+cv_broadcast(kcondvar_t *cv)
+{
+	VERIFY(cond_broadcast(cv) == 0);
+}
+
+/*
+ * =========================================================================
+ * vnode operations
+ * =========================================================================
+ */
+/*
+ * Note: for the xxxat() versions of these functions, we assume that the
+ * starting vp is always rootdir (which is true for spa_directory.c, the only
+ * ZFS consumer of these interfaces).  We assert this is true, and then emulate
+ * them by adding '/' in front of the path.
+ */
+
+/*ARGSUSED*/
+int
+vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
+{
+	int fd;
+	int dump_fd;
+	vnode_t *vp;
+	int old_umask;
+	char realpath[MAXPATHLEN];
+	struct stat64 st;
+
+	/*
+	 * If we're accessing a real disk from userland, we need to use
+	 * the character interface to avoid caching.  This is particularly
+	 * important if we're trying to look at a real in-kernel storage
+	 * pool from userland, e.g. via zdb, because otherwise we won't
+	 * see the changes occurring under the segmap cache.
+	 * On the other hand, the stupid character device returns zero
+	 * for its size.  So -- gag -- we open the block device to get
+	 * its size, and remember it for subsequent VOP_GETATTR().
+	 */
+	if (strncmp(path, "/dev/", 5) == 0) {
+		char *dsk;
+		fd = open64(path, O_RDONLY);
+		if (fd == -1)
+			return (errno);
+		if (fstat64(fd, &st) == -1) {
+			close(fd);
+			return (errno);
+		}
+		close(fd);
+		(void) sprintf(realpath, "%s", path);
+		dsk = strstr(path, "/dsk/");
+		if (dsk != NULL)
+			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
+			    dsk + 1);
+	} else {
+		(void) sprintf(realpath, "%s", path);
+		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
+			return (errno);
+	}
+
+	if (flags & FCREAT)
+		old_umask = umask(0);
+
+	/*
+	 * The construct 'flags - FREAD' conveniently maps combinations of
+	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
+	 */
+	fd = open64(realpath, flags - FREAD, mode);
+
+	if (flags & FCREAT)
+		(void) umask(old_umask);
+
+	if (vn_dumpdir != NULL) {
+		char dumppath[MAXPATHLEN];
+		(void) snprintf(dumppath, sizeof (dumppath),
+		    "%s/%s", vn_dumpdir, basename(realpath));
+		dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
+		if (dump_fd == -1)
+			return (errno);
+	} else {
+		dump_fd = -1;
+	}
+
+	if (fd == -1)
+		return (errno);
+
+	if (fstat64(fd, &st) == -1) {
+		close(fd);
+		return (errno);
+	}
+
+	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
+
+	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
+
+	vp->v_fd = fd;
+	vp->v_size = st.st_size;
+	vp->v_path = spa_strdup(path);
+	vp->v_dump_fd = dump_fd;
+
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
+    int x3, vnode_t *startvp, int fd)
+{
+	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
+	int ret;
+
+	ASSERT(startvp == rootdir);
+	(void) sprintf(realpath, "/%s", path);
+
+	/* fd ignored for now, need if want to simulate nbmand support */
+	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
+
+	umem_free(realpath, strlen(path) + 2);
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+int
+vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
+    int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
+{
+	ssize_t iolen, split;
+
+	if (uio == UIO_READ) {
+		iolen = pread64(vp->v_fd, addr, len, offset);
+		if (vp->v_dump_fd != -1) {
+			int status =
+			    pwrite64(vp->v_dump_fd, addr, iolen, offset);
+			ASSERT(status != -1);
+		}
+	} else {
+		/*
+		 * To simulate partial disk writes, we split writes into two
+		 * system calls so that the process can be killed in between.
+		 */
+		int sectors = len >> SPA_MINBLOCKSHIFT;
+		split = (sectors > 0 ? rand() % sectors : 0) <<
+		    SPA_MINBLOCKSHIFT;
+		iolen = pwrite64(vp->v_fd, addr, split, offset);
+		iolen += pwrite64(vp->v_fd, (char *)addr + split,
+		    len - split, offset + split);
+	}
+
+	if (iolen == -1)
+		return (errno);
+	if (residp)
+		*residp = len - iolen;
+	else if (iolen != len)
+		return (EIO);
+	return (0);
+}
+
+void
+vn_close(vnode_t *vp, int openflag, cred_t *cr, kthread_t *td)
+{
+	close(vp->v_fd);
+	if (vp->v_dump_fd != -1)
+		close(vp->v_dump_fd);
+	spa_strfree(vp->v_path);
+	umem_free(vp, sizeof (vnode_t));
+}
+
+/*
+ * At a minimum we need to update the size since vdev_reopen()
+ * will no longer call vn_openat().
+ */
+int
+fop_getattr(vnode_t *vp, vattr_t *vap)
+{
+	struct stat64 st;
+
+	if (fstat64(vp->v_fd, &st) == -1) {
+		close(vp->v_fd);
+		return (errno);
+	}
+
+	vap->va_size = st.st_size;
+	return (0);
+}
+
+#ifdef ZFS_DEBUG
+
+/*
+ * =========================================================================
+ * Figure out which debugging statements to print
+ * =========================================================================
+ */
+
+static char *dprintf_string;
+static int dprintf_print_all;
+
+int
+dprintf_find_string(const char *string)
+{
+	char *tmp_str = dprintf_string;
+	int len = strlen(string);
+
+	/*
+	 * Find out if this is a string we want to print.
+	 * String format: file1.c,function_name1,file2.c,file3.c
+	 */
+
+	while (tmp_str != NULL) {
+		if (strncmp(tmp_str, string, len) == 0 &&
+		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
+			return (1);
+		tmp_str = strchr(tmp_str, ',');
+		if (tmp_str != NULL)
+			tmp_str++; /* Get rid of , */
+	}
+	return (0);
+}
+
+void
+dprintf_setup(int *argc, char **argv)
+{
+	int i, j;
+
+	/*
+	 * Debugging can be specified two ways: by setting the
+	 * environment variable ZFS_DEBUG, or by including a
+	 * "debug=..."  argument on the command line.  The command
+	 * line setting overrides the environment variable.
+	 */
+
+	for (i = 1; i < *argc; i++) {
+		int len = strlen("debug=");
+		/* First look for a command line argument */
+		if (strncmp("debug=", argv[i], len) == 0) {
+			dprintf_string = argv[i] + len;
+			/* Remove from args */
+			for (j = i; j < *argc; j++)
+				argv[j] = argv[j+1];
+			argv[j] = NULL;
+			(*argc)--;
+		}
+	}
+
+	if (dprintf_string == NULL) {
+		/* Look for ZFS_DEBUG environment variable */
+		dprintf_string = getenv("ZFS_DEBUG");
+	}
+
+	/*
+	 * Are we just turning on all debugging?
+	 */
+	if (dprintf_find_string("on"))
+		dprintf_print_all = 1;
+
+	if (dprintf_string != NULL)
+		zfs_flags |= ZFS_DEBUG_DPRINTF;
+}
+
+int
+sysctl_handle_64(SYSCTL_HANDLER_ARGS)
+{
+	return (0);
+}
+
+/*
+ * =========================================================================
+ * debug printfs
+ * =========================================================================
+ */
+void
+__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	const char *newfile;
+	va_list adx;
+
+	/*
+	 * Get rid of annoying "../common/" prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	if (dprintf_print_all ||
+	    dprintf_find_string(newfile) ||
+	    dprintf_find_string(func)) {
+		/* Print out just the function name if requested */
+		flockfile(stdout);
+		if (dprintf_find_string("pid"))
+			(void) printf("%d ", getpid());
+		if (dprintf_find_string("tid"))
+			(void) printf("%lu ", thr_self());
+#if 0
+		if (dprintf_find_string("cpu"))
+			(void) printf("%u ", getcpuid());
+#endif
+		if (dprintf_find_string("time"))
+			(void) printf("%llu ", gethrtime());
+		if (dprintf_find_string("long"))
+			(void) printf("%s, line %d: ", newfile, line);
+		(void) printf("%s: ", func);
+		va_start(adx, fmt);
+		(void) vprintf(fmt, adx);
+		va_end(adx);
+		funlockfile(stdout);
+	}
+}
+
+#endif /* ZFS_DEBUG */
+
+/*
+ * =========================================================================
+ * cmn_err() and panic()
+ * =========================================================================
+ */
+static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
+static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
+
+void
+vpanic(const char *fmt, va_list adx)
+{
+	(void) fprintf(stderr, "error: ");
+	(void) vfprintf(stderr, fmt, adx);
+	(void) fprintf(stderr, "\n");
+
+	abort();	/* think of it as a "user-level crash dump" */
+}
+
+void
+panic(const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	vpanic(fmt, adx);
+	va_end(adx);
+}
+
+void
+vcmn_err(int ce, const char *fmt, va_list adx)
+{
+	if (ce == CE_PANIC)
+		vpanic(fmt, adx);
+	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
+		(void) fprintf(stderr, "%s", ce_prefix[ce]);
+		(void) vfprintf(stderr, fmt, adx);
+		(void) fprintf(stderr, "%s", ce_suffix[ce]);
+	}
+}
+
+/*PRINTFLIKE2*/
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	vcmn_err(ce, fmt, adx);
+	va_end(adx);
+}
+
+/*
+ * =========================================================================
+ * kobj interfaces
+ * =========================================================================
+ */
+struct _buf *
+kobj_open_file(char *name)
+{
+	struct _buf *file;
+	vnode_t *vp;
+
+	/* set vp as the _fd field of the file */
+	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
+	    -1) != 0)
+		return ((void *)-1UL);
+
+	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
+	file->_fd = (intptr_t)vp;
+	return (file);
+}
+
+int
+kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
+{
+	ssize_t resid;
+
+	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
+	    UIO_SYSSPACE, 0, 0, 0, &resid);
+
+	return (size - resid);
+}
+
+void
+kobj_close_file(struct _buf *file)
+{
+	vn_close((vnode_t *)file->_fd, 0, NULL, NULL);
+	umem_free(file, sizeof (struct _buf));
+}
+
+int
+kobj_get_filesize(struct _buf *file, uint64_t *size)
+{
+	struct stat64 st;
+	vnode_t *vp = (vnode_t *)file->_fd;
+
+	if (fstat64(vp->v_fd, &st) == -1) {
+		vn_close(vp, 0, NULL, NULL);
+		return (errno);
+	}
+	*size = st.st_size;
+	return (0);
+}
+
+/*
+ * =========================================================================
+ * misc routines
+ * =========================================================================
+ */
+
+void
+delay(clock_t ticks)
+{
+	poll(0, 0, ticks * (1000 / hz));
+}
+
+#if 0
+/*
+ * Find highest one bit set.
+ *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+int
+highbit64(uint64_t i)
+{
+	int h = 1;
+
+	if (i == 0)
+		return (0);
+	if (i & 0xffffffff00000000ULL) {
+		h += 32; i >>= 32;
+	}
+	if (i & 0xffff0000) {
+		h += 16; i >>= 16;
+	}
+	if (i & 0xff00) {
+		h += 8; i >>= 8;
+	}
+	if (i & 0xf0) {
+		h += 4; i >>= 4;
+	}
+	if (i & 0xc) {
+		h += 2; i >>= 2;
+	}
+	if (i & 0x2) {
+		h += 1;
+	}
+	return (h);
+}
+#endif
+
+static int random_fd = -1, urandom_fd = -1;
+
+static int
+random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
+{
+	size_t resid = len;
+	ssize_t bytes;
+
+	ASSERT(fd != -1);
+
+	while (resid != 0) {
+		bytes = read(fd, ptr, resid);
+		ASSERT3S(bytes, >=, 0);
+		ptr += bytes;
+		resid -= bytes;
+	}
+
+	return (0);
+}
+
+int
+random_get_bytes(uint8_t *ptr, size_t len)
+{
+	return (random_get_bytes_common(ptr, len, random_fd));
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+	return (random_get_bytes_common(ptr, len, urandom_fd));
+}
+
+int
+ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
+{
+	char *end;
+
+	*result = strtoul(hw_serial, &end, base);
+	if (*result == 0)
+		return (errno);
+	return (0);
+}
+
+int
+ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
+{
+	char *end;
+
+	*result = strtoull(str, &end, base);
+	if (*result == 0)
+		return (errno);
+	return (0);
+}
+
+#ifndef __FreeBSD__
+/* ARGSUSED */
+cyclic_id_t
+cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when)
+{
+	return (1);
+}
+
+/* ARGSUSED */
+void
+cyclic_remove(cyclic_id_t id)
+{
+}
+
+/* ARGSUSED */
+int
+cyclic_reprogram(cyclic_id_t id, hrtime_t expiration)
+{
+	return (1);
+}
+#endif
+
+/*
+ * =========================================================================
+ * kernel emulation setup & teardown
+ * =========================================================================
+ */
+static int
+umem_out_of_memory(void)
+{
+	char errmsg[] = "out of memory -- generating core dump\n";
+
+	write(fileno(stderr), errmsg, sizeof (errmsg));
+	abort();
+	return (0);
+}
+
+void
+kernel_init(int mode)
+{
+	extern uint_t rrw_tsd_key;
+
+	umem_nofail_callback(umem_out_of_memory);
+
+	physmem = sysconf(_SC_PHYS_PAGES);
+
+	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
+	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
+
+	(void) snprintf(hw_serial, sizeof (hw_serial), "%lu",
+	    (mode & FWRITE) ? (unsigned long)gethostid() : 0);
+
+	VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
+	VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
+
+	system_taskq_init();
+
+#ifdef illumos
+	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
+#endif
+
+	spa_init(mode);
+
+	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+}
+
+void
+kernel_fini(void)
+{
+	spa_fini();
+
+	system_taskq_fini();
+
+	close(random_fd);
+	close(urandom_fd);
+
+	random_fd = -1;
+	urandom_fd = -1;
+}
+
+int
+z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
+{
+	int ret;
+	uLongf len = *dstlen;
+
+	if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
+		*dstlen = (size_t)len;
+
+	return (ret);
+}
+
+int
+z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
+    int level)
+{
+	int ret;
+	uLongf len = *dstlen;
+
+	if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
+		*dstlen = (size_t)len;
+
+	return (ret);
+}
+
+uid_t
+crgetuid(cred_t *cr)
+{
+	return (0);
+}
+
+uid_t
+crgetruid(cred_t *cr)
+{
+	return (0);
+}
+
+gid_t
+crgetgid(cred_t *cr)
+{
+	return (0);
+}
+
+int
+crgetngroups(cred_t *cr)
+{
+	return (0);
+}
+
+gid_t *
+crgetgroups(cred_t *cr)
+{
+	return (NULL);
+}
+
+int
+zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
+{
+	return (0);
+}
+
+int
+zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
+{
+	return (0);
+}
+
+int
+zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
+{
+	return (0);
+}
+
+ksiddomain_t *
+ksid_lookupdomain(const char *dom)
+{
+	ksiddomain_t *kd;
+
+	kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
+	kd->kd_name = spa_strdup(dom);
+	return (kd);
+}
+
+void
+ksiddomain_rele(ksiddomain_t *ksid)
+{
+	spa_strfree(ksid->kd_name);
+	umem_free(ksid, sizeof (ksiddomain_t));
+}
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	char *buf;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx) + 1;
+	va_end(adx);
+
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	va_start(adx, fmt);
+	size = vsnprintf(buf, size, fmt, adx);
+	va_end(adx);
+
+	return (buf);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+	*minorp = 0;
+	return (0);
+}
+
+/* ARGSUSED */
+void
+zfs_onexit_fd_rele(int fd)
+{
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+    uint64_t *action_handle)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+	return (0);
+}
+
+#ifdef __FreeBSD__
+/* ARGSUSED */
+int
+zvol_create_minors(const char *name)
+{
+	return (0);
+}
+#endif
+
+#ifdef illumos
+void
+bioinit(buf_t *bp)
+{
+	bzero(bp, sizeof (buf_t));
+}
+
+void
+biodone(buf_t *bp)
+{
+	if (bp->b_iodone != NULL) {
+		(*(bp->b_iodone))(bp);
+		return;
+	}
+	ASSERT((bp->b_flags & B_DONE) == 0);
+	bp->b_flags |= B_DONE;
+}
+
+void
+bioerror(buf_t *bp, int error)
+{
+	ASSERT(bp != NULL);
+	ASSERT(error >= 0);
+
+	if (error != 0) {
+		bp->b_flags |= B_ERROR;
+	} else {
+		bp->b_flags &= ~B_ERROR;
+	}
+	bp->b_error = error;
+}
+
+
+int
+geterror(struct buf *bp)
+{
+	int error = 0;
+
+	if (bp->b_flags & B_ERROR) {
+		error = bp->b_error;
+		if (!error)
+			error = EIO;
+	}
+	return (error);
+}
+#endif
Index: src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c,v
retrieving revision 1.4
diff -u -p -r1.4 taskq.c
--- src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c	21 Jun 2013 16:22:45 -0000	1.4
+++ src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c	19 Nov 2014 12:24:40 -0000
@@ -19,25 +19,25 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
 
 #include <sys/zfs_context.h>
 
 int taskq_now;
 taskq_t *system_taskq;
 
-typedef struct task {
-	struct task	*task_next;
-	struct task	*task_prev;
-	task_func_t	*task_func;
-	void		*task_arg;
-} task_t;
-
 #define	TASKQ_ACTIVE	0x00010000
+#define	TASKQ_NAMELEN	31
 
 struct taskq {
+	char		tq_name[TASKQ_NAMELEN + 1];
 	kmutex_t	tq_lock;
 	krwlock_t	tq_threadlock;
 	kcondvar_t	tq_dispatch_cv;
@@ -49,37 +49,46 @@ struct taskq {
 	int		tq_nalloc;
 	int		tq_minalloc;
 	int		tq_maxalloc;
-	task_t		*tq_freelist;
-	task_t		tq_task;
+	kcondvar_t	tq_maxalloc_cv;
+	int		tq_maxalloc_wait;
+	taskq_ent_t	*tq_freelist;
+	taskq_ent_t	tq_task;
 };
 
-static task_t *
+static taskq_ent_t *
 task_alloc(taskq_t *tq, int tqflags)
 {
-	task_t *t;
+	taskq_ent_t *t;
+	int rv;
 
-	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
-		tq->tq_freelist = t->task_next;
+again:	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+		tq->tq_freelist = t->tqent_next;
 	} else {
-		mutex_exit(&tq->tq_lock);
 		if (tq->tq_nalloc >= tq->tq_maxalloc) {
-			if (!(tqflags & KM_SLEEP)) {
-				mutex_enter(&tq->tq_lock);
+			if (!(tqflags & KM_SLEEP))
 				return (NULL);
-			}
+
 			/*
 			 * We don't want to exceed tq_maxalloc, but we can't
 			 * wait for other tasks to complete (and thus free up
 			 * task structures) without risking deadlock with
 			 * the caller.  So, we just delay for one second
-			 * to throttle the allocation rate.
+			 * to throttle the allocation rate. If we have tasks
+			 * complete before one second timeout expires then
+			 * taskq_ent_free will signal us and we will
+			 * immediately retry the allocation.
 			 */
-			xdelay(hz);
+			tq->tq_maxalloc_wait++;
+			rv = cv_timedwait(&tq->tq_maxalloc_cv,
+			    &tq->tq_lock, ddi_get_lbolt() + hz);
+			tq->tq_maxalloc_wait--;
+			if (rv > 0)
+				goto again;		/* signaled */
 		}
+		mutex_exit(&tq->tq_lock);
+
+		t = kmem_alloc(sizeof (taskq_ent_t), tqflags & KM_SLEEP);
 
-		/* Clean up TQ_FRONT from tqflags before passing it to kmem */
-		t = kmem_alloc(sizeof (task_t),
-		    tqflags & (KM_SLEEP | KM_NOSLEEP));
 		mutex_enter(&tq->tq_lock);
 		if (t != NULL)
 			tq->tq_nalloc++;
@@ -88,23 +97,26 @@ task_alloc(taskq_t *tq, int tqflags)
 }
 
 static void
-task_free(taskq_t *tq, task_t *t)
+task_free(taskq_t *tq, taskq_ent_t *t)
 {
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
-		t->task_next = tq->tq_freelist;
+		t->tqent_next = tq->tq_freelist;
 		tq->tq_freelist = t;
 	} else {
 		tq->tq_nalloc--;
 		mutex_exit(&tq->tq_lock);
-		kmem_free(t, sizeof (task_t));
+		kmem_free(t, sizeof (taskq_ent_t));
 		mutex_enter(&tq->tq_lock);
 	}
+
+	if (tq->tq_maxalloc_wait)
+		cv_signal(&tq->tq_maxalloc_cv);
 }
 
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
 {
-	task_t *t;
+	taskq_ent_t *t;
 
 	if (taskq_now) {
 		func(arg);
@@ -118,26 +130,59 @@ taskq_dispatch(taskq_t *tq, task_func_t 
 		return (0);
 	}
 	if (tqflags & TQ_FRONT) {
-		t->task_next = tq->tq_task.task_next;
-		t->task_prev = &tq->tq_task;
+		t->tqent_next = tq->tq_task.tqent_next;
+		t->tqent_prev = &tq->tq_task;
 	} else {
-		t->task_next = &tq->tq_task;
-		t->task_prev = tq->tq_task.task_prev;
+		t->tqent_next = &tq->tq_task;
+		t->tqent_prev = tq->tq_task.tqent_prev;
 	}
-	t->task_next->task_prev = t;
-	t->task_prev->task_next = t;
-	t->task_func = func;
-	t->task_arg = arg;
+	t->tqent_next->tqent_prev = t;
+	t->tqent_prev->tqent_next = t;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	t->tqent_flags = 0;
 	cv_signal(&tq->tq_dispatch_cv);
 	mutex_exit(&tq->tq_lock);
 	return (1);
 }
 
 void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+    taskq_ent_t *t)
+{
+	ASSERT(func != NULL);
+	ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
+
+	/*
+	 * Mark it as a prealloc'd task.  This is important
+	 * to ensure that we don't free it later.
+	 */
+	t->tqent_flags |= TQENT_FLAG_PREALLOC;
+	/*
+	 * Enqueue the task to the underlying queue.
+	 */
+	mutex_enter(&tq->tq_lock);
+
+	if (flags & TQ_FRONT) {
+		t->tqent_next = tq->tq_task.tqent_next;
+		t->tqent_prev = &tq->tq_task;
+	} else {
+		t->tqent_next = &tq->tq_task;
+		t->tqent_prev = tq->tq_task.tqent_prev;
+	}
+	t->tqent_next->tqent_prev = t;
+	t->tqent_prev->tqent_next = t;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	cv_signal(&tq->tq_dispatch_cv);
+	mutex_exit(&tq->tq_lock);
+}
+
+void
 taskq_wait(taskq_t *tq)
 {
 	mutex_enter(&tq->tq_lock);
-	while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0)
+	while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
 		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
 	mutex_exit(&tq->tq_lock);
 }
@@ -146,27 +191,32 @@ static void *
 taskq_thread(void *arg)
 {
 	taskq_t *tq = arg;
-	task_t *t;
+	taskq_ent_t *t;
+	boolean_t prealloc;
 
 	mutex_enter(&tq->tq_lock);
 	while (tq->tq_flags & TASKQ_ACTIVE) {
-		if ((t = tq->tq_task.task_next) == &tq->tq_task) {
+		if ((t = tq->tq_task.tqent_next) == &tq->tq_task) {
 			if (--tq->tq_active == 0)
 				cv_broadcast(&tq->tq_wait_cv);
 			cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
 			tq->tq_active++;
 			continue;
 		}
-		t->task_prev->task_next = t->task_next;
-		t->task_next->task_prev = t->task_prev;
+		t->tqent_prev->tqent_next = t->tqent_next;
+		t->tqent_next->tqent_prev = t->tqent_prev;
+		t->tqent_next = NULL;
+		t->tqent_prev = NULL;
+		prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC;
 		mutex_exit(&tq->tq_lock);
 
 		rw_enter(&tq->tq_threadlock, RW_READER);
-		t->task_func(t->task_arg);
+		t->tqent_func(t->tqent_arg);
 		rw_exit(&tq->tq_threadlock);
 
 		mutex_enter(&tq->tq_lock);
-		task_free(tq, t);
+		if (!prealloc)
+			task_free(tq, t);
 	}
 	tq->tq_nthreads--;
 	cv_broadcast(&tq->tq_wait_cv);
@@ -199,13 +249,15 @@ taskq_create(const char *name, int nthre
 	mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
+	(void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
 	tq->tq_flags = flags | TASKQ_ACTIVE;
 	tq->tq_active = nthreads;
 	tq->tq_nthreads = nthreads;
 	tq->tq_minalloc = minalloc;
 	tq->tq_maxalloc = maxalloc;
-	tq->tq_task.task_next = &tq->tq_task;
-	tq->tq_task.task_prev = &tq->tq_task;
+	tq->tq_task.tqent_next = &tq->tq_task;
+	tq->tq_task.tqent_prev = &tq->tq_task;
 	tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP);
 
 	if (flags & TASKQ_PREPOPULATE) {
@@ -255,6 +307,7 @@ taskq_destroy(taskq_t *tq)
 	mutex_destroy(&tq->tq_lock);
 	cv_destroy(&tq->tq_dispatch_cv);
 	cv_destroy(&tq->tq_wait_cv);
+	cv_destroy(&tq->tq_maxalloc_cv);
 
 	kmem_free(tq, sizeof (taskq_t));
 }
Index: src/external/cddl/osnet/dist/lib/libzpool/common/util.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzpool/common/util.c,v
retrieving revision 1.3
diff -u -p -r1.3 util.c
--- src/external/cddl/osnet/dist/lib/libzpool/common/util.c	28 Mar 2014 02:50:18 -0000	1.3
+++ src/external/cddl/osnet/dist/lib/libzpool/common/util.c	10 Oct 2016 11:14:31 -0000
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <assert.h>
@@ -38,7 +37,7 @@
  */
 
 void
-nicenum(uint64_t num, char *buf, size_t buflen)
+nicenum(uint64_t num, char *buf)
 {
 	uint64_t n = num;
 	int index = 0;
@@ -52,15 +51,15 @@ nicenum(uint64_t num, char *buf, size_t 
 	u = " KMGTPE"[index];
 
 	if (index == 0) {
-		(void) snprintf(buf, buflen, "%llu", (u_longlong_t)n);
+		(void) sprintf(buf, "%llu", (u_longlong_t)n);
 	} else if (n < 10 && (num & (num - 1)) != 0) {
-		(void) snprintf(buf, buflen, "%.2f%c",
+		(void) sprintf(buf, "%.2f%c",
 		    (double)num / (1ULL << 10 * index), u);
 	} else if (n < 100 && (num & (num - 1)) != 0) {
-		(void) snprintf(buf, buflen, "%.1f%c",
+		(void) sprintf(buf, "%.1f%c",
 		    (double)num / (1ULL << 10 * index), u);
 	} else {
-		(void) snprintf(buf, buflen, "%llu%c", (u_longlong_t)n, u);
+		(void) sprintf(buf, "%llu%c", (u_longlong_t)n, u);
 	}
 }
 
@@ -90,26 +89,26 @@ show_vdev_stats(const char *desc, const 
 		if (is_log)
 			prefix = "log ";
 
-		if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+		if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 		    (uint64_t **)&vs, &c) != 0)
 			vs = &v0;
 
 		sec = MAX(1, vs->vs_timestamp / NANOSEC);
 
-		nicenum(vs->vs_alloc, used, sizeof(used));
-		nicenum(vs->vs_space - vs->vs_alloc, avail, sizeof(avail));
-		nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops, sizeof(rops));
-		nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops, sizeof(wops));
-		nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes, sizeof(rbytes));
-		nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes, sizeof(wbytes));
-		nicenum(vs->vs_read_errors, rerr, sizeof(rerr));
-		nicenum(vs->vs_write_errors, werr, sizeof(werr));
-		nicenum(vs->vs_checksum_errors, cerr, sizeof(cerr));
+		nicenum(vs->vs_alloc, used);
+		nicenum(vs->vs_space - vs->vs_alloc, avail);
+		nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops);
+		nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops);
+		nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes);
+		nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes);
+		nicenum(vs->vs_read_errors, rerr);
+		nicenum(vs->vs_write_errors, werr);
+		nicenum(vs->vs_checksum_errors, cerr);
 
 		(void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n",
 		    indent, "",
 		    prefix,
-		    indent + strlen(prefix) - 25 - (vs->vs_space ? 0 : 12),
+		    (int)(indent + strlen(prefix) - 25 - (vs->vs_space ? 0 : 12)),
 		    desc,
 		    vs->vs_space ? 6 : 0, vs->vs_space ? used : "",
 		    vs->vs_space ? 6 : 0, vs->vs_space ? avail : "",
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c,v
retrieving revision 1.3
diff -u -p -r1.3 barrier.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c	13 Feb 2016 21:37:12 -0000	1.3
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c	4 Feb 2015 07:20:42 -0000
@@ -38,13 +38,8 @@
  */
 
 #include <pthread.h>
-#if defined(sun)
+#ifdef illumos
 #include <synch.h>
-#else
-#define USYNC_THREAD 1
-#define sema_init(a, b, c, d)	sem_init((a), (c) != USYNC_THREAD, (b))
-#define sema_wait(a)		sem_wait(a)
-#define sema_post(a)		sem_post(a)
 #endif
 #include <stdio.h>
 
@@ -54,7 +49,12 @@ void
 barrier_init(barrier_t *bar, int nthreads)
 {
 	pthread_mutex_init(&bar->bar_lock, NULL);
+#ifdef illumos
 	sema_init(&bar->bar_sem, 0, USYNC_THREAD, NULL);
+#else
+	sem_init(&bar->bar_sem, 0, 0);
+#endif
+
 	bar->bar_numin = 0;
 	bar->bar_nthr = nthreads;
 }
@@ -66,7 +66,12 @@ barrier_wait(barrier_t *bar)
 
 	if (++bar->bar_numin < bar->bar_nthr) {
 		pthread_mutex_unlock(&bar->bar_lock);
+#ifdef illumos
 		sema_wait(&bar->bar_sem);
+#else
+		sem_wait(&bar->bar_sem);
+#endif
+
 		return (0);
 
 	} else {
@@ -75,7 +80,11 @@ barrier_wait(barrier_t *bar)
 		/* reset for next use */
 		bar->bar_numin = 0;
 		for (i = 1; i < bar->bar_nthr; i++)
+#ifdef illumos
 			sema_post(&bar->bar_sem);
+#else
+			sem_post(&bar->bar_sem);
+#endif
 		pthread_mutex_unlock(&bar->bar_lock);
 
 		return (1);
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h,v
retrieving revision 1.2
diff -u -p -r1.2 barrier.h
--- src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h	21 Feb 2010 00:49:55 -0000	1.2
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h	4 Feb 2015 07:20:42 -0000
@@ -33,7 +33,7 @@
  * APIs for the barrier synchronization primitive.
  */
 
-#if defined(sun)
+#ifdef illumos
 #include <synch.h>
 #else
 #include <semaphore.h>
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c,v
retrieving revision 1.13
diff -u -p -r1.13 ctf.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c	18 Mar 2016 17:11:04 -0000	1.13
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c	13 Apr 2017 19:16:42 -0000
@@ -56,8 +56,6 @@ static char *curfile;
 #define	CTF_BUF_CHUNK_SIZE	(64 * 1024)
 #define	RES_BUF_CHUNK_SIZE	(64 * 1024)
 
-static int ntypes = 0;		/* The number of types. */
-
 struct ctf_buf {
 	strtab_t ctb_strtab;	/* string table */
 	caddr_t ctb_base;	/* pointer to base of buffer */
@@ -1157,10 +1155,6 @@ resurrect_types(ctf_header_t *h, tdata_t
 					(*mpp)->ml_type = tdarr[ctm->ctm_type];
 					(*mpp)->ml_offset = ctm->ctm_offset;
 					(*mpp)->ml_size = 0;
-					if (ctm->ctm_type > ntypes) {
-						parseterminate("Invalid member type ctm_type=%d",
-						    ctm->ctm_type);
-					}
 				}
 			} else {
 				for (i = 0, mpp = &tdp->t_members; i < vlen;
@@ -1177,10 +1171,6 @@ resurrect_types(ctf_header_t *h, tdata_t
 					(*mpp)->ml_offset =
 					    (int)CTF_LMEM_OFFSET(ctlm);
 					(*mpp)->ml_size = 0;
-					if (ctlm->ctlm_type > ntypes) {
-						parseterminate("Invalid lmember type ctlm_type=%d",
-						    ctlm->ctlm_type);
-					}
 				}
 			}
 
@@ -1294,10 +1284,9 @@ ctf_parse(ctf_header_t *h, caddr_t buf, 
 {
 	tdata_t *td = tdata_new();
 	tdesc_t **tdarr;
+	int ntypes = count_types(h, buf);
 	int idx, i;
 
-	ntypes = count_types(h, buf);
-
 	/* shudder */
 	tdarr = xcalloc(sizeof (tdesc_t *) * (ntypes + 1));
 	tdarr[0] = NULL;
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c,v
retrieving revision 1.5
diff -u -p -r1.5 ctfconvert.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c	20 Feb 2016 22:08:44 -0000	1.5
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c	13 Apr 2017 19:21:22 -0000
@@ -68,7 +68,7 @@ usage(void)
 static void
 terminate_cleanup(void)
 {
-#if 0
+#if !defined(__FreeBSD__) && !defined(__NetBSD__)
 	if (!outfile) {
 		fprintf(stderr, "Removing %s\n", infile);
 		unlink(infile);
@@ -159,7 +159,7 @@ main(int argc, char **argv)
 	int keep_stabs = 0;
 	int c;
 
-#if defined(sun)
+#ifdef illumos
 	sighold(SIGINT);
 	sighold(SIGQUIT);
 	sighold(SIGTERM);
@@ -225,7 +225,7 @@ main(int argc, char **argv)
 	 */
 	set_terminate_cleanup(terminate_cleanup);
 
-#if defined(sun)
+#ifdef illumos
 	sigset(SIGINT, handle_sig);
 	sigset(SIGQUIT, handle_sig);
 	sigset(SIGTERM, handle_sig);
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c,v
retrieving revision 1.15
diff -u -p -r1.15 ctfmerge.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c	20 Feb 2016 22:08:44 -0000	1.15
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c	13 Apr 2017 19:25:33 -0000
@@ -186,20 +186,20 @@
 #endif
 #include <pthread.h>
 #include <assert.h>
-#if defined(sun)
+#ifdef illumos
 #include <synch.h>
 #endif
 #include <signal.h>
 #include <libgen.h>
 #include <string.h>
 #include <errno.h>
-#if defined(sun)
+#ifdef illumos
 #include <alloca.h>
 #endif
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/mman.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/sysconf.h>
 #endif
 
@@ -222,10 +222,9 @@ static char *outfile = NULL;
 static char *tmpname = NULL;
 static int dynsym;
 int debug_level = DEBUG_LEVEL;
-#if 0
+#ifdef illumos
 static size_t maxpgsize = 0x400000;
 #endif
-static int maxslots = MERGE_PHASE1_MAX_SLOTS;
 
 
 static void
@@ -245,7 +244,7 @@ usage(void)
 	    progname, progname);
 }
 
-#if defined(sun)
+#ifdef illumos
 static void
 bigheap(void)
 {
@@ -624,7 +623,7 @@ terminate_cleanup(void)
 	if (outfile == NULL)
 		return;
 
-#if 0
+#if !defined (__FreeBSD__) && !defined(__NetBSD__)
 	if (dounlink) {
 		fprintf(stderr, "Removing %s\n", outfile);
 		unlink(outfile);
@@ -658,7 +657,7 @@ wq_init(workqueue_t *wq, int nfiles)
 	if (getenv("CTFMERGE_MAX_SLOTS"))
 		nslots = atoi(getenv("CTFMERGE_MAX_SLOTS"));
 	else
-		nslots = maxslots;
+		nslots = MERGE_PHASE1_MAX_SLOTS;
 
 	if (getenv("CTFMERGE_PHASE1_BATCH_SIZE"))
 		wq->wq_maxbatchsz = atoi(getenv("CTFMERGE_PHASE1_BATCH_SIZE"));
@@ -733,7 +732,7 @@ start_threads(workqueue_t *wq)
 		    (void *(*)(void *))worker_thread, wq);
 	}
 
-#if defined(sun)
+#ifdef illumos
 	sigset(SIGINT, handle_sig);
 	sigset(SIGQUIT, handle_sig);
 	sigset(SIGTERM, handle_sig);
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h,v
retrieving revision 1.7
diff -u -p -r1.7 ctftools.h
--- src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h	18 Mar 2016 17:07:23 -0000	1.7
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h	13 Apr 2017 19:26:29 -0000
@@ -26,8 +26,6 @@
 #ifndef _CTFTOOLS_H
 #define	_CTFTOOLS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Functions and data structures used in the manipulation of stabs and CTF data
  */
@@ -39,6 +37,8 @@
 #include <gelf.h>
 #include <pthread.h>
 
+#include <sys/ccompile.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c,v
retrieving revision 1.23
diff -u -p -r1.23 dwarf.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c	8 Jun 2016 21:32:27 -0000	1.23
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c	22 Apr 2017 17:06:56 -0000
@@ -429,7 +429,7 @@ die_string(dwarf_t *dw, Dwarf_Die die, D
 static Dwarf_Off
 die_attr_ref(dwarf_t *dw, Dwarf_Die die, Dwarf_Half name)
 {
-	Dwarf_Unsigned off;
+	Dwarf_Off off;
 
 	if (dwarf_attrval_unsigned(die, name, &off, &dw->dw_err) != DW_DLV_OK) {
 		terminate("die %ju: failed to get ref: %s\n",
@@ -672,8 +672,6 @@ tdesc_array_create(dwarf_t *dw, Dwarf_Di
 
 	if ((dim2 = die_sibling(dw, dim)) == NULL) {
 		ctdp = arrtdp;
-		debug(3, "die %ju: sibling type %#x for dimension\n",
-		    (uintmax_t)die_off(dw, dim), ctdp->t_id);
 	} else if (die_tag(dw, dim2) == DW_TAG_subrange_type) {
 		ctdp = xcalloc(sizeof (tdesc_t));
 		ctdp->t_id = mfgtid_next(dw);
@@ -762,13 +760,6 @@ die_array_create(dwarf_t *dw, Dwarf_Die 
 		tdesc_t *dimtdp;
 		int flags;
 
-		/* Check for bogus gcc DW_AT_byte_size attribute */
-		if (uval == (unsigned)-1) {
-			printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n",
-			    __func__);
-			uval = 0;
-		}
-		
 		tdp->t_size = uval;
 
 		/*
@@ -852,19 +843,16 @@ die_enum_create(dwarf_t *dw, Dwarf_Die d
 	Dwarf_Unsigned uval;
 	Dwarf_Signed sval;
 
+	if (die_isdecl(dw, die)) {
+		tdp->t_type = FORWARD;
+		return;
+	}
+
 	debug(3, "die %ju: creating enum\n", (uintmax_t)off);
 
-	tdp->t_type = (die_isdecl(dw, die) ? FORWARD : ENUM);
-	if (tdp->t_type != ENUM)
-		return;
+	tdp->t_type = ENUM;
 
 	(void) die_unsigned(dw, die, DW_AT_byte_size, &uval, DW_ATTR_REQ);
-	/* Check for bogus gcc DW_AT_byte_size attribute */
-	if (uval == (unsigned)-1) {
-		printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n",
-		    __func__); 
-		uval = 0;
-	}
 	tdp->t_size = uval;
 
 	if ((mem = die_child(dw, die)) != NULL) {
@@ -980,7 +968,7 @@ static void
 die_sou_create(dwarf_t *dw, Dwarf_Die str, Dwarf_Off off, tdesc_t *tdp,
     int type, const char *typename)
 {
-	Dwarf_Unsigned sz, bitsz, bitoff, maxsz=0;
+	Dwarf_Unsigned sz, bitsz, bitoff;
 #if BYTE_ORDER == LITTLE_ENDIAN
 	Dwarf_Unsigned bysz;
 #endif
@@ -1040,8 +1028,6 @@ die_sou_create(dwarf_t *dw, Dwarf_Die st
 			ml->ml_name = NULL;
 
 		ml->ml_type = die_lookup_pass1(dw, mem, DW_AT_type);
-		debug(3, "die_sou_create(): ml_type = %p t_id = %#x\n",
-		    ml->ml_type, ml->ml_type->t_id);
 
 		if (die_mem_offset(dw, mem, DW_AT_data_member_location,
 		    &mloff, 0)) {
@@ -1088,24 +1074,8 @@ die_sou_create(dwarf_t *dw, Dwarf_Die st
 
 		*mlastp = ml;
 		mlastp = &ml->ml_next;
-
-		/* Find the size of the largest member to work around a gcc
-		 * bug.  See GCC Bugzilla 35998.
-		 */
-		if (maxsz < ml->ml_size)
-			maxsz = ml->ml_size;
-
 	} while ((mem = die_sibling(dw, mem)) != NULL);
 
-	/* See if we got a bogus DW_AT_byte_size.  GCC will sometimes
-	 * emit this.
-	 */
-	if (sz == (unsigned)-1) {
-		 printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n",
-		     __func__);
-		 tdp->t_size = maxsz / 8;  /* maxsz is in bits, t_size is bytes */
-	}
-
 	/*
 	 * GCC will attempt to eliminate unused types, thus decreasing the
 	 * size of the emitted dwarf.  That is, if you declare a foo_t in your
@@ -1246,7 +1216,7 @@ die_sou_resolve(tdesc_t *tdp, tdesc_t **
 		}
 
 		if (ml->ml_size != 0 && mt->t_type == INTRINSIC &&
-		    mt->t_intr->intr_nbits != (int)ml->ml_size) {
+		    mt->t_intr->intr_nbits != ml->ml_size) {
 			/*
 			 * This member is a bitfield, and needs to reference
 			 * an intrinsic type with the same width.  If the
@@ -1564,13 +1534,6 @@ die_base_create(dwarf_t *dw, Dwarf_Die b
 	 */
 	(void) die_unsigned(dw, base, DW_AT_byte_size, &sz, DW_ATTR_REQ);
 
-	/* Check for bogus gcc DW_AT_byte_size attribute */
-	if (sz == (unsigned)-1) {
-		printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n",
-		    __func__);
-		sz = 0;
-	}
-
 	if (tdp->t_name == NULL)
 		terminate("die %ju: base type without name\n", (uintmax_t)off);
 
@@ -2065,7 +2028,6 @@ dw_read(tdata_t *td, Elf *elf, char *fil
 			errno = ENOENT;
 			return (-1);
 		} else {
-
 			return (0);
 		}
 	} else if (rc != DW_DLV_OK) {
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c,v
retrieving revision 1.7
diff -u -p -r1.7 merge.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c	10 Apr 2016 23:37:10 -0000	1.7
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c	13 Apr 2017 19:51:37 -0000
@@ -287,24 +287,14 @@ static int
 equiv_su(tdesc_t *stdp, tdesc_t *ttdp, equiv_data_t *ed)
 {
 	mlist_t *ml1 = stdp->t_members, *ml2 = ttdp->t_members;
-	mlist_t *olm1 = NULL;
 
 	while (ml1 && ml2) {
 		if (ml1->ml_offset != ml2->ml_offset ||
-		    strcmp(ml1->ml_name, ml2->ml_name) != 0)
+		    strcmp(ml1->ml_name, ml2->ml_name) != 0 ||
+		    ml1->ml_size != ml2->ml_size ||
+		    !equiv_node(ml1->ml_type, ml2->ml_type, ed))
 			return (0);
 
-		/*
-		 * Don't do the recursive equivalency checking more than
-		 * we have to.
-		 */
-		if (olm1 == NULL || olm1->ml_type->t_id != ml1->ml_type->t_id) {
-			if (ml1->ml_size != ml2->ml_size ||
-			    !equiv_node(ml1->ml_type, ml2->ml_type, ed))
-				return (0);
-		}
-
-		olm1 = ml1;
 		ml1 = ml1->ml_next;
 		ml2 = ml2->ml_next;
 	}
@@ -352,7 +342,8 @@ fwd_equiv(tdesc_t *ctdp, tdesc_t *mtdp)
 {
 	tdesc_t *defn = (ctdp->t_type == FORWARD ? mtdp : ctdp);
 
-	return (defn->t_type == STRUCT || defn->t_type == UNION);
+	return (defn->t_type == STRUCT || defn->t_type == UNION ||
+	    defn->t_type == ENUM);
 }
 
 static int
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/output.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/output.c,v
retrieving revision 1.8
diff -u -p -r1.8 output.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/output.c	18 Mar 2016 17:07:23 -0000	1.8
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/output.c	22 Apr 2017 20:40:20 -0000
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Routines for preparing tdata trees for conversion into CTF data, and
  * for placing the resulting data into an output file.
@@ -582,7 +580,7 @@ write_file(Elf *src, const char *srcname
 			    shdr.sh_name);
 		}
 
-#if !defined(sun)
+#ifndef illumos
 		if (gelf_update_shdr(dscn, &shdr) == 0)
 			elfterminate(dstname, "Cannot update sect %s", sname);
 #endif
@@ -591,7 +589,7 @@ write_file(Elf *src, const char *srcname
 			elfterminate(srcname, "Cannot get sect %s data", sname);
 		if ((ddata = elf_newdata(dscn)) == NULL)
 			elfterminate(dstname, "Can't make sect %s data", sname);
-#if defined(sun)
+#ifdef illumos
 		bcopy(sdata, ddata, sizeof (Elf_Data));
 #else
 		/*
@@ -651,7 +649,7 @@ write_file(Elf *src, const char *srcname
 			}
 		}
 
-#if !defined(sun)
+#ifndef illumos
 		if (ddata->d_buf == NULL && sdata->d_buf != NULL) {
 			ddata->d_buf = xmalloc(shdr.sh_size);
 			bcopy(sdata->d_buf, ddata->d_buf, shdr.sh_size);
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c,v
retrieving revision 1.7
diff -u -p -r1.7 st_parse.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c	17 Mar 2016 03:05:55 -0000	1.7
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c	13 Apr 2017 19:53:16 -0000
@@ -956,7 +956,7 @@ soudef(char *cp, stabtype_t type, tdesc_
 
 		itdp = find_intrinsic(tdp);
 		if (itdp->t_type == INTRINSIC) {
-			if ((int)mlp->ml_size != itdp->t_intr->intr_nbits) {
+			if (mlp->ml_size != itdp->t_intr->intr_nbits) {
 				parse_debug(4, cp, "making %d bit intrinsic "
 				    "from %s", mlp->ml_size, tdesc_name(itdp));
 				mlp->ml_type = bitintrinsic(itdp, mlp->ml_size);
@@ -1177,7 +1177,7 @@ resolve_typed_bitfields_cb(void *arg, vo
 	while (tdp) {
 		switch (tdp->t_type) {
 		case INTRINSIC:
-			if ((int)ml->ml_size != tdp->t_intr->intr_nbits) {
+			if (ml->ml_size != tdp->t_intr->intr_nbits) {
 				debug(3, "making %d bit intrinsic from %s",
 				    ml->ml_size, tdesc_name(tdp));
 				ml->ml_type = bitintrinsic(tdp, ml->ml_size);
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c,v
retrieving revision 1.5
diff -u -p -r1.5 stabs.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c	7 Feb 2015 20:30:03 -0000	1.5
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c	13 Apr 2017 19:54:32 -0000
@@ -190,7 +190,7 @@ stabs_read(tdata_t *td, Elf *elf, char *
 	char *curfile = NULL;
 	char *str;
 	char *fstr = NULL, *ofstr = NULL;
-	int stabidx, stabstridx = 0;
+	int stabidx, stabstridx;
 	int nstabs, rc, i;
 	int scope = 0;
 
@@ -198,8 +198,6 @@ stabs_read(tdata_t *td, Elf *elf, char *
 	    (stabstridx = findelfsecidx(elf, file, ".stab.exclstr")) >= 0) &&
 	    !((stabidx = findelfsecidx(elf, file, ".stab")) >= 0 &&
 	    (stabstridx = findelfsecidx(elf, file, ".stabstr")) >= 0)) {
-		debug(1, "NO stabs: .stab=%d, .stabstr=%d\n", stabidx,
-		    stabstridx);
 		errno = ENOENT;
 		return (-1);
 	}
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c,v
retrieving revision 1.3
diff -u -p -r1.3 strtab.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c	20 Feb 2016 21:50:02 -0000	1.3
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c	13 Apr 2017 19:54:50 -0000
@@ -28,8 +28,8 @@
 
 #include <sys/types.h>
 #include <sys/sysmacros.h>
-#include <stdlib.h>
 #include <strings.h>
+#include <stdlib.h>
 #include <stdio.h>
 
 #include "strtab.h"
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c,v
retrieving revision 1.8
diff -u -p -r1.8 tdata.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c	18 Mar 2016 17:07:23 -0000	1.8
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c	13 Apr 2017 19:55:09 -0000
@@ -177,7 +177,7 @@ tdesc_namecmp(void *arg1, void *arg2)
 	return (!streq(tdp1->t_name, tdp2->t_name));
 }
 
-#if defined(sun)
+#ifdef illumos
 /*ARGSUSED1*/
 static int
 tdesc_print(void *data, void *private __unused)
Index: src/external/cddl/osnet/dist/tools/ctf/cvt/util.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/util.c,v
retrieving revision 1.5
diff -u -p -r1.5 util.c
--- src/external/cddl/osnet/dist/tools/ctf/cvt/util.c	7 Feb 2015 20:30:03 -0000	1.5
+++ src/external/cddl/osnet/dist/tools/ctf/cvt/util.c	13 Apr 2017 19:55:46 -0000
@@ -175,7 +175,7 @@ aborterr(const char *format, ...)
 	whine("ERROR", format, ap);
 	va_end(ap);
 
-#if defined(sun)
+#ifdef illumos
 	abort();
 #else
 	exit(0);
Index: src/external/cddl/osnet/dist/uts/common/Makefile.files
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/Makefile.files,v
retrieving revision 1.2
diff -u -p -r1.2 Makefile.files
--- src/external/cddl/osnet/dist/uts/common/Makefile.files	27 Feb 2010 23:43:53 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/Makefile.files	7 Jul 2017 17:50:49 -0000
@@ -20,1300 +20,30 @@
 #
 
 #
-# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 #
-# This Makefile defines all file modules for the directory uts/common
-# and its children. These are the source files which may be considered
-# common to all SunOS systems.
-
-i386_CORE_OBJS += \
-		atomic.o	\
-		avintr.o	\
-		pic.o
-
-sparc_CORE_OBJS +=
-
-COMMON_CORE_OBJS +=		\
-		beep.o		\
-		bitset.o	\
-		bp_map.o	\
-		brand.o		\
-		cpucaps.o	\
-		cmt.o		\
-		cmt_policy.o	\
-		cpu.o		\
-		cpu_event.o	\
-		cpu_intr.o	\
-		cpu_pm.o	\
-		cpupart.o	\
-		cap_util.o	\
-		disp.o		\
-		group.o		\
-		kstat_fr.o	\
-		iscsiboot_prop.o	\
-		lgrp.o		\
-		lgrp_topo.o	\
-		mmapobj.o	\
-		mutex.o		\
-		page_lock.o	\
-		page_retire.o	\
-		panic.o		\
-		param.o		\
-		pg.o		\
-		pghw.o		\
-		putnext.o	\
-		rctl_proc.o	\
-		rwlock.o	\
-		seg_kmem.o	\
-		softint.o	\
-		string.o	\
-		strtol.o	\
-		strtoul.o	\
-		strtoll.o	\
-		strtoull.o	\
-		thread_intr.o	\
-		vm_page.o	\
-		vm_pagelist.o	\
-		zlib_obj.o	\
-		clock_tick.o
-
-CORE_OBJS +=	$(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS)
-
-ZLIB_OBJS =	zutil.o zmod.o zmod_subr.o \
-		adler32.o crc32.o deflate.o inffast.o \
-		inflate.o inftrees.o trees.o
-
-GENUNIX_OBJS +=	\
-		access.o	\
-		acl.o		\
-		acl_common.o	\
-		adjtime.o	\
-		alarm.o		\
-		aio_subr.o	\
-		auditsys.o	\
-		autoconf.o	\
-		avl.o		\
-		bdev_dsort.o	\
-		bio.o		\
-		bitmap.o	\
-		blabel.o	\
-		brandsys.o	\
-		bz2blocksort.o	\
-		bz2compress.o	\
-		bz2decompress.o	\
-		bz2randtable.o	\
-		bz2bzlib.o	\
-		bz2crctable.o	\
-		bz2huffman.o	\
-		callb.o		\
-		callout.o	\
-		chdir.o		\
-		chmod.o		\
-		chown.o		\
-		cladm.o		\
-		class.o		\
-		clock.o		\
-		clock_highres.o	\
-		clock_realtime.o\
-		close.o		\
-		compress.o	\
-		condvar.o	\
-		conf.o		\
-		console.o	\
-		contract.o	\
-		copyops.o	\
-		core.o		\
-		corectl.o	\
-		cred.o		\
-		cs_stubs.o	\
-		dacf.o		\
-		dacf_clnt.o	\
-		damap.o	\
-		cyclic.o	\
-		ddi.o		\
-		ddifm.o		\
-		ddi_hp_impl.o	\
-		ddi_hp_ndi.o	\
-		ddi_intr.o	\
-		ddi_intr_impl.o	\
-		ddi_intr_irm.o	\
-		ddi_nodeid.o	\
-		ddi_timer.o	\
-		devcfg.o	\
-		devcache.o	\
-		device.o	\
-		devid.o		\
-		devid_cache.o	\
-		devid_scsi.o	\
-		devid_smp.o	\
-		devpolicy.o	\
-		disp_lock.o	\
-		dnlc.o		\
-		driver.o	\
-		dumpsubr.o	\
-		driver_lyr.o	\
-		dtrace_subr.o	\
-		errorq.o	\
-		etheraddr.o	\
-		evchannels.o	\
-		exacct.o	\
-		exacct_core.o	\
-		exec.o		\
-		exit.o		\
-		fbio.o		\
-		fcntl.o		\
-		fdbuffer.o	\
-		fdsync.o	\
-		fem.o		\
-		ffs.o		\
-		fio.o		\
-		flock.o		\
-		fm.o		\
-		fork.o		\
-		vpm.o		\
-		fsat.o		\
-		fs_reparse.o	\
-		fs_subr.o	\
-		fsflush.o	\
-		ftrace.o	\
-		getcwd.o	\
-		getdents.o	\
-		getloadavg.o	\
-		getpagesizes.o	\
-		getpid.o	\
-		gfs.o		\
-		rusagesys.o	\
-		gid.o		\
-		groups.o	\
-		grow.o		\
-		hat.o		\
-		hat_refmod.o	\
-		id32.o		\
-		id_space.o	\
-		inet_ntop.o	\
-		instance.o	\
-		ioctl.o		\
-		ip_cksum.o	\
-		issetugid.o	\
-		ippconf.o	\
-		kcpc.o		\
-		kdi.o		\
-		kiconv.o	\
-		klpd.o		\
-		kmem.o		\
-		ksyms_snapshot.o	\
-		l_strplumb.o	\
-		labelsys.o	\
-		link.o		\
-		list.o		\
-		lockstat_subr.o	\
-		log_sysevent.o	\
-		logsubr.o	\
-		lookup.o	\
-		lseek.o		\
-		ltos.o		\
-		lwp.o		\
-		lwp_create.o	\
-		lwp_info.o	\
-		lwp_self.o	\
-		lwp_sobj.o	\
-		lwp_timer.o	\
-		lwpsys.o	\
-		main.o		\
-		mmapobjsys.o	\
-		memcntl.o	\
-		memstr.o	\
-		lgrpsys.o	\
-		mkdir.o		\
-		mknod.o		\
-		mount.o		\
-		move.o		\
-		msacct.o	\
-		multidata.o	\
-		nbmlock.o	\
-		ndifm.o		\
-		nice.o		\
-		netstack.o	\
-		ntptime.o	\
-		nvpair.o	\
-		nvpair_alloc_system.o	\
-		nvpair_alloc_fixed.o	\
-		octet.o		\
-		open.o		\
-		p_online.o	\
-		pathconf.o	\
-		pathname.o	\
-		pause.o		\
-		serializer.o	\
-		pci_intr_lib.o	\
-		pci_cap.o	\
-		pcifm.o		\
-		pgrp.o		\
-		pgrpsys.o	\
-		pid.o		\
-		policy.o	\
-		poll.o		\
-		pool.o		\
-		pool_pset.o	\
-		port_subr.o	\
-		ppriv.o		\
-		printf.o	\
-		priocntl.o	\
-		priv.o		\
-		priv_const.o	\
-		proc.o		\
-		procset.o	\
-		processor_bind.o	\
-		processor_info.o	\
-		profil.o	\
-		project.o	\
-		qsort.o		\
-		rctl.o		\
-		rctlsys.o	\
-		readlink.o	\
-		refstr.o	\
-		rename.o	\
-		resolvepath.o	\
-		retire_store.o	\
-		process.o	\
-		rlimit.o	\
-		rmap.o		\
-		rmdir.o		\
-		rw.o		\
-		rwstlock.o	\
-		sad_conf.o	\
-		sid.o		\
-		sidsys.o	\
-		sched.o		\
-		schedctl.o	\
-		sctp_crc32.o	\
-		seg_dev.o	\
-		seg_kp.o	\
-		seg_kpm.o	\
-		seg_map.o	\
-		seg_vn.o	\
-		seg_spt.o	\
-		semaphore.o	\
-		sendfile.o	\
-		session.o	\
-		share.o		\
-		shuttle.o	\
-		sig.o		\
-		sigaction.o	\
-		sigaltstack.o	\
-		signotify.o	\
-		sigpending.o	\
-		sigprocmask.o	\
-		sigqueue.o	\
-		sigsendset.o	\
-		sigsuspend.o	\
-		sigtimedwait.o	\
-		sleepq.o	\
-		sock_conf.o	\
-		space.o		\
-		sscanf.o	\
-		stat.o		\
-		statfs.o	\
-		statvfs.o	\
-		stol.o		\
-		str_conf.o	\
-		strcalls.o	\
-		stream.o	\
-		streamio.o	\
-		strext.o	\
-		strsubr.o	\
-		strsun.o	\
-		subr.o		\
-		sunddi.o	\
-		sunmdi.o	\
-		sunndi.o	\
-		sunpci.o	\
-		sunpm.o		\
-		sundlpi.o	\
-		suntpi.o	\
-		swap_subr.o	\
-		swap_vnops.o	\
-		symlink.o	\
-		sync.o		\
-		sysclass.o	\
-		sysconfig.o	\
-		sysent.o	\
-		sysfs.o		\
-		systeminfo.o	\
-		task.o		\
-		taskq.o		\
-		tasksys.o	\
-		time.o		\
-		timer.o		\
-		times.o		\
-		timers.o	\
-		thread.o	\
-		tlabel.o	\
-		tnf_res.o	\
-		turnstile.o	\
-		tty_common.o	\
-		u8_textprep.o	\
-		uadmin.o	\
-		uconv.o		\
-		ucredsys.o	\
-		uid.o		\
-		umask.o		\
-		umount.o	\
-		uname.o		\
-		unix_bb.o	\
-		unlink.o	\
-		urw.o		\
-		utime.o		\
-		utssys.o	\
-		uucopy.o	\
-		vfs.o		\
-		vfs_conf.o	\
-		vmem.o		\
-		vm_anon.o	\
-		vm_as.o		\
-		vm_meter.o	\
-		vm_pageout.o	\
-		vm_pvn.o	\
-		vm_rm.o		\
-		vm_seg.o	\
-		vm_subr.o	\
-		vm_swap.o	\
-		vm_usage.o	\
-		vnode.o		\
-		vuid_queue.o	\
-		vuid_store.o	\
-		waitq.o		\
-		watchpoint.o	\
-		yield.o		\
-		scsi_confdata.o	\
-		xattr.o		\
-		xattr_common.o	\
-		xdr_mblk.o	\
-		xdr_mem.o	\
-		xdr.o		\
-		xdr_array.o	\
-		xdr_refer.o	\
-		xhat.o		\
-		zone.o
-
-#
-#	Stubs for the stand-alone linker/loader
-#
-sparc_GENSTUBS_OBJS =	\
-	kobj_stubs.o
-
-i386_GENSTUBS_OBJS =
-
-COMMON_GENSTUBS_OBJS =
-
-GENSTUBS_OBJS += $(COMMON_GENSTUBS_OBJS) $($(MACH)_GENSTUBS_OBJS)
-
-#
-#	DTrace and DTrace Providers
-#
-DTRACE_OBJS += dtrace.o dtrace_isa.o dtrace_asm.o
-
-SDT_OBJS += sdt_subr.o
-
-PROFILE_OBJS += profile.o
-
-SYSTRACE_OBJS += systrace.o
-
-LX_SYSTRACE_OBJS += lx_systrace.o
-
-LOCKSTAT_OBJS += lockstat.o
-
-FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o
-
-DCPC_OBJS += dcpc.o
-
-#
-#	Driver (pseudo-driver) Modules
-#
-IPP_OBJS +=	ippctl.o
-
-AUDIO_OBJS += audio_client.o audio_ddi.o audio_engine.o \
-	   audio_fltdata.o audio_format.o audio_ctrl.o \
-	   audio_grc3.o audio_output.o audio_input.o \
-	   audio_oss.o audio_sun.o
-
-AUDIOEMU10K_OBJS += audioemu10k.o
-
-AUDIOENS_OBJS += audioens.o
-
-AUDIOVIA823X_OBJS += audiovia823x.o
-
-AUDIOVIA97_OBJS += audiovia97.o
-
-AUDIO1575_OBJS += audio1575.o
-
-AUDIO810_OBJS += audio810.o
-
-AUDIOCMI_OBJS += audiocmi.o
-
-AUDIOHD_OBJS +=	audiohd.o
-
-AUDIOIXP_OBJS += audioixp.o
-
-AUDIOLS_OBJS += audiols.o
-
-AUDIOP16X_OBJS += audiop16x.o
-
-AUDIOPCI_OBJS += audiopci.o
-
-AUDIOSOLO_OBJS += audiosolo.o
-
-AUDIOTS_OBJS +=	audiots.o
-
-AC97_OBJS += ac97.o ac97_ad.o ac97_alc.o ac97_cmi.o
-
-CARDBUS_OBJS += cardbus.o cardbus_hp.o cardbus_cfg.o
-
-CONSKBD_OBJS += conskbd.o
-
-CONSMS_OBJS +=	consms.o
-
-OLDPTY_OBJS +=	tty_ptyconf.o
-
-PTC_OBJS +=	tty_pty.o
-
-PTSL_OBJS +=	tty_pts.o
-
-PTM_OBJS +=	ptm.o
-
-LX_PTM_OBJS +=	lx_ptm.o
-
-LX_AUDIO_OBJS += lx_audio.o
-
-MII_OBJS +=	mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \
-		mii_marvell.o mii_realtek.o mii_other.o
-
-PTS_OBJS +=	pts.o
-
-PTY_OBJS +=	ptms_conf.o
-
-SAD_OBJS +=	sad.o
-
-MD4_OBJS +=	md4.o md4_mod.o
-
-MD5_OBJS +=	md5.o md5_mod.o
-
-SHA1_OBJS +=	sha1.o sha1_mod.o fips_sha1_util.o
-
-SHA2_OBJS +=	sha2.o sha2_mod.o fips_sha2_util.o
-
-IPGPC_OBJS +=   classifierddi.o classifier.o filters.o trie.o table.o \
-		ba_table.o
-
-DSCPMK_OBJS +=	dscpmk.o dscpmkddi.o
-
-DLCOSMK_OBJS +=	dlcosmk.o dlcosmkddi.o
-
-FLOWACCT_OBJS +=	flowacctddi.o flowacct.o
-
-TOKENMT_OBJS +=	tokenmt.o tokenmtddi.o
-
-TSWTCL_OBJS +=	tswtcl.o tswtclddi.o
-
-ARP_OBJS +=	arpddi.o
-
-ICMP_OBJS +=	icmpddi.o
-
-ICMP6_OBJS +=	icmp6ddi.o
-
-RTS_OBJS +=	rtsddi.o
-
-IP_ICMP_OBJS =	icmp.o icmp_opt_data.o
-IP_RTS_OBJS =	rts.o rts_opt_data.o
-IP_TCP_OBJS =	tcp.o tcp_fusion.o tcp_kssl.o tcp_opt_data.o tcp_sack.o
-IP_UDP_OBJS =	udp.o udp_opt_data.o
-IP_SCTP_OBJS =	sctp.o sctp_opt_data.o sctp_output.o \
-		sctp_init.o sctp_input.o sctp_cookie.o \
-		sctp_conn.o sctp_error.o sctp_snmp.o \
-		sctp_param.o sctp_shutdown.o sctp_common.o \
-		sctp_timer.o sctp_heartbeat.o sctp_hash.o \
-		sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \
-		sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o
-IP_ILB_OBJS =	ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o
-
-IP_OBJS +=	igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
-		ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
-		ip_multi.o ip2mac.o ip_ndp.o ip_rts.o ip_srcid.o \
-		ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
-		spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
-		ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \
-		ip_helper_stream.o \
-		ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \
-		conn_opt.o ip_attr.o ip_dce.o \
-		$(IP_ICMP_OBJS) \
-		$(IP_RTS_OBJS) \
-		$(IP_TCP_OBJS) \
-		$(IP_UDP_OBJS) \
-		$(IP_SCTP_OBJS) \
-		$(IP_ILB_OBJS)
-
-IP6_OBJS +=	ip6ddi.o
-
-HOOK_OBJS +=	hook.o
-
-NETI_OBJS +=	neti_impl.o neti_mod.o neti_stack.o
-
-KEYSOCK_OBJS +=	keysockddi.o keysock.o keysock_opt_data.o
-
-IPNET_OBJS +=	ipnet.o ipnet_bpf.o
-
-SPDSOCK_OBJS += spdsockddi.o spdsock.o spdsock_opt_data.o
-
-IPSECESP_OBJS += ipsecespddi.o ipsecesp.o
-
-IPSECAH_OBJS +=	ipsecahddi.o ipsecah.o sadb.o
-
-SPPP_OBJS +=	sppp.o sppp_dlpi.o sppp_mod.o s_common.o
-
-SPPPTUN_OBJS +=	sppptun.o sppptun_mod.o
-
-SPPPASYN_OBJS += spppasyn.o spppasyn_mod.o
-
-SPPPCOMP_OBJS += spppcomp.o spppcomp_mod.o deflate.o bsd-comp.o vjcompress.o \
-		zlib.o
-
-TCP_OBJS +=	tcpddi.o
-
-TCP6_OBJS +=	tcp6ddi.o
-
-SCTP_OBJS +=	sctpddi.o
-
-SCTP6_OBJS +=	sctp6ddi.o
-
-NCA_OBJS +=	ncaddi.o
-
-SDP_SOCK_MOD_OBJS += sockmod_sdp.o socksdp.o socksdpsubr.o
-
-SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o
-
-PFP_SOCK_MOD_OBJS += sockmod_pfp.o
-
-RDS_OBJS +=	rdsddi.o rdssubr.o rds_opt.o rds_ioctl.o
-
-RDSIB_OBJS +=	rdsib.o rdsib_ib.o rdsib_cm.o rdsib_ep.o rdsib_buf.o \
-		rdsib_debug.o rdsib_sc.o
-
-ISER_OBJS +=	iser.o iser_cm.o iser_cq.o iser_ib.o iser_idm.o \
-		iser_resource.o iser_xfer.o
-
-UDP_OBJS +=	udpddi.o
-
-UDP6_OBJS +=	udp6ddi.o
-
-SY_OBJS +=	gentty.o
-
-TCO_OBJS +=	ticots.o
-
-TCOO_OBJS +=	ticotsord.o
-
-TCL_OBJS +=	ticlts.o
-
-TL_OBJS +=	tl.o
-
-DUMP_OBJS +=	dump.o
-
-BPF_OBJS +=	bpf.o bpf_filter.o bpf_mod.o bpf_dlt.o bpf_mac.o
-
-CLONE_OBJS +=	clone.o
-
-CN_OBJS +=	cons.o
-
-DLD_OBJS +=	dld_drv.o dld_proto.o dld_str.o dld_flow.o
-
-DLS_OBJS +=	dls.o dls_link.o dls_mod.o dls_stat.o dls_mgmt.o
-
-GLD_OBJS +=     gld.o gldutil.o
-
-MAC_OBJS +=     mac.o  mac_bcast.o mac_client.o mac_datapath_setup.o mac_flow.o \
-		mac_hio.o mac_mod.o mac_ndd.o mac_provider.o mac_sched.o \
-		mac_protect.o mac_soft_ring.o mac_stat.o mac_util.o
-
-MAC_6TO4_OBJS +=	mac_6to4.o
-
-MAC_ETHER_OBJS +=	mac_ether.o
-
-MAC_IPV4_OBJS +=	mac_ipv4.o
-
-MAC_IPV6_OBJS +=	mac_ipv6.o
-
-MAC_WIFI_OBJS +=	mac_wifi.o
-
-MAC_IB_OBJS +=		mac_ib.o
-
-IPTUN_OBJS +=	iptun_dev.o iptun_ctl.o iptun.o
-
-AGGR_OBJS +=	aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \
-		aggr_send.o aggr_recv.o aggr_lacp.o
-
-SOFTMAC_OBJS += softmac_main.o softmac_ctl.o softmac_capab.o \
-		softmac_dev.o softmac_stat.o softmac_pkt.o softmac_fp.o
-
-NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \
-		 net80211_output.o net80211_node.o net80211_crypto.o \
-		 net80211_crypto_none.o net80211_crypto_wep.o net80211_ioctl.o \
-		 net80211_crypto_tkip.o net80211_crypto_ccmp.o	\
-		 net80211_ht.o
-
-VNIC_OBJS +=	vnic_ctl.o vnic_dev.o
-
-SIMNET_OBJS +=	simnet.o
-
-IB_OBJS +=	ibnex.o ibnex_ioctl.o
-
-IBCM_OBJS +=	ibcm_impl.o ibcm_sm.o ibcm_ti.o ibcm_utils.o ibcm_path.o \
-		ibcm_arp.o ibcm_arp_link.o
-
-IBDM_OBJS +=	ibdm.o
-
-IBDMA_OBJS +=	ibdma.o
-
-IBMF_OBJS +=	ibmf.o ibmf_impl.o ibmf_dr.o ibmf_wqe.o ibmf_ud_dest.o ibmf_mod.o \
-		ibmf_send.o ibmf_recv.o ibmf_handlers.o ibmf_trans.o \
-		ibmf_timers.o ibmf_msg.o ibmf_utils.o ibmf_rmpp.o \
-		ibmf_saa.o ibmf_saa_impl.o ibmf_saa_utils.o ibmf_saa_events.o
-
-IBTL_OBJS +=	ibtl_impl.o ibtl_util.o ibtl_mem.o ibtl_handlers.o ibtl_qp.o \
-		ibtl_cq.o ibtl_wr.o ibtl_hca.o ibtl_chan.o ibtl_cm.o \
-		ibtl_mcg.o ibtl_ibnex.o ibtl_srq.o
-
-TAVOR_OBJS +=	tavor.o tavor_agents.o tavor_cfg.o tavor_ci.o tavor_cmd.o \
-		tavor_cq.o tavor_event.o tavor_ioctl.o tavor_misc.o \
-		tavor_mr.o tavor_qp.o tavor_qpmod.o tavor_rsrc.o \
-		tavor_srq.o tavor_stats.o tavor_umap.o tavor_wr.o
-
-HERMON_OBJS +=	hermon.o hermon_agents.o hermon_cfg.o hermon_ci.o hermon_cmd.o \
-		hermon_cq.o hermon_event.o hermon_ioctl.o hermon_misc.o \
-		hermon_mr.o hermon_qp.o hermon_qpmod.o hermon_rsrc.o \
-		hermon_srq.o hermon_stats.o hermon_umap.o hermon_wr.o \
-		hermon_fm.o
-
-DAPLT_OBJS +=	daplt.o
-
-KSTAT_OBJS +=	kstat.o
-
-KSYMS_OBJS +=	ksyms.o
-
-INSTANCE_OBJS += inst_sync.o
-
-IWSCN_OBJS +=	iwscons.o
-
-LOFI_OBJS +=	lofi.o LzmaDec.o
-
-FSSNAP_OBJS +=	fssnap.o
-
-FSSNAPIF_OBJS += fssnap_if.o
-
-MM_OBJS +=	mem.o
-
-PHYSMEM_OBJS +=	physmem.o
-
-OPTIONS_OBJS += options.o
-
-WINLOCK_OBJS +=	winlockio.o
-
-PM_OBJS +=	pm.o
-SRN_OBJS +=	srn.o
-
-PSEUDO_OBJS +=	pseudonex.o
-
-RAMDISK_OBJS +=	ramdisk.o
-
-LLC1_OBJS += llc1.o
-
-USBKBM_OBJS += usbkbm.o
-
-USBWCM_OBJS += usbwcm.o
-
-BOFI_OBJS += bofi.o
-
-HID_OBJS += hid.o
-
-HWA_RC_OBJS += hwarc.o
-
-USBSKEL_OBJS += usbskel.o
-
-USBVC_OBJS += usbvc.o usbvc_v4l2.o
-
-HIDPARSER_OBJS += hidparser.o
-
-USB_AC_OBJS += usb_ac.o
-
-USB_AS_OBJS += usb_as.o
-
-USB_AH_OBJS += usb_ah.o
-
-USBMS_OBJS += usbms.o
-
-USBPRN_OBJS += usbprn.o
-
-UGEN_OBJS += ugen.o
-
-USBSER_OBJS += usbser.o usbser_rseq.o
-
-USBSACM_OBJS += usbsacm.o
-
-USBSER_KEYSPAN_OBJS += usbser_keyspan.o keyspan_dsd.o keyspan_pipe.o
-
-USBS49_FW_OBJS += keyspan_49fw.o
-
-USBSPRL_OBJS += usbser_pl2303.o pl2303_dsd.o
-
-WUSB_CA_OBJS += wusb_ca.o
-
-USBFTDI_OBJS += usbser_uftdi.o uftdi_dsd.o
-
-WC_OBJS += wscons.o vcons.o
-
-VCONS_CONF_OBJS += vcons_conf.o
-
-SCSI_OBJS +=	scsi_capabilities.o scsi_confsubr.o scsi_control.o \
-		scsi_data.o scsi_fm.o scsi_hba.o scsi_reset_notify.o \
-		scsi_resource.o scsi_subr.o scsi_transport.o scsi_watch.o \
-		smp_transport.o
-
-SCSI_VHCI_OBJS +=		scsi_vhci.o mpapi_impl.o scsi_vhci_tpgs.o
-
-SCSI_VHCI_F_SYM_OBJS +=		sym.o
-
-SCSI_VHCI_F_TPGS_OBJS +=	tpgs.o
-
-SCSI_VHCI_F_ASYM_SUN_OBJS +=	asym_sun.o
-
-SCSI_VHCI_F_SYM_HDS_OBJS += 	sym_hds.o
-
-SCSI_VHCI_F_TAPE_OBJS +=	tape.o
-
-SCSI_VHCI_F_TPGS_TAPE_OBJS +=	tpgs_tape.o
-
-SGEN_OBJS +=	sgen.o
-
-SMP_OBJS +=	smp.o
-
-SATA_OBJS +=	sata.o
-
-USBA_OBJS +=	hcdi.o	usba.o	usbai.o hubdi.o parser.o genconsole.o \
-		usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \
-		usba_devdb.o usba10_calls.o usba_ugen.o whcdi.o wa.o
-USBA_WITHOUT_WUSB_OBJS +=	hcdi.o	usba.o	usbai.o hubdi.o parser.o genconsole.o \
-		usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \
-		usba_devdb.o usba10_calls.o usba_ugen.o
-
-USBA10_OBJS +=	usba10.o
-
-RSM_OBJS +=	rsm.o	rsmka_pathmanager.o	rsmka_util.o
-
-RSMOPS_OBJS +=	rsmops.o
-
-S1394_OBJS +=	t1394.o t1394_errmsg.o s1394.o s1394_addr.o s1394_asynch.o \
-		s1394_bus_reset.o s1394_cmp.o s1394_csr.o s1394_dev_disc.o \
-		s1394_fa.o s1394_fcp.o \
-		s1394_hotplug.o s1394_isoch.o s1394_misc.o h1394.o nx1394.o
-
-HCI1394_OBJS +=	hci1394.o hci1394_async.o hci1394_attach.o hci1394_buf.o \
-		hci1394_csr.o hci1394_detach.o hci1394_extern.o \
-		hci1394_ioctl.o hci1394_isoch.o hci1394_isr.o \
-		hci1394_ixl_comp.o hci1394_ixl_isr.o hci1394_ixl_misc.o \
-		hci1394_ixl_update.o hci1394_misc.o hci1394_ohci.o \
-		hci1394_q.o hci1394_s1394if.o hci1394_tlabel.o \
-		hci1394_tlist.o hci1394_vendor.o
-
-AV1394_OBJS +=	av1394.o av1394_as.o av1394_async.o av1394_cfgrom.o \
-		av1394_cmp.o av1394_fcp.o av1394_isoch.o av1394_isoch_chan.o \
-		av1394_isoch_recv.o av1394_isoch_xmit.o av1394_list.o \
-		av1394_queue.o
-
-DCAM1394_OBJS += dcam.o dcam_frame.o dcam_param.o dcam_reg.o \
-		dcam_ring_buff.o
-
-SCSA1394_OBJS += hba.o sbp2_driver.o sbp2_bus.o
-
-SBP2_OBJS += cfgrom.o sbp2.o
-
-PMODEM_OBJS += pmodem.o pmodem_cis.o cis.o cis_callout.o cis_handlers.o cis_params.o
-
-DSW_OBJS +=	dsw.o dsw_dev.o ii_tree.o
-
-NCALL_OBJS +=	ncall.o \
-		ncall_stub.o
-
-RDC_OBJS +=	rdc.o \
-		rdc_dev.o \
-		rdc_io.o \
-		rdc_clnt.o \
-		rdc_prot_xdr.o \
-		rdc_svc.o \
-		rdc_bitmap.o \
-		rdc_health.o \
-		rdc_subr.o \
-		rdc_diskq.o
-
-RDCSRV_OBJS +=	rdcsrv.o
-
-RDCSTUB_OBJS += rdc_stub.o
-
-SDBC_OBJS +=	sd_bcache.o \
-		sd_bio.o \
-		sd_conf.o \
-		sd_ft.o \
-		sd_hash.o \
-		sd_io.o \
-		sd_misc.o \
-		sd_pcu.o \
-		sd_tdaemon.o \
-		sd_trace.o \
-		sd_iob_impl0.o \
-		sd_iob_impl1.o \
-		sd_iob_impl2.o \
-		sd_iob_impl3.o \
-		sd_iob_impl4.o \
-		sd_iob_impl5.o \
-		sd_iob_impl6.o \
-		sd_iob_impl7.o \
-		safestore.o \
-		safestore_ram.o
-
-NSCTL_OBJS +=	nsctl.o \
-		nsc_cache.o \
-		nsc_disk.o \
-		nsc_dev.o \
-		nsc_freeze.o \
-		nsc_gen.o \
-		nsc_mem.o \
-		nsc_ncallio.o \
-		nsc_power.o \
-		nsc_resv.o \
-		nsc_rmspin.o \
-		nsc_solaris.o \
-		nsc_trap.o \
-		nsc_list.o
-UNISTAT_OBJS +=	spuni.o \
-		spcs_s_k.o
-
-NSKERN_OBJS +=	nsc_ddi.o \
-		nsc_proc.o \
-		nsc_raw.o \
-		nsc_thread.o \
-		nskernd.o
-
-SV_OBJS +=	sv.o
-
-PMCS_OBJS += pmcs_attach.o pmcs_ds.o pmcs_intr.o pmcs_nvram.o pmcs_sata.o \
-		pmcs_scsa.o pmcs_smhba.o pmcs_subr.o pmcs_fwlog.o
-
-PMCS8001FW_C_OBJS +=	pmcs_fw_hdr.o
-PMCS8001FW_OBJS +=		$(PMCS8001FW_C_OBJS) SPCBoot.o ila.o firmware.o
-
-#
-#	Build up defines and paths.
-
-ST_OBJS +=	st.o	st_conf.o
-
-EMLXS_OBJS +=	emlxs_clock.o emlxs_dfc.o emlxs_dhchap.o emlxs_diag.o \
-		emlxs_download.o emlxs_dump.o emlxs_els.o emlxs_event.o \
-		emlxs_fcp.o emlxs_fct.o emlxs_hba.o emlxs_ip.o \
-		emlxs_mbox.o emlxs_mem.o emlxs_msg.o emlxs_node.o \
-		emlxs_pkt.o emlxs_sli3.o emlxs_sli4.o emlxs_solaris.o \
-		emlxs_thread.o
-
-EMLXS_FW_OBJS +=	emlxs_fw.o
-
-OCE_OBJS +=	oce_buf.o oce_fm.o oce_gld.o oce_hw.o oce_intr.o oce_main.o \
-		oce_mbx.o oce_mq.o oce_queue.o oce_rx.o oce_stat.o oce_tx.o \
-		oce_utils.o
-
-FCT_OBJS += discovery.o fct.o
-
-QLT_OBJS += 2400.o 2500.o 8100.o qlt.o qlt_dma.o
-
-SRPT_OBJS += srpt_mod.o srpt_ch.o srpt_cm.o srpt_ioc.o srpt_stp.o
-
-FCOE_OBJS += fcoe.o fcoe_eth.o fcoe_fc.o
-
-FCOET_OBJS += fcoet.o fcoet_eth.o fcoet_fc.o
-
-FCOEI_OBJS += fcoei.o fcoei_eth.o fcoei_lv.o
-
-ISCSIT_SHARED_OBJS += \
-		iscsit_common.o
-
-ISCSIT_OBJS +=	$(ISCSIT_SHARED_OBJS) \
-		iscsit.o iscsit_tgt.o iscsit_sess.o iscsit_login.o \
-		iscsit_text.o iscsit_isns.o iscsit_radiusauth.o \
-		iscsit_radiuspacket.o iscsit_auth.o iscsit_authclient.o
-
-PPPT_OBJS +=	alua_ic_if.o pppt.o pppt_msg.o pppt_tgt.o
-
-STMF_OBJS += lun_map.o stmf.o
-
-STMF_SBD_OBJS += sbd.o sbd_scsi.o sbd_pgr.o
-
-SYSMSG_OBJS +=	sysmsg.o
-
-SES_OBJS +=	ses.o ses_sen.o ses_safte.o ses_ses.o
-
-TNF_OBJS +=	tnf_buf.o	tnf_trace.o	tnf_writer.o	trace_init.o \
-		trace_funcs.o	tnf_probe.o	tnf.o
-
-LOGINDMUX_OBJS += logindmux.o
-
-DEVINFO_OBJS += devinfo.o
-
-DEVPOLL_OBJS += devpoll.o
-
-DEVPOOL_OBJS += devpool.o
-
-I8042_OBJS +=	i8042.o
-
-KB8042_OBJS +=	\
-		at_keyprocess.o	\
-		kb8042.o	\
-		kb8042_keytables.o
-
-MOUSE8042_OBJS += mouse8042.o
-
-FDC_OBJS +=	fdc.o
-
-ASY_OBJS +=	asy.o
-
-ECPP_OBJS +=	ecpp.o
-
-VUIDM3P_OBJS += vuidmice.o vuidm3p.o
-
-VUIDM4P_OBJS += vuidmice.o vuidm4p.o
-
-VUIDM5P_OBJS += vuidmice.o vuidm5p.o
-
-VUIDPS2_OBJS += vuidmice.o vuidps2.o
-
-HPCSVC_OBJS += hpcsvc.o
-
-PCIE_MISC_OBJS += pcie.o pcie_fault.o pcie_hp.o pciehpc.o pcishpc.o pcie_pwr.o pciev.o
-
-PCIHPNEXUS_OBJS += pcihp.o
-
-OPENEEPR_OBJS += openprom.o
-
-RANDOM_OBJS += random.o
-
-PSHOT_OBJS += pshot.o
-
-GEN_DRV_OBJS += gen_drv.o
-
-TCLIENT_OBJS +=	tclient.o
-
-TPHCI_OBJS += tphci.o
-
-TVHCI_OBJS += tvhci.o
-
-EMUL64_OBJS += emul64.o emul64_bsd.o
-
-FCP_OBJS += fcp.o
-
-FCIP_OBJS += fcip.o
-
-FCSM_OBJS += fcsm.o
-
-FCTL_OBJS += fctl.o
-
-FP_OBJS += fp.o
-
-QLC_OBJS += ql_api.o ql_debug.o ql_hba_fru.o ql_init.o ql_iocb.o ql_ioctl.o \
-	ql_isr.o ql_mbx.o ql_xioctl.o ql_fw_table.o
-
-QLC_FW_2200_OBJS += ql_fw_2200.o
-
-QLC_FW_2300_OBJS += ql_fw_2300.o
-
-QLC_FW_2400_OBJS += ql_fw_2400.o
-
-QLC_FW_2500_OBJS += ql_fw_2500.o
-
-QLC_FW_6322_OBJS += ql_fw_6322.o
-
-QLC_FW_8100_OBJS += ql_fw_8100.o
-
-QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_gld.o qlge_mpi.o
-
-ZCONS_OBJS += zcons.o
-
-NV_SATA_OBJS += nv_sata.o
-
-SI3124_OBJS += si3124.o
-
-AHCI_OBJS += ahci.o
-
-PCIIDE_OBJS += pci-ide.o
-
-PCEPP_OBJS += pcepp.o
-
-CPC_OBJS += cpc.o
-
-CPUID_OBJS += cpuid_drv.o
-
-SYSEVENT_OBJS += sysevent.o
-
-BL_OBJS += bl.o
-
-DRM_OBJS += drm_sunmod.o drm_kstat.o drm_agpsupport.o \
-	    drm_auth.o drm_bufs.o drm_context.o drm_dma.o \
-	    drm_drawable.o drm_drv.o drm_fops.o drm_ioctl.o drm_irq.o \
-	    drm_lock.o drm_memory.o drm_msg.o drm_pci.o drm_scatter.o \
-	    drm_cache.o drm_gem.o drm_mm.o ati_pcigart.o 
-
-FM_OBJS += devfm.o devfm_machdep.o
-
-RTLS_OBJS +=	rtls.o
-
-#
-#			exec modules
-#
-AOUTEXEC_OBJS +=aout.o
-
-ELFEXEC_OBJS +=	elf.o elf_notes.o old_notes.o
-
-INTPEXEC_OBJS +=intp.o
-
-SHBINEXEC_OBJS +=shbin.o
-
-JAVAEXEC_OBJS +=java.o
-
-#
-#			file system modules
-#
-AUTOFS_OBJS +=	auto_vfsops.o auto_vnops.o auto_subr.o auto_xdr.o auto_sys.o
-
-CACHEFS_OBJS +=	cachefs_cnode.o		cachefs_cod.o \
-		cachefs_dir.o		cachefs_dlog.o	cachefs_filegrp.o \
-		cachefs_fscache.o	cachefs_ioctl.o	cachefs_log.o \
-		cachefs_module.o \
-		cachefs_noopc.o		cachefs_resource.o \
-		cachefs_strict.o \
-		cachefs_subr.o		cachefs_vfsops.o \
-		cachefs_vnops.o
-
-DCFS_OBJS +=	dc_vnops.o
-
-DEVFS_OBJS +=	devfs_subr.o	devfs_vfsops.o	devfs_vnops.o
-
-DEV_OBJS  +=	sdev_subr.o	sdev_vfsops.o	sdev_vnops.o	\
-		sdev_ptsops.o	sdev_zvolops.o	sdev_comm.o	\
-		sdev_profile.o	sdev_ncache.o	sdev_netops.o	\
-		sdev_ipnetops.o	\
-		sdev_vtops.o
-
-CTFS_OBJS +=	ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \
-		ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o
-
-OBJFS_OBJS +=	objfs_vfs.o	objfs_root.o	objfs_common.o \
-		objfs_odir.o	objfs_data.o
-
-FDFS_OBJS +=	fdops.o
-
-FIFO_OBJS +=	fifosubr.o	fifovnops.o
-
-PIPE_OBJS +=	pipe.o
-
-HSFS_OBJS +=	hsfs_node.o	hsfs_subr.o	hsfs_vfsops.o	hsfs_vnops.o \
-		hsfs_susp.o	hsfs_rrip.o	hsfs_susp_subr.o
-
-LOFS_OBJS +=	lofs_subr.o	lofs_vfsops.o	lofs_vnops.o
-
-NAMEFS_OBJS +=	namevfs.o	namevno.o
-
-NFS_OBJS +=	nfs_client.o	nfs_common.o	nfs_dump.o \
-		nfs_subr.o	nfs_vfsops.o	nfs_vnops.o \
-		nfs_xdr.o	nfs_sys.o	nfs_strerror.o \
-		nfs3_vfsops.o	nfs3_vnops.o	nfs3_xdr.o \
-		nfs_acl_vnops.o	nfs_acl_xdr.o	nfs4_vfsops.o \
-		nfs4_vnops.o	nfs4_xdr.o 	nfs4_idmap.o \
-		nfs4_shadow.o	nfs4_subr.o \
-		nfs4_attr.o	nfs4_rnode.o	nfs4_client.o \
-		nfs4_acache.o	nfs4_common.o	nfs4_client_state.o \
-		nfs4_callback.o	nfs4_recovery.o nfs4_client_secinfo.o \
-		nfs4_client_debug.o	nfs_stats.o \
-		nfs4_acl.o	nfs4_stub_vnops.o	nfs_cmd.o
-
-NFSSRV_OBJS +=	nfs_server.o	nfs_srv.o	nfs3_srv.o \
-		nfs_acl_srv.o	nfs_auth.o	nfs_auth_xdr.o \
-		nfs_export.o	nfs_log.o	nfs_log_xdr.o \
-		nfs4_srv.o	nfs4_state.o	nfs4_srv_attr.o \
-		nfs4_srv_ns.o	nfs4_db.o	nfs4_srv_deleg.o \
-		nfs4_deleg_ops.o nfs4_srv_readdir.o nfs4_dispatch.o
-
-SMBSRV_SHARED_OBJS += \
-		smb_inet.o \
-		smb_match.o \
-		smb_msgbuf.o \
-		smb_oem.o \
-		smb_string.o \
-		smb_utf8.o \
-		smb_common_door_decode.o \
-		smb_xdr_utils.o \
-		smb_token.o \
-		smb_token_xdr.o \
-		smb_sid.o \
-		smb_status_xlat.o \
-		smb_native.o \
-		smb_netbios_util.o \
-		smb_share_door_decode.o
-
-SMBSRV_OBJS +=	$(SMBSRV_SHARED_OBJS)			\
-		smb_acl.o				\
-		smb_alloc.o				\
-		smb_close.o				\
-		smb_common_open.o			\
-		smb_common_transact.o			\
-		smb_create.o				\
-		smb_delete.o				\
-		smb_directory.o				\
-		smb_dispatch.o				\
-		smb_echo.o				\
-		smb_fem.o				\
-		smb_find.o				\
-		smb_flush.o				\
-		smb_fsops.o				\
-		smb_init.o				\
-		smb_kdoor_encdec.o			\
-		smb_kdoor_clnt.o			\
-		smb_kshare.o				\
-		smb_lock.o				\
-		smb_lock_byte_range.o			\
-		smb_locking_andx.o			\
-		smb_logoff_andx.o			\
-		smb_mangle_name.o			\
-		smb_mbuf_marshaling.o			\
-		smb_mbuf_util.o				\
-		smb_negotiate.o				\
-		smb_net.o				\
-		smb_node.o				\
-                smb_nt_cancel.o				\
-		smb_nt_create_andx.o			\
-                smb_nt_transact_create.o		\
-		smb_nt_transact_ioctl.o			\
-		smb_nt_transact_notify_change.o		\
-		smb_nt_transact_security.o		\
-                smb_odir.o				\
-		smb_ofile.o				\
-		smb_open_andx.o				\
-		smb_opipe.o				\
-		smb_oplock.o				\
-		smb_pathname.o				\
-		smb_print.o				\
-		smb_process_exit.o			\
-		smb_query_fileinfo.o			\
-		smb_query_information_disk.o		\
-		smb_read.o				\
-		smb_rename.o				\
-		smb_sd.o				\
-		smb_seek.o				\
-		smb_server.o				\
-		smb_session.o				\
-		smb_session_setup_andx.o		\
-		smb_set_fileinfo.o			\
-		smb_signing.o				\
-                smb_tree.o				\
-		smb_trans2_create_directory.o		\
-		smb_trans2_dfs.o			\
-		smb_trans2_find.o			\
-		smb_trans2_query_fs_information.o	\
-		smb_tree_connect.o			\
-		smb_unlock_byte_range.o			\
-		smb_upcalls.o				\
-		smb_user.o				\
-                smb_util.o				\
-		smb_vfs.o				\
-		smb_vops.o				\
-		smb_vss.o				\
-		smb_write.o				\
-		smb_write_raw.o				\
-		smb_xlate.o
-
-PCFS_OBJS +=	pc_alloc.o	pc_dir.o	pc_node.o	pc_subr.o \
-		pc_vfsops.o	pc_vnops.o
-
-PROC_OBJS +=	prcontrol.o	prioctl.o	prsubr.o	prusrio.o \
-		prvfsops.o	prvnops.o
-
-MNTFS_OBJS +=	mntvfsops.o	mntvnops.o
-
-SHAREFS_OBJS +=	sharetab.o	sharefs_vfsops.o	sharefs_vnops.o
-
-SPEC_OBJS +=	specsubr.o	specvfsops.o	specvnops.o
-
-SOCK_OBJS +=	socksubr.o	sockvfsops.o	sockparams.o	\
-		socksyscalls.o	socktpi.o	sockstr.o	sockssl.o \
-		sockcommon_vnops.o	sockcommon_subr.o \
-		sockcommon_sops.o	sockcommon.o	\
-		sock_notsupp.o	socknotify.o \
-		nl7c.o		nl7curi.o	nl7chttp.o	nl7clogd.o \
-		nl7cnca.o	sodirect.o
-
-TMPFS_OBJS +=	tmp_dir.o	tmp_subr.o	tmp_tnode.o	tmp_vfsops.o \
-		tmp_vnops.o
-
-UDFS_OBJS +=	udf_alloc.o	udf_bmap.o	udf_dir.o	\
-		udf_inode.o	udf_subr.o	udf_vfsops.o	\
-		udf_vnops.o
-
-UFS_OBJS +=	ufs_alloc.o	ufs_bmap.o	ufs_dir.o 	ufs_xattr.o \
-		ufs_inode.o	ufs_subr.o	ufs_tables.o	ufs_vfsops.o \
-		ufs_vnops.o	quota.o		quotacalls.o	quota_ufs.o \
-		ufs_filio.o	ufs_lockfs.o	ufs_thread.o	ufs_trans.o \
-		ufs_acl.o	ufs_panic.o	ufs_directio.o	ufs_log.o \
-		ufs_extvnops.o	ufs_snap.o	lufs.o		lufs_thread.o \
-		lufs_log.o	lufs_map.o	lufs_top.o	lufs_debug.o
-VSCAN_OBJS +=	vscan_drv.o	vscan_svc.o vscan_door.o
-
-NSMB_OBJS +=	smb_conn.o	smb_dev.o	smb_iod.o	smb_pass.o \
-		smb_rq.o	smb_sign.o	smb_smb.o	smb_subrs.o \
-		smb_time.o	smb_tran.o	smb_trantcp.o	smb_usr.o \
-		subr_mchain.o
-
-SMBFS_COMMON_OBJS += smbfs_ntacl.o
-SMBFS_OBJS +=	smbfs_vfsops.o	smbfs_vnops.o	smbfs_node.o	\
-		smbfs_acl.o	smbfs_client.o	smbfs_smb.o	\
-		smbfs_subr.o	smbfs_subr2.o	\
-		smbfs_rwlock.o	smbfs_xattr.o	\
-		$(SMBFS_COMMON_OBJS)
-
-
-#
-#			LVM modules
-#
-MD_OBJS	+= md.o md_error.o md_ioctl.o md_mddb.o md_names.o \
-	md_med.o md_rename.o md_subr.o
-
-MD_COMMON_OBJS = md_convert.o md_crc.o md_revchk.o
-
-MD_DERIVED_OBJS = metamed_xdr.o meta_basic_xdr.o
-
-SOFTPART_OBJS += sp.o sp_ioctl.o
-
-STRIPE_OBJS += stripe.o stripe_ioctl.o
-
-HOTSPARES_OBJS += hotspares.o
-
-RAID_OBJS += raid.o raid_ioctl.o raid_replay.o raid_resync.o raid_hotspare.o
-
-MIRROR_OBJS += mirror.o mirror_ioctl.o mirror_resync.o
-
-NOTIFY_OBJS += md_notify.o
-
-TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o
+#
+# This Makefile defines all file modules for the directory uts/common
+# and its children. These are the source files which may be considered
+# common to all SunOS systems.
 
 ZFS_COMMON_OBJS +=		\
 	arc.o			\
 	bplist.o		\
+	blkptr.o		\
+	bpobj.o			\
+	bptree.o		\
+	bqueue.o		\
 	dbuf.o			\
 	ddt.o			\
 	ddt_zap.o		\
 	dmu.o			\
+	dmu_diff.o		\
 	dmu_send.o		\
 	dmu_object.o		\
 	dmu_objset.o		\
@@ -1321,18 +51,28 @@ ZFS_COMMON_OBJS +=		\
 	dmu_tx.o		\
 	dnode.o			\
 	dnode_sync.o		\
+	dsl_bookmark.o		\
 	dsl_dir.o		\
 	dsl_dataset.o		\
+	dsl_deadlist.o		\
+	dsl_destroy.o		\
 	dsl_pool.o		\
 	dsl_synctask.o		\
+	dsl_userhold.o		\
 	dmu_zfetch.o		\
 	dsl_deleg.o		\
 	dsl_prop.o		\
-	dsl_scrub.o		\
+	dsl_scan.o		\
+	zfeature.o		\
 	gzip.o			\
+	lz4.o			\
 	lzjb.o			\
 	metaslab.o		\
+	multilist.o		\
+	range_tree.o		\
 	refcount.o		\
+	rrwlock.o		\
+	sa.o			\
 	sha256.o		\
 	spa.o			\
 	spa_config.o		\
@@ -1340,6 +80,7 @@ ZFS_COMMON_OBJS +=		\
 	spa_history.o		\
 	spa_misc.o		\
 	space_map.o		\
+	space_reftree.o		\
 	txg.o			\
 	uberblock.o		\
 	unique.o		\
@@ -1356,626 +97,41 @@ ZFS_COMMON_OBJS +=		\
 	zap_leaf.o		\
 	zap_micro.o		\
 	zfs_byteswap.o		\
+	zfs_debug.o		\
 	zfs_fm.o		\
 	zfs_fuid.o		\
+	zfs_sa.o		\
 	zfs_znode.o		\
 	zil.o			\
 	zio.o			\
 	zio_checksum.o		\
 	zio_compress.o		\
 	zio_inject.o		\
-	zle.o
+	zle.o			\
+	zrlock.o
 
 ZFS_SHARED_OBJS +=		\
-	zfs_namecheck.o		\
-	zfs_deleg.o		\
-	zfs_prop.o		\
+	zfeature_common.o	\
 	zfs_comutil.o		\
+	zfs_deleg.o		\
 	zfs_fletcher.o		\
+	zfs_namecheck.o		\
+	zfs_prop.o		\
 	zpool_prop.o		\
 	zprop_common.o
 
 ZFS_OBJS +=			\
 	$(ZFS_COMMON_OBJS)	\
 	$(ZFS_SHARED_OBJS)	\
-	vdev_disk.o		\
 	zfs_acl.o		\
 	zfs_ctldir.o		\
 	zfs_dir.o		\
 	zfs_ioctl.o		\
+	zfs_ioctl_compat.o	\
 	zfs_log.o		\
+	zfs_onexit.o		\
 	zfs_replay.o		\
 	zfs_rlock.o		\
-	rrwlock.o		\
 	zfs_vfsops.o		\
 	zfs_vnops.o		\
 	zvol.o
-
-ZUT_OBJS +=			\
-	zut.o
-
-#
-#			streams modules
-#
-BUFMOD_OBJS	+=	bufmod.o
-
-CONNLD_OBJS +=	connld.o
-
-DEDUMP_OBJS +=	dedump.o
-
-DRCOMPAT_OBJS +=	drcompat.o
-
-LDLINUX_OBJS +=	ldlinux.o
-
-LDTERM_OBJS +=	ldterm.o uwidth.o
-
-PCKT_OBJS +=	pckt.o
-
-PFMOD_OBJS +=	pfmod.o
-
-PTEM_OBJS +=	ptem.o
-
-REDIRMOD_OBJS += strredirm.o
-
-TIMOD_OBJS +=	timod.o
-
-TIRDWR_OBJS +=	tirdwr.o
-
-TTCOMPAT_OBJS +=ttcompat.o
-
-LOG_OBJS +=	log.o
-
-PIPEMOD_OBJS +=	pipemod.o
-
-RPCMOD_OBJS +=	rpcmod.o	clnt_cots.o	clnt_clts.o \
-		clnt_gen.o	clnt_perr.o	mt_rpcinit.o	rpc_calmsg.o \
-		rpc_prot.o	rpc_sztypes.o	rpc_subr.o	rpcb_prot.o \
-		svc.o		svc_clts.o	svc_gen.o	svc_cots.o \
-		rpcsys.o	xdr_sizeof.o	clnt_rdma.o	svc_rdma.o \
-		xdr_rdma.o	rdma_subr.o	xdrrdma_sizeof.o
-
-TLIMOD_OBJS +=	tlimod.o	t_kalloc.o	t_kbind.o	t_kclose.o \
-		t_kconnect.o	t_kfree.o	t_kgtstate.o	t_kopen.o \
-		t_krcvudat.o	t_ksndudat.o	t_kspoll.o	t_kunbind.o \
-		t_kutil.o
-
-RLMOD_OBJS += rlmod.o
-
-TELMOD_OBJS += telmod.o
-
-CRYPTMOD_OBJS += cryptmod.o
-
-KB_OBJS +=	kbd.o		keytables.o
-
-#
-#			ID mapping module
-#
-IDMAP_OBJS +=	idmap_mod.o	idmap_kapi.o	idmap_xdr.o	idmap_cache.o
-
-#
-#			scheduling class modules
-#
-SDC_OBJS +=		sysdc.o
-
-RT_OBJS +=		rt.o
-RT_DPTBL_OBJS +=	rt_dptbl.o
-
-TS_OBJS +=		ts.o
-TS_DPTBL_OBJS +=	ts_dptbl.o
-
-IA_OBJS +=		ia.o
-
-FSS_OBJS +=		fss.o
-
-FX_OBJS +=		fx.o
-FX_DPTBL_OBJS +=	fx_dptbl.o
-
-#
-#			Inter-Process Communication (IPC) modules
-#
-IPC_OBJS +=	ipc.o
-
-IPCMSG_OBJS +=	msg.o
-
-IPCSEM_OBJS +=	sem.o
-
-IPCSHM_OBJS +=	shm.o
-
-#
-#			bignum module
-#
-COMMON_BIGNUM_OBJS += bignum_mod.o bignumimpl.o
-
-BIGNUM_OBJS += $(COMMON_BIGNUM_OBJS) $(BIGNUM_PSR_OBJS)
-
-#
-#			kernel cryptographic framework
-#
-KCF_OBJS +=	kcf.o kcf_callprov.o kcf_cbufcall.o kcf_cipher.o kcf_crypto.o \
-		kcf_cryptoadm.o kcf_ctxops.o kcf_digest.o kcf_dual.o \
-		kcf_keys.o kcf_mac.o kcf_mech_tabs.o kcf_miscapi.o \
-		kcf_object.o kcf_policy.o kcf_prov_lib.o kcf_prov_tabs.o \
-		kcf_sched.o kcf_session.o kcf_sign.o kcf_spi.o kcf_verify.o \
-		kcf_random.o modes.o ecb.o cbc.o ctr.o ccm.o gcm.o fips_random.o
-
-CRYPTOADM_OBJS += cryptoadm.o
-
-CRYPTO_OBJS +=	crypto.o
-
-DPROV_OBJS +=	dprov.o
-
-DCA_OBJS +=	dca.o dca_3des.o dca_debug.o dca_dsa.o dca_kstat.o dca_rng.o \
-		dca_rsa.o
-
-AESPROV_OBJS +=	aes.o aes_impl.o aes_modes.o fips_aes_util.o
-
-ARCFOURPROV_OBJS += arcfour.o arcfour_crypt.o
-
-BLOWFISHPROV_OBJS += blowfish.o blowfish_impl.o
-
-ECCPROV_OBJS += ecc.o ec.o ec2_163.o ec2_mont.o ecdecode.o ecl_mult.o \
-		ecp_384.o ecp_jac.o ec2_193.o ecl.o ecp_192.o ecp_521.o \
-		ecp_jm.o ec2_233.o ecl_curve.o ecp_224.o ecp_aff.o \
-		ecp_mont.o ec2_aff.o ec_naf.o ecl_gf.o ecp_256.o mp_gf2m.o \
-		mpi.o mplogic.o mpmontg.o mpprime.o oid.o \
-		secitem.o ec2_test.o ecp_test.o fips_ecc_util.o
-
-RSAPROV_OBJS += rsa.o rsa_impl.o fips_rsa_util.o
-
-SWRANDPROV_OBJS += swrand.o fips_random_util.o
-
-#
-#			kernel SSL
-#
-KSSL_OBJS +=	kssl.o ksslioctl.o ksslapi.o ksslrec.o
-
-#
-#			misc. modules
-#
-
-C2AUDIT_OBJS +=	adr.o audit.o audit_event.o audit_io.o \
-		audit_path.o audit_start.o audit_syscalls.o audit_token.o \
-		audit_mem.o audit_zone.o
-
-PCIC_OBJS +=	pcic.o
-
-RPCSEC_OBJS +=	secmod.o	sec_clnt.o	sec_svc.o	sec_gen.o \
-		auth_des.o	auth_kern.o	auth_none.o	auth_loopb.o\
-		authdesprt.o	authdesubr.o	authu_prot.o \
-		key_call.o	key_prot.o	svc_authu.o	svcauthdes.o
-
-RPCSEC_GSS_OBJS +=	rpcsec_gssmod.o rpcsec_gss.o rpcsec_gss_misc.o \
-		rpcsec_gss_utils.o svc_rpcsec_gss.o
-
-CONSCONFIG_OBJS += consconfig.o
-
-CONSCONFIG_DACF_OBJS  += consconfig_dacf.o consplat.o
-
-TEM_OBJS += tem.o tem_safe.o 6x10.o 7x14.o 12x22.o
-
-KBTRANS_OBJS +=				\
-		kbtrans.o		\
-		kbtrans_keytables.o	\
-		kbtrans_polled.o	\
-		kbtrans_streams.o	\
-		usb_keytables.o
-
-KGSSD_OBJS +=	gssd_clnt_stubs.o gssd_handle.o gssd_prot.o \
-		gss_display_name.o gss_release_name.o gss_import_name.o \
-		gss_release_buffer.o gss_release_oid_set.o gen_oids.o gssdmod.o
-
-KGSSD_DERIVED_OBJS = gssd_xdr.o
-
-KGSS_DUMMY_OBJS += dmech.o
-
-KSOCKET_OBJS +=	ksocket.o ksocket_mod.o
-
-CRYPTO= cksumtypes.o decrypt.o encrypt.o encrypt_length.o etypes.o \
-	nfold.o verify_checksum.o prng.o block_size.o make_checksum.o\
-	checksum_length.o hmac.o default_state.o mandatory_sumtype.o
-
-# crypto/des
-CRYPTO_DES= f_cbc.o f_cksum.o f_parity.o weak_key.o d3_cbc.o ef_crypto.o
-
-CRYPTO_DK= checksum.o derive.o dk_decrypt.o dk_encrypt.o
-
-CRYPTO_ARCFOUR= k5_arcfour.o
-
-# crypto/enc_provider
-CRYPTO_ENC= des.o des3.o arcfour_provider.o aes_provider.o
-
-# crypto/hash_provider
-CRYPTO_HASH= hash_kef_generic.o hash_kmd5.o hash_crc32.o hash_ksha1.o
-
-# crypto/keyhash_provider
-CRYPTO_KEYHASH= descbc.o k5_kmd5des.o k_hmac_md5.o
-
-# crypto/crc32
-CRYPTO_CRC32= crc32.o
-
-# crypto/old
-CRYPTO_OLD= old_decrypt.o old_encrypt.o
-
-# crypto/raw
-CRYPTO_RAW= raw_decrypt.o raw_encrypt.o
-
-K5_KRB= kfree.o copy_key.o \
-	parse.o init_ctx.o \
-	ser_adata.o ser_addr.o \
-	ser_auth.o ser_cksum.o \
-	ser_key.o ser_princ.o \
-	serialize.o unparse.o \
-	ser_actx.o
-
-K5_OS=  timeofday.o toffset.o \
-	init_os_ctx.o c_ustime.o
-
-SEAL=
-# EXPORT DELETE START
-SEAL=	seal.o unseal.o
-# EXPORT DELETE END
-
-MECH=	delete_sec_context.o \
-	import_sec_context.o \
-	gssapi_krb5.o \
-	k5seal.o k5unseal.o k5sealv3.o \
-	ser_sctx.o \
-	sign.o \
-	util_crypt.o  \
-	util_validate.o  util_ordering.o  \
-	util_seqnum.o util_set.o util_seed.o \
-	wrap_size_limit.o verify.o
-
-
-
-MECH_GEN= util_token.o
-
-
-KGSS_KRB5_OBJS += krb5mech.o \
-	$(MECH) $(SEAL) $(MECH_GEN) \
-	$(CRYPTO) $(CRYPTO_DES) $(CRYPTO_DK) $(CRYPTO_ARCFOUR) \
-	$(CRYPTO_ENC) $(CRYPTO_HASH) \
-	$(CRYPTO_KEYHASH) $(CRYPTO_CRC32) \
-	$(CRYPTO_OLD) \
-	$(CRYPTO_RAW) $(K5_KRB) $(K5_OS)
-
-DES_OBJS +=	des_crypt.o des_impl.o des_ks.o des_soft.o fips_des_util.o
-
-DLBOOT_OBJS +=	bootparam_xdr.o nfs_dlinet.o scan.o
-
-KRTLD_OBJS +=	kobj_bootflags.o getoptstr.o \
-		kobj.o kobj_kdi.o kobj_lm.o kobj_subr.o
-
-MOD_OBJS +=	modctl.o modsubr.o modsysfile.o modconf.o modhash.o
-
-STRPLUMB_OBJS += strplumb.o
-
-CPR_OBJS +=	cpr_driver.o cpr_dump.o \
-		cpr_main.o cpr_misc.o cpr_mod.o cpr_stat.o \
-		cpr_uthread.o
-
-PROF_OBJS +=	prf.o
-
-SE_OBJS += se_driver.o
-
-SYSACCT_OBJS +=	acct.o
-
-ACCTCTL_OBJS +=	acctctl.o
-
-EXACCTSYS_OBJS += exacctsys.o
-
-KAIO_OBJS += aio.o
-
-PCMCIA_OBJS += pcmcia.o cs.o cis.o cis_callout.o cis_handlers.o cis_params.o
-
-BUSRA_OBJS += busra.o
-
-PCS_OBJS += pcs.o
-
-PCAN_OBJS += pcan.o
-
-PCATA_OBJS += pcide.o pcdisk.o pclabel.o pcata.o
-
-PCSER_OBJS += pcser.o pcser_cis.o
-
-PCWL_OBJS += pcwl.o
-
-PSET_OBJS +=	pset.o
-
-OHCI_OBJS += ohci.o ohci_hub.o ohci_polled.o
-
-UHCI_OBJS += uhci.o uhciutil.o uhcitgt.o uhcihub.o uhcipolled.o
-
-EHCI_OBJS += ehci.o ehci_hub.o ehci_xfer.o ehci_intr.o ehci_util.o ehci_polled.o ehci_isoch.o ehci_isoch_util.o
-
-HUBD_OBJS += hubd.o
-
-USB_MID_OBJS += usb_mid.o
-
-USB_IA_OBJS += usb_ia.o
-
-UWBA_OBJS += uwba.o uwbai.o
-
-SCSA2USB_OBJS += scsa2usb.o usb_ms_bulkonly.o usb_ms_cbi.o
-
-HWAHC_OBJS += hwahc.o hwahc_util.o
-
-WUSB_DF_OBJS += wusb_df.o
-WUSB_FWMOD_OBJS += wusb_fwmod.o
-
-IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \
-	    ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \
-	    ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o
-
-IBD_OBJS +=	ibd.o ibd_cm.o
-
-DLPISTUB_OBJS += dlpistub.o
-
-SDP_OBJS +=	sdpddi.o
-
-TRILL_OBJS +=   trill.o
-
-CTF_OBJS += ctf_create.o ctf_decl.o ctf_error.o ctf_hash.o ctf_labels.o \
-	ctf_lookup.o ctf_open.o ctf_types.o ctf_util.o ctf_subr.o ctf_mod.o
-
-SMBIOS_OBJS += smb_error.o smb_info.o smb_open.o smb_subr.o smb_dev.o
-
-RPCIB_OBJS += rpcib.o
-
-KMDB_OBJS += kdrv.o
-
-AFE_OBJS += afe.o
-
-BGE_OBJS += bge_main2.o bge_chip2.o bge_kstats.o bge_log.o bge_ndd.o \
-		bge_atomic.o bge_mii.o bge_send.o bge_recv2.o bge_mii_5906.o
-
-DMFE_OBJS += dmfe_log.o dmfe_main.o dmfe_mii.o
-
-ELXL_OBJS += elxl.o
-
-HME_OBJS += hme.o
-
-IXGB_OBJS += ixgb.o ixgb_atomic.o ixgb_chip.o ixgb_gld.o ixgb_kstats.o \
-		ixgb_log.o ixgb_ndd.o ixgb_rx.o ixgb_tx.o ixgb_xmii.o
-
-NGE_OBJS += nge_main.o nge_atomic.o nge_chip.o nge_ndd.o nge_kstats.o \
-		nge_log.o nge_rx.o nge_tx.o nge_xmii.o
-
-RGE_OBJS += rge_main.o rge_chip.o rge_ndd.o rge_kstats.o rge_log.o rge_rxtx.o
-
-URTW_OBJS += urtw.o
-
-ARN_OBJS += arn_hw.o arn_eeprom.o arn_mac.o arn_calib.o arn_ani.o arn_phy.o arn_regd.o arn_beacon.o \
-		arn_main.o arn_recv.o arn_xmit.o arn_rc.o
-
-ATH_OBJS += ath_aux.o ath_main.o ath_osdep.o ath_rate.o
-
-ATU_OBJS += atu.o
-
-IPW_OBJS += ipw2100_hw.o ipw2100.o
-
-IWI_OBJS += ipw2200_hw.o ipw2200.o
-
-IWH_OBJS += iwh.o
-
-IWK_OBJS += iwk2.o
-
-IWP_OBJS += iwp.o
-
-MWL_OBJS += mwl.o
-
-MWLFW_OBJS += mwlfw_mode.o
-
-WPI_OBJS += wpi.o
-
-RAL_OBJS += rt2560.o ral_rate.o
-
-RUM_OBJS += rum.o
-
-RWD_OBJS += rt2661.o
-
-RWN_OBJS += rt2860.o
-
-UATH_OBJS += uath.o
-
-UATHFW_OBJS += uathfw_mod.o
-
-URAL_OBJS += ural.o
-
-RTW_OBJS += rtw.o smc93cx6.o rtwphy.o rtwphyio.o
-
-ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o
-
-MXFE_OBJS += mxfe.o
-
-MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o mptsas_raid.o
-
-SFE_OBJS += sfe.o sfe_util.o
-
-BFE_OBJS += bfe.o
-
-BRIDGE_OBJS += bridge.o
-
-DDA_OBJS += dda.o
-
-DMD_OBJS += dmd.o
-
-IDM_SHARED_OBJS += base64.o
-
-IDM_OBJS +=	$(IDM_SHARED_OBJS) \
-		idm.o idm_impl.o idm_text.o idm_conn_sm.o idm_so.o
-
-VR_OBJS += vr.o
-
-ATGE_OBJS += atge_main.o atge_l1e.o atge_mii.o atge_l1.o
-
-YGE_OBJS = yge.o
-
-#
-#	Build up defines and paths.
-#
-LINT_DEFS	+= -Dunix
-
-#
-#	This duality can be removed when the native and target compilers
-#	are the same (or at least recognize the same command line syntax!)
-#	It is a bug in the current compilation system that the assember
-#	can't process the -Y I, flag.
-#
-NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
-AS_INC_PATH	+= $(INC_PATH) -I$(UTSBASE)/common
-INCLUDE_PATH    += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common
-
-PCIEB_OBJS += pcieb.o
-
-#	Chelsio N110 10G NIC driver module
-#
-CH_OBJS = ch.o glue.o pe.o sge.o
-
-CH_COM_OBJS =	ch_mac.o ch_subr.o cspi.o espi.o ixf1010.o mc3.o mc4.o mc5.o \
-		mv88e1xxx.o mv88x201x.o my3126.o pm3393.o tp.o ulp.o \
-		vsc7321.o vsc7326.o xpak.o
-
-#
-#	PCI strings file
-#
-PCI_STRING_OBJS = pci_strings.o
-
-NET_DACF_OBJS += net_dacf.o
-
-#
-#	Xframe 10G NIC driver module
-#
-XGE_OBJS = xge.o xgell.o
-
-XGE_HAL_OBJS =  xgehal-channel.o xgehal-fifo.o xgehal-ring.o  xgehal-config.o \
-		xgehal-driver.o  xgehal-mm.o xgehal-stats.o  xgehal-device.o \
-		xge-queue.o  xgehal-mgmt.o xgehal-mgmtaux.o
-
-#
-#	e1000g module
-#
-E1000G_OBJS +=	e1000_80003es2lan.o e1000_82540.o e1000_82541.o e1000_82542.o \
-		e1000_82543.o e1000_82571.o e1000_api.o e1000_ich8lan.o \
-		e1000_mac.o e1000_manage.o e1000_nvm.o e1000_osdep.o \
-		e1000_phy.o e1000g_debug.o e1000g_main.o e1000g_alloc.o \
-		e1000g_tx.o e1000g_rx.o e1000g_stat.o
-
-#
-#	Intel 82575 1G NIC driver module
-#
-IGB_OBJS =	igb_82575.o igb_api.o igb_mac.o igb_manage.o \
-		igb_nvm.o igb_osdep.o igb_phy.o igb_buf.o \
-		igb_debug.o igb_gld.o igb_log.o igb_main.o \
-		igb_rx.o igb_stat.o igb_tx.o
-
-#
-#       Intel 10GbE PCIE NIC driver module
-#
-IXGBE_OBJS =    ixgbe_82598.o ixgbe_82599.o ixgbe_api.o		\
-                ixgbe_common.o ixgbe_phy.o			\
-                ixgbe_buf.o ixgbe_debug.o ixgbe_gld.o           \
-                ixgbe_log.o ixgbe_main.o 	                \
-                ixgbe_osdep.o ixgbe_rx.o ixgbe_stat.o           \
-                ixgbe_tx.o
-
-#
-#	NIU 10G/1G driver module
-#
-NXGE_OBJS =	nxge_mac.o nxge_ipp.o nxge_rxdma.o		\
-		nxge_txdma.o nxge_txc.o nxge_main.o		\
-		nxge_hw.o nxge_fzc.o nxge_virtual.o		\
-		nxge_send.o nxge_classify.o nxge_fflp.o		\
-		nxge_fflp_hash.o nxge_ndd.o nxge_kstats.o	\
-		nxge_zcp.o nxge_fm.o nxge_espc.o nxge_hv.o	\
-		nxge_hio.o nxge_hio_guest.o nxge_intr.o
-
-NXGE_NPI_OBJS =	\
-		npi.o npi_mac.o npi_ipp.o			\
-		npi_txdma.o npi_rxdma.o npi_txc.o		\
-		npi_zcp.o npi_espc.o npi_fflp.o			\
-		npi_vir.o
-
-NXGE_HCALL_OBJS =	\
-		nxge_hcall.o
-
-#
-#	kiconv modules
-#
-KICONV_EMEA_OBJS += kiconv_emea.o
-
-#
-#	blk2scsa
-#
-BLK2SCSA_OBJS = blk2scsa.o
-
-KICONV_JA_OBJS += kiconv_ja.o
-
-KICONV_KO_OBJS += kiconv_cck_common.o kiconv_ko.o
-
-KICONV_SC_OBJS += kiconv_cck_common.o kiconv_sc.o
-
-KICONV_TC_OBJS += kiconv_cck_common.o kiconv_tc.o
-
-#
-#	AAC module
-#
-AAC_OBJS = aac.o aac_ioctl.o
-
-#
-#	sdcard modules
-#
-SDA_OBJS =	sda_cmd.o sda_host.o sda_init.o sda_mem.o sda_mod.o \
-		sda_nexus.o sda_slot.o
-SDCARD_OBJS =	sdcard.o
-SDHOST_OBJS =	sdhost.o
-
-#
-#	hxge 10G driver module
-#
-HXGE_OBJS =	hxge_main.o hxge_vmac.o hxge_send.o		\
-		hxge_txdma.o hxge_rxdma.o hxge_virtual.o	\
-		hxge_fm.o hxge_fzc.o hxge_hw.o hxge_kstats.o	\
-		hxge_ndd.o hxge_pfc.o				\
-		hpi.o hpi_vmac.o hpi_rxdma.o hpi_txdma.o	\
-		hpi_vir.o hpi_pfc.o
-
-#
-#	MEGARAID_SAS module
-#
-MEGA_SAS_OBJS = megaraid_sas.o
-
-#
-#	MR_SAS module
-#
-MR_SAS_OBJS = mr_sas.o
-
-#
-#	ISCSI_INITIATOR module
-#
-ISCSI_INITIATOR_OBJS =	chap.o iscsi_io.o iscsi_thread.o	\
-			iscsi_ioctl.o iscsid.o iscsi.o		\
-			iscsi_login.o isns_client.o iscsiAuthClient.o	\
-			iscsi_lun.o iscsiAuthClientGlue.o	\
-			iscsi_net.o nvfile.o iscsi_cmd.o	\
-			iscsi_queue.o persistent.o iscsi_conn.o	\
-			iscsi_sess.o radius_auth.o iscsi_crc.o	\
-			iscsi_stats.o radius_packet.o iscsi_doorclt.o	\
-			iscsi_targetparam.o utils.o kifconf.o
-
-#
-#	ntxn 10Gb/1Gb NIC driver module
-#
-NTXN_OBJS =	unm_nic_init.o unm_gem.o unm_nic_hw.o unm_ndd.o	\
-			unm_nic_main.o unm_nic_isr.o unm_nic_ctx.o niu.o
-
-#
-#	Myricom 10Gb NIC driver module
-#
-MYRI10GE_OBJS =	myri10ge.o myri10ge_lro.o
-
-#	nulldriver module
-#
-NULLDRIVER_OBJS =	nulldriver.o
-
-TPM_OBJS =	tpm.o tpm_hcall.o
Index: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c,v
retrieving revision 1.35
diff -u -p -r1.35 dtrace.c
--- src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c	7 Jan 2017 21:39:53 -0000	1.35
+++ src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c	11 Jun 2017 11:48:36 -0000
@@ -18,16 +18,15 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c,v 1.10.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 313266 2017-02-05 02:47:34Z markj $
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
-/* #pragma ident	"%Z%%M%	%I%	%E% SMI" */
-
 /*
  * DTrace - Dynamic Tracing for Solaris
  *
@@ -67,82 +66,119 @@
  * [Group] Functions", allowing one to find each block by searching forward
  * on capital-f functions.
  */
-#if !defined(sun)
-/* we need internal access to mutexes for state inspection */
+#ifdef __NetBSD__
 #define	__MUTEX_PRIVATE
 #define __RWLOCK_PRIVATE
+#include <sys/proc.h>
 #endif
 
 #include <sys/errno.h>
-#if !defined(sun)
+#ifndef illumos
 #include <sys/time.h>
 #endif
 #include <sys/stat.h>
+#include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
-#if defined(sun)
-#include <sys/modctl.h>
+#ifdef illumos
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #endif
 #include <sys/cpuvar.h>
 #include <sys/kmem.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/strsubr.h>
 #endif
 #include <sys/sysmacros.h>
 #include <sys/dtrace_impl.h>
 #include <sys/atomic.h>
 #include <sys/cmn_err.h>
+#ifdef illumos
 #include <sys/mutex_impl.h>
 #include <sys/rwlock_impl.h>
+#endif
 #include <sys/ctf_api.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/panic.h>
 #include <sys/priv_impl.h>
 #endif
 #include <sys/policy.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/cred_impl.h>
 #include <sys/procfs_isa.h>
 #endif
 #include <sys/taskq.h>
-#if defined(sun)
+#ifdef illumos
 #include <sys/mkdev.h>
 #include <sys/kdi.h>
 #endif
 #include <sys/zone.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
+#include "strtolctype.h"
 
 /* FreeBSD includes: */
-#if !defined(sun)
-
+#ifdef __FreeBSD__
+#include <sys/callout.h>
 #include <sys/ctype.h>
-#include <sys/syslimits.h>
-//#include <sys/kdb.h>
+#include <sys/eventhandler.h>
+#include <sys/limits.h>
+#include <sys/linker.h>
+#include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/ptrace.h>
+#include <sys/random.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
 #include <sys/sysctl.h>
+#endif
+
+#ifdef __NetBSD__
+#include <sys/cred.h>
+#include <sys/callout.h>
+#include <sys/ctype.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/random.h>
 #include <sys/rwlock.h>
-//#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/mutex_impl.h>
+#include <sys/rwlock_impl.h>
+#include <sys/mkdev.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
-#include <sys/dtrace_bsd.h>
 #include <sys/vmem.h>
 #include <sys/module.h>
 #include <sys/cpu.h>
+#endif
+
+#ifndef illumos
+
+#include <sys/dtrace_bsd.h>
+
+#include "dtrace_xoroshiro128_plus.h"
+
 #include <netinet/in.h>
+
 #include "dtrace_cddl.h"
 #include "dtrace_debug.c"
-#endif
 
-#if !defined(sun)
-/* fake module entry for netbsd */
-module_t *mod_nbsd = NULL;
-#endif
+#ifdef __NetBSD__
+struct dtrace_state_worker *dtrace_state_worker_add(void (*fn)(dtrace_state_t *),
+    dtrace_state_t *state, hrtime_t interval);
+void dtrace_state_worker_remove(struct dtrace_state_worker *w);
+
+modctl_t *mod_nbsd;
+
+#endif /* __NetBSD__ */
+
+#endif /* !illumos */
+
 
 /*
  * DTrace Tunable Variables
@@ -165,17 +201,21 @@ module_t *mod_nbsd = NULL;
  * /etc/system.
  */
 int		dtrace_destructive_disallow = 0;
+#ifndef illumos
+/* Positive logic version of dtrace_destructive_disallow for loader tunable */
+int		dtrace_allow_destructive = 1;
+#endif
 dtrace_optval_t	dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 size_t		dtrace_difo_maxsize = (256 * 1024);
-dtrace_optval_t	dtrace_dof_maxsize = (256 * 1024);
-size_t		dtrace_global_maxsize = (16 * 1024);
+dtrace_optval_t	dtrace_dof_maxsize = (8 * 1024 * 1024);
+size_t		dtrace_statvar_maxsize = (16 * 1024);
 size_t		dtrace_actions_max = (16 * 1024);
 size_t		dtrace_retain_max = 1024;
-dtrace_optval_t	dtrace_helper_actions_max = 32;
+dtrace_optval_t	dtrace_helper_actions_max = 128;
 dtrace_optval_t	dtrace_helper_providers_max = 32;
 dtrace_optval_t	dtrace_dstate_defsize = (1 * 1024 * 1024);
 size_t		dtrace_strsize_default = 256;
-dtrace_optval_t	dtrace_cleanrate_default = 99009900;		/* 101 hz */
+dtrace_optval_t	dtrace_cleanrate_default = 9900990;		/* 101 hz */
 dtrace_optval_t	dtrace_cleanrate_min = 200000;			/* 5000 hz */
 dtrace_optval_t	dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;	/* 1/minute */
 dtrace_optval_t	dtrace_aggrate_default = NANOSEC;		/* 1 hz */
@@ -189,13 +229,17 @@ dtrace_optval_t dtrace_ustackframes_defa
 dtrace_optval_t dtrace_jstackframes_default = 50;
 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 int		dtrace_msgdsize_max = 128;
-hrtime_t	dtrace_chill_max = 500 * (NANOSEC / MILLISEC);	/* 500 ms */
+hrtime_t	dtrace_chill_max = MSEC2NSEC(500);		/* 500 ms */
 hrtime_t	dtrace_chill_interval = NANOSEC;		/* 1000 ms */
 int		dtrace_devdepth_max = 32;
 int		dtrace_err_verbose;
 hrtime_t	dtrace_deadman_interval = NANOSEC;
 hrtime_t	dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 hrtime_t	dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
+hrtime_t	dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
+#ifndef illumos
+int		dtrace_memstr_max = 4096;
+#endif
 
 /*
  * DTrace External Variables
@@ -211,22 +255,28 @@ const char	dtrace_zero[256] = { 0 };	/* 
 /*
  * DTrace Internal Variables
  */
-#if defined(sun)
+#ifdef illumos
 static dev_info_t	*dtrace_devi;		/* device info */
 #endif
+#ifdef illumos
 static vmem_t		*dtrace_arena;		/* probe ID arena */
-#if defined(sun)
 static vmem_t		*dtrace_minor;		/* minor number arena */
+#else
 static taskq_t		*dtrace_taskq;		/* task queue */
+#ifdef __NetBSD__
+static vmem_t		*dtrace_arena;		/* probe ID arena */
+#else
+static struct unrhdr	*dtrace_arena;		/* Probe ID number.     */
+#endif
 #endif
 static dtrace_probe_t	**dtrace_probes;	/* array of all probes */
-int			dtrace_probes_size=0;	/* size for kmem_free */
 static int		dtrace_nprobes;		/* number of probes */
 static dtrace_provider_t *dtrace_provider;	/* provider list */
 static dtrace_meta_t	*dtrace_meta_pid;	/* user-land meta provider */
 static int		dtrace_opens;		/* number of opens */
 static int		dtrace_helpers;		/* number of helpers */
-#if defined(sun)
+static int		dtrace_getf;		/* number of unpriv getf()s */
+#ifdef illumos
 static void		*dtrace_softstate;	/* softstate pointer */
 #endif
 static dtrace_hash_t	*dtrace_bymod;		/* probes hashed by module */
@@ -243,12 +293,14 @@ static dtrace_ecb_t	*dtrace_ecb_create_c
 static dtrace_genid_t	dtrace_probegen;	/* current probe generation */
 static dtrace_helpers_t *dtrace_deferred_pid;	/* deferred helper list */
 static dtrace_enabling_t *dtrace_retained;	/* list of retained enablings */
+static dtrace_genid_t	dtrace_retained_gen;	/* current retained enab gen */
 static dtrace_dynvar_t	dtrace_dynhash_sink;	/* end of dynamic hash chains */
-#if !defined(sun)
-int		dtrace_in_probe;	/* non-zero if executing a probe */
-#if defined(__i386__) || defined(__amd64__)
-uintptr_t	dtrace_in_probe_addr;	/* Address of invop when already in probe */
-#endif
+static int		dtrace_dynvar_failclean; /* dynvars failed to clean */
+#ifdef __FreeBSD__
+static struct mtx	dtrace_unr_mtx;
+MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
+static eventhandler_tag	dtrace_kld_load_tag;
+static eventhandler_tag	dtrace_kld_unload_try_tag;
 #endif
 
 /*
@@ -285,19 +337,23 @@ static kmutex_t		dtrace_lock;		/* probe 
 static kmutex_t		dtrace_provider_lock;	/* provider state lock */
 static kmutex_t		dtrace_meta_lock;	/* meta-provider state lock */
 
-#if !defined(sun)
+#ifndef illumos
 /* XXX FreeBSD hacks. */
+#ifdef __FreeBSD__
 static kmutex_t		mod_lock;
+#endif
 
 #define cr_suid		cr_svuid
 #define cr_sgid		cr_svgid
 #define	ipaddr_t	in_addr_t
 #define mod_modname	pathname
 #define vuprintf	vprintf
+#ifdef __NetBSD__
 #define ttoproc(_a)	((_a)->l_proc)
+#else
+#define ttoproc(_a)	((_a)->td_proc)
+#endif
 #define crgetzoneid(_a)	0
-//#define	NCPU		MAXCPUS
-#define	NCPU		ncpu
 #define SNOCD		0
 #define CPU_ON_INTR(_a)	0
 
@@ -309,12 +365,18 @@ static kmutex_t		mod_lock;
 #define PRIV_PROC_ZONE		(1 << 5)
 #define PRIV_ALL		~0
 
-//SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information");
+SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information");
+SYSCTL_DECL(_debug_dtrace);
+SYSCTL_DECL(_kern_dtrace);
 #endif
 
-#if defined(sun)
+#ifdef illumos
 #define curcpu_id	CPU->cpu_id
-#else
+#endif
+#ifdef __FreeBSD__
+#define curcpu_id	curcpu
+#endif
+#ifdef __NetBSD__
 #define curcpu_id	cpu_number()
 #endif
 
@@ -337,20 +399,10 @@ static void
 dtrace_nullop(void)
 {}
 
-static int
-dtrace_enable_nullop(void)
-{
-	return (0);
-}
-
 static dtrace_pops_t	dtrace_provider_ops = {
-	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
-#if defined(sun)
-	(void (*)(void *, modctl_t *))dtrace_nullop,
-#else
-	(void (*)(void *, dtrace_modctl_t *))dtrace_nullop,
-#endif
-	(int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
+	(void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
+ 	(void (*)(void *, modctl_t *))dtrace_nullop,
+	(int (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
@@ -366,17 +418,22 @@ dtrace_id_t		dtrace_probeid_error;	/* sp
 
 /*
  * DTrace Helper Tracing Variables
- */
-uint32_t dtrace_helptrace_next = 0;
-uint32_t dtrace_helptrace_nlocals;
-char	*dtrace_helptrace_buffer;
-int	dtrace_helptrace_bufsize = 512 * 1024;
-
-#ifdef DEBUG
-int	dtrace_helptrace_enabled = 1;
-#else
-int	dtrace_helptrace_enabled = 0;
-#endif
+ *
+ * These variables should be set dynamically to enable helper tracing.  The
+ * only variables that should be set are dtrace_helptrace_enable (which should
+ * be set to a non-zero value to allocate helper tracing buffers on the next
+ * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
+ * non-zero value to deallocate helper tracing buffers on the next close of
+ * /dev/dtrace).  When (and only when) helper tracing is disabled, the
+ * buffer size may also be set via dtrace_helptrace_bufsize.
+ */
+int			dtrace_helptrace_enable = 0;
+int			dtrace_helptrace_disable = 0;
+int			dtrace_helptrace_bufsize = 16 * 1024 * 1024;
+uint32_t		dtrace_helptrace_nlocals;
+static dtrace_helptrace_t *dtrace_helptrace_buffer;
+static uint32_t		dtrace_helptrace_next = 0;
+static int		dtrace_helptrace_wrapped = 0;
 
 /*
  * DTrace Error Hashing
@@ -434,7 +491,7 @@ static kmutex_t dtrace_errlock;
  * no way for a global variable key signature to match a thread-local key
  * signature.
  */
-#if defined(sun)
+#ifdef illumos
 #define	DTRACE_TLS_THRKEY(where) { \
 	uint_t intr = 0; \
 	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
@@ -444,27 +501,27 @@ static kmutex_t dtrace_errlock;
 	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
 	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 }
-#else
-#define	DTRACE_TLS_THRKEY(where) { \
-	uint_t intr = 0; \
-	(where) = ((curthread->l_lid + (curthread->l_proc->p_pid << 16) + \
-		    DIF_VARIABLE_MAX) & \
-		    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
-}
-#if 0
+#endif
+#ifdef __FreeBSD__
 #define	DTRACE_TLS_THRKEY(where) { \
-	solaris_cpu_t *_c = &solaris_cpu[curcpu_id]; \
+	solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
 	uint_t intr = 0; \
 	uint_t actv = _c->cpu_intr_actv; \
 	for (; actv; actv >>= 1) \
 		intr++; \
 	ASSERT(intr < (1 << 3)); \
+	(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
+	    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
+}
+#endif
+#ifdef __NetBSD__
+#define	DTRACE_TLS_THRKEY(where) { \
+	uint_t intr = 0; \
 	(where) = ((curthread->l_lid + (curthread->l_proc->p_pid << 16) + \
 		    DIF_VARIABLE_MAX) & \
 		    (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 }
 #endif
-#endif
 
 #define	DT_BSWAP_8(x)	((x) & 0xff)
 #define	DT_BSWAP_16(x)	((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
@@ -476,7 +533,7 @@ static kmutex_t dtrace_errlock;
 #define	DTRACE_STORE(type, tomax, offset, what) \
 	*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 
-#ifndef __i386
+#ifndef __x86
 #define	DTRACE_ALIGNCHECK(addr, size, flags)				\
 	if (addr & (size - 1)) {					\
 		*flags |= CPU_DTRACE_BADALIGN;				\
@@ -494,10 +551,18 @@ static kmutex_t dtrace_errlock;
  * disallow all negative sizes.  Ranges of size 0 are allowed.
  */
 #define	DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
-	((testaddr) - (baseaddr) < (basesz) && \
-	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
+	((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
+	(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
 	(testaddr) + (testsz) >= (testaddr))
 
+#define	DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)		\
+do {									\
+	if ((remp) != NULL) {						\
+		*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);	\
+	}								\
+_NOTE(CONSTCOND) } while (0)
+
+
 /*
  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
  * alloc_sz on the righthand side of the comparison in order to avoid overflow
@@ -584,21 +649,19 @@ static dtrace_probe_t *dtrace_probe_look
 static void dtrace_enabling_provide(dtrace_provider_t *);
 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
 static void dtrace_enabling_matchall(void);
+static void dtrace_enabling_reap(void);
 static dtrace_state_t *dtrace_anon_grab(void);
 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
     dtrace_state_t *, uint64_t, uint64_t);
-#if defined(sun)
 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
-#endif
 static void dtrace_buffer_drop(dtrace_buffer_t *);
+static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
     dtrace_state_t *, dtrace_mstate_t *);
 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
     dtrace_optval_t);
 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
-#if defined(sun)
 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
-#endif
 uint16_t dtrace_load16(uintptr_t);
 uint32_t dtrace_load32(uintptr_t);
 uint64_t dtrace_load64(uintptr_t);
@@ -607,6 +670,12 @@ void dtrace_dynvar_clean(dtrace_dstate_t
 dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
     size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
 uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
+static int dtrace_priv_proc(dtrace_state_t *);
+static void dtrace_getf_barrier(void);
+static int dtrace_canload_remains(uint64_t, size_t, size_t *,
+    dtrace_mstate_t *, dtrace_vstate_t *);
+static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
+    dtrace_mstate_t *, dtrace_vstate_t *);
 
 /*
  * DTrace Probe Context Functions
@@ -631,7 +700,7 @@ dtrace_panic(const char *format, ...)
 	va_list alist;
 
 	va_start(alist, format);
-#ifdef __NetBSD__
+#ifndef illumos
 	vpanic(format, alist);
 #else
 	dtrace_vpanic(format, alist);
@@ -694,10 +763,12 @@ dtrace_error(uint32_t *counter)
  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
  */
+/* BEGIN CSTYLED */
 DTRACE_LOADFUNC(8)
 DTRACE_LOADFUNC(16)
 DTRACE_LOADFUNC(32)
 DTRACE_LOADFUNC(64)
+/* END CSTYLED */
 
 static int
 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
@@ -715,19 +786,43 @@ dtrace_inscratch(uintptr_t dest, size_t 
 }
 
 static int
-dtrace_canstore_statvar(uint64_t addr, size_t sz,
+dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
     dtrace_statvar_t **svars, int nsvars)
 {
 	int i;
+	size_t maxglobalsize, maxlocalsize;
+
+	if (nsvars == 0)
+		return (0);
+
+	maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
+	maxlocalsize = maxglobalsize * NCPU;
 
 	for (i = 0; i < nsvars; i++) {
 		dtrace_statvar_t *svar = svars[i];
+		uint8_t scope;
+		size_t size;
 
-		if (svar == NULL || svar->dtsv_size == 0)
+		if (svar == NULL || (size = svar->dtsv_size) == 0)
 			continue;
 
-		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
+		scope = svar->dtsv_var.dtdv_scope;
+
+		/*
+		 * We verify that our size is valid in the spirit of providing
+		 * defense in depth:  we want to prevent attackers from using
+		 * DTrace to escalate an orthogonal kernel heap corruption bug
+		 * into the ability to store to arbitrary locations in memory.
+		 */
+		VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
+		    (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
+
+		if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
+		    svar->dtsv_size)) {
+			DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
+			    svar->dtsv_size);
 			return (1);
+		}
 	}
 
 	return (0);
@@ -743,12 +838,26 @@ static int
 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
     dtrace_vstate_t *vstate)
 {
+	return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
+}
+
+/*
+ * Implementation of dtrace_canstore which communicates the upper bound of the
+ * allowed memory region.
+ */
+static int
+dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
+    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
+{
 	/*
 	 * First, check to see if the address is in scratch space...
 	 */
 	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
-	    mstate->dtms_scratch_size))
+	    mstate->dtms_scratch_size)) {
+		DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
+		    mstate->dtms_scratch_size);
 		return (1);
+	}
 
 	/*
 	 * Now check to see if it's a dynamic variable.  This check will pick
@@ -761,6 +870,7 @@ dtrace_canstore(uint64_t addr, size_t sz
 		uintptr_t base = (uintptr_t)dstate->dtds_base +
 		    (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
 		uintptr_t chunkoffs;
+		dtrace_dynvar_t *dvar;
 
 		/*
 		 * Before we assume that we can store here, we need to make
@@ -777,6 +887,8 @@ dtrace_canstore(uint64_t addr, size_t sz
 		 *
 		 *	(3) Not span a chunk boundary
 		 *
+		 *	(4) Not be in the tuple space of a dynamic variable
+		 *
 		 */
 		if (addr < base)
 			return (0);
@@ -789,6 +901,16 @@ dtrace_canstore(uint64_t addr, size_t sz
 		if (chunkoffs + sz > dstate->dtds_chunksize)
 			return (0);
 
+		dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
+
+		if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
+			return (0);
+
+		if (chunkoffs < sizeof (dtrace_dynvar_t) +
+		    ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
+			return (0);
+
+		DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
 		return (1);
 	}
 
@@ -796,11 +918,11 @@ dtrace_canstore(uint64_t addr, size_t sz
 	 * Finally, check the static local and global variables.  These checks
 	 * take the longest, so we perform them last.
 	 */
-	if (dtrace_canstore_statvar(addr, sz,
+	if (dtrace_canstore_statvar(addr, sz, remain,
 	    vstate->dtvs_locals, vstate->dtvs_nlocals))
 		return (1);
 
-	if (dtrace_canstore_statvar(addr, sz,
+	if (dtrace_canstore_statvar(addr, sz, remain,
 	    vstate->dtvs_globals, vstate->dtvs_nglobals))
 		return (1);
 
@@ -821,27 +943,167 @@ static int
 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
     dtrace_vstate_t *vstate)
 {
+	return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
+}
+
+/*
+ * Implementation of dtrace_canload which communicates the uppoer bound of the
+ * allowed memory region.
+ */
+static int
+dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
+    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
+{
 	volatile uintptr_t *illval = &cpu_core[curcpu_id].cpuc_dtrace_illval;
+	file_t *fp;
 
 	/*
 	 * If we hold the privilege to read from kernel memory, then
 	 * everything is readable.
 	 */
-	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
+	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
+		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
 		return (1);
+	}
 
 	/*
 	 * You can obviously read that which you can store.
 	 */
-	if (dtrace_canstore(addr, sz, mstate, vstate))
+	if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
 		return (1);
 
 	/*
 	 * We're allowed to read from our own string table.
 	 */
-	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
-	    mstate->dtms_difo->dtdo_strlen))
+	if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
+	    mstate->dtms_difo->dtdo_strlen)) {
+		DTRACE_RANGE_REMAIN(remain, addr,
+		    mstate->dtms_difo->dtdo_strtab,
+		    mstate->dtms_difo->dtdo_strlen);
 		return (1);
+	}
+
+	if (vstate->dtvs_state != NULL &&
+	    dtrace_priv_proc(vstate->dtvs_state)) {
+		proc_t *p;
+
+		/*
+		 * When we have privileges to the current process, there are
+		 * several context-related kernel structures that are safe to
+		 * read, even absent the privilege to read from kernel memory.
+		 * These reads are safe because these structures contain only
+		 * state that (1) we're permitted to read, (2) is harmless or
+		 * (3) contains pointers to additional kernel state that we're
+		 * not permitted to read (and as such, do not present an
+		 * opportunity for privilege escalation).  Finally (and
+		 * critically), because of the nature of their relation with
+		 * the current thread context, the memory associated with these
+		 * structures cannot change over the duration of probe context,
+		 * and it is therefore impossible for this memory to be
+		 * deallocated and reallocated as something else while it's
+		 * being operated upon.
+		 */
+		if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
+			DTRACE_RANGE_REMAIN(remain, addr, curthread,
+			    sizeof (kthread_t));
+			return (1);
+		}
+
+		if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
+		    sz, curthread->t_procp, sizeof (proc_t))) {
+			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
+			    sizeof (proc_t));
+			return (1);
+		}
+
+#ifndef __NetBSD__
+		if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
+		    curthread->t_cred, sizeof (cred_t))) {
+			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
+			    sizeof (cred_t));
+			return (1);
+		}
+#endif
+
+#ifdef illumos
+		if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
+		    &(p->p_pidp->pid_id), sizeof (pid_t))) {
+			DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
+			    sizeof (pid_t));
+			return (1);
+		}
+
+		if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
+		    curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
+			DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
+			    offsetof(cpu_t, cpu_pause_thread));
+			return (1);
+		}
+#endif
+	}
+
+	if ((fp = mstate->dtms_getf) != NULL) {
+		uintptr_t psz = sizeof (void *);
+		vnode_t *vp;
+		vnodeops_t *op;
+
+		/*
+		 * When getf() returns a file_t, the enabling is implicitly
+		 * granted the (transient) right to read the returned file_t
+		 * as well as the v_path and v_op->vnop_name of the underlying
+		 * vnode.  These accesses are allowed after a successful
+		 * getf() because the members that they refer to cannot change
+		 * once set -- and the barrier logic in the kernel's closef()
+		 * path assures that the file_t and its referenced vode_t
+		 * cannot themselves be stale (that is, it impossible for
+		 * either dtms_getf itself or its f_vnode member to reference
+		 * freed memory).
+		 */
+		if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
+			DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
+			return (1);
+		}
+
+		if ((vp = fp->f_vnode) != NULL) {
+			size_t slen;
+#ifdef illumos
+			if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
+				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
+				    psz);
+				return (1);
+			}
+			slen = strlen(vp->v_path) + 1;
+			if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
+				DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
+				    slen);
+				return (1);
+			}
+#endif
+
+			if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
+				DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
+				    psz);
+				return (1);
+			}
+
+#ifdef illumos
+			if ((op = vp->v_op) != NULL &&
+			    DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
+				DTRACE_RANGE_REMAIN(remain, addr,
+				    &op->vnop_name, psz);
+				return (1);
+			}
+
+			if (op != NULL && op->vnop_name != NULL &&
+			    DTRACE_INRANGE(addr, sz, op->vnop_name,
+			    (slen = strlen(op->vnop_name) + 1))) {
+				DTRACE_RANGE_REMAIN(remain, addr,
+				    op->vnop_name, slen);
+				return (1);
+			}
+#endif
+		}
+	}
 
 	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 	*illval = addr;
@@ -855,21 +1117,42 @@ dtrace_canload(uint64_t addr, size_t sz,
  * calls in the event that the user has all privileges.
  */
 static int
-dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
-    dtrace_vstate_t *vstate)
+dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
+    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
 {
-	size_t strsz;
+	size_t rsize;
 
 	/*
 	 * If we hold the privilege to read from kernel memory, then
 	 * everything is readable.
 	 */
-	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
-		return (1);
 
-	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
-	if (dtrace_canload(addr, strsz, mstate, vstate))
+	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
+		DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
 		return (1);
+	}
+
+	/*
+	 * Even if the caller is uninterested in querying the remaining valid
+	 * range, it is required to ensure that the access is allowed.
+	 */
+	if (remain == NULL) {
+		remain = &rsize;
+	}
+	if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
+		size_t strsz;
+		/*
+		 * Perform the strlen after determining the length of the
+		 * memory region which is accessible.  This prevents timing
+		 * information from being used to find NULs in memory which is
+		 * not accessible to the caller.
+		 */
+		strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
+		    MIN(sz, *remain));
+		if (strsz <= *remain) {
+			return (1);
+		}
+	}
 
 	return (0);
 }
@@ -879,26 +1162,101 @@ dtrace_strcanload(uint64_t addr, size_t 
  * region in which a load may be issued given the user's privilege level.
  */
 static int
-dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
-    dtrace_vstate_t *vstate)
+dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
+    dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
 {
 	size_t sz;
 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 
 	/*
+	 * Calculate the max size before performing any checks since even
+	 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
+	 * return the max length via 'remain'.
+	 */
+	if (type->dtdt_kind == DIF_TYPE_STRING) {
+		dtrace_state_t *state = vstate->dtvs_state;
+
+		if (state != NULL) {
+			sz = state->dts_options[DTRACEOPT_STRSIZE];
+		} else {
+			/*
+			 * In helper context, we have a NULL state; fall back
+			 * to using the system-wide default for the string size
+			 * in this case.
+			 */
+			sz = dtrace_strsize_default;
+		}
+	} else {
+		sz = type->dtdt_size;
+	}
+
+	/*
 	 * If we hold the privilege to read from kernel memory, then
 	 * everything is readable.
 	 */
-	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
+	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
+		DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
 		return (1);
+	}
 
-	if (type->dtdt_kind == DIF_TYPE_STRING)
-		sz = dtrace_strlen(src,
-		    vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
-	else
-		sz = type->dtdt_size;
+	if (type->dtdt_kind == DIF_TYPE_STRING) {
+		return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
+		    vstate));
+	}
+	return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
+	    vstate));
+}
+
+/*
+ * Convert a string to a signed integer using safe loads.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static int64_t
+dtrace_strtoll(char *input, int base, size_t limit)
+{
+	uintptr_t pos = (uintptr_t)input;
+	int64_t val = 0;
+	int x;
+	boolean_t neg = B_FALSE;
+	char c, cc, ccc;
+	uintptr_t end = pos + limit;
+
+	/*
+	 * Consume any whitespace preceding digits.
+	 */
+	while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
+		pos++;
+
+	/*
+	 * Handle an explicit sign if one is present.
+	 */
+	if (c == '-' || c == '+') {
+		if (c == '-')
+			neg = B_TRUE;
+		c = dtrace_load8(++pos);
+	}
 
-	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
+	/*
+	 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
+	 * if present.
+	 */
+	if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
+	    cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
+		pos += 2;
+		c = ccc;
+	}
+
+	/*
+	 * Read in contiguous digits until the first non-digit character.
+	 */
+	for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
+	    c = dtrace_load8(++pos))
+		val = val * base + x;
+
+	return (neg ? -val : val);
 }
 
 /*
@@ -1036,14 +1394,14 @@ dtrace_strcpy(const void *src, void *dst
  * specified type; we assume that we can store to directly.
  */
 static void
-dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
+dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
 {
 	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 
 	if (type->dtdt_kind == DIF_TYPE_STRING) {
-		dtrace_strcpy(src, dst, type->dtdt_size);
+		dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
 	} else {
-		dtrace_bcopy(src, dst, type->dtdt_size);
+		dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
 	}
 }
 
@@ -1193,16 +1551,7 @@ dtrace_priv_proc_common_user(dtrace_stat
 	 */
 	ASSERT(s_cr != NULL);
 
-#if defined(sun)
-	if ((cr = CRED()) != NULL &&
-	    s_cr->cr_uid == cr->cr_uid &&
-	    s_cr->cr_uid == cr->cr_ruid &&
-	    s_cr->cr_uid == cr->cr_suid &&
-	    s_cr->cr_gid == cr->cr_gid &&
-	    s_cr->cr_gid == cr->cr_rgid &&
-	    s_cr->cr_gid == cr->cr_sgid)
-		return (1);
-#else
+#ifdef __NetBSD__
 	if ((cr = CRED()) != NULL) {
 	    uid_t uid;
 	    gid_t gid;
@@ -1215,10 +1564,18 @@ dtrace_priv_proc_common_user(dtrace_stat
 		    uid == kauth_cred_getsvuid(cr) &&
 		    gid == kauth_cred_getgid(cr) &&
 		    gid == kauth_cred_getegid(cr) &&
-		    gid == kauth_cred_getsvgid(cr)) {
+		    gid == kauth_cred_getsvgid(cr))
 			return 1;
-		}
 	}
+#else
+	if ((cr = CRED()) != NULL &&
+	    s_cr->cr_uid == cr->cr_uid &&
+	    s_cr->cr_uid == cr->cr_ruid &&
+	    s_cr->cr_uid == cr->cr_suid &&
+	    s_cr->cr_gid == cr->cr_gid &&
+	    s_cr->cr_gid == cr->cr_rgid &&
+	    s_cr->cr_gid == cr->cr_sgid)
+		return (1);
 #endif
 
 	return (0);
@@ -1232,7 +1589,7 @@ dtrace_priv_proc_common_user(dtrace_stat
 static int
 dtrace_priv_proc_common_zone(dtrace_state_t *state)
 {
-#if defined(sun)
+#ifdef illumos
 	cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
 
 	/*
@@ -1241,7 +1598,7 @@ dtrace_priv_proc_common_zone(dtrace_stat
 	 */
 	ASSERT(s_cr != NULL);
 
-	if ((cr = CRED()) != NULL &&
+	if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
 	    s_cr->cr_zone == cr->cr_zone)
 		return (1);
 
@@ -1342,38 +1699,214 @@ dtrace_priv_kernel_destructive(dtrace_st
 }
 
 /*
- * Note:  not called from probe context.  This function is called
- * asynchronously (and at a regular interval) from outside of probe context to
- * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
- * cleaning is explained in detail in <sys/dtrace_impl.h>.
+ * Determine if the dte_cond of the specified ECB allows for processing of
+ * the current probe to continue.  Note that this routine may allow continued
+ * processing, but with access(es) stripped from the mstate's dtms_access
+ * field.
  */
-void
-dtrace_dynvar_clean(dtrace_dstate_t *dstate)
+static int
+dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
+    dtrace_ecb_t *ecb)
 {
-	dtrace_dynvar_t *dirty;
-	dtrace_dstate_percpu_t *dcpu;
-	int i, work = 0;
+	dtrace_probe_t *probe = ecb->dte_probe;
+	dtrace_provider_t *prov = probe->dtpr_provider;
+	dtrace_pops_t *pops = &prov->dtpv_pops;
+	int mode = DTRACE_MODE_NOPRIV_DROP;
 
-	for (i = 0; i < NCPU; i++) {
-		dcpu = &dstate->dtds_percpu[i];
+	ASSERT(ecb->dte_cond);
 
-		ASSERT(dcpu->dtdsc_rinsing == NULL);
+#ifdef illumos
+	if (pops->dtps_mode != NULL) {
+		mode = pops->dtps_mode(prov->dtpv_arg,
+		    probe->dtpr_id, probe->dtpr_arg);
 
-		/*
-		 * If the dirty list is NULL, there is no dirty work to do.
-		 */
-		if (dcpu->dtdsc_dirty == NULL)
-			continue;
+		ASSERT((mode & DTRACE_MODE_USER) ||
+		    (mode & DTRACE_MODE_KERNEL));
+		ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
+		    (mode & DTRACE_MODE_NOPRIV_DROP));
+	}
+
+	/*
+	 * If the dte_cond bits indicate that this consumer is only allowed to
+	 * see user-mode firings of this probe, call the provider's dtps_mode()
+	 * entry point to check that the probe was fired while in a user
+	 * context.  If that's not the case, use the policy specified by the
+	 * provider to determine if we drop the probe or merely restrict
+	 * operation.
+	 */
+	if (ecb->dte_cond & DTRACE_COND_USERMODE) {
+		ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
+
+		if (!(mode & DTRACE_MODE_USER)) {
+			if (mode & DTRACE_MODE_NOPRIV_DROP)
+				return (0);
+
+			mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
+		}
+	}
+#endif
+
+	/*
+	 * This is more subtle than it looks. We have to be absolutely certain
+	 * that CRED() isn't going to change out from under us so it's only
+	 * legit to examine that structure if we're in constrained situations.
+	 * Currently, the only times we'll this check is if a non-super-user
+	 * has enabled the profile or syscall providers -- providers that
+	 * allow visibility of all processes. For the profile case, the check
+	 * above will ensure that we're examining a user context.
+	 */
+	if (ecb->dte_cond & DTRACE_COND_OWNER) {
+		cred_t *cr;
+		cred_t *s_cr = state->dts_cred.dcr_cred;
+		proc_t *proc;
+
+		ASSERT(s_cr != NULL);
+
+#ifdef __NetBSD__
+		uid_t uid = kauth_cred_getuid(s_cr);
+		gid_t gid = kauth_cred_getgid(s_cr);
+
+		if ((cr = CRED()) == NULL ||
+		    uid != kauth_cred_geteuid(cr) ||
+		    uid != kauth_cred_getuid(cr) ||
+		    uid != kauth_cred_getsvuid(cr) ||
+		    gid != kauth_cred_getegid(cr) ||
+		    gid != kauth_cred_getgid(cr) ||
+		    gid != kauth_cred_getsvgid(cr) ||
+		    (proc = ttoproc(curthread)) == NULL ||
+		    (proc->p_flag & SNOCD)) {
+			if (mode & DTRACE_MODE_NOPRIV_DROP)
+				return (0);
+		}
+#else /* __NetBSD__ */
+		if ((cr = CRED()) == NULL ||
+		    s_cr->cr_uid != cr->cr_uid ||
+		    s_cr->cr_uid != cr->cr_ruid ||
+		    s_cr->cr_uid != cr->cr_suid ||
+		    s_cr->cr_gid != cr->cr_gid ||
+		    s_cr->cr_gid != cr->cr_rgid ||
+		    s_cr->cr_gid != cr->cr_sgid ||
+		    (proc = ttoproc(curthread)) == NULL ||
+		    (proc->p_flag & SNOCD)) {
+			if (mode & DTRACE_MODE_NOPRIV_DROP)
+				return (0);
+
+#ifdef illumos
+			mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
+#endif
+		}
+#endif /* __NetBSD__ */
+	}
+
+#ifdef illumos
+	/*
+	 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
+	 * in our zone, check to see if our mode policy is to restrict rather
+	 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
+	 * and DTRACE_ACCESS_ARGS
+	 */
+	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
+		cred_t *cr;
+		cred_t *s_cr = state->dts_cred.dcr_cred;
+
+		ASSERT(s_cr != NULL);
+
+		if ((cr = CRED()) == NULL ||
+		    s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
+			if (mode & DTRACE_MODE_NOPRIV_DROP)
+				return (0);
+
+			mstate->dtms_access &=
+			    ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
+		}
+	}
+#endif
+
+	return (1);
+}
+
+/*
+ * Note:  not called from probe context.  This function is called
+ * asynchronously (and at a regular interval) from outside of probe context to
+ * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
+ * cleaning is explained in detail in <sys/dtrace_impl.h>.
+ */
+void
+dtrace_dynvar_clean(dtrace_dstate_t *dstate)
+{
+	dtrace_dynvar_t *dirty;
+	dtrace_dstate_percpu_t *dcpu;
+	dtrace_dynvar_t **rinsep;
+	int i, j, work = 0;
+
+	for (i = 0; i < NCPU; i++) {
+		dcpu = &dstate->dtds_percpu[i];
+
+		rinsep = &dcpu->dtdsc_rinsing;
 
 		/*
-		 * If the clean list is non-NULL, then we're not going to do
-		 * any work for this CPU -- it means that there has not been
-		 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
-		 * since the last time we cleaned house.
+		 * If the dirty list is NULL, there is no dirty work to do.
 		 */
-		if (dcpu->dtdsc_clean != NULL)
+		if (dcpu->dtdsc_dirty == NULL)
 			continue;
 
+		if (dcpu->dtdsc_rinsing != NULL) {
+			/*
+			 * If the rinsing list is non-NULL, then it is because
+			 * this CPU was selected to accept another CPU's
+			 * dirty list -- and since that time, dirty buffers
+			 * have accumulated.  This is a highly unlikely
+			 * condition, but we choose to ignore the dirty
+			 * buffers -- they'll be picked up a future cleanse.
+			 */
+			continue;
+		}
+
+		if (dcpu->dtdsc_clean != NULL) {
+			/*
+			 * If the clean list is non-NULL, then we're in a
+			 * situation where a CPU has done deallocations (we
+			 * have a non-NULL dirty list) but no allocations (we
+			 * also have a non-NULL clean list).  We can't simply
+			 * move the dirty list into the clean list on this
+			 * CPU, yet we also don't want to allow this condition
+			 * to persist, lest a short clean list prevent a
+			 * massive dirty list from being cleaned (which in
+			 * turn could lead to otherwise avoidable dynamic
+			 * drops).  To deal with this, we look for some CPU
+			 * with a NULL clean list, NULL dirty list, and NULL
+			 * rinsing list -- and then we borrow this CPU to
+			 * rinse our dirty list.
+			 */
+			for (j = 0; j < NCPU; j++) {
+				dtrace_dstate_percpu_t *rinser;
+
+				rinser = &dstate->dtds_percpu[j];
+
+				if (rinser->dtdsc_rinsing != NULL)
+					continue;
+
+				if (rinser->dtdsc_dirty != NULL)
+					continue;
+
+				if (rinser->dtdsc_clean != NULL)
+					continue;
+
+				rinsep = &rinser->dtdsc_rinsing;
+				break;
+			}
+
+			if (j == NCPU) {
+				/*
+				 * We were unable to find another CPU that
+				 * could accept this dirty list -- we are
+				 * therefore unable to clean it now.
+				 */
+				dtrace_dynvar_failclean++;
+				continue;
+			}
+		}
+
 		work = 1;
 
 		/*
@@ -1389,7 +1922,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dst
 			 * on a hash chain, either the dirty list or the
 			 * rinsing list for some CPU must be non-NULL.)
 			 */
-			dcpu->dtdsc_rinsing = dirty;
+			*rinsep = dirty;
 			dtrace_membar_producer();
 		} while (dtrace_casptr(&dcpu->dtdsc_dirty,
 		    dirty, NULL) != dirty);
@@ -1797,7 +2330,7 @@ retry:
 
 			/*
 			 * The clean list appears to be non-empty.  We want to
-			 * move the clean list to the free list; we start by
+			 * move the clean list to our free list; we start by
 			 * moving the clean pointer aside.
 			 */
 			if (dtrace_casptr(&dcpu->dtdsc_clean,
@@ -1833,6 +2366,7 @@ retry:
 			 * owners of the clean lists out before resetting
 			 * the clean lists.
 			 */
+			dcpu = &dstate->dtds_percpu[me];
 			rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
 			ASSERT(rval == NULL);
 			goto retry;
@@ -1881,7 +2415,7 @@ retry:
 	 * this hash chain, or another CPU is deleting an element from this
 	 * hash chain.  The simplest way to deal with both of these cases
 	 * (though not necessarily the most efficient) is to free our
-	 * allocated block and tail-call ourselves.  Note that the free is
+	 * allocated block and re-attempt it all.  Note that the free is
 	 * to the dirty list and _not_ to the free list.  This is to prevent
 	 * races with allocators, above.
 	 */
@@ -1894,7 +2428,7 @@ retry:
 		dvar->dtdv_next = free;
 	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
 
-	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
+	goto top;
 }
 
 /*ARGSUSED*/
@@ -1974,6 +2508,75 @@ dtrace_aggregate_lquantize(uint64_t *lqu
 	lquanta[levels + 1] += incr;
 }
 
+static int
+dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
+    uint16_t high, uint16_t nsteps, int64_t value)
+{
+	int64_t this = 1, last, next;
+	int base = 1, order;
+
+	ASSERT(factor <= nsteps);
+	ASSERT(nsteps % factor == 0);
+
+	for (order = 0; order < low; order++)
+		this *= factor;
+
+	/*
+	 * If our value is less than our factor taken to the power of the
+	 * low order of magnitude, it goes into the zeroth bucket.
+	 */
+	if (value < (last = this))
+		return (0);
+
+	for (this *= factor; order <= high; order++) {
+		int nbuckets = this > nsteps ? nsteps : this;
+
+		if ((next = this * factor) < this) {
+			/*
+			 * We should not generally get log/linear quantizations
+			 * with a high magnitude that allows 64-bits to
+			 * overflow, but we nonetheless protect against this
+			 * by explicitly checking for overflow, and clamping
+			 * our value accordingly.
+			 */
+			value = this - 1;
+		}
+
+		if (value < this) {
+			/*
+			 * If our value lies within this order of magnitude,
+			 * determine its position by taking the offset within
+			 * the order of magnitude, dividing by the bucket
+			 * width, and adding to our (accumulated) base.
+			 */
+			return (base + (value - last) / (this / nbuckets));
+		}
+
+		base += nbuckets - (nbuckets / factor);
+		last = this;
+		this = next;
+	}
+
+	/*
+	 * Our value is greater than or equal to our factor taken to the
+	 * power of one plus the high magnitude -- return the top bucket.
+	 */
+	return (base);
+}
+
+static void
+dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
+{
+	uint64_t arg = *llquanta++;
+	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
+	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
+	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
+	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
+
+	llquanta[dtrace_aggregate_llquantize_bucket(factor,
+	    low, high, nsteps, nval)] += incr;
+}
+
 /*ARGSUSED*/
 static void
 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
@@ -2334,9 +2937,10 @@ dtrace_speculation_commit(dtrace_state_t
 {
 	dtrace_speculation_t *spec;
 	dtrace_buffer_t *src, *dest;
-	uintptr_t daddr, saddr, dlimit;
+	uintptr_t daddr, saddr, dlimit, slimit;
 	dtrace_speculation_state_t current, new = 0;
 	intptr_t offs;
+	uint64_t timestamp;
 
 	if (which == 0)
 		return;
@@ -2412,7 +3016,37 @@ dtrace_speculation_commit(dtrace_state_t
 	}
 
 	/*
-	 * We have the space; copy the buffer across.  (Note that this is a
+	 * We have sufficient space to copy the speculative buffer into the
+	 * primary buffer.  First, modify the speculative buffer, filling
+	 * in the timestamp of all entries with the current time.  The data
+	 * must have the commit() time rather than the time it was traced,
+	 * so that all entries in the primary buffer are in timestamp order.
+	 */
+	timestamp = dtrace_gethrtime();
+	saddr = (uintptr_t)src->dtb_tomax;
+	slimit = saddr + src->dtb_offset;
+	while (saddr < slimit) {
+		size_t size;
+		dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
+
+		if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+			saddr += sizeof (dtrace_epid_t);
+			continue;
+		}
+		ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
+		size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
+
+		ASSERT3U(saddr + size, <=, slimit);
+		ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
+		ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
+
+		DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
+
+		saddr += size;
+	}
+
+	/*
+	 * Copy the buffer across.  (Note that this is a
 	 * highly subobtimal bcopy(); in the unlikely event that this becomes
 	 * a serious performance issue, a high-performance DTrace-specific
 	 * bcopy() should obviously be invented.)
@@ -2736,7 +3370,6 @@ dtrace_dif_varstr(uintptr_t addr, dtrace
 	return (ret);
 }
 
-#ifdef notyet
 /*
  * Return a string from a memoy address which is known to have one or
  * more concatenated, individually zero terminated, sub-strings.
@@ -2774,7 +3407,6 @@ dtrace_dif_varstrz(uintptr_t addr, size_
 	mstate->dtms_scratch_ptr += strsz;
 	return (ret);
 }
-#endif
 
 /*
  * This function implements the DIF emulator's variable lookups.  The emulator
@@ -2827,7 +3459,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 
 		return (mstate->dtms_arg[ndx]);
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_VAR_UREGS: {
 		klwp_t *lwp;
 
@@ -2844,9 +3476,41 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		return (0);
 	}
 #endif
+#ifdef __FreeBSD__
+	case DIF_VAR_UREGS: {
+		struct trapframe *tframe;
+
+		if (!dtrace_priv_proc(state))
+			return (0);
+
+		if ((tframe = curthread->td_frame) == NULL) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
+			cpu_core[curcpu].cpuc_dtrace_illval = 0;
+			return (0);
+		}
+
+		return (dtrace_getreg(tframe, ndx));
+	}
+#endif
+#ifdef __NetBSD__
+	case DIF_VAR_UREGS: {
+		struct trapframe *tframe;
+
+		if (!dtrace_priv_proc(state))
+			return (0);
+
+		if ((tframe = lwp_trapframe(curlwp)) == NULL) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
+			cpu_core[curcpu_id].cpuc_dtrace_illval = 0;
+			return (0);
+		}
+
+		return (dtrace_getreg(tframe, ndx));
+	}
+#endif
 
 	case DIF_VAR_CURTHREAD:
-		if (!dtrace_priv_kernel(state))
+		if (!dtrace_priv_proc(state))
 			return (0);
 		return ((uint64_t)(uintptr_t)curthread);
 
@@ -2868,7 +3532,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		}
 		return (mstate->dtms_walltimestamp);
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_VAR_IPL:
 		if (!dtrace_priv_kernel(state))
 			return (0);
@@ -3005,7 +3669,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * Note that we are assuming that an unanchored probe is
 		 * always due to a high-level interrupt.  (And we're assuming
@@ -3031,7 +3695,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
@@ -3046,11 +3710,14 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		 */
 		return ((uint64_t)curthread->t_procp->p_ppid);
 #else
-		return ((uint64_t)curproc->p_pptr->p_pid);
+		if (curproc->p_pid == proc0.p_pid)
+			return (curproc->p_pid);
+		else
+			return (curproc->p_pptr->p_pid);
 #endif
 
 	case DIF_VAR_TID:
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
@@ -3061,7 +3728,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		return ((uint64_t)curthread->t_tid);
 
 	case DIF_VAR_EXECARGS: {
-#if 0
+#ifdef __FreeBSD__
 		struct pargs *p_args = curthread->td_proc->p_args;
 
 		if (p_args == NULL)
@@ -3069,13 +3736,13 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 
 		return (dtrace_dif_varstrz(
 		    (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
-#endif
-		/* XXX FreeBSD extension */
+#else
 		return 0;
+#endif
 	}
 
 	case DIF_VAR_EXECNAME:
-#if defined(sun)
+#ifdef illumos
 		if (!dtrace_priv_proc(state))
 			return (0);
 
@@ -3096,11 +3763,11 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		    state, mstate));
 #else
 		return (dtrace_dif_varstr(
-		    (uintptr_t) curthread->l_proc->p_comm, state, mstate));
+		    (uintptr_t) curproc->p_comm, state, mstate));
 #endif
 
 	case DIF_VAR_ZONENAME:
-#if defined(sun)
+#ifdef illumos
 		if (!dtrace_priv_proc(state))
 			return (0);
 
@@ -3127,7 +3794,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
@@ -3144,15 +3811,19 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		 * credential, since this is never NULL after process birth.
 		 */
 		return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
-#else
-		return (uint64_t)kauth_cred_getuid(curthread->t_procp->p_cred);
+#endif
+#ifdef __FreeBSD__
+		return ((uint64_t)curthread->td_ucred->cr_uid);
+#endif
+#ifdef __NetBSD__
+		return ((uint64_t)kauth_cred_getuid(curthread->t_procp->p_cred));
 #endif
 
 	case DIF_VAR_GID:
 		if (!dtrace_priv_proc(state))
 			return (0);
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * See comment in DIF_VAR_PID.
 		 */
@@ -3169,12 +3840,16 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 		 * credential, since this is never NULL after process birth.
 		 */
 		return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
-#else
-		return (uint64_t)kauth_cred_getgid(curthread->t_procp->p_cred);
+#endif
+#ifdef __FreeBSD__
+		return ((uint64_t)curthread->td_ucred->cr_gid);
+#endif
+#ifdef __NetBSD__
+		return ((uint64_t)kauth_cred_getgid(curthread->t_procp->p_cred));
 #endif
 
 	case DIF_VAR_ERRNO: {
-#if defined(sun)
+#ifdef illumos
 		klwp_t *lwp;
 		if (!dtrace_priv_proc(state))
 			return (0);
@@ -3195,20 +3870,482 @@ dtrace_dif_variable(dtrace_mstate_t *mst
 			return (0);
 
 		return ((uint64_t)lwp->lwp_errno);
-#else
-#if 0
-		return (curthread->l_errno);
-#else
-		return 0;	/* XXX TBD errno support at lwp level? */
 #endif
+#ifdef __FreeBSD__
+		return (curthread->td_errno);
+#endif
+#ifdef __NetBSD__
+		return 0;	/* XXX TBD errno support at lwp level? */
 #endif
 	}
+#ifndef illumos
+	case DIF_VAR_CPU: {
+		return curcpu_id;
+	}
+#endif
 	default:
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
 		return (0);
 	}
 }
 
+
+typedef enum dtrace_json_state {
+	DTRACE_JSON_REST = 1,
+	DTRACE_JSON_OBJECT,
+	DTRACE_JSON_STRING,
+	DTRACE_JSON_STRING_ESCAPE,
+	DTRACE_JSON_STRING_ESCAPE_UNICODE,
+	DTRACE_JSON_COLON,
+	DTRACE_JSON_COMMA,
+	DTRACE_JSON_VALUE,
+	DTRACE_JSON_IDENTIFIER,
+	DTRACE_JSON_NUMBER,
+	DTRACE_JSON_NUMBER_FRAC,
+	DTRACE_JSON_NUMBER_EXP,
+	DTRACE_JSON_COLLECT_OBJECT
+} dtrace_json_state_t;
+
+/*
+ * This function possesses just enough knowledge about JSON to extract a single
+ * value from a JSON string and store it in the scratch buffer.  It is able
+ * to extract nested object values, and members of arrays by index.
+ *
+ * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
+ * be looked up as we descend into the object tree.  e.g.
+ *
+ *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
+ *       with nelems = 5.
+ *
+ * The run time of this function must be bounded above by strsize to limit the
+ * amount of work done in probe context.  As such, it is implemented as a
+ * simple state machine, reading one character at a time using safe loads
+ * until we find the requested element, hit a parsing error or run off the
+ * end of the object or string.
+ *
+ * As there is no way for a subroutine to return an error without interrupting
+ * clause execution, we simply return NULL in the event of a missing key or any
+ * other error condition.  Each NULL return in this function is commented with
+ * the error condition it represents -- parsing or otherwise.
+ *
+ * The set of states for the state machine closely matches the JSON
+ * specification (http://json.org/).  Briefly:
+ *
+ *   DTRACE_JSON_REST:
+ *     Skip whitespace until we find either a top-level Object, moving
+ *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
+ *
+ *   DTRACE_JSON_OBJECT:
+ *     Locate the next key String in an Object.  Sets a flag to denote
+ *     the next String as a key string and moves to DTRACE_JSON_STRING.
+ *
+ *   DTRACE_JSON_COLON:
+ *     Skip whitespace until we find the colon that separates key Strings
+ *     from their values.  Once found, move to DTRACE_JSON_VALUE.
+ *
+ *   DTRACE_JSON_VALUE:
+ *     Detects the type of the next value (String, Number, Identifier, Object
+ *     or Array) and routes to the states that process that type.  Here we also
+ *     deal with the element selector list if we are requested to traverse down
+ *     into the object tree.
+ *
+ *   DTRACE_JSON_COMMA:
+ *     Skip whitespace until we find the comma that separates key-value pairs
+ *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
+ *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
+ *     states return to this state at the end of their value, unless otherwise
+ *     noted.
+ *
+ *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
+ *     Processes a Number literal from the JSON, including any exponent
+ *     component that may be present.  Numbers are returned as strings, which
+ *     may be passed to strtoll() if an integer is required.
+ *
+ *   DTRACE_JSON_IDENTIFIER:
+ *     Processes a "true", "false" or "null" literal in the JSON.
+ *
+ *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
+ *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
+ *     Processes a String literal from the JSON, whether the String denotes
+ *     a key, a value or part of a larger Object.  Handles all escape sequences
+ *     present in the specification, including four-digit unicode characters,
+ *     but merely includes the escape sequence without converting it to the
+ *     actual escaped character.  If the String is flagged as a key, we
+ *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
+ *
+ *   DTRACE_JSON_COLLECT_OBJECT:
+ *     This state collects an entire Object (or Array), correctly handling
+ *     embedded strings.  If the full element selector list matches this nested
+ *     object, we return the Object in full as a string.  If not, we use this
+ *     state to skip to the next value at this level and continue processing.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static char *
+dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
+    char *dest)
+{
+	dtrace_json_state_t state = DTRACE_JSON_REST;
+	int64_t array_elem = INT64_MIN;
+	int64_t array_pos = 0;
+	uint8_t escape_unicount = 0;
+	boolean_t string_is_key = B_FALSE;
+	boolean_t collect_object = B_FALSE;
+	boolean_t found_key = B_FALSE;
+	boolean_t in_array = B_FALSE;
+	uint32_t braces = 0, brackets = 0;
+	char *elem = elemlist;
+	char *dd = dest;
+	uintptr_t cur;
+
+	for (cur = json; cur < json + size; cur++) {
+		char cc = dtrace_load8(cur);
+		if (cc == '\0')
+			return (NULL);
+
+		switch (state) {
+		case DTRACE_JSON_REST:
+			if (isspace(cc))
+				break;
+
+			if (cc == '{') {
+				state = DTRACE_JSON_OBJECT;
+				break;
+			}
+
+			if (cc == '[') {
+				in_array = B_TRUE;
+				array_pos = 0;
+				array_elem = dtrace_strtoll(elem, 10, size);
+				found_key = array_elem == 0 ? B_TRUE : B_FALSE;
+				state = DTRACE_JSON_VALUE;
+				break;
+			}
+
+			/*
+			 * ERROR: expected to find a top-level object or array.
+			 */
+			return (NULL);
+		case DTRACE_JSON_OBJECT:
+			if (isspace(cc))
+				break;
+
+			if (cc == '"') {
+				state = DTRACE_JSON_STRING;
+				string_is_key = B_TRUE;
+				break;
+			}
+
+			/*
+			 * ERROR: either the object did not start with a key
+			 * string, or we've run off the end of the object
+			 * without finding the requested key.
+			 */
+			return (NULL);
+		case DTRACE_JSON_STRING:
+			if (cc == '\\') {
+				*dd++ = '\\';
+				state = DTRACE_JSON_STRING_ESCAPE;
+				break;
+			}
+
+			if (cc == '"') {
+				if (collect_object) {
+					/*
+					 * We don't reset the dest here, as
+					 * the string is part of a larger
+					 * object being collected.
+					 */
+					*dd++ = cc;
+					collect_object = B_FALSE;
+					state = DTRACE_JSON_COLLECT_OBJECT;
+					break;
+				}
+				*dd = '\0';
+				dd = dest; /* reset string buffer */
+				if (string_is_key) {
+					if (dtrace_strncmp(dest, elem,
+					    size) == 0)
+						found_key = B_TRUE;
+				} else if (found_key) {
+					if (nelems > 1) {
+						/*
+						 * We expected an object, not
+						 * this string.
+						 */
+						return (NULL);
+					}
+					return (dest);
+				}
+				state = string_is_key ? DTRACE_JSON_COLON :
+				    DTRACE_JSON_COMMA;
+				string_is_key = B_FALSE;
+				break;
+			}
+
+			*dd++ = cc;
+			break;
+		case DTRACE_JSON_STRING_ESCAPE:
+			*dd++ = cc;
+			if (cc == 'u') {
+				escape_unicount = 0;
+				state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
+			} else {
+				state = DTRACE_JSON_STRING;
+			}
+			break;
+		case DTRACE_JSON_STRING_ESCAPE_UNICODE:
+			if (!isxdigit(cc)) {
+				/*
+				 * ERROR: invalid unicode escape, expected
+				 * four valid hexidecimal digits.
+				 */
+				return (NULL);
+			}
+
+			*dd++ = cc;
+			if (++escape_unicount == 4)
+				state = DTRACE_JSON_STRING;
+			break;
+		case DTRACE_JSON_COLON:
+			if (isspace(cc))
+				break;
+
+			if (cc == ':') {
+				state = DTRACE_JSON_VALUE;
+				break;
+			}
+
+			/*
+			 * ERROR: expected a colon.
+			 */
+			return (NULL);
+		case DTRACE_JSON_COMMA:
+			if (isspace(cc))
+				break;
+
+			if (cc == ',') {
+				if (in_array) {
+					state = DTRACE_JSON_VALUE;
+					if (++array_pos == array_elem)
+						found_key = B_TRUE;
+				} else {
+					state = DTRACE_JSON_OBJECT;
+				}
+				break;
+			}
+
+			/*
+			 * ERROR: either we hit an unexpected character, or
+			 * we reached the end of the object or array without
+			 * finding the requested key.
+			 */
+			return (NULL);
+		case DTRACE_JSON_IDENTIFIER:
+			if (islower(cc)) {
+				*dd++ = cc;
+				break;
+			}
+
+			*dd = '\0';
+			dd = dest; /* reset string buffer */
+
+			if (dtrace_strncmp(dest, "true", 5) == 0 ||
+			    dtrace_strncmp(dest, "false", 6) == 0 ||
+			    dtrace_strncmp(dest, "null", 5) == 0) {
+				if (found_key) {
+					if (nelems > 1) {
+						/*
+						 * ERROR: We expected an object,
+						 * not this identifier.
+						 */
+						return (NULL);
+					}
+					return (dest);
+				} else {
+					cur--;
+					state = DTRACE_JSON_COMMA;
+					break;
+				}
+			}
+
+			/*
+			 * ERROR: we did not recognise the identifier as one
+			 * of those in the JSON specification.
+			 */
+			return (NULL);
+		case DTRACE_JSON_NUMBER:
+			if (cc == '.') {
+				*dd++ = cc;
+				state = DTRACE_JSON_NUMBER_FRAC;
+				break;
+			}
+
+			if (cc == 'x' || cc == 'X') {
+				/*
+				 * ERROR: specification explicitly excludes
+				 * hexidecimal or octal numbers.
+				 */
+				return (NULL);
+			}
+
+			/* FALLTHRU */
+		case DTRACE_JSON_NUMBER_FRAC:
+			if (cc == 'e' || cc == 'E') {
+				*dd++ = cc;
+				state = DTRACE_JSON_NUMBER_EXP;
+				break;
+			}
+
+			if (cc == '+' || cc == '-') {
+				/*
+				 * ERROR: expect sign as part of exponent only.
+				 */
+				return (NULL);
+			}
+			/* FALLTHRU */
+		case DTRACE_JSON_NUMBER_EXP:
+			if (isdigit(cc) || cc == '+' || cc == '-') {
+				*dd++ = cc;
+				break;
+			}
+
+			*dd = '\0';
+			dd = dest; /* reset string buffer */
+			if (found_key) {
+				if (nelems > 1) {
+					/*
+					 * ERROR: We expected an object, not
+					 * this number.
+					 */
+					return (NULL);
+				}
+				return (dest);
+			}
+
+			cur--;
+			state = DTRACE_JSON_COMMA;
+			break;
+		case DTRACE_JSON_VALUE:
+			if (isspace(cc))
+				break;
+
+			if (cc == '{' || cc == '[') {
+				if (nelems > 1 && found_key) {
+					in_array = cc == '[' ? B_TRUE : B_FALSE;
+					/*
+					 * If our element selector directs us
+					 * to descend into this nested object,
+					 * then move to the next selector
+					 * element in the list and restart the
+					 * state machine.
+					 */
+					while (*elem != '\0')
+						elem++;
+					elem++; /* skip the inter-element NUL */
+					nelems--;
+					dd = dest;
+					if (in_array) {
+						state = DTRACE_JSON_VALUE;
+						array_pos = 0;
+						array_elem = dtrace_strtoll(
+						    elem, 10, size);
+						found_key = array_elem == 0 ?
+						    B_TRUE : B_FALSE;
+					} else {
+						found_key = B_FALSE;
+						state = DTRACE_JSON_OBJECT;
+					}
+					break;
+				}
+
+				/*
+				 * Otherwise, we wish to either skip this
+				 * nested object or return it in full.
+				 */
+				if (cc == '[')
+					brackets = 1;
+				else
+					braces = 1;
+				*dd++ = cc;
+				state = DTRACE_JSON_COLLECT_OBJECT;
+				break;
+			}
+
+			if (cc == '"') {
+				state = DTRACE_JSON_STRING;
+				break;
+			}
+
+			if (islower(cc)) {
+				/*
+				 * Here we deal with true, false and null.
+				 */
+				*dd++ = cc;
+				state = DTRACE_JSON_IDENTIFIER;
+				break;
+			}
+
+			if (cc == '-' || isdigit(cc)) {
+				*dd++ = cc;
+				state = DTRACE_JSON_NUMBER;
+				break;
+			}
+
+			/*
+			 * ERROR: unexpected character at start of value.
+			 */
+			return (NULL);
+		case DTRACE_JSON_COLLECT_OBJECT:
+			if (cc == '\0')
+				/*
+				 * ERROR: unexpected end of input.
+				 */
+				return (NULL);
+
+			*dd++ = cc;
+			if (cc == '"') {
+				collect_object = B_TRUE;
+				state = DTRACE_JSON_STRING;
+				break;
+			}
+
+			if (cc == ']') {
+				if (brackets-- == 0) {
+					/*
+					 * ERROR: unbalanced brackets.
+					 */
+					return (NULL);
+				}
+			} else if (cc == '}') {
+				if (braces-- == 0) {
+					/*
+					 * ERROR: unbalanced braces.
+					 */
+					return (NULL);
+				}
+			} else if (cc == '{') {
+				braces++;
+			} else if (cc == '[') {
+				brackets++;
+			}
+
+			if (brackets == 0 && braces == 0) {
+				if (found_key) {
+					*dd = '\0';
+					return (dest);
+				}
+				dd = dest; /* reset string buffer */
+				state = DTRACE_JSON_COMMA;
+			}
+			break;
+		}
+	}
+	return (NULL);
+}
+
 /*
  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
  * Notice that we don't bother validating the proper number of arguments or
@@ -3225,7 +4362,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 	volatile uintptr_t *illval = &cpu_core[curcpu_id].cpuc_dtrace_illval;
 	dtrace_vstate_t *vstate = &state->dts_vstate;
 
-#if defined(sun)
+#ifdef illumos
 	union {
 		mutex_impl_t mi;
 		uint64_t mx;
@@ -3235,7 +4372,15 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		krwlock_t ri;
 		uintptr_t rw;
 	} r;
-#else
+#endif
+#ifdef __FreeBSD__
+	struct thread *lowner;
+	union {
+		struct lock_object *li;
+		uintptr_t lx;
+	} l;
+#endif
+#ifdef __NetBSD__
 	union {
 		kmutex_t mi;
 		uint64_t mx;
@@ -3249,10 +4394,11 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 
 	switch (subr) {
 	case DIF_SUBR_RAND:
-		regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
+		regs[rd] = dtrace_xoroshiro128_plus_next(
+		    state->dts_rstate[curcpu_id]);
 		break;
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_SUBR_MUTEX_OWNED:
 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
 		    mstate, vstate)) {
@@ -3340,7 +4486,101 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		regs[rd] = _RW_ISWRITER(&r.ri);
 		break;
 
-#else
+#endif /* illumos */
+#ifdef __FreeBSD__
+	case DIF_SUBR_MUTEX_OWNED:
+		if (!dtrace_canload(tupregs[0].dttk_value,
+			sizeof (struct lock_object), mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+		break;
+
+	case DIF_SUBR_MUTEX_OWNER:
+		if (!dtrace_canload(tupregs[0].dttk_value,
+			sizeof (struct lock_object), mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+		regs[rd] = (uintptr_t)lowner;
+		break;
+
+	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
+		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
+		    mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SLEEPLOCK) != 0;
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+		break;
+
+	case DIF_SUBR_MUTEX_TYPE_SPIN:
+		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
+		    mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+		break;
+
+	case DIF_SUBR_RW_READ_HELD: 
+	case DIF_SUBR_SX_SHARED_HELD: 
+		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
+		    mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
+		    lowner == NULL;
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+		break;
+
+	case DIF_SUBR_RW_WRITE_HELD:
+	case DIF_SUBR_SX_EXCLUSIVE_HELD:
+		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
+		    mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
+		    lowner != NULL;
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+		break;
+
+	case DIF_SUBR_RW_ISWRITER:
+	case DIF_SUBR_SX_ISEXCLUSIVE:
+		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
+		    mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		l.lx = dtrace_loadptr(tupregs[0].dttk_value);
+		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+		LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
+		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+		regs[rd] = (lowner == curthread);
+		break;
+
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
 	case DIF_SUBR_MUTEX_OWNED:
 		if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
 		    mstate, vstate)) {
@@ -3426,7 +4666,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		regs[rd] = _RW_ISWRITER(&r.ri);
 		break;
 
-#endif /* ! defined(sun) */
+#endif /* __NetBSD__ */
 
 	case DIF_SUBR_BCOPY: {
 		/*
@@ -3536,7 +4776,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		break;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_SUBR_MSGSIZE:
 	case DIF_SUBR_MSGDSIZE: {
 		uintptr_t baddr = tupregs[0].dttk_value, daddr;
@@ -3604,7 +4844,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 
 		for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
-#if defined(sun)
+#ifdef illumos
 			if (p->p_pidp->pid_id == pid) {
 #else
 			if (p->p_pid == pid) {
@@ -3643,30 +4883,30 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		uintptr_t kaddr = tupregs[0].dttk_value;
 		uintptr_t uaddr = tupregs[1].dttk_value;
 		uint64_t size = tupregs[2].dttk_value;
+		size_t lim;
 
 		if (!dtrace_destructive_disallow &&
 		    dtrace_priv_proc_control(state) &&
-		    !dtrace_istoxic(kaddr, size)) {
+		    !dtrace_istoxic(kaddr, size) &&
+		    dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
-			dtrace_copyoutstr(kaddr, uaddr, size, flags);
+			dtrace_copyoutstr(kaddr, uaddr, lim, flags);
 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 		}
 		break;
 	}
 
 	case DIF_SUBR_STRLEN: {
-		size_t sz;
+		size_t size = state->dts_options[DTRACEOPT_STRSIZE];
 		uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
-		sz = dtrace_strlen((char *)addr,
-		    state->dts_options[DTRACEOPT_STRSIZE]);
+		size_t lim;
 
-		if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
+		if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
 			regs[rd] = 0;
 			break;
 		}
 
-		regs[rd] = sz;
-
+		regs[rd] = dtrace_strlen((char *)addr, lim);
 		break;
 	}
 
@@ -3679,12 +4919,19 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
 		 * of the specified character instead of the first.
 		 */
-		uintptr_t saddr = tupregs[0].dttk_value;
 		uintptr_t addr = tupregs[0].dttk_value;
-		uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
+		uintptr_t addr_limit;
+		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+		size_t lim;
 		char c, target = (char)tupregs[1].dttk_value;
 
-		for (regs[rd] = 0; addr < limit; addr++) {
+		if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+		addr_limit = addr + lim;
+
+		for (regs[rd] = 0; addr < addr_limit; addr++) {
 			if ((c = dtrace_load8(addr)) == target) {
 				regs[rd] = addr;
 
@@ -3695,12 +4942,6 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			if (c == '\0')
 				break;
 		}
-
-		if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
-			regs[rd] = 0;
-			break;
-		}
-
 		break;
 	}
 
@@ -3858,7 +5099,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		uintptr_t addr = tupregs[0].dttk_value;
 		uintptr_t tokaddr = tupregs[1].dttk_value;
 		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
-		uintptr_t limit, toklimit = tokaddr + size;
+		uintptr_t limit, toklimit;
+		size_t clim;
 		uint8_t c = 0, tokmap[32];	 /* 256 / 8 */
 		char *dest = (char *)mstate->dtms_scratch_ptr;
 		int i;
@@ -3867,10 +5109,11 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		 * Check both the token buffer and (later) the input buffer,
 		 * since both could be non-scratch addresses.
 		 */
-		if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
+		if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
 			regs[rd] = 0;
 			break;
 		}
+		toklimit = tokaddr + clim;
 
 		if (!DTRACE_INSCRATCH(mstate, size)) {
 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
@@ -3887,6 +5130,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			 * it behaves like an implicit clause-local variable.
 			 */
 			addr = mstate->dtms_strtok;
+			limit = mstate->dtms_strtok_limit;
 		} else {
 			/*
 			 * If the user-specified address is non-NULL we must
@@ -3896,10 +5140,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			 * (when we fetch addr from mstate->dtms_strtok)
 			 * would fail this access check.
 			 */
-			if (!dtrace_strcanload(addr, size, mstate, vstate)) {
+			if (!dtrace_strcanload(addr, size, &clim, mstate,
+			    vstate)) {
 				regs[rd] = 0;
 				break;
 			}
+			limit = addr + clim;
 		}
 
 		/*
@@ -3918,10 +5164,10 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			tokmap[c >> 3] |= (1 << (c & 0x7));
 		}
 
-		for (limit = addr + size; addr < limit; addr++) {
+		for (; addr < limit; addr++) {
 			/*
-			 * We're looking for a character that is _not_ contained
-			 * in the token string.
+			 * We're looking for a character that is _not_
+			 * contained in the token string.
 			 */
 			if ((c = dtrace_load8(addr)) == '\0')
 				break;
@@ -3939,6 +5185,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			 */
 			regs[rd] = 0;
 			mstate->dtms_strtok = 0;
+			mstate->dtms_strtok_limit = 0;
 			break;
 		}
 
@@ -3949,29 +5196,151 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			if ((c = dtrace_load8(addr)) == '\0')
 				break;
 
-			if (tokmap[c >> 3] & (1 << (c & 0x7)))
-				break;
+			if (tokmap[c >> 3] & (1 << (c & 0x7)))
+				break;
+
+			ASSERT(i < size);
+			dest[i++] = c;
+		}
+
+		ASSERT(i < size);
+		dest[i] = '\0';
+		regs[rd] = (uintptr_t)dest;
+		mstate->dtms_scratch_ptr += size;
+		mstate->dtms_strtok = addr;
+		mstate->dtms_strtok_limit = limit;
+		break;
+	}
+
+	case DIF_SUBR_SUBSTR: {
+		uintptr_t s = tupregs[0].dttk_value;
+		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+		char *d = (char *)mstate->dtms_scratch_ptr;
+		int64_t index = (int64_t)tupregs[1].dttk_value;
+		int64_t remaining = (int64_t)tupregs[2].dttk_value;
+		size_t len = dtrace_strlen((char *)s, size);
+		int64_t i;
+
+		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+
+		if (!DTRACE_INSCRATCH(mstate, size)) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+			regs[rd] = 0;
+			break;
+		}
+
+		if (nargs <= 2)
+			remaining = (int64_t)size;
+
+		if (index < 0) {
+			index += len;
+
+			if (index < 0 && index + remaining > 0) {
+				remaining += index;
+				index = 0;
+			}
+		}
+
+		if (index >= len || index < 0) {
+			remaining = 0;
+		} else if (remaining < 0) {
+			remaining += len - index;
+		} else if (index + remaining > size) {
+			remaining = size - index;
+		}
+
+		for (i = 0; i < remaining; i++) {
+			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
+				break;
+		}
+
+		d[i] = '\0';
+
+		mstate->dtms_scratch_ptr += size;
+		regs[rd] = (uintptr_t)d;
+		break;
+	}
+
+	case DIF_SUBR_JSON: {
+		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+		uintptr_t json = tupregs[0].dttk_value;
+		size_t jsonlen = dtrace_strlen((char *)json, size);
+		uintptr_t elem = tupregs[1].dttk_value;
+		size_t elemlen = dtrace_strlen((char *)elem, size);
+
+		char *dest = (char *)mstate->dtms_scratch_ptr;
+		char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
+		char *ee = elemlist;
+		int nelems = 1;
+		uintptr_t cur;
+
+		if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
+		    !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
+			regs[rd] = 0;
+			break;
+		}
+
+		if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+			regs[rd] = 0;
+			break;
+		}
+
+		/*
+		 * Read the element selector and split it up into a packed list
+		 * of strings.
+		 */
+		for (cur = elem; cur < elem + elemlen; cur++) {
+			char cc = dtrace_load8(cur);
+
+			if (cur == elem && cc == '[') {
+				/*
+				 * If the first element selector key is
+				 * actually an array index then ignore the
+				 * bracket.
+				 */
+				continue;
+			}
+
+			if (cc == ']')
+				continue;
 
-			ASSERT(i < size);
-			dest[i++] = c;
+			if (cc == '.' || cc == '[') {
+				nelems++;
+				cc = '\0';
+			}
+
+			*ee++ = cc;
 		}
+		*ee++ = '\0';
 
-		ASSERT(i < size);
-		dest[i] = '\0';
-		regs[rd] = (uintptr_t)dest;
-		mstate->dtms_scratch_ptr += size;
-		mstate->dtms_strtok = addr;
+		if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
+		    nelems, dest)) != 0)
+			mstate->dtms_scratch_ptr += jsonlen + 1;
 		break;
 	}
 
-	case DIF_SUBR_SUBSTR: {
+	case DIF_SUBR_TOUPPER:
+	case DIF_SUBR_TOLOWER: {
 		uintptr_t s = tupregs[0].dttk_value;
 		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
-		char *d = (char *)mstate->dtms_scratch_ptr;
-		int64_t index = (int64_t)tupregs[1].dttk_value;
-		int64_t remaining = (int64_t)tupregs[2].dttk_value;
+		char *dest = (char *)mstate->dtms_scratch_ptr, c;
 		size_t len = dtrace_strlen((char *)s, size);
-		int64_t i = 0;
+		char lower, upper, convert;
+		int64_t i;
+
+		if (subr == DIF_SUBR_TOUPPER) {
+			lower = 'a';
+			upper = 'z';
+			convert = 'A';
+		} else {
+			lower = 'A';
+			upper = 'Z';
+			convert = 'a';
+		}
 
 		if (!dtrace_canload(s, len + 1, mstate, vstate)) {
 			regs[rd] = 0;
@@ -3984,39 +5353,24 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			break;
 		}
 
-		if (nargs <= 2)
-			remaining = (int64_t)size;
-
-		if (index < 0) {
-			index += len;
-
-			if (index < 0 && index + remaining > 0) {
-				remaining += index;
-				index = 0;
-			}
-		}
+		for (i = 0; i < size - 1; i++) {
+			if ((c = dtrace_load8(s + i)) == '\0')
+				break;
 
-		if (index >= len || index < 0) {
-			remaining = 0;
-		} else if (remaining < 0) {
-			remaining += len - index;
-		} else if (index + remaining > size) {
-			remaining = size - index;
-		}
+			if (c >= lower && c <= upper)
+				c = convert + (c - lower);
 
-		for (i = 0; i < remaining; i++) {
-			if ((d[i] = dtrace_load8(s + index + i)) == '\0')
-				break;
+			dest[i] = c;
 		}
 
-		d[i] = '\0';
-
+		ASSERT(i < size);
+		dest[i] = '\0';
+		regs[rd] = (uintptr_t)dest;
 		mstate->dtms_scratch_ptr += size;
-		regs[rd] = (uintptr_t)d;
 		break;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	case DIF_SUBR_GETMAJOR:
 #ifdef _LP64
 		regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
@@ -4232,10 +5586,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 		uintptr_t s1 = tupregs[0].dttk_value;
 		uintptr_t s2 = tupregs[1].dttk_value;
-		int i = 0;
+		int i = 0, j = 0;
+		size_t lim1, lim2;
+		char c;
 
-		if (!dtrace_strcanload(s1, size, mstate, vstate) ||
-		    !dtrace_strcanload(s2, size, mstate, vstate)) {
+		if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
+		    !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
 			regs[rd] = 0;
 			break;
 		}
@@ -4253,7 +5609,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 				break;
 			}
 
-			if ((d[i++] = dtrace_load8(s1++)) == '\0') {
+			c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
+			if ((d[i++] = c) == '\0') {
 				i--;
 				break;
 			}
@@ -4266,7 +5623,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 				break;
 			}
 
-			if ((d[i++] = dtrace_load8(s2++)) == '\0')
+			c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
+			if ((d[i++] = c) == '\0')
 				break;
 		}
 
@@ -4278,11 +5636,45 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		break;
 	}
 
+	case DIF_SUBR_STRTOLL: {
+		uintptr_t s = tupregs[0].dttk_value;
+		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+		size_t lim;
+		int base = 10;
+
+		if (nargs > 1) {
+			if ((base = tupregs[1].dttk_value) <= 1 ||
+			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
+				*flags |= CPU_DTRACE_ILLOP;
+				break;
+			}
+		}
+
+		if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
+			regs[rd] = INT64_MIN;
+			break;
+		}
+
+		regs[rd] = dtrace_strtoll((char *)s, base, lim);
+		break;
+	}
+
 	case DIF_SUBR_LLTOSTR: {
 		int64_t i = (int64_t)tupregs[0].dttk_value;
-		int64_t val = i < 0 ? i * -1 : i;
-		uint64_t size = 22;	/* enough room for 2^64 in decimal */
+		uint64_t val, digit;
+		uint64_t size = 65;	/* enough room for 2^64 in binary */
 		char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
+		int base = 10;
+
+		if (nargs > 1) {
+			if ((base = tupregs[1].dttk_value) <= 1 ||
+			    base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
+				*flags |= CPU_DTRACE_ILLOP;
+				break;
+			}
+		}
+
+		val = (base == 10 && i < 0) ? i * -1 : i;
 
 		if (!DTRACE_INSCRATCH(mstate, size)) {
 			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
@@ -4290,13 +5682,24 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 			break;
 		}
 
-		for (*end-- = '\0'; val; val /= 10)
-			*end-- = '0' + (val % 10);
+		for (*end-- = '\0'; val; val /= base) {
+			if ((digit = val % base) <= '9' - '0') {
+				*end-- = '0' + digit;
+			} else {
+				*end-- = 'a' + (digit - ('9' - '0') - 1);
+			}
+		}
+
+		if (i == 0 && base == 16)
+			*end-- = '0';
+
+		if (base == 16)
+			*end-- = 'x';
 
-		if (i == 0)
+		if (i == 0 || base == 8 || base == 16)
 			*end-- = '0';
 
-		if (i < 0)
+		if (i < 0 && base == 10)
 			*end-- = '-';
 
 		regs[rd] = (uintptr_t)end + 1;
@@ -4465,13 +5868,39 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		break;
 	}
 
+	case DIF_SUBR_GETF: {
+		uintptr_t fd = tupregs[0].dttk_value;
+		struct filedesc *fdp;
+		file_t *fp;
+
+		if (!dtrace_priv_proc(state)) {
+			regs[rd] = 0;
+			break;
+		}
+#ifdef __FreeBSD_
+		fdp = curproc->p_fd;
+		FILEDESC_SLOCK(fdp);
+		fp = fget_locked(fdp, fd);
+		mstate->dtms_getf = fp;
+		regs[rd] = (uintptr_t)fp;
+		FILEDESC_SUNLOCK(fdp);
+#endif
+#ifdef __NetBSD__
+		regs[rd] = 0;
+#endif
+		break;
+	}
 	case DIF_SUBR_CLEANPATH: {
 		char *dest = (char *)mstate->dtms_scratch_ptr, c;
 		uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 		uintptr_t src = tupregs[0].dttk_value;
+		size_t lim;
 		int i = 0, j = 0;
+#ifdef illumos
+		zone_t *z;
+#endif
 
-		if (!dtrace_strcanload(src, size, mstate, vstate)) {
+		if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
 			regs[rd] = 0;
 			break;
 		}
@@ -4486,7 +5915,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, 
 		 * Move forward, loading each character.
 		 */
 		do {
-			c = dtrace_load8(src + i++);
+			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 next:
 			if (j + 5 >= size)	/* 5 = strlen("/..c\0") */
 				break;
@@ -4496,7 +5925,7 @@ next:
 				continue;
 			}
 
-			c = dtrace_load8(src + i++);
+			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 
 			if (c == '/') {
 				/*
@@ -4517,7 +5946,7 @@ next:
 				continue;
 			}
 
-			c = dtrace_load8(src + i++);
+			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 
 			if (c == '/') {
 				/*
@@ -4540,7 +5969,7 @@ next:
 				continue;
 			}
 
-			c = dtrace_load8(src + i++);
+			c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 
 			if (c != '/' && c != '\0') {
 				/*
@@ -4568,6 +5997,25 @@ next:
 		} while (c != '\0');
 
 		dest[j] = '\0';
+
+#ifdef illumos
+		if (mstate->dtms_getf != NULL &&
+		    !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
+		    (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
+			/*
+			 * If we've done a getf() as a part of this ECB and we
+			 * don't have kernel access (and we're not in the global
+			 * zone), check if the path we cleaned up begins with
+			 * the zone's root path, and trim it off if so.  Note
+			 * that this is an output cleanliness issue, not a
+			 * security issue: knowing one's zone root path does
+			 * not enable privilege escalation.
+			 */
+			if (strstr(dest, z->zone_rootpath) == dest)
+				dest += strlen(z->zone_rootpath) - 1;
+		}
+#endif
+
 		regs[rd] = (uintptr_t)dest;
 		mstate->dtms_scratch_ptr += size;
 		break;
@@ -4592,6 +6040,12 @@ next:
 			ipaddr_t ip4;
 			uint8_t *ptr8, val;
 
+			if (!dtrace_canload(tupregs[argi].dttk_value,
+			    sizeof (ipaddr_t), mstate, vstate)) {
+				regs[rd] = 0;
+				break;
+			}
+
 			/*
 			 * Safely load the IPv4 address.
 			 */
@@ -4645,6 +6099,12 @@ next:
 			 * just the IPv4 string is returned for inet_ntoa6.
 			 */
 
+			if (!dtrace_canload(tupregs[argi].dttk_value,
+			    sizeof (struct in6_addr), mstate, vstate)) {
+				regs[rd] = 0;
+				break;
+			}
+
 			/*
 			 * Safely load the IPv6 address.
 			 */
@@ -4673,7 +6133,7 @@ next:
 			tryzero = -1;
 			numzero = 1;
 			for (i = 0; i < sizeof (struct in6_addr); i++) {
-#if defined(sun)
+#ifdef illumos
 				if (ip6._S6_un._S6_u8[i] == 0 &&
 #else
 				if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
@@ -4684,7 +6144,7 @@ next:
 				}
 
 				if (tryzero != -1 &&
-#if defined(sun)
+#ifdef illumos
 				    (ip6._S6_un._S6_u8[i] != 0 ||
 #else
 				    (ip6.__u6_addr.__u6_addr8[i] != 0 ||
@@ -4700,7 +6160,7 @@ next:
 					numzero = i - i % 2 - tryzero;
 					tryzero = -1;
 
-#if defined(sun)
+#ifdef illumos
 					if (ip6._S6_un._S6_u8[i] == 0 &&
 #else
 					if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
@@ -4721,7 +6181,7 @@ next:
 				    i >= DTRACE_V4MAPPED_OFFSET; i--) {
 					ASSERT(end >= base);
 
-#if defined(sun)
+#ifdef illumos
 					val = ip6._S6_un._S6_u8[i];
 #else
 					val = ip6.__u6_addr.__u6_addr8[i];
@@ -4766,7 +6226,7 @@ next:
 				if (i < 14 && i != firstzero - 2)
 					*end-- = ':';
 
-#if defined(sun)
+#ifdef illumos
 				val = (ip6._S6_un._S6_u8[i] << 8) +
 				    ip6._S6_un._S6_u8[i + 1];
 #else
@@ -4812,21 +6272,44 @@ inetout:	regs[rd] = (uintptr_t)end + 1;
 		break;
 	}
 
-	case DIF_SUBR_TYPEREF: {
-		uintptr_t size = 4 * sizeof(uintptr_t);
-		uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
-		size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
-
-		/* address, num_elements, type_str, type_len */
-		typeref[0] = tupregs[0].dttk_value;
-		typeref[1] = tupregs[1].dttk_value;
-		typeref[2] = tupregs[2].dttk_value;
-		typeref[3] = tupregs[3].dttk_value;
+#ifndef illumos
+	case DIF_SUBR_MEMSTR: {
+		char *str = (char *)mstate->dtms_scratch_ptr;
+		uintptr_t mem = tupregs[0].dttk_value;
+		char c = tupregs[1].dttk_value;
+		size_t size = tupregs[2].dttk_value;
+		uint8_t n;
+		int i;
+
+		regs[rd] = 0;
 
-		regs[rd] = (uintptr_t) typeref;
-		mstate->dtms_scratch_ptr += scratch_size;
+		if (size == 0)
+			break;
+
+		if (!dtrace_canload(mem, size - 1, mstate, vstate))
+			break;
+
+		if (!DTRACE_INSCRATCH(mstate, size)) {
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+			break;
+		}
+
+		if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
+			*flags |= CPU_DTRACE_ILLOP;
+			break;
+		}
+
+		for (i = 0; i < size - 1; i++) {
+			n = dtrace_load8(mem++);
+			str[i] = (n == 0) ? c : n;
+		}
+		str[size - 1] = 0;
+
+		regs[rd] = (uintptr_t)str;
+		mstate->dtms_scratch_ptr += size;
 		break;
 	}
+#endif
 	}
 }
 
@@ -5002,102 +6485,95 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 				pc = DIF_INSTR_LABEL(instr);
 			break;
 		case DIF_OP_RLDSB:
-			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSB:
 			regs[rd] = (int8_t)dtrace_load8(regs[r1]);
 			break;
 		case DIF_OP_RLDSH:
-			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSH:
 			regs[rd] = (int16_t)dtrace_load16(regs[r1]);
 			break;
 		case DIF_OP_RLDSW:
-			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDSW:
 			regs[rd] = (int32_t)dtrace_load32(regs[r1]);
 			break;
 		case DIF_OP_RLDUB:
-			if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUB:
 			regs[rd] = dtrace_load8(regs[r1]);
 			break;
 		case DIF_OP_RLDUH:
-			if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUH:
 			regs[rd] = dtrace_load16(regs[r1]);
 			break;
 		case DIF_OP_RLDUW:
-			if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDUW:
 			regs[rd] = dtrace_load32(regs[r1]);
 			break;
 		case DIF_OP_RLDX:
-			if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
-				*flags |= CPU_DTRACE_KPRIV;
-				*illval = regs[r1];
+			if (!dtrace_canload(regs[r1], 8, mstate, vstate))
 				break;
-			}
 			/*FALLTHROUGH*/
 		case DIF_OP_LDX:
 			regs[rd] = dtrace_load64(regs[r1]);
 			break;
 		case DIF_OP_ULDSB:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] = (int8_t)
 			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDSH:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] = (int16_t)
 			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDSW:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] = (int32_t)
 			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDUB:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDUH:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDUW:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_ULDX:
+			DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 			regs[rd] =
 			    dtrace_fuword64((void *)(uintptr_t)regs[r1]);
+			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 			break;
 		case DIF_OP_RET:
 			rval = regs[rd];
@@ -5116,15 +6592,17 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 			size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
 			uintptr_t s1 = regs[r1];
 			uintptr_t s2 = regs[r2];
+			size_t lim1, lim2;
 
 			if (s1 != 0 &&
-			    !dtrace_strcanload(s1, sz, mstate, vstate))
+			    !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
 				break;
 			if (s2 != 0 &&
-			    !dtrace_strcanload(s2, sz, mstate, vstate))
+			    !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
 				break;
 
-			cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
+			cc_r = dtrace_strncmp((char *)s1, (char *)s2,
+			    MIN(lim1, lim2));
 
 			cc_n = cc_r < 0;
 			cc_z = cc_r == 0;
@@ -5176,12 +6654,14 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 			ASSERT(id >= DIF_VAR_OTHER_UBASE);
 			id -= DIF_VAR_OTHER_UBASE;
 
+			VERIFY(id < vstate->dtvs_nglobals);
 			svar = vstate->dtvs_globals[id];
 			ASSERT(svar != NULL);
 			v = &svar->dtsv_var;
 
 			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 				uintptr_t a = (uintptr_t)svar->dtsv_data;
+				size_t lim;
 
 				ASSERT(a != 0);
 				ASSERT(svar->dtsv_size != 0);
@@ -5195,11 +6675,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 				}
 				if (!dtrace_vcanload(
 				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
-				    mstate, vstate))
+				    &lim, mstate, vstate))
 					break;
 
 				dtrace_vcopy((void *)(uintptr_t)regs[rd],
-				    (void *)a, &v->dtdv_type);
+				    (void *)a, &v->dtdv_type, lim);
 				break;
 			}
 
@@ -5238,6 +6718,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 				uintptr_t a = (uintptr_t)svar->dtsv_data;
 				size_t sz = v->dtdv_type.dtdt_size;
+				size_t lim;
 
 				sz += sizeof (uint64_t);
 				ASSERT(svar->dtsv_size == NCPU * sz);
@@ -5267,7 +6748,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 
 			ASSERT(id >= DIF_VAR_OTHER_UBASE);
 			id -= DIF_VAR_OTHER_UBASE;
-			ASSERT(id < vstate->dtvs_nlocals);
+			VERIFY(id < vstate->dtvs_nlocals);
 
 			ASSERT(vstate->dtvs_locals != NULL);
 			svar = vstate->dtvs_locals[id];
@@ -5277,6 +6758,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 				uintptr_t a = (uintptr_t)svar->dtsv_data;
 				size_t sz = v->dtdv_type.dtdt_size;
+				size_t lim;
 
 				sz += sizeof (uint64_t);
 				ASSERT(svar->dtsv_size == NCPU * sz);
@@ -5292,11 +6774,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 
 				if (!dtrace_vcanload(
 				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
-				    mstate, vstate))
+				    &lim, mstate, vstate))
 					break;
 
 				dtrace_vcopy((void *)(uintptr_t)regs[rd],
-				    (void *)a, &v->dtdv_type);
+				    (void *)a, &v->dtdv_type, lim);
 				break;
 			}
 
@@ -5345,6 +6827,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 			id = DIF_INSTR_VAR(instr);
 			ASSERT(id >= DIF_VAR_OTHER_UBASE);
 			id -= DIF_VAR_OTHER_UBASE;
+			VERIFY(id < vstate->dtvs_ntlocals);
 
 			key = &tupregs[DIF_DTR_NREGS];
 			key[0].dttk_value = (uint64_t)id;
@@ -5369,13 +6852,15 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 				break;
 
 			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+				size_t lim;
+
 				if (!dtrace_vcanload(
 				    (void *)(uintptr_t)regs[rd],
-				    &v->dtdv_type, mstate, vstate))
+				    &v->dtdv_type, &lim, mstate, vstate))
 					break;
 
 				dtrace_vcopy((void *)(uintptr_t)regs[rd],
-				    dvar->dtdv_data, &v->dtdv_type);
+				    dvar->dtdv_data, &v->dtdv_type, lim);
 			} else {
 				*((uint64_t *)dvar->dtdv_data) = regs[rd];
 			}
@@ -5412,6 +6897,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 				    regs[r2] ? regs[r2] :
 				    dtrace_strsize_default) + 1;
 			} else {
+				if (regs[r2] > LONG_MAX) {
+					*flags |= CPU_DTRACE_ILLOP;
+					break;
+				}
+
 				tupregs[ttop].dttk_size = regs[r2];
 			}
 
@@ -5453,8 +6943,10 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 			if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
 				DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
 				key[nkeys++].dttk_size = 0;
+				VERIFY(id < vstate->dtvs_ntlocals);
 				v = &vstate->dtvs_tlocals[id];
 			} else {
+				VERIFY(id < vstate->dtvs_nglobals);
 				v = &vstate->dtvs_globals[id]->dtsv_var;
 			}
 
@@ -5508,13 +7000,15 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 				break;
 
 			if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
+				size_t lim;
+
 				if (!dtrace_vcanload(
 				    (void *)(uintptr_t)regs[rd], &v->dtdv_type,
-				    mstate, vstate))
+				    &lim, mstate, vstate))
 					break;
 
 				dtrace_vcopy((void *)(uintptr_t)regs[rd],
-				    dvar->dtdv_data, &v->dtdv_type);
+				    dvar->dtdv_data, &v->dtdv_type, lim);
 			} else {
 				*((uint64_t *)dvar->dtdv_data) = regs[rd];
 			}
@@ -5551,6 +7045,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, 
 				*illval = regs[rd];
 				break;
 			}
+
 			if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
 				break;
 
@@ -5679,9 +7174,15 @@ dtrace_action_breakpoint(dtrace_ecb_t *e
 	c[i++] = ')';
 	c[i] = '\0';
 
-#if defined(sun)
+#ifdef illumos
 	debug_enter(c);
-#else
+#endif
+
+#ifdef __FreeBSD__
+	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
+#endif
+
+#ifdef __NetBSD__
 #ifdef DDB
 	db_printf("%s\n", c);
 	Debugger();
@@ -5731,7 +7232,7 @@ dtrace_action_raise(uint64_t sig)
 		return;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	/*
 	 * raise() has a queue depth of 1 -- we ignore all subsequent
 	 * invocations of the raise() action.
@@ -5741,7 +7242,15 @@ dtrace_action_raise(uint64_t sig)
 
 	curthread->t_sig_check = 1;
 	aston(curthread);
-#else
+#endif
+
+#ifdef __FreeBSD__
+	PROC_LOCK(p);
+	kern_psignal(p, sig);
+	PROC_UNLOCK(p);
+#endif
+
+#ifdef __NetBSD__
 	struct proc *p = curproc;
 	mutex_enter(proc_lock);
 	psignal(p, sig);
@@ -5755,13 +7264,21 @@ dtrace_action_stop(void)
 	if (dtrace_destructive_disallow)
 		return;
 
-#if defined(sun)
+#ifdef illumos
 	if (!curthread->t_dtrace_stop) {
 		curthread->t_dtrace_stop = 1;
 		curthread->t_sig_check = 1;
 		aston(curthread);
 	}
-#else
+#endif
+
+#ifdef __FreeBSD__
+	PROC_LOCK(p);
+	kern_psignal(p, SIGSTOP);
+	PROC_UNLOCK(p);
+#endif
+
+#ifdef __NetBSD__
 	struct proc *p = curproc;
 	mutex_enter(proc_lock);
 	psignal(p, SIGSTOP);
@@ -5772,10 +7289,9 @@ dtrace_action_stop(void)
 static void
 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
 {
-#if 0	/* XXX TBD - needs solaris_cpu */
 	hrtime_t now;
 	volatile uint16_t *flags;
-#if defined(sun)
+#ifdef illumos
 	cpu_t *cpu = CPU;
 #else
 	cpu_t *cpu = &solaris_cpu[curcpu_id];
@@ -5817,7 +7333,6 @@ dtrace_action_chill(dtrace_mstate_t *mst
 	 */
 	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
 	cpu->cpu_dtrace_chilled += val;
-#endif
 }
 
 static void
@@ -5829,6 +7344,7 @@ dtrace_action_ustack(dtrace_mstate_t *ms
 	uint64_t *pcs = &buf[1], *fps;
 	char *str = (char *)&pcs[nframes];
 	int size, offs = 0, i, j;
+	size_t rem;
 	uintptr_t old = mstate->dtms_scratch_ptr, saved;
 	uint16_t *flags = &cpu_core[curcpu_id].cpuc_dtrace_flags;
 	char *sym;
@@ -5900,12 +7416,18 @@ dtrace_action_ustack(dtrace_mstate_t *ms
 			continue;
 		}
 
+		if (!dtrace_strcanload((uintptr_t)sym, strsize, &rem, mstate,
+		    &(state->dts_vstate))) {
+			str[offs++] = '\0';
+			continue;
+		}
+
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 
 		/*
 		 * Now copy in the string that the helper returned to us.
 		 */
-		for (j = 0; offs + j < strsize; j++) {
+		for (j = 0; offs + j < strsize && j < rem; j++) {
 			if ((str[offs + j] = sym[j]) == '\0')
 				break;
 		}
@@ -5933,6 +7455,63 @@ out:
 	mstate->dtms_scratch_ptr = old;
 }
 
+static void
+dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
+    size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
+{
+	volatile uint16_t *flags;
+	uint64_t val = *valp;
+	size_t valoffs = *valoffsp;
+
+	flags = (volatile uint16_t *)&cpu_core[curcpu_id].cpuc_dtrace_flags;
+	ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
+
+	/*
+	 * If this is a string, we're going to only load until we find the zero
+	 * byte -- after which we'll store zero bytes.
+	 */
+	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
+		char c = '\0' + 1;
+		size_t s;
+
+		for (s = 0; s < size; s++) {
+			if (c != '\0' && dtkind == DIF_TF_BYREF) {
+				c = dtrace_load8(val++);
+			} else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				c = dtrace_fuword8((void *)(uintptr_t)val++);
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+				if (*flags & CPU_DTRACE_FAULT)
+					break;
+			}
+
+			DTRACE_STORE(uint8_t, tomax, valoffs++, c);
+
+			if (c == '\0' && intuple)
+				break;
+		}
+	} else {
+		uint8_t c;
+		while (valoffs < end) {
+			if (dtkind == DIF_TF_BYREF) {
+				c = dtrace_load8(val++);
+			} else if (dtkind == DIF_TF_BYUREF) {
+				DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+				c = dtrace_fuword8((void *)(uintptr_t)val++);
+				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+				if (*flags & CPU_DTRACE_FAULT)
+					break;
+			}
+
+			DTRACE_STORE(uint8_t, tomax,
+			    valoffs++, c);
+		}
+	}
+
+	*valp = val;
+	*valoffsp = valoffs;
+}
+
 /*
  * If you're looking for the epicenter of DTrace, you just found it.  This
  * is the function called by the provider to fire a probe -- from which all
@@ -5954,7 +7533,10 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 	volatile uint16_t *flags;
 	hrtime_t now;
 
-#if defined(sun)
+	if (panicstr != NULL)
+		return;
+
+#ifdef illumos
 	/*
 	 * Kick out immediately if this CPU is still being born (in which case
 	 * curthread will be set to -1) or the current thread can't allow
@@ -5979,7 +7561,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 		return;
 	}
 
-#if defined(sun)
+#ifdef illumos
 	if (panic_quiesce) {
 #else
 	if (panicstr != NULL) {
@@ -5991,7 +7573,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 		return;
 	}
 
-	now = dtrace_gethrtime();
+	now = mstate.dtms_timestamp = dtrace_gethrtime();
+	mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
 	vtime = dtrace_vtime_references != 0;
 
 	if (vtime && curthread->t_dtrace_start)
@@ -6015,6 +7598,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 		dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
 		dtrace_vstate_t *vstate = &state->dts_vstate;
 		dtrace_provider_t *prov = probe->dtpr_provider;
+		uint64_t tracememsize = 0;
 		int committed = 0;
 		caddr_t tomax;
 
@@ -6031,6 +7615,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 		uint64_t val = 0;
 
 		mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
+		mstate.dtms_getf = NULL;
+
 		*flags &= ~CPU_DTRACE_ERROR;
 
 		if (prov == dtrace_provider) {
@@ -6082,7 +7668,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 			    probe->dtpr_id, probe->dtpr_arg) == 0)
 				continue;
 
-#if defined(sun)
+#ifdef illumos
 			/*
 			 * This is more subtle than it looks. We have to be
 			 * absolutely certain that CRED() isn't going to
@@ -6133,7 +7719,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 		if (now - state->dts_alive > dtrace_deadman_timeout) {
 			/*
 			 * We seem to be dead.  Unless we (a) have kernel
-			 * destructive permissions (b) have expicitly enabled
+			 * destructive permissions (b) have explicitly enabled
 			 * destructive actions and (c) destructive actions have
 			 * not been disabled, we're going to transition into
 			 * the KILLED state, from which no further processing
@@ -6161,8 +7747,18 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 		tomax = buf->dtb_tomax;
 		ASSERT(tomax != NULL);
 
-		if (ecb->dte_size != 0)
-			DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
+		if (ecb->dte_size != 0) {
+			dtrace_rechdr_t dtrh;
+			if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+				mstate.dtms_timestamp = dtrace_gethrtime();
+				mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+			}
+			ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
+			dtrh.dtrh_epid = ecb->dte_epid;
+			DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
+			    mstate.dtms_timestamp);
+			*((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
+		}
 
 		mstate.dtms_epid = ecb->dte_epid;
 		mstate.dtms_present |= DTRACE_MSTATE_EPID;
@@ -6174,7 +7770,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 
 		if (pred != NULL) {
 			dtrace_difo_t *dp = pred->dtp_difo;
-			int rval;
+			uint64_t rval;
 
 			rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
 
@@ -6309,7 +7905,9 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 				continue;
 
 			switch (act->dta_kind) {
-			case DTRACEACT_SPECULATE:
+			case DTRACEACT_SPECULATE: {
+				dtrace_rechdr_t *dtrh;
+
 				ASSERT(buf == &state->dts_buffer[cpuid]);
 				buf = dtrace_speculation_buffer(state,
 				    cpuid, val);
@@ -6331,10 +7929,23 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 				tomax = buf->dtb_tomax;
 				ASSERT(tomax != NULL);
 
-				if (ecb->dte_size != 0)
-					DTRACE_STORE(uint32_t, tomax, offs,
-					    ecb->dte_epid);
+				if (ecb->dte_size == 0)
+					continue;
+
+				ASSERT3U(ecb->dte_size, >=,
+				    sizeof (dtrace_rechdr_t));
+				dtrh = ((void *)(tomax + offs));
+				dtrh->dtrh_epid = ecb->dte_epid;
+				/*
+				 * When the speculation is committed, all of
+				 * the records in the speculative buffer will
+				 * have their timestamps set to the commit
+				 * time.  Until then, it is set to a sentinel
+				 * value, for debugability.
+				 */
+				DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
 				continue;
+			}
 
 			case DTRACEACT_PRINTM: {
 				/* The DIF returns a 'memref'. */
@@ -6353,83 +7964,23 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 					continue;
 				}
 
-				/* Store the size in the buffer first. */
-				DTRACE_STORE(uintptr_t, tomax,
-				    valoffs, size);
-
-				/*
-				 * Offset the buffer address to the start
-				 * of the data.
-				 */
-				valoffs += sizeof(uintptr_t);
-
-				/*
-				 * Reset to the memory address rather than
-				 * the memref array, then let the BYREF
-				 * code below do the work to store the 
-				 * memory data in the buffer.
-				 */
-				val = memref[0];
-				break;
-			}
-
-			case DTRACEACT_PRINTT: {
-				/* The DIF returns a 'typeref'. */
-				uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
-				char c = '\0' + 1;
-				size_t s;
-
-				/*
-				 * Get the type string length and round it
-				 * up so that the data that follows is
-				 * aligned for easy access.
-				 */
-				size_t typs = strlen((char *) typeref[2]) + 1;
-				typs = roundup(typs,  sizeof(uintptr_t));
-
-				/*
-				 *Get the size from the typeref using the
-				 * number of elements and the type size.
-				 */
-				size = typeref[1] * typeref[3];
-
-				/*
-				 * Check if the size exceeds the allocated
-				 * buffer size.
-				 */
-				if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
-					/* Flag a drop! */
-					*flags |= CPU_DTRACE_DROP;
-				
-				}
-
-				/* Store the size in the buffer first. */
-				DTRACE_STORE(uintptr_t, tomax,
-				    valoffs, size);
-				valoffs += sizeof(uintptr_t);
-
-				/* Store the type size in the buffer. */
-				DTRACE_STORE(uintptr_t, tomax,
-				    valoffs, typeref[3]);
-				valoffs += sizeof(uintptr_t);
-
-				val = typeref[2];
-
-				for (s = 0; s < typs; s++) {
-					if (c != '\0')
-						c = dtrace_load8(val++);
+				/* Store the size in the buffer first. */
+				DTRACE_STORE(uintptr_t, tomax,
+				    valoffs, size);
 
-					DTRACE_STORE(uint8_t, tomax,
-					    valoffs++, c);
-				}
+				/*
+				 * Offset the buffer address to the start
+				 * of the data.
+				 */
+				valoffs += sizeof(uintptr_t);
 
 				/*
 				 * Reset to the memory address rather than
-				 * the typeref array, then let the BYREF
+				 * the memref array, then let the BYREF
 				 * code below do the work to store the 
 				 * memory data in the buffer.
 				 */
-				val = typeref[0];
+				val = memref[0];
 				break;
 			}
 
@@ -6466,6 +8017,11 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 			case DTRACEACT_PRINTA:
 			case DTRACEACT_SYSTEM:
 			case DTRACEACT_FREOPEN:
+			case DTRACEACT_TRACEMEM:
+				break;
+
+			case DTRACEACT_TRACEMEM_DYNSIZE:
+				tracememsize = val;
 				break;
 
 			case DTRACEACT_SYM:
@@ -6477,14 +8033,15 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 			case DTRACEACT_USYM:
 			case DTRACEACT_UMOD:
 			case DTRACEACT_UADDR: {
-#if defined(sun)
+#ifdef illumos
 				struct pid *pid = curthread->t_procp->p_pidp;
 #endif
+
 				if (!dtrace_priv_proc(state))
 					continue;
 
 				DTRACE_STORE(uint64_t, tomax,
-#if defined(sun)
+#ifdef illumos
 				    valoffs, (uint64_t)pid->pid_id);
 #else
 				    valoffs, (uint64_t) curproc->p_pid);
@@ -6533,43 +8090,25 @@ dtrace_probe(dtrace_id_t id, uintptr_t a
 				ASSERT(0);
 			}
 
-			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
+			if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
+			    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
 				uintptr_t end = valoffs + size;
 
-				if (!dtrace_vcanload((void *)(uintptr_t)val,
-				    &dp->dtdo_rtype, &mstate, vstate))
-					continue;
-
-				/*
-				 * If this is a string, we're going to only
-				 * load until we find the zero byte -- after
-				 * which we'll store zero bytes.
-				 */
-				if (dp->dtdo_rtype.dtdt_kind ==
-				    DIF_TYPE_STRING) {
-					char c = '\0' + 1;
-					int intuple = act->dta_intuple;
-					size_t s;
-
-					for (s = 0; s < size; s++) {
-						if (c != '\0')
-							c = dtrace_load8(val++);
-
-						DTRACE_STORE(uint8_t, tomax,
-						    valoffs++, c);
-
-						if (c == '\0' && intuple)
-							break;
-					}
-
-					continue;
+				if (tracememsize != 0 &&
+				    valoffs + tracememsize < end) {
+					end = valoffs + tracememsize;
+					tracememsize = 0;
 				}
 
-				while (valoffs < end) {
-					DTRACE_STORE(uint8_t, tomax, valoffs++,
-					    dtrace_load8(val++));
-				}
+				if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
+				    !dtrace_vcanload((void *)(uintptr_t)val,
+				    &dp->dtdo_rtype, NULL, &mstate, vstate))
+					continue;
 
+				dtrace_store_by_ref(dp, tomax, size, &valoffs,
+				    &val, end, act->dta_intuple,
+				    dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
+				    DIF_TF_BYREF: DIF_TF_BYUREF);
 				continue;
 			}
 
@@ -6927,15 +8466,11 @@ dtrace_cred2priv(cred_t *cr, uint32_t *p
 {
 	uint32_t priv;
 
-#if defined(sun)
+#ifdef illumos
 	if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
 		/*
-		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter,
-		 * but for GCC they do.
+		 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
 		 */
-		*uidp = 0;
-		*zoneidp = 0;
-
 		priv = DTRACE_PRIV_ALL;
 	} else {
 		*uidp = crgetuid(cr);
@@ -7447,18 +8982,13 @@ dtrace_register(const char *name, const 
 	if (pops->dtps_provide == NULL) {
 		ASSERT(pops->dtps_provide_module != NULL);
 		provider->dtpv_pops.dtps_provide =
-		    (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
+		    (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
 	}
 
 	if (pops->dtps_provide_module == NULL) {
 		ASSERT(pops->dtps_provide != NULL);
-#if defined(sun)
 		provider->dtpv_pops.dtps_provide_module =
 		    (void (*)(void *, modctl_t *))dtrace_nullop;
-#else
-		provider->dtpv_pops.dtps_provide_module =
-		    (void (*)(void *, dtrace_modctl_t *))dtrace_nullop;
-#endif
 	}
 
 	if (pops->dtps_suspend == NULL) {
@@ -7530,17 +9060,17 @@ dtrace_unregister(dtrace_provider_id_t i
 {
 	dtrace_provider_t *old = (dtrace_provider_t *)id;
 	dtrace_provider_t *prev = NULL;
-	int i, self = 0;
+	int i, self = 0, noreap = 0;
 	dtrace_probe_t *probe, *first = NULL;
 
 	if (old->dtpv_pops.dtps_enable ==
-	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
+	    (int (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
 		/*
 		 * If DTrace itself is the provider, we're called with locks
 		 * already held.
 		 */
 		ASSERT(old == dtrace_provider);
-#if defined(sun)
+#ifdef illumos
 		ASSERT(dtrace_devi != NULL);
 #endif
 		ASSERT(MUTEX_HELD(&dtrace_provider_lock));
@@ -7555,7 +9085,9 @@ dtrace_unregister(dtrace_provider_id_t i
 		}
 	} else {
 		mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
 		mutex_enter(&mod_lock);
+#endif
 		mutex_enter(&dtrace_lock);
 	}
 
@@ -7569,7 +9101,9 @@ dtrace_unregister(dtrace_provider_id_t i
 	    dtrace_anon.dta_state->dts_necbs > 0))) {
 		if (!self) {
 			mutex_exit(&dtrace_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_provider_lock);
 		}
 		return (EBUSY);
@@ -7589,14 +9123,36 @@ dtrace_unregister(dtrace_provider_id_t i
 			continue;
 
 		/*
+		 * If we are trying to unregister a defunct provider, and the
+		 * provider was made defunct within the interval dictated by
+		 * dtrace_unregister_defunct_reap, we'll (asynchronously)
+		 * attempt to reap our enablings.  To denote that the provider
+		 * should reattempt to unregister itself at some point in the
+		 * future, we will return a differentiable error code (EAGAIN
+		 * instead of EBUSY) in this case.
+		 */
+		if (dtrace_gethrtime() - old->dtpv_defunct >
+		    dtrace_unregister_defunct_reap)
+			noreap = 1;
+
+		/*
 		 * We have at least one ECB; we can't remove this provider.
 		 */
 		if (!self) {
 			mutex_exit(&dtrace_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_provider_lock);
 		}
-		return (EBUSY);
+
+		if (noreap)
+			return (EBUSY);
+
+		(void) taskq_dispatch(dtrace_taskq,
+		    (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
+
+		return (EAGAIN);
 	}
 
 	/*
@@ -7640,16 +9196,20 @@ dtrace_unregister(dtrace_provider_id_t i
 		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
 		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
-#if defined(sun)
+#ifdef illumos
 		vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
-#else
+#endif
+#ifdef __FreeBSD__
+		free_unr(dtrace_arena, probe->dtpr_id);
+#endif
+#ifdef __NetBSD__
 		vmem_free(dtrace_arena, (uintptr_t)(probe->dtpr_id), 1);
 #endif
 		kmem_free(probe, sizeof (dtrace_probe_t));
 	}
 
 	if ((prev = dtrace_provider) == old) {
-#if defined(sun)
+#ifdef illumos
 		ASSERT(self || dtrace_devi == NULL);
 		ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
 #endif
@@ -7668,7 +9228,9 @@ dtrace_unregister(dtrace_provider_id_t i
 
 	if (!self) {
 		mutex_exit(&dtrace_lock);
+#ifdef illumos
 		mutex_exit(&mod_lock);
+#endif
 		mutex_exit(&dtrace_provider_lock);
 	}
 
@@ -7688,12 +9250,12 @@ dtrace_invalidate(dtrace_provider_id_t i
 	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
 
 	ASSERT(pvp->dtpv_pops.dtps_enable !=
-	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
+	    (int (*)(void *, dtrace_id_t, void *))dtrace_nullop);
 
 	mutex_enter(&dtrace_provider_lock);
 	mutex_enter(&dtrace_lock);
 
-	pvp->dtpv_defunct = 1;
+	pvp->dtpv_defunct = dtrace_gethrtime();
 
 	mutex_exit(&dtrace_lock);
 	mutex_exit(&dtrace_provider_lock);
@@ -7729,7 +9291,7 @@ dtrace_condense(dtrace_provider_id_t id)
 	 * Make sure this isn't the dtrace provider itself.
 	 */
 	ASSERT(prov->dtpv_pops.dtps_enable !=
-	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
+	    (int (*)(void *, dtrace_id_t, void *))dtrace_nullop);
 
 	mutex_enter(&dtrace_provider_lock);
 	mutex_enter(&dtrace_lock);
@@ -7759,9 +9321,13 @@ dtrace_condense(dtrace_provider_id_t id)
 		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
 		kmem_free(probe, sizeof (dtrace_probe_t));
-#if defined(sun)
+#ifdef illumos
 		vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
-#else
+#endif
+#ifdef __FreeBSD__
+		free_unr(dtrace_arena, i + 1);
+#endif
+#ifdef __NetBSD__
 		vmem_free(dtrace_arena, ((uintptr_t)i + 1), 1);
 #endif
 	}
@@ -7792,7 +9358,6 @@ dtrace_probe_create(dtrace_provider_id_t
 	dtrace_probe_t *probe, **probes;
 	dtrace_provider_t *provider = (dtrace_provider_t *)prov;
 	dtrace_id_t id;
-	vmem_addr_t offset;
 
 	if (provider == dtrace_provider) {
 		ASSERT(MUTEX_HELD(&dtrace_lock));
@@ -7800,9 +9365,19 @@ dtrace_probe_create(dtrace_provider_id_t
 		mutex_enter(&dtrace_lock);
 	}
 
+#ifdef illumos
+	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
+	    VM_BESTFIT | VM_SLEEP);
+#endif
+#ifdef __FreeBSD__
+	id = alloc_unr(dtrace_arena);
+#endif
+#ifdef __NetBSD__
+	vmem_addr_t offset;
 	if (vmem_alloc(dtrace_arena, 1, VM_BESTFIT | VM_SLEEP, &offset) != 0)
 		ASSERT(0);
 	id = (dtrace_id_t)(uintptr_t)offset;
+#endif
 	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
 
 	probe->dtpr_id = id;
@@ -7829,7 +9404,6 @@ dtrace_probe_create(dtrace_provider_id_t
 		}
 
 		probes = kmem_zalloc(nsize, KM_SLEEP);
-		dtrace_probes_size = nsize;
 
 		if (dtrace_probes == NULL) {
 			ASSERT(osize == 0);
@@ -7888,8 +9462,8 @@ dtrace_probe_lookup_match(dtrace_probe_t
  * name and probe name.
  */
 dtrace_id_t
-dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
-    const char *func, const char *name)
+dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
+    char *func, char *name)
 {
 	dtrace_probekey_t pkey;
 	dtrace_id_t id;
@@ -7897,11 +9471,11 @@ dtrace_probe_lookup(dtrace_provider_id_t
 
 	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
 	pkey.dtpk_pmatch = &dtrace_match_string;
-	pkey.dtpk_mod = __UNCONST(mod);
+	pkey.dtpk_mod = mod;
 	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
-	pkey.dtpk_func = __UNCONST(func);
+	pkey.dtpk_func = func;
 	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
-	pkey.dtpk_name = __UNCONST(name);
+	pkey.dtpk_name = name;
 	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
 	pkey.dtpk_id = DTRACE_IDNONE;
 
@@ -7951,21 +9525,6 @@ dtrace_probe_description(const dtrace_pr
 	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
 }
 
-#ifdef notyet	/* XXX TBD */
-#if !defined(sun)
-static int
-dtrace_probe_provide_cb(linker_file_t lf, void *arg)
-{
-	dtrace_provider_t *prv = (dtrace_provider_t *) arg;
-
-	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, lf);
-
-	return(0);
-}
-#endif
-#endif /* notyet */
-
-
 /*
  * Called to indicate that a probe -- or probes -- should be provided by a
  * specfied provider.  If the specified description is NULL, the provider will
@@ -7984,9 +9543,10 @@ dtrace_probe_provide_cb(linker_file_t lf
 static void
 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
 {
-#if defined(sun)
+#ifdef illumos
 	modctl_t *ctl;
-#else
+#endif
+#ifdef __NetBSD__
 	module_t *mod;
 #endif
 	int all = 0;
@@ -8004,6 +9564,7 @@ dtrace_probe_provide(dtrace_probedesc_t 
 		 */
 		prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
 
+#ifdef illumos
 		/*
 		 * Now call the per-module provide operation.  We will grab
 		 * mod_lock to prevent the list from being modified.  Note
@@ -8012,7 +9573,6 @@ dtrace_probe_provide(dtrace_probedesc_t 
 		 */
 		mutex_enter(&mod_lock);
 
-#if defined(sun)
 		ctl = &modules;
 		do {
 			if (ctl->mod_busy || ctl->mod_mp == NULL)
@@ -8021,29 +9581,25 @@ dtrace_probe_provide(dtrace_probedesc_t 
 			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
 
 		} while ((ctl = ctl->mod_next) != &modules);
-#else
+
+		mutex_exit(&mod_lock);
+#endif
+#ifdef __NetBSD__
+		kernconfig_lock();
 
 		/* Fake netbsd module first */
-		if (mod_nbsd == NULL) {
-		    mod_nbsd = kmem_zalloc(sizeof(*mod_nbsd), KM_SLEEP);
-		    mod_nbsd->mod_info = kmem_zalloc(sizeof(modinfo_t), KM_SLEEP);
-		    mod_nbsd->mod_refcnt = 1;
-		    *((char **)(intptr_t)&mod_nbsd->mod_info->mi_name) = __UNCONST("netbsd");
-		}
+		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, module_kernel());
 
-		kernconfig_lock();
-		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, mod_nbsd);
 		TAILQ_FOREACH(mod, &module_list, mod_chain) {
-			prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, mod);
+			if (module_source(mod) != MODULE_SOURCE_KERNEL)
+				prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, mod);
 		}
 		kernconfig_unlock();
 #endif
-
-		mutex_exit(&mod_lock);
 	} while (all && (prv = prv->dtpv_next) != NULL);
 }
 
-#if defined(sun)
+#ifdef illumos
 /*
  * Iterate over each probe, and call the Framework-to-Provider API function
  * denoted by offs.
@@ -8204,6 +9760,10 @@ dtrace_helper_provide_one(dof_helper_t *
 		probe = (dof_probe_t *)(uintptr_t)(daddr +
 		    prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
 
+		/* See the check in dtrace_helper_provider_validate(). */
+		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN)
+			continue;
+
 		dhpb.dthpb_mod = dhp->dofhp_mod;
 		dhpb.dthpb_func = strtab + probe->dofpr_func;
 		dhpb.dthpb_name = strtab + probe->dofpr_name;
@@ -8256,7 +9816,6 @@ dtrace_helper_provide(dof_helper_t *dhp,
 	dtrace_enabling_matchall();
 }
 
-#if defined(sun)
 static void
 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
 {
@@ -8304,7 +9863,6 @@ dtrace_helper_provider_remove(dof_helper
 		dtrace_helper_provider_remove_one(dhp, sec, pid);
 	}
 }
-#endif
 
 /*
  * DTrace Meta Provider-to-Framework API Functions
@@ -8464,6 +10022,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, 
 	int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
 	int kcheckload;
 	uint_t pc;
+	int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
 
 	kcheckload = cr == NULL ||
 	    (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
@@ -8695,6 +10254,19 @@ dtrace_difo_validate(dtrace_difo_t *dp, 
 			    subr == DIF_SUBR_COPYOUTSTR) {
 				dp->dtdo_destructive = 1;
 			}
+			if (subr == DIF_SUBR_GETF) {
+				/*
+				 * If we have a getf() we need to record that
+				 * in our state.  Note that our state can be
+				 * NULL if this is a helper -- but in that
+				 * case, the call to getf() is itself illegal,
+				 * and will be caught (slightly later) when
+				 * the helper is validated.
+				 */
+				if (vstate->dtvs_state != NULL)
+					vstate->dtvs_state->dts_getf++;
+			}
+
 			break;
 		case DIF_OP_PUSHTR:
 			if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
@@ -8724,7 +10296,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, 
 		    "expected 'ret' as last DIF instruction\n");
 	}
 
-	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
+	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
 		/*
 		 * If we're not returning by reference, the size must be either
 		 * 0 or the size of one of the base types.
@@ -8779,6 +10351,9 @@ dtrace_difo_validate(dtrace_difo_t *dp, 
 
 		switch (v->dtdv_scope) {
 		case DIFV_SCOPE_GLOBAL:
+			if (maxglobal == -1 || ndx > maxglobal)
+				maxglobal = ndx;
+
 			if (ndx < vstate->dtvs_nglobals) {
 				dtrace_statvar_t *svar;
 
@@ -8789,11 +10364,17 @@ dtrace_difo_validate(dtrace_difo_t *dp, 
 			break;
 
 		case DIFV_SCOPE_THREAD:
+			if (maxtlocal == -1 || ndx > maxtlocal)
+				maxtlocal = ndx;
+
 			if (ndx < vstate->dtvs_ntlocals)
 				existing = &vstate->dtvs_tlocals[ndx];
 			break;
 
 		case DIFV_SCOPE_LOCAL:
+			if (maxlocal == -1 || ndx > maxlocal)
+				maxlocal = ndx;
+
 			if (ndx < vstate->dtvs_nlocals) {
 				dtrace_statvar_t *svar;
 
@@ -8812,9 +10393,10 @@ dtrace_difo_validate(dtrace_difo_t *dp, 
 				break;
 			}
 
-			if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
-			    vt->dtdt_size > dtrace_global_maxsize) {
-				err += efunc(i, "oversized by-ref global\n");
+			if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
+			    v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
+			    vt->dtdt_size > dtrace_statvar_maxsize) {
+				err += efunc(i, "oversized by-ref static\n");
 				break;
 			}
 		}
@@ -8841,10 +10423,40 @@ dtrace_difo_validate(dtrace_difo_t *dp, 
 		}
 	}
 
+	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
+		dif_instr_t instr = dp->dtdo_buf[pc];
+
+		uint_t v = DIF_INSTR_VAR(instr);
+		uint_t op = DIF_INSTR_OP(instr);
+
+		switch (op) {
+		case DIF_OP_LDGS:
+		case DIF_OP_LDGAA:
+		case DIF_OP_STGS:
+		case DIF_OP_STGAA:
+			if (v > DIF_VAR_OTHER_UBASE + maxglobal)
+				err += efunc(pc, "invalid variable %u\n", v);
+			break;
+		case DIF_OP_LDTS:
+		case DIF_OP_LDTAA:
+		case DIF_OP_STTS:
+		case DIF_OP_STTAA:
+			if (v > DIF_VAR_OTHER_UBASE + maxtlocal)
+				err += efunc(pc, "invalid variable %u\n", v);
+			break;
+		case DIF_OP_LDLS:
+		case DIF_OP_STLS:
+			if (v > DIF_VAR_OTHER_UBASE + maxlocal)
+				err += efunc(pc, "invalid variable %u\n", v);
+			break;
+		default:
+			break;
+		}
+	}
+
 	return (err);
 }
 
-#if defined(sun)
 /*
  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
  * are much more constrained than normal DIFOs.  Specifically, they may
@@ -8975,7 +10587,9 @@ dtrace_difo_validate_helper(dtrace_difo_
 			    subr == DIF_SUBR_INET_NTOA ||
 			    subr == DIF_SUBR_INET_NTOA6 ||
 			    subr == DIF_SUBR_INET_NTOP ||
+			    subr == DIF_SUBR_JSON ||
 			    subr == DIF_SUBR_LLTOSTR ||
+			    subr == DIF_SUBR_STRTOLL ||
 			    subr == DIF_SUBR_RINDEX ||
 			    subr == DIF_SUBR_STRCHR ||
 			    subr == DIF_SUBR_STRJOIN ||
@@ -8987,9 +10601,13 @@ dtrace_difo_validate_helper(dtrace_difo_
 			    subr == DIF_SUBR_NTOHS ||
 			    subr == DIF_SUBR_NTOHL ||
 			    subr == DIF_SUBR_NTOHLL ||
-			    subr == DIF_SUBR_MEMREF ||
-			    subr == DIF_SUBR_TYPEREF)
+			    subr == DIF_SUBR_MEMREF)
+				break;
+
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+			if (subr == DIF_SUBR_MEMSTR)
 				break;
+#endif
 
 			err += efunc(pc, "invalid subr %u\n", subr);
 			break;
@@ -9002,7 +10620,6 @@ dtrace_difo_validate_helper(dtrace_difo_
 
 	return (err);
 }
-#endif
 
 /*
  * Returns 1 if the expression in the DIF object can be cached on a per-thread
@@ -9155,6 +10772,9 @@ dtrace_difo_chunksize(dtrace_difo_t *dp,
 				if (srd == 0)
 					return;
 
+				if (sval > LONG_MAX)
+					return;
+
 				tupregs[ttop++].dttk_size = sval;
 			}
 
@@ -9216,6 +10836,19 @@ dtrace_difo_chunksize(dtrace_difo_t *dp,
 		 */
 		size = P2ROUNDUP(size, sizeof (uint64_t));
 
+		/*
+		 * Before setting the chunk size, check that we're not going
+		 * to set it to a negative value...
+		 */
+		if (size > LONG_MAX)
+			return;
+
+		/*
+		 * ...and make certain that we didn't badly overflow.
+		 */
+		if (size < ksize || size < sizeof (dtrace_dynvar_t))
+			return;
+
 		if (size > vstate->dtvs_dynvars.dtds_chunksize)
 			vstate->dtvs_dynvars.dtds_chunksize = size;
 	}
@@ -9334,7 +10967,6 @@ dtrace_difo_init(dtrace_difo_t *dp, dtra
 	dtrace_difo_hold(dp);
 }
 
-#if defined(sun)
 static dtrace_difo_t *
 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
 {
@@ -9378,7 +11010,6 @@ dtrace_difo_duplicate(dtrace_difo_t *dp,
 	dtrace_difo_init(new, vstate);
 	return (new);
 }
-#endif
 
 static void
 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
@@ -9626,7 +11257,7 @@ dtrace_actdesc_create(dtrace_actkind_t k
 {
 	dtrace_actdesc_t *act;
 
-#if defined(sun)
+#ifdef illumos
 	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
 	    arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
 #endif
@@ -9665,7 +11296,7 @@ dtrace_actdesc_release(dtrace_actdesc_t 
 	if (DTRACEACT_ISPRINTFLIKE(kind)) {
 		char *str = (char *)(uintptr_t)act->dtad_arg;
 
-#if defined(sun)
+#ifdef illumos
 		ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
 		    (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
 #endif
@@ -9694,9 +11325,9 @@ dtrace_ecb_add(dtrace_state_t *state, dt
 
 	/*
 	 * The default size is the size of the default action: recording
-	 * the epid.
+	 * the header.
 	 */
-	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
 	ecb->dte_alignment = sizeof (dtrace_epid_t);
 
 	epid = state->dts_epid++;
@@ -9792,125 +11423,99 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
 	}
 }
 
-static void
+static int
 dtrace_ecb_resize(dtrace_ecb_t *ecb)
 {
-	uint32_t maxalign = sizeof (dtrace_epid_t);
-	uint32_t align = sizeof (uint8_t), offs, diff;
 	dtrace_action_t *act;
-	int wastuple = 0;
+	uint32_t curneeded = UINT32_MAX;
 	uint32_t aggbase = UINT32_MAX;
-	dtrace_state_t *state = ecb->dte_state;
 
 	/*
-	 * If we record anything, we always record the epid.  (And we always
-	 * record it first.)
+	 * If we record anything, we always record the dtrace_rechdr_t.  (And
+	 * we always record it first.)
 	 */
-	offs = sizeof (dtrace_epid_t);
-	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+	ecb->dte_size = sizeof (dtrace_rechdr_t);
+	ecb->dte_alignment = sizeof (dtrace_epid_t);
 
 	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
 		dtrace_recdesc_t *rec = &act->dta_rec;
+		ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
 
-		if ((align = rec->dtrd_alignment) > maxalign)
-			maxalign = align;
-
-		if (!wastuple && act->dta_intuple) {
-			/*
-			 * This is the first record in a tuple.  Align the
-			 * offset to be at offset 4 in an 8-byte aligned
-			 * block.
-			 */
-			diff = offs + sizeof (dtrace_aggid_t);
-
-			if ((diff = (diff & (sizeof (uint64_t) - 1))))
-				offs += sizeof (uint64_t) - diff;
-
-			aggbase = offs - sizeof (dtrace_aggid_t);
-			ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
-		}
-
-		/*LINTED*/
-		if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
-			/*
-			 * The current offset is not properly aligned; align it.
-			 */
-			offs += align - diff;
-		}
-
-		rec->dtrd_offset = offs;
-
-		if (offs + rec->dtrd_size > ecb->dte_needed) {
-			ecb->dte_needed = offs + rec->dtrd_size;
-
-			if (ecb->dte_needed > state->dts_needed)
-				state->dts_needed = ecb->dte_needed;
-		}
+		ecb->dte_alignment = MAX(ecb->dte_alignment,
+		    rec->dtrd_alignment);
 
 		if (DTRACEACT_ISAGG(act->dta_kind)) {
 			dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
-			dtrace_action_t *first = agg->dtag_first, *prev;
 
-			ASSERT(rec->dtrd_size != 0 && first != NULL);
-			ASSERT(wastuple);
+			ASSERT(rec->dtrd_size != 0);
+			ASSERT(agg->dtag_first != NULL);
+			ASSERT(act->dta_prev->dta_intuple);
 			ASSERT(aggbase != UINT32_MAX);
+			ASSERT(curneeded != UINT32_MAX);
 
 			agg->dtag_base = aggbase;
 
-			while ((prev = first->dta_prev) != NULL &&
-			    DTRACEACT_ISAGG(prev->dta_kind)) {
-				agg = (dtrace_aggregation_t *)prev;
-				first = agg->dtag_first;
-			}
+			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+			rec->dtrd_offset = curneeded;
+			if (curneeded + rec->dtrd_size < curneeded)
+				return (EINVAL);
+			curneeded += rec->dtrd_size;
+			ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
 
-			if (prev != NULL) {
-				offs = prev->dta_rec.dtrd_offset +
-				    prev->dta_rec.dtrd_size;
-			} else {
-				offs = sizeof (dtrace_epid_t);
-			}
-			wastuple = 0;
+			aggbase = UINT32_MAX;
+			curneeded = UINT32_MAX;
+		} else if (act->dta_intuple) {
+			if (curneeded == UINT32_MAX) {
+				/*
+				 * This is the first record in a tuple.  Align
+				 * curneeded to be at offset 4 in an 8-byte
+				 * aligned block.
+				 */
+				ASSERT(act->dta_prev == NULL ||
+				    !act->dta_prev->dta_intuple);
+				ASSERT3U(aggbase, ==, UINT32_MAX);
+				curneeded = P2PHASEUP(ecb->dte_size,
+				    sizeof (uint64_t), sizeof (dtrace_aggid_t));
+
+				aggbase = curneeded - sizeof (dtrace_aggid_t);
+				ASSERT(IS_P2ALIGNED(aggbase,
+				    sizeof (uint64_t)));
+			}
+			curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+			rec->dtrd_offset = curneeded;
+			if (curneeded + rec->dtrd_size < curneeded)
+				return (EINVAL);
+			curneeded += rec->dtrd_size;
 		} else {
-			if (!act->dta_intuple)
-				ecb->dte_size = offs + rec->dtrd_size;
-
-			offs += rec->dtrd_size;
+			/* tuples must be followed by an aggregation */
+			ASSERT(act->dta_prev == NULL ||
+			    !act->dta_prev->dta_intuple);
+
+			ecb->dte_size = P2ROUNDUP(ecb->dte_size,
+			    rec->dtrd_alignment);
+			rec->dtrd_offset = ecb->dte_size;
+			if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
+				return (EINVAL);
+			ecb->dte_size += rec->dtrd_size;
+			ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
 		}
-
-		wastuple = act->dta_intuple;
 	}
 
 	if ((act = ecb->dte_action) != NULL &&
 	    !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
-	    ecb->dte_size == sizeof (dtrace_epid_t)) {
+	    ecb->dte_size == sizeof (dtrace_rechdr_t)) {
 		/*
-		 * If the size is still sizeof (dtrace_epid_t), then all
+		 * If the size is still sizeof (dtrace_rechdr_t), then all
 		 * actions store no data; set the size to 0.
 		 */
-		ecb->dte_alignment = maxalign;
 		ecb->dte_size = 0;
-
-		/*
-		 * If the needed space is still sizeof (dtrace_epid_t), then
-		 * all actions need no additional space; set the needed
-		 * size to 0.
-		 */
-		if (ecb->dte_needed == sizeof (dtrace_epid_t))
-			ecb->dte_needed = 0;
-
-		return;
 	}
 
-	/*
-	 * Set our alignment, and make sure that the dte_size and dte_needed
-	 * are aligned to the size of an EPID.
-	 */
-	ecb->dte_alignment = maxalign;
-	ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
-	    ~(sizeof (dtrace_epid_t) - 1);
-	ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
-	    ~(sizeof (dtrace_epid_t) - 1);
-	ASSERT(ecb->dte_size <= ecb->dte_needed);
+	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
+	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
+	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
+	    ecb->dte_needed);
+	return (0);
 }
 
 static dtrace_action_t *
@@ -9923,7 +11528,6 @@ dtrace_ecb_aggregation_create(dtrace_ecb
 	dtrace_recdesc_t *frec;
 	dtrace_aggid_t aggid;
 	dtrace_state_t *state = ecb->dte_state;
-	vmem_addr_t offset;
 
 	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
 	agg->dtag_ecb = ecb;
@@ -9965,6 +11569,35 @@ dtrace_ecb_aggregation_create(dtrace_ecb
 		break;
 	}
 
+	case DTRACEAGG_LLQUANTIZE: {
+		uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
+		uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
+		uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
+		uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
+		int64_t v;
+
+		agg->dtag_initial = desc->dtad_arg;
+		agg->dtag_aggregate = dtrace_aggregate_llquantize;
+
+		if (factor < 2 || low >= high || nsteps < factor)
+			goto err;
+
+		/*
+		 * Now check that the number of steps evenly divides a power
+		 * of the factor.  (This assures both integer bucket size and
+		 * linearity within each magnitude.)
+		 */
+		for (v = factor; v < nsteps; v *= factor)
+			continue;
+
+		if ((v % nsteps) || (nsteps % factor))
+			goto err;
+
+		size = (dtrace_aggregate_llquantize_bucket(factor,
+		    low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
+		break;
+	}
+
 	case DTRACEAGG_AVG:
 		agg->dtag_aggregate = dtrace_aggregate_avg;
 		size = sizeof (uint64_t) * 2;
@@ -10030,11 +11663,22 @@ success:
 	/*
 	 * We need to allocate an id for this aggregation.
 	 */
+
+#ifdef illumos
+	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
+	    VM_BESTFIT | VM_SLEEP);
+#endif
+#ifdef __FreeBSD__
+	aggid = alloc_unr(state->dts_aggid_arena);
+#endif
+#ifdef __NetBSD__
+	vmem_addr_t offset;
+
 	if (vmem_alloc(state->dts_aggid_arena, 1, VM_BESTFIT | VM_SLEEP,
 	    &offset) != 0)
 		ASSERT(0);
 	aggid = (dtrace_aggid_t)(uintptr_t)offset;
-
+#endif
 
 	if (aggid - 1 >= state->dts_naggregations) {
 		dtrace_aggregation_t **oaggs = state->dts_aggregations;
@@ -10083,9 +11727,13 @@ dtrace_ecb_aggregation_destroy(dtrace_ec
 	dtrace_aggid_t aggid = agg->dtag_id;
 
 	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
-#if defined(sun)
+#ifdef illumos
 	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
-#else
+#endif
+#ifdef __FreeBSD__
+	free_unr(state->dts_aggid_arena, aggid);
+#endif
+#ifdef __NetBSD__
 	vmem_free(state->dts_aggid_arena, (uintptr_t)aggid, 1);
 #endif
 
@@ -10141,16 +11789,18 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb,
 		case DTRACEACT_PRINTA:
 		case DTRACEACT_SYSTEM:
 		case DTRACEACT_FREOPEN:
+		case DTRACEACT_DIFEXPR:
 			/*
 			 * We know that our arg is a string -- turn it into a
 			 * format.
 			 */
 			if (arg == 0) {
-				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
+				ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
+				    desc->dtad_kind == DTRACEACT_DIFEXPR);
 				format = 0;
 			} else {
 				ASSERT(arg != 0);
-#if defined(sun)
+#ifdef illumos
 				ASSERT(arg > KERNELBASE);
 #endif
 				format = dtrace_format_add(state,
@@ -10159,7 +11809,8 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb,
 
 			/*FALLTHROUGH*/
 		case DTRACEACT_LIBACT:
-		case DTRACEACT_DIFEXPR:
+		case DTRACEACT_TRACEMEM:
+		case DTRACEACT_TRACEMEM_DYNSIZE:
 			if (dp == NULL)
 				return (EINVAL);
 
@@ -10258,7 +11909,7 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb,
 			break;
 
 		case DTRACEACT_SPECULATE:
-			if (ecb->dte_size > sizeof (dtrace_epid_t))
+			if (ecb->dte_size > sizeof (dtrace_rechdr_t))
 				return (EINVAL);
 
 			if (dp == NULL)
@@ -10271,10 +11922,6 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb,
 		    	size = dp->dtdo_rtype.dtdt_size;
 			break;
 
-		case DTRACEACT_PRINTT:
-		    	size = dp->dtdo_rtype.dtdt_size;
-			break;
-
 		case DTRACEACT_COMMIT: {
 			dtrace_action_t *act = ecb->dte_action;
 
@@ -10379,7 +12026,7 @@ dtrace_ecb_action_remove(dtrace_ecb_t *e
 
 	ecb->dte_action = NULL;
 	ecb->dte_action_last = NULL;
-	ecb->dte_size = sizeof (dtrace_epid_t);
+	ecb->dte_size = 0;
 }
 
 static void
@@ -10542,12 +12189,12 @@ dtrace_ecb_create(dtrace_state_t *state,
 		 * of creating our own (saving both time and space).
 		 */
 		dtrace_ecb_t *cached = dtrace_ecb_create_cache;
-		dtrace_action_t *xact = cached->dte_action;
+		dtrace_action_t *act = cached->dte_action;
 
-		if (xact != NULL) {
-			ASSERT(xact->dta_refcnt > 0);
-			xact->dta_refcnt++;
-			ecb->dte_action = xact;
+		if (act != NULL) {
+			ASSERT(act->dta_refcnt > 0);
+			act->dta_refcnt++;
+			ecb->dte_action = act;
 			ecb->dte_action_last = cached->dte_action_last;
 			ecb->dte_needed = cached->dte_needed;
 			ecb->dte_size = cached->dte_size;
@@ -10564,7 +12211,10 @@ dtrace_ecb_create(dtrace_state_t *state,
 		}
 	}
 
-	dtrace_ecb_resize(ecb);
+	if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
+		dtrace_ecb_destroy(ecb);
+		return (NULL);
+	}
 
 	return (dtrace_ecb_create_cache = ecb);
 }
@@ -10650,11 +12300,13 @@ dtrace_buffer_switch(dtrace_buffer_t *bu
 	caddr_t tomax = buf->dtb_tomax;
 	caddr_t xamot = buf->dtb_xamot;
 	dtrace_icookie_t cookie;
+	hrtime_t now;
 
 	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
 	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
 
 	cookie = dtrace_interrupt_disable();
+	now = dtrace_gethrtime();
 	buf->dtb_tomax = xamot;
 	buf->dtb_xamot = tomax;
 	buf->dtb_xamot_drops = buf->dtb_drops;
@@ -10665,6 +12317,8 @@ dtrace_buffer_switch(dtrace_buffer_t *bu
 	buf->dtb_drops = 0;
 	buf->dtb_errors = 0;
 	buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
+	buf->dtb_interval = now - buf->dtb_switched;
+	buf->dtb_switched = now;
 	dtrace_interrupt_enable(cookie);
 }
 
@@ -10695,22 +12349,41 @@ dtrace_buffer_activate(dtrace_state_t *s
 	dtrace_interrupt_enable(cookie);
 }
 
+#ifdef __FreeBSD__
+/*
+ * Activate the specified per-CPU buffer.  This is used instead of
+ * dtrace_buffer_activate() when APs have not yet started, i.e. when
+ * activating anonymous state.
+ */
+static void
+dtrace_buffer_activate_cpu(dtrace_state_t *state, int cpu)
+{
+
+	if (state->dts_buffer[cpu].dtb_tomax != NULL)
+		state->dts_buffer[cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
+}
+#endif
+
 static int
 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
-    processorid_t cpu)
+    processorid_t cpu, int *factor)
 {
-#if defined(sun)
+#ifdef illumos
 	cpu_t *cp;
-#else
+#endif
+#ifdef __NetBSD__
 	CPU_INFO_ITERATOR cpuind;
 	struct cpu_info *cinfo;
 #endif
 	dtrace_buffer_t *buf;
+	int allocated = 0, desired = 0;
 
-#if defined(sun)
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 
+	*factor = 1;
+
 	if (size > dtrace_nonroot_maxsize &&
 	    !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
 		return (EFBIG);
@@ -10734,7 +12407,8 @@ dtrace_buffer_alloc(dtrace_buffer_t *buf
 
 		ASSERT(buf->dtb_xamot == NULL);
 
-		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_tomax = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 
 		buf->dtb_size = size;
@@ -10745,7 +12419,8 @@ dtrace_buffer_alloc(dtrace_buffer_t *buf
 		if (flags & DTRACEBUF_NOSWITCH)
 			continue;
 
-		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_xamot = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 	} while ((cp = cp->cpu_next) != cpu_list);
 
@@ -10759,42 +12434,53 @@ err:
 			continue;
 
 		buf = &bufs[cp->cpu_id];
+		desired += 2;
 
 		if (buf->dtb_xamot != NULL) {
 			ASSERT(buf->dtb_tomax != NULL);
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_xamot, size);
+			allocated++;
 		}
 
 		if (buf->dtb_tomax != NULL) {
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_tomax, size);
+			allocated++;
 		}
 
 		buf->dtb_tomax = NULL;
 		buf->dtb_xamot = NULL;
 		buf->dtb_size = 0;
 	} while ((cp = cp->cpu_next) != cpu_list);
-
-	return (ENOMEM);
 #else
 
-#if defined(__amd64__)
+	*factor = 1;
+#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
+    defined(__mips__) || defined(__powerpc__) || defined(__riscv__)
 	/*
 	 * FreeBSD isn't good at limiting the amount of memory we
 	 * ask to malloc, so let's place a limit here before trying
 	 * to do something that might well end in tears at bedtime.
 	 */
 	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
-		return(ENOMEM);
+		return (ENOMEM);
 #endif
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
-	for (CPU_INFO_FOREACH(cpuind, cinfo)) {
-		if (cpu != DTRACE_CPUALL && cpu != cpu_index(cinfo))
+#ifdef __NetBSD__
+	for (CPU_INFO_FOREACH(cpuind, cinfo))
+#else
+	CPU_FOREACH(i)
+#endif
+	{
+#ifdef __NetBSD__
+		int i = cpu_index(cinfo);
+#endif
+		if (cpu != DTRACE_CPUALL && cpu != i)
 			continue;
 
-		buf = &bufs[cpu_index(cinfo)];
+		buf = &bufs[i];
 
 		/*
 		 * If there is already a buffer allocated for this CPU, it
@@ -10808,7 +12494,8 @@ err:
 
 		ASSERT(buf->dtb_xamot == NULL);
 
-		if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_tomax = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 
 		buf->dtb_size = size;
@@ -10819,7 +12506,8 @@ err:
 		if (flags & DTRACEBUF_NOSWITCH)
 			continue;
 
-		if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+		if ((buf->dtb_xamot = kmem_zalloc(size,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 			goto err;
 	}
 
@@ -10830,21 +12518,32 @@ err:
 	 * Error allocating memory, so free the buffers that were
 	 * allocated before the failed allocation.
 	 */
-	for (CPU_INFO_FOREACH(cpuind, cinfo)) {
+#ifdef __NetBSD__
+	for (CPU_INFO_FOREACH(cpuind, cinfo))
+#else
+	CPU_FOREACH(i)
+#endif
+	{
+#ifdef __NetBSD__
+		int i = cpu_index(cinfo);
+#endif
 		if (cpu != DTRACE_CPUALL && cpu != cpu_index(cinfo))
 			continue;
 
-		buf = &bufs[cpu_index(cinfo)];
+		buf = &bufs[i];
+		desired += 2;
 
 		if (buf->dtb_xamot != NULL) {
 			ASSERT(buf->dtb_tomax != NULL);
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_xamot, size);
+			allocated++;
 		}
 
 		if (buf->dtb_tomax != NULL) {
 			ASSERT(buf->dtb_size == size);
 			kmem_free(buf->dtb_tomax, size);
+			allocated++;
 		}
 
 		buf->dtb_tomax = NULL;
@@ -10852,9 +12551,10 @@ err:
 		buf->dtb_size = 0;
 
 	}
+#endif
+	*factor = desired / (allocated > 0 ? allocated : 1);
 
 	return (ENOMEM);
-#endif
 }
 
 /*
@@ -11020,7 +12720,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *b
 			if (epid == DTRACE_EPIDNONE) {
 				size = sizeof (uint32_t);
 			} else {
-				ASSERT(epid <= state->dts_necbs);
+				ASSERT3U(epid, <=, state->dts_necbs);
 				ASSERT(state->dts_ecbs[epid - 1] != NULL);
 
 				size = state->dts_ecbs[epid - 1]->dte_size;
@@ -11148,11 +12848,41 @@ dtrace_buffer_polish(dtrace_buffer_t *bu
 		    buf->dtb_xamot_offset - buf->dtb_offset);
 	}
 
-	if (buf->dtb_offset > buf->dtb_xamot_offset) {
-		bzero(buf->dtb_tomax + buf->dtb_offset,
-		    buf->dtb_size - buf->dtb_offset);
-		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
-	}
+	if (buf->dtb_offset > buf->dtb_xamot_offset) {
+		bzero(buf->dtb_tomax + buf->dtb_offset,
+		    buf->dtb_size - buf->dtb_offset);
+		bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
+	}
+}
+
+/*
+ * This routine determines if data generated at the specified time has likely
+ * been entirely consumed at user-level.  This routine is called to determine
+ * if an ECB on a defunct probe (but for an active enabling) can be safely
+ * disabled and destroyed.
+ */
+static int
+dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
+{
+	int i;
+
+	for (i = 0; i < NCPU; i++) {
+		dtrace_buffer_t *buf = &bufs[i];
+
+		if (buf->dtb_size == 0)
+			continue;
+
+		if (buf->dtb_flags & DTRACEBUF_RING)
+			return (0);
+
+		if (!buf->dtb_switched && buf->dtb_offset != 0)
+			return (0);
+
+		if (buf->dtb_switched - buf->dtb_interval < when)
+			return (0);
+	}
+
+	return (1);
 }
 
 static void
@@ -11269,9 +12999,15 @@ dtrace_enabling_dump(dtrace_enabling_t *
 	for (i = 0; i < enab->dten_ndesc; i++) {
 		dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
 
+#ifdef __FreeBSD__
+		printf("dtrace: enabling probe %d (%s:%s:%s:%s)\n", i,
+		    desc->dtpd_provider, desc->dtpd_mod,
+		    desc->dtpd_func, desc->dtpd_name);
+#else
 		cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
 		    desc->dtpd_provider, desc->dtpd_mod,
 		    desc->dtpd_func, desc->dtpd_name);
+#endif
 	}
 }
 
@@ -11314,6 +13050,7 @@ dtrace_enabling_destroy(dtrace_enabling_
 		ASSERT(enab->dten_vstate->dtvs_state != NULL);
 		ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
 		enab->dten_vstate->dtvs_state->dts_nretained--;
+		dtrace_retained_gen++;
 	}
 
 	if (enab->dten_prev == NULL) {
@@ -11356,6 +13093,7 @@ dtrace_enabling_retain(dtrace_enabling_t
 		return (ENOSPC);
 
 	state->dts_nretained++;
+	dtrace_retained_gen++;
 
 	if (dtrace_retained == NULL) {
 		dtrace_retained = enab;
@@ -11540,10 +13278,11 @@ dtrace_enabling_matchall(void)
 	 * block pending our completion.
 	 */
 	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
-#if defined(sun)
+#ifdef illumos
 		cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
 
-		if (INGLOBALZONE(curproc) || getzoneid() == crgetzoneid(cr))
+		if (INGLOBALZONE(curproc) ||
+		    cr != NULL && getzoneid() == crgetzoneid(cr))
 #endif
 			(void) dtrace_enabling_match(enab, NULL);
 	}
@@ -11604,6 +13343,7 @@ dtrace_enabling_provide(dtrace_provider_
 {
 	int i, all = 0;
 	dtrace_probedesc_t desc;
+	dtrace_genid_t gen;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
@@ -11614,15 +13354,25 @@ dtrace_enabling_provide(dtrace_provider_
 	}
 
 	do {
-		dtrace_enabling_t *enab = dtrace_retained;
+		dtrace_enabling_t *enab;
 		void *parg = prv->dtpv_arg;
 
-		for (; enab != NULL; enab = enab->dten_next) {
+retry:
+		gen = dtrace_retained_gen;
+		for (enab = dtrace_retained; enab != NULL;
+		    enab = enab->dten_next) {
 			for (i = 0; i < enab->dten_ndesc; i++) {
 				desc = enab->dten_desc[i]->dted_probe;
 				mutex_exit(&dtrace_lock);
 				prv->dtpv_pops.dtps_provide(parg, &desc);
 				mutex_enter(&dtrace_lock);
+				/*
+				 * Process the retained enablings again if
+				 * they have changed while we weren't holding
+				 * dtrace_lock.
+				 */
+				if (gen != dtrace_retained_gen)
+					goto retry;
 			}
 		}
 	} while (all && (prv = prv->dtpv_next) != NULL);
@@ -11633,6 +13383,84 @@ dtrace_enabling_provide(dtrace_provider_
 }
 
 /*
+ * Called to reap ECBs that are attached to probes from defunct providers.
+ */
+static void
+dtrace_enabling_reap(void)
+{
+	dtrace_provider_t *prov;
+	dtrace_probe_t *probe;
+	dtrace_ecb_t *ecb;
+	hrtime_t when;
+	int i;
+
+	mutex_enter(&cpu_lock);
+	mutex_enter(&dtrace_lock);
+
+	for (i = 0; i < dtrace_nprobes; i++) {
+		if ((probe = dtrace_probes[i]) == NULL)
+			continue;
+
+		if (probe->dtpr_ecb == NULL)
+			continue;
+
+		prov = probe->dtpr_provider;
+
+		if ((when = prov->dtpv_defunct) == 0)
+			continue;
+
+		/*
+		 * We have ECBs on a defunct provider:  we want to reap these
+		 * ECBs to allow the provider to unregister.  The destruction
+		 * of these ECBs must be done carefully:  if we destroy the ECB
+		 * and the consumer later wishes to consume an EPID that
+		 * corresponds to the destroyed ECB (and if the EPID metadata
+		 * has not been previously consumed), the consumer will abort
+		 * processing on the unknown EPID.  To reduce (but not, sadly,
+		 * eliminate) the possibility of this, we will only destroy an
+		 * ECB for a defunct provider if, for the state that
+		 * corresponds to the ECB:
+		 *
+		 *  (a)	There is no speculative tracing (which can effectively
+		 *	cache an EPID for an arbitrary amount of time).
+		 *
+		 *  (b)	The principal buffers have been switched twice since the
+		 *	provider became defunct.
+		 *
+		 *  (c)	The aggregation buffers are of zero size or have been
+		 *	switched twice since the provider became defunct.
+		 *
+		 * We use dts_speculates to determine (a) and call a function
+		 * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
+		 * that as soon as we've been unable to destroy one of the ECBs
+		 * associated with the probe, we quit trying -- reaping is only
+		 * fruitful in as much as we can destroy all ECBs associated
+		 * with the defunct provider's probes.
+		 */
+		while ((ecb = probe->dtpr_ecb) != NULL) {
+			dtrace_state_t *state = ecb->dte_state;
+			dtrace_buffer_t *buf = state->dts_buffer;
+			dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
+
+			if (state->dts_speculates)
+				break;
+
+			if (!dtrace_buffer_consumed(buf, when))
+				break;
+
+			if (!dtrace_buffer_consumed(aggbuf, when))
+				break;
+
+			dtrace_ecb_disable(ecb);
+			ASSERT(probe->dtpr_ecb != ecb);
+			dtrace_ecb_destroy(ecb);
+		}
+	}
+
+	mutex_exit(&dtrace_lock);
+	mutex_exit(&cpu_lock);
+}
+/*
  * DTrace DOF Functions
  */
 /*ARGSUSED*/
@@ -11754,10 +13582,107 @@ dtrace_dof_copyin(uintptr_t uarg, int *e
 	return (dof);
 }
 
-#if 0
-#if !defined(sun)
+#ifdef __FreeBSD__
+static dof_hdr_t *
+dtrace_dof_copyin_proc(struct proc *p, uintptr_t uarg, int *errp)
+{
+	dof_hdr_t hdr, *dof;
+	struct thread *td;
+	size_t loadsz;
+
+	ASSERT(!MUTEX_HELD(&dtrace_lock));
+
+	td = curthread;
+
+	/*
+	 * First, we're going to copyin() the sizeof (dof_hdr_t).
+	 */
+	if (proc_readmem(td, p, uarg, &hdr, sizeof(hdr)) != sizeof(hdr)) {
+		dtrace_dof_error(NULL, "failed to copyin DOF header");
+		*errp = EFAULT;
+		return (NULL);
+	}
+
+	/*
+	 * Now we'll allocate the entire DOF and copy it in -- provided
+	 * that the length isn't outrageous.
+	 */
+	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
+		dtrace_dof_error(&hdr, "load size exceeds maximum");
+		*errp = E2BIG;
+		return (NULL);
+	}
+	loadsz = (size_t)hdr.dofh_loadsz;
+
+	if (loadsz < sizeof (hdr)) {
+		dtrace_dof_error(&hdr, "invalid load size");
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	dof = kmem_alloc(loadsz, KM_SLEEP);
+
+	if (proc_readmem(td, p, uarg, dof, loadsz) != loadsz ||
+	    dof->dofh_loadsz != loadsz) {
+		kmem_free(dof, hdr.dofh_loadsz);
+		*errp = EFAULT;
+		return (NULL);
+	}
+
+	return (dof);
+}
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+static dof_hdr_t *
+dtrace_dof_copyin_pid(pid_t pid, const void *uarg, int *errp)
+{
+	dof_hdr_t hdr, *dof;
+	size_t loadsz;
+	int err;
+
+	err = copyin_pid(pid, uarg, &hdr, sizeof(hdr));
+	if (err != 0) {
+		*errp = err;
+		return (NULL);
+	}
+
+	/*
+	 * Now we'll allocate the entire DOF and copy it in -- provided
+	 * that the length isn't outrageous.
+	 */
+	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
+		dtrace_dof_error(&hdr, "load size exceeds maximum");
+		*errp = E2BIG;
+		return (NULL);
+	}
+	loadsz = (size_t)hdr.dofh_loadsz;
+
+	if (loadsz < sizeof (hdr)) {
+		dtrace_dof_error(&hdr, "invalid load size");
+		*errp = EINVAL;
+		return (NULL);
+	}
+
+	dof = kmem_alloc(loadsz, KM_SLEEP);
+
+	err = copyin_pid(pid, uarg, dof, loadsz);
+	if (err == 0 && dof->dofh_loadsz != loadsz)
+		err = EFAULT;
+	if (err != 0) {
+		kmem_free(dof, loadsz);
+		*errp = EFAULT;
+		return (NULL);
+	}
+
+	return (dof);
+}
+#endif
+
+#ifdef __FreeBSD__
 static __inline uchar_t
-dtrace_dof_char(char c) {
+dtrace_dof_char(char c)
+{
+
 	switch (c) {
 	case '0':
 	case '1':
@@ -11786,19 +13711,18 @@ dtrace_dof_char(char c) {
 		return (c - 'a' + 10);
 	}
 	/* Should not reach here. */
-	return (0);
+	return (UCHAR_MAX);
 }
-#endif
-#endif
+#endif /* __FreeBSD__ */
 
 static dof_hdr_t *
 dtrace_dof_property(const char *name)
 {
-	dof_hdr_t *dof = NULL;
-#if defined(sun)
+#ifdef illumos
 	uchar_t *buf;
 	uint64_t loadsz;
 	unsigned int len, i;
+	dof_hdr_t *dof = NULL;
 
 	/*
 	 * Unfortunately, array of values in .conf files are always (and
@@ -11833,52 +13757,89 @@ dtrace_dof_property(const char *name)
 	dof = kmem_alloc(loadsz, KM_SLEEP);
 	bcopy(buf, dof, loadsz);
 	ddi_prop_free(buf);
-#else
-	printf("dtrace: XXX %s not implemented (name=%s)\n", __func__, name);
-#if 0	/* XXX TBD dtrace_dof_provide */
-	char *p;
-	char *p_env;
 
-	if ((p_env = getenv(name)) == NULL)
-		return (NULL);
+	return (dof);
+#endif /* illumos */
+#ifdef __FreeBSD__
+	uint8_t *dofbuf;
+	u_char *data, *eol;
+	caddr_t doffile;
+	size_t bytes, len, i;
+	dof_hdr_t *dof;
+	u_char c1, c2;
 
-	len = strlen(p_env) / 2;
+	dof = NULL;
 
-	buf = kmem_alloc(len, KM_SLEEP);
+	doffile = preload_search_by_type("dtrace_dof");
+	if (doffile == NULL)
+		return (NULL);
 
-	dof = (dof_hdr_t *) buf;
+	data = preload_fetch_addr(doffile);
+	len = preload_fetch_size(doffile);
+	for (;;) {
+		/* Look for the end of the line. All lines end in a newline. */
+		eol = memchr(data, '\n', len);
+		if (eol == NULL)
+			return (NULL);
 
-	p = p_env;
+		if (strncmp(name, data, strlen(name)) == 0)
+			break;
 
-	for (i = 0; i < len; i++) {
-		buf[i] = (dtrace_dof_char(p[0]) << 4) |
-		     dtrace_dof_char(p[1]);
-		p += 2;
+		eol++; /* skip past the newline */
+		len -= eol - data;
+		data = eol;
 	}
 
-	freeenv(p_env);
+	/* We've found the data corresponding to the specified key. */
 
-	if (len < sizeof (dof_hdr_t)) {
-		kmem_free(buf, len);
+	data += strlen(name) + 1; /* skip past the '=' */
+	len = eol - data;
+	if (len % 2 != 0) {
+		dtrace_dof_error(NULL, "invalid DOF encoding length");
+		goto doferr;
+	}
+	bytes = len / 2;
+	if (bytes < sizeof(dof_hdr_t)) {
 		dtrace_dof_error(NULL, "truncated header");
-		return (NULL);
+		goto doferr;
+	}
+
+	/*
+	 * Each byte is represented by the two ASCII characters in its hex
+	 * representation.
+	 */
+	dofbuf = malloc(bytes, M_SOLARIS, M_WAITOK);
+	for (i = 0; i < bytes; i++) {
+		c1 = dtrace_dof_char(data[i * 2]);
+		c2 = dtrace_dof_char(data[i * 2 + 1]);
+		if (c1 == UCHAR_MAX || c2 == UCHAR_MAX) {
+			dtrace_dof_error(NULL, "invalid hex char in DOF");
+			goto doferr;
+		}
+		dofbuf[i] = c1 * 16 + c2;
 	}
 
-	if (len < (loadsz = dof->dofh_loadsz)) {
-		kmem_free(buf, len);
+	dof = (dof_hdr_t *)dofbuf;
+	if (bytes < dof->dofh_loadsz) {
 		dtrace_dof_error(NULL, "truncated DOF");
-		return (NULL);
+		goto doferr;
 	}
 
-	if (loadsz >= dtrace_dof_maxsize) {
-		kmem_free(buf, len);
+	if (dof->dofh_loadsz >= dtrace_dof_maxsize) {
 		dtrace_dof_error(NULL, "oversized DOF");
-		return (NULL);
+		goto doferr;
 	}
-#endif
-#endif
 
 	return (dof);
+
+doferr:
+	free(dof, M_SOLARIS);
+	return (NULL);
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+	printf("dtrace: XXX %s not implemented (name=%s)\n", __func__, name);
+	return (NULL);
+#endif /* __NetBSD__ */
 }
 
 static void
@@ -11994,7 +13955,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_
 	size_t ttl = 0;
 	dof_difohdr_t *dofd;
 	uintptr_t daddr = (uintptr_t)dof;
-	size_t maxx = dtrace_difo_maxsize;
+	size_t max = dtrace_difo_maxsize;
 	int i, l, n;
 
 	static const struct {
@@ -12055,7 +14016,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_
 		    dofd->dofd_links[l])) == NULL)
 			goto err; /* invalid section link */
 
-		if (ttl + subsec->dofs_size > maxx) {
+		if (ttl + subsec->dofs_size > max) {
 			dtrace_dof_error(dof, "exceeds maximum size");
 			goto err;
 		}
@@ -12219,15 +14180,20 @@ dtrace_dof_actdesc(dof_hdr_t *dof, dof_s
 		    (uintptr_t)sec->dofs_offset + offs);
 		kind = (dtrace_actkind_t)desc->dofa_kind;
 
-		if (DTRACEACT_ISPRINTFLIKE(kind) &&
+		if ((DTRACEACT_ISPRINTFLIKE(kind) &&
 		    (kind != DTRACEACT_PRINTA ||
+		    desc->dofa_strtab != DOF_SECIDX_NONE)) ||
+		    (kind == DTRACEACT_DIFEXPR &&
 		    desc->dofa_strtab != DOF_SECIDX_NONE)) {
 			dof_sec_t *strtab;
 			char *str, *fmt;
 			uint64_t i;
 
 			/*
-			 * printf()-like actions must have a format string.
+			 * The argument to these actions is an index into the
+			 * DOF string table.  For printf()-like actions, this
+			 * is the format string.  For print(), this is the
+			 * CTF type of the expression result.
 			 */
 			if ((strtab = dtrace_dof_sect(dof,
 			    DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
@@ -12365,12 +14331,13 @@ err:
 
 /*
  * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
- * specified DOF.  At present, this amounts to simply adding 'ubase' to the
- * site of any user SETX relocations to account for load object base address.
- * In the future, if we need other relocations, this function can be extended.
+ * specified DOF.  SETX relocations are computed using 'ubase', the base load
+ * address of the object containing the DOF, and DOFREL relocations are relative
+ * to the relocation offset within the DOF.
  */
 static int
-dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
+dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase,
+    uint64_t udaddr)
 {
 	uintptr_t daddr = (uintptr_t)dof;
 	dof_relohdr_t *dofr =
@@ -12408,6 +14375,7 @@ dtrace_dof_relocate(dof_hdr_t *dof, dof_
 		case DOF_RELO_NONE:
 			break;
 		case DOF_RELO_SETX:
+		case DOF_RELO_DOFREL:
 			if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
 			    sizeof (uint64_t) > ts->dofs_size) {
 				dtrace_dof_error(dof, "bad relocation offset");
@@ -12419,7 +14387,11 @@ dtrace_dof_relocate(dof_hdr_t *dof, dof_
 				return (-1);
 			}
 
-			*(uint64_t *)taddr += ubase;
+			if (r->dofr_type == DOF_RELO_SETX)
+				*(uint64_t *)taddr += ubase;
+			else
+				*(uint64_t *)taddr +=
+				    udaddr + ts->dofs_offset + r->dofr_offset;
 			break;
 		default:
 			dtrace_dof_error(dof, "invalid relocation type");
@@ -12440,7 +14412,7 @@ dtrace_dof_relocate(dof_hdr_t *dof, dof_
  */
 static int
 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
-    dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
+    dtrace_enabling_t **enabp, uint64_t ubase, uint64_t udaddr, int noprobes)
 {
 	uint64_t len = dof->dofh_loadsz, seclen;
 	uintptr_t daddr = (uintptr_t)dof;
@@ -12565,7 +14537,7 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_
 		if (!(sec->dofs_flags & DOF_SECF_LOAD))
 			continue; /* just ignore non-loadable sections */
 
-		if (sec->dofs_align & (sec->dofs_align - 1)) {
+		if (!ISP2(sec->dofs_align)) {
 			dtrace_dof_error(dof, "bad section alignment");
 			return (-1);
 		}
@@ -12602,7 +14574,7 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_
 
 		switch (sec->dofs_type) {
 		case DOF_SECT_URELHDR:
-			if (dtrace_dof_relocate(dof, sec, ubase) != 0)
+			if (dtrace_dof_relocate(dof, sec, ubase, udaddr) != 0)
 				return (-1);
 			break;
 		}
@@ -12696,7 +14668,7 @@ dtrace_dof_options(dof_hdr_t *dof, dtrac
 static int
 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
 {
-	size_t hashsize, maxper, minn, chunksize = dstate->dtds_chunksize;
+	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
 	void *base;
 	uintptr_t limit;
 	dtrace_dynvar_t *dvar, *next, *start;
@@ -12710,10 +14682,12 @@ dtrace_dstate_init(dtrace_dstate_t *dsta
 	if ((dstate->dtds_chunksize = chunksize) == 0)
 		dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
 
-	if (size < (minn = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
-		size = minn;
+	VERIFY(dstate->dtds_chunksize < LONG_MAX);
 
-	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
+		size = min;
+
+	if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
 		return (ENOMEM);
 
 	dstate->dtds_size = size;
@@ -12750,10 +14724,22 @@ dtrace_dstate_init(dtrace_dstate_t *dsta
 	    ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
 	limit = (uintptr_t)base + size;
 
+	VERIFY((uintptr_t)start < limit);
+	VERIFY((uintptr_t)start >= (uintptr_t)base);
+
 	maxper = (limit - (uintptr_t)start) / NCPU;
 	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
 
-	for (i = 0; i < NCPU; i++) {
+#ifdef illumos
+	for (i = 0; i < NCPU; i++)
+#endif
+#ifdef __FreeBSD__
+	CPU_FOREACH(i)
+#endif
+#ifdef __NetBSD__
+	for (i = 0; i < NCPU; i++)
+#endif
+	{
 		dstate->dtds_percpu[i].dtdsc_free = dvar = start;
 
 		/*
@@ -12771,7 +14757,7 @@ dtrace_dstate_init(dtrace_dstate_t *dsta
 			start = (dtrace_dynvar_t *)limit;
 		}
 
-		ASSERT(limit <= (uintptr_t)base + size);
+		VERIFY(limit <= (uintptr_t)base + size);
 
 		for (;;) {
 			next = (dtrace_dynvar_t *)((uintptr_t)dvar +
@@ -12780,6 +14766,8 @@ dtrace_dstate_init(dtrace_dstate_t *dsta
 			if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
 				break;
 
+			VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
+			    (uintptr_t)dvar <= (uintptr_t)base + size);
 			dvar->dtdv_next = next;
 			dvar = next;
 		}
@@ -12829,6 +14817,56 @@ dtrace_vstate_fini(dtrace_vstate_t *vsta
 	}
 }
 
+#ifdef __FreeBSD__
+static void
+dtrace_state_clean(void *arg)
+{
+	dtrace_state_t *state = arg;
+	dtrace_optval_t *opt = state->dts_options;
+
+	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
+		return;
+
+	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
+	dtrace_speculation_clean(state);
+
+	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
+	    dtrace_state_clean, state);
+}
+
+static void
+dtrace_state_deadman(void *arg)
+{
+	dtrace_state_t *state = arg;
+	hrtime_t now;
+
+	dtrace_sync();
+
+	dtrace_debug_output();
+
+	now = dtrace_gethrtime();
+
+	if (state != dtrace_anon.dta_state &&
+	    now - state->dts_laststatus >= dtrace_deadman_user)
+		return;
+
+	/*
+	 * We must be sure that dts_alive never appears to be less than the
+	 * value upon entry to dtrace_state_deadman(), and because we lack a
+	 * dtrace_cas64(), we cannot store to it atomically.  We thus instead
+	 * store INT64_MAX to it, followed by a memory barrier, followed by
+	 * the new value.  This assures that dts_alive never appears to be
+	 * less than its true value, regardless of the order in which the
+	 * stores to the underlying storage are issued.
+	 */
+	state->dts_alive = INT64_MAX;
+	dtrace_membar_producer();
+	state->dts_alive = now;
+
+	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
+	    dtrace_state_deadman, state);
+}
+#else
 static void
 dtrace_state_clean(dtrace_state_t *state)
 {
@@ -12866,25 +14904,29 @@ dtrace_state_deadman(dtrace_state_t *sta
 	state->dts_alive = now;
 }
 
-#if !defined(sun)
-struct dtrace_state_worker *dtrace_state_worker_add(void (*)(dtrace_state_t *),
-    dtrace_state_t *, hrtime_t);
-void dtrace_state_worker_remove(struct dtrace_state_worker *);
-#endif
+#endif	/* illumos */
 
 static dtrace_state_t *
-#if defined(sun)
+#ifdef illumos
+dtrace_state_create(dev_t *devp, cred_t *cr)
+#endif
+#ifdef __FreeBSD__
+dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)
+#endif
+#ifdef __NetBSD__
 dtrace_state_create(dev_t *devp, cred_t *cr)
-#else
-dtrace_state_create(dev_t dev, cred_t *cr)
 #endif
 {
-#if defined(sun)
+#ifdef illumos
 	minor_t minor;
 	major_t major;
 #else
 	int m = 0;
 #endif
+#ifdef __FreeBSD__
+	cred_t *cr = NULL;
+#endif
+	int cpu_it;
 	char c[30];
 	dtrace_state_t *state;
 	dtrace_optval_t *opt;
@@ -12893,7 +14935,7 @@ dtrace_state_create(dev_t dev, cred_t *c
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
-#if defined(sun)
+#ifdef illumos
 	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
 	    VM_BESTFIT | VM_SLEEP);
 
@@ -12903,17 +14945,25 @@ dtrace_state_create(dev_t dev, cred_t *c
 	}
 
 	state = ddi_get_soft_state(dtrace_softstate, minor);
-#else
-	m = minor(dev) & 0x0F;
+#endif
+#ifdef __FreeBSD__
+	if (dev != NULL) {
+		cr = dev->si_cred;
+		m = dev2unit(dev);
+	}
+#endif
+#ifdef __NetBSD__
+	m = minor(*devp) & 0x0F;
 
 	/* Allocate memory for the state. */
 	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
 #endif
 
+
 	state->dts_epid = DTRACE_EPIDNONE + 1;
 
 	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
-#if defined(sun)
+#ifdef illumos
 	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
 	    NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
 
@@ -12927,10 +14977,15 @@ dtrace_state_create(dev_t dev, cred_t *c
 
 	if (devp != NULL)
 		*devp = state->dts_dev;
-#else
+#endif
+#ifdef __FreeBSD__
+	state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
+ 	state->dts_dev = dev;
+#endif
+#ifdef __NetBSD__
 	state->dts_aggid_arena = vmem_create(c, 1, INT_MAX, 1,
 	    NULL, NULL, NULL, 0, VM_SLEEP, IPL_NONE);
-	state->dts_dev = dev;
+	state->dts_dev = *devp;
 #endif
 
 	/*
@@ -12942,10 +14997,31 @@ dtrace_state_create(dev_t dev, cred_t *c
 	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
 	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
 
-#if defined(sun)
+	/*
+         * Allocate and initialise the per-process per-CPU random state.
+	 * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
+         * assumed to be seeded at this point (if from Fortuna seed file).
+	 */
+	(void) read_random(&state->dts_rstate[0], 2 * sizeof(uint64_t));
+	for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
+		/*
+		 * Each CPU is assigned a 2^64 period, non-overlapping
+		 * subsequence.
+		 */
+		dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
+		    state->dts_rstate[cpu_it]); 
+	}
+
+
+#ifdef illumos
 	state->dts_cleaner = CYCLIC_NONE;
 	state->dts_deadman = CYCLIC_NONE;
-#else
+#endif
+#ifdef __FreeBSD__
+	callout_init(&state->dts_cleaner, 1);
+	callout_init(&state->dts_deadman, 1);
+#endif
+#ifdef __NetBSD__
 	state->dts_cleaner = NULL;
 	state->dts_deadman = NULL;
 #endif
@@ -12992,11 +15068,7 @@ dtrace_state_create(dev_t dev, cred_t *c
 		 * credential from disappearing.  This means that we can
 		 * examine the credential and the zone from probe context.
 		 */
-#if defined(sun)
 		crhold(cr);
-#else
-		kauth_cred_hold(cr);
-#endif
 		state->dts_cred.dcr_cred = cr;
 
 		/*
@@ -13036,7 +15108,7 @@ dtrace_state_create(dev_t dev, cred_t *c
 			 * we can do destructive things to processes which
 			 * have altered credentials.
 			 */
-#if defined(sun)
+#ifdef illumos
 			if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
 			    cr->cr_zone->zone_privset)) {
 				state->dts_cred.dcr_action |=
@@ -13078,7 +15150,7 @@ dtrace_state_create(dev_t dev, cred_t *c
 				state->dts_cred.dcr_action |=
 				    DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
 
-#if defined(sun)
+#ifdef illumos
 			/*
 			 * If we have all privs in whatever zone this is,
 			 * we can do destructive things to processes which
@@ -13117,7 +15189,7 @@ dtrace_state_buffer(dtrace_state_t *stat
 {
 	dtrace_optval_t *opt = state->dts_options, size;
 	processorid_t cpu = 0;;
-	int flags = 0, rval;
+	int flags = 0, rval, factor, divisor = 1;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 	ASSERT(MUTEX_HELD(&cpu_lock));
@@ -13147,7 +15219,7 @@ dtrace_state_buffer(dtrace_state_t *stat
 			flags |= DTRACEBUF_INACTIVE;
 	}
 
-	for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
+	for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
 		/*
 		 * The size must be 8-byte aligned.  If the size is not 8-byte
 		 * aligned, drop it down by the difference.
@@ -13165,7 +15237,7 @@ dtrace_state_buffer(dtrace_state_t *stat
 			return (E2BIG);
 		}
 
-		rval = dtrace_buffer_alloc(buf, size, flags, cpu);
+		rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
 
 		if (rval != ENOMEM) {
 			opt[which] = size;
@@ -13174,6 +15246,9 @@ dtrace_state_buffer(dtrace_state_t *stat
 
 		if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
 			return (rval);
+
+		for (divisor = 2; divisor < factor; divisor <<= 1)
+			continue;
 	}
 
 	return (ENOMEM);
@@ -13234,7 +15309,7 @@ dtrace_state_go(dtrace_state_t *state, p
 	dtrace_optval_t *opt = state->dts_options, sz, nspec;
 	dtrace_speculation_t *spec;
 	dtrace_buffer_t *buf;
-#if defined(sun)
+#ifdef illumos
 	cyc_handler_t hdlr;
 	cyc_time_t when;
 #endif
@@ -13275,7 +15350,8 @@ dtrace_state_go(dtrace_state_t *state, p
 		goto out;
 	}
 
-	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
+	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
+	    KM_NOSLEEP | KM_NORMALPRI);
 
 	if (spec == NULL) {
 		rval = ENOMEM;
@@ -13286,7 +15362,8 @@ dtrace_state_go(dtrace_state_t *state, p
 	state->dts_nspeculations = (int)nspec;
 
 	for (i = 0; i < nspec; i++) {
-		if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
+		if ((buf = kmem_zalloc(bufsize,
+		    KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
 			rval = ENOMEM;
 			goto err;
 		}
@@ -13416,7 +15493,7 @@ dtrace_state_go(dtrace_state_t *state, p
 		opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
 
 	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
-#if defined(sun)
+#ifdef illumos
 	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
 	hdlr.cyh_arg = state;
 	hdlr.cyh_level = CY_LOW_LEVEL;
@@ -13434,7 +15511,14 @@ dtrace_state_go(dtrace_state_t *state, p
 	when.cyt_interval = dtrace_deadman_interval;
 
 	state->dts_deadman = cyclic_add(&hdlr, &when);
-#else
+#endif
+#ifdef __FreeBSD__
+	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
+	    dtrace_state_clean, state);
+	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
+	    dtrace_state_deadman, state);
+#endif
+#ifdef __NetBSD__
 	state->dts_cleaner = dtrace_state_worker_add(
 	    dtrace_state_clean, state, opt[DTRACEOPT_CLEANRATE]);
 	state->dts_deadman = dtrace_state_worker_add(
@@ -13443,6 +15527,24 @@ dtrace_state_go(dtrace_state_t *state, p
 
 	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
 
+#ifdef illumos
+	if (state->dts_getf != 0 &&
+	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+		/*
+		 * We don't have kernel privs but we have at least one call
+		 * to getf(); we need to bump our zone's count, and (if
+		 * this is the first enabling to have an unprivileged call
+		 * to getf()) we need to hook into closef().
+		 */
+		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
+
+		if (dtrace_getf++ == 0) {
+			ASSERT(dtrace_closef == NULL);
+			dtrace_closef = dtrace_getf_barrier;
+		}
+	}
+#endif
+
 	/*
 	 * Now it's time to actually fire the BEGIN probe.  We need to disable
 	 * interrupts here both to record the CPU on which we fired the BEGIN
@@ -13467,6 +15569,19 @@ dtrace_state_go(dtrace_state_t *state, p
 	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
 		state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
 
+#ifdef __FreeBSD__
+	/*
+	 * We enable anonymous tracing before APs are started, so we must
+	 * activate buffers using the current CPU.
+	 */
+	if (state == dtrace_anon.dta_state)
+		for (int i = 0; i < NCPU; i++)
+			dtrace_buffer_activate_cpu(state, i);
+	else
+		dtrace_xcall(DTRACE_CPUALL,
+		    (dtrace_xcall_t)dtrace_buffer_activate, state);
+#else
+
 	/*
 	 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
 	 * want each CPU to transition its principal buffer out of the
@@ -13477,6 +15592,7 @@ dtrace_state_go(dtrace_state_t *state, p
 	 */
 	dtrace_xcall(DTRACE_CPUALL,
 	    (dtrace_xcall_t)dtrace_buffer_activate, state);
+#endif
 	goto out;
 
 err:
@@ -13559,6 +15675,26 @@ dtrace_state_stop(dtrace_state_t *state,
 	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
 	dtrace_sync();
 
+#ifdef illumos
+	if (state->dts_getf != 0 &&
+	    !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+		/*
+		 * We don't have kernel privs but we have at least one call
+		 * to getf(); we need to lower our zone's count, and (if
+		 * this is the last enabling to have an unprivileged call
+		 * to getf()) we need to clear the closef() hook.
+		 */
+		ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
+		ASSERT(dtrace_closef == dtrace_getf_barrier);
+		ASSERT(dtrace_getf > 0);
+
+		state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
+
+		if (--dtrace_getf == 0)
+			dtrace_closef = NULL;
+	}
+#endif
+
 	return (0);
 }
 
@@ -13622,7 +15758,7 @@ dtrace_state_destroy(dtrace_state_t *sta
 {
 	dtrace_ecb_t *ecb;
 	dtrace_vstate_t *vstate = &state->dts_vstate;
-#if defined(sun)
+#ifdef illumos
 	minor_t minor = getminor(state->dts_dev);
 #endif
 	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
@@ -13657,13 +15793,8 @@ dtrace_state_destroy(dtrace_state_t *sta
 	/*
 	 * Release the credential hold we took in dtrace_state_create().
 	 */
-	if (state->dts_cred.dcr_cred != NULL) {
-#if defined(sun)
+	if (state->dts_cred.dcr_cred != NULL)
 		crfree(state->dts_cred.dcr_cred);
-#else
-		kauth_cred_free(state->dts_cred.dcr_cred);
-#endif
-	}
 
 	/*
 	 * Now we can safely disable and destroy any enabled probes.  Because
@@ -13705,13 +15836,20 @@ dtrace_state_destroy(dtrace_state_t *sta
 	for (i = 0; i < nspec; i++)
 		dtrace_buffer_free(spec[i].dtsp_buffer);
 
-#if defined(sun)
+#ifdef illumos
 	if (state->dts_cleaner != CYCLIC_NONE)
 		cyclic_remove(state->dts_cleaner);
 
 	if (state->dts_deadman != CYCLIC_NONE)
 		cyclic_remove(state->dts_deadman);
-#else
+#endif
+#ifdef __FreeBSD__
+	callout_stop(&state->dts_cleaner);
+	callout_drain(&state->dts_cleaner);
+	callout_stop(&state->dts_deadman);
+	callout_drain(&state->dts_deadman);
+#endif
+#ifdef __NetBSD__
 	if (state->dts_cleaner != NULL)
 		dtrace_state_worker_remove(state->dts_cleaner);
 
@@ -13746,13 +15884,18 @@ dtrace_state_destroy(dtrace_state_t *sta
 	dtrace_format_destroy(state);
 
 	if (state->dts_aggid_arena != NULL) {
+#if defined(illumos) || defined(__NetBSD__)
 		vmem_destroy(state->dts_aggid_arena);
+#else
+		delete_unrhdr(state->dts_aggid_arena);
+#endif
 		state->dts_aggid_arena = NULL;
 	}
-#if defined(sun)
+#ifdef illumos
 	ddi_soft_state_free(dtrace_softstate, minor);
 	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
-#else
+#endif
+#ifdef __NetBSD__
 	kmem_free(state, sizeof(dtrace_state_t));
 #endif
 }
@@ -13803,7 +15946,7 @@ dtrace_anon_property(void)
 			break;
 		}
 
-#if defined(sun)
+#ifdef illumos
 		/*
 		 * We want to create anonymous state, so we need to transition
 		 * the kernel debugger to indicate that DTrace is active.  If
@@ -13822,9 +15965,7 @@ dtrace_anon_property(void)
 		 * If we haven't allocated an anonymous state, we'll do so now.
 		 */
 		if ((state = dtrace_anon.dta_state) == NULL) {
-#if defined(sun)
 			state = dtrace_state_create(NULL, NULL);
-#endif
 			dtrace_anon.dta_state = state;
 
 			if (state == NULL) {
@@ -13846,7 +15987,7 @@ dtrace_anon_property(void)
 		}
 
 		rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
-		    &dtrace_anon.dta_enabling, 0, B_TRUE);
+		    &dtrace_anon.dta_enabling, 0, 0, B_TRUE);
 
 		if (rv == 0)
 			rv = dtrace_dof_options(dof, state);
@@ -13893,10 +16034,10 @@ dtrace_helper_trace(dtrace_helper_action
     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
 {
 	uint32_t size, next, nnext, i;
-	dtrace_helptrace_t *ent;
+	dtrace_helptrace_t *ent, *buffer;
 	uint16_t flags = cpu_core[curcpu_id].cpuc_dtrace_flags;
 
-	if (!dtrace_helptrace_enabled)
+	if ((buffer = dtrace_helptrace_buffer) == NULL)
 		return;
 
 	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
@@ -13924,10 +16065,12 @@ dtrace_helper_trace(dtrace_helper_action
 	/*
 	 * We have our slot; fill it in.
 	 */
-	if (nnext == size)
+	if (nnext == size) {
+		dtrace_helptrace_wrapped++;
 		next = 0;
+	}
 
-	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
+	ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
 	ent->dtht_helper = helper;
 	ent->dtht_where = where;
 	ent->dtht_nlocals = vstate->dtvs_nlocals;
@@ -13961,7 +16104,7 @@ dtrace_helper(int which, dtrace_mstate_t
 	dtrace_helper_action_t *helper;
 	dtrace_vstate_t *vstate;
 	dtrace_difo_t *pred;
-	int i, trace = dtrace_helptrace_enabled;
+	int i, trace = dtrace_helptrace_buffer != NULL;
 
 	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
 
@@ -14040,7 +16183,6 @@ err:
 	return (0);
 }
 
-#if defined(sun)
 static void
 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
     dtrace_vstate_t *vstate)
@@ -14061,13 +16203,15 @@ dtrace_helper_action_destroy(dtrace_help
 }
 
 static int
-dtrace_helper_destroygen(int gen)
+dtrace_helper_destroygen(dtrace_helpers_t *help, int gen)
 {
 	proc_t *p = curproc;
-	dtrace_helpers_t *help = p->p_dtrace_helpers;
 	dtrace_vstate_t *vstate;
 	int i;
 
+	if (help == NULL)
+		help = p->p_dtrace_helpers;
+
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 
 	if (help == NULL || gen > help->dthps_generation)
@@ -14165,9 +16309,9 @@ dtrace_helper_validate(dtrace_helper_act
 }
 
 static int
-dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
+dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep,
+    dtrace_helpers_t *help)
 {
-	dtrace_helpers_t *help;
 	dtrace_helper_action_t *helper, *last;
 	dtrace_actdesc_t *act;
 	dtrace_vstate_t *vstate;
@@ -14177,7 +16321,6 @@ dtrace_helper_action_add(int which, dtra
 	if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
 		return (EINVAL);
 
-	help = curproc->p_dtrace_helpers;
 	last = help->dthps_actions[which];
 	vstate = &help->dthps_vstate;
 
@@ -14301,15 +16444,12 @@ dtrace_helper_provider_register(proc_t *
 }
 
 static int
-dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
+dtrace_helper_provider_add(dof_helper_t *dofhp, dtrace_helpers_t *help, int gen)
 {
-	dtrace_helpers_t *help;
 	dtrace_helper_provider_t *hprov, **tmp_provs;
 	uint_t tmp_maxprovs, i;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
-
-	help = curproc->p_dtrace_helpers;
 	ASSERT(help != NULL);
 
 	/*
@@ -14484,7 +16624,13 @@ dtrace_helper_provider_validate(dof_hdr_
 
 		if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
 			dtrace_dof_error(dof, "function name too long");
-			return (-1);
+			/*
+			 * Keep going if the function name is too long.
+			 * Unlike provider and probe names, we cannot reasonably
+			 * impose restrictions on function names, since they're
+			 * a property of the code being instrumented. We will
+			 * skip this probe in dtrace_helper_provide_one().
+			 */
 		}
 
 		if (probe->dofpr_name >= str_sec->dofs_size ||
@@ -14596,7 +16742,7 @@ dtrace_helper_provider_validate(dof_hdr_
 }
 
 static int
-dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
+dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp, struct proc *p)
 {
 	dtrace_helpers_t *help;
 	dtrace_vstate_t *vstate;
@@ -14606,13 +16752,12 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
 
-	if ((help = curproc->p_dtrace_helpers) == NULL)
-		help = dtrace_helpers_create(curproc);
+	if ((help = p->p_dtrace_helpers) == NULL)
+		help = dtrace_helpers_create(p);
 
 	vstate = &help->dthps_vstate;
-
-	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
-	    dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
+	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, dhp->dofhp_addr,
+	    dhp->dofhp_dof, B_FALSE)) != 0) {
 		dtrace_dof_destroy(dof);
 		return (rv);
 	}
@@ -14620,22 +16765,20 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_
 	/*
 	 * Look for helper providers and validate their descriptions.
 	 */
-	if (dhp != NULL) {
-		for (i = 0; i < dof->dofh_secnum; i++) {
-			dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
-			    dof->dofh_secoff + i * dof->dofh_secsize);
-
-			if (sec->dofs_type != DOF_SECT_PROVIDER)
-				continue;
+	for (i = 0; i < dof->dofh_secnum; i++) {
+		dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
+		    dof->dofh_secoff + i * dof->dofh_secsize);
 
-			if (dtrace_helper_provider_validate(dof, sec) != 0) {
-				dtrace_enabling_destroy(enab);
-				dtrace_dof_destroy(dof);
-				return (-1);
-			}
+		if (sec->dofs_type != DOF_SECT_PROVIDER)
+			continue;
 
-			nprovs++;
+		if (dtrace_helper_provider_validate(dof, sec) != 0) {
+			dtrace_enabling_destroy(enab);
+			dtrace_dof_destroy(dof);
+			return (-1);
 		}
+
+		nprovs++;
 	}
 
 	/*
@@ -14655,12 +16798,13 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_
 			continue;
 
 		if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
-		    ep)) != 0) {
+		    ep, help)) != 0) {
 			/*
 			 * Adding this helper action failed -- we are now going
 			 * to rip out the entire generation and return failure.
 			 */
-			(void) dtrace_helper_destroygen(help->dthps_generation);
+			(void) dtrace_helper_destroygen(help,
+			    help->dthps_generation);
 			dtrace_enabling_destroy(enab);
 			dtrace_dof_destroy(dof);
 			return (-1);
@@ -14675,11 +16819,18 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_
 	gen = help->dthps_generation++;
 	dtrace_enabling_destroy(enab);
 
-	if (dhp != NULL && nprovs > 0) {
+	if (nprovs > 0) {
+		/*
+		 * Now that this is in-kernel, we change the sense of the
+		 * members:  dofhp_dof denotes the in-kernel copy of the DOF
+		 * and dofhp_addr denotes the address at user-level.
+		 */
+		dhp->dofhp_addr = dhp->dofhp_dof;
 		dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
-		if (dtrace_helper_provider_add(dhp, gen) == 0) {
+
+		if (dtrace_helper_provider_add(dhp, help, gen) == 0) {
 			mutex_exit(&dtrace_lock);
-			dtrace_helper_provider_register(curproc, help, dhp);
+			dtrace_helper_provider_register(p, help, dhp);
 			mutex_enter(&dtrace_lock);
 
 			destroy = 0;
@@ -14710,12 +16861,17 @@ dtrace_helpers_create(proc_t *p)
 	return (help);
 }
 
-static void
-dtrace_helpers_destroy(void)
+#ifdef illumos
+static
+#endif
+void
+dtrace_helpers_destroy(proc_t *p)
 {
 	dtrace_helpers_t *help;
 	dtrace_vstate_t *vstate;
+#ifdef illumos
 	proc_t *p = curproc;
+#endif
 	int i;
 
 	mutex_enter(&dtrace_lock);
@@ -14802,7 +16958,10 @@ dtrace_helpers_destroy(void)
 	mutex_exit(&dtrace_lock);
 }
 
-static void
+#ifdef illumos
+static
+#endif
+void
 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
 {
 	dtrace_helpers_t *help, *newhelp;
@@ -14844,7 +17003,7 @@ dtrace_helpers_duplicate(proc_t *from, p
 			new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
 
 			for (j = 0; j < new->dtha_nactions; j++) {
-				dp = helper->dtha_actions[j];
+				dtrace_difo_t *dp = helper->dtha_actions[j];
 
 				ASSERT(dp != NULL);
 				dp = dtrace_difo_duplicate(dp, vstate);
@@ -14892,10 +17051,27 @@ dtrace_module_loaded(modctl_t *ctl)
 {
 	dtrace_provider_t *prv;
 
+#ifdef __NetBSD__
+	/*
+	 * We have just one symbol table and CTF table for the entire
+	 * base kernel, so ignore any other built-in module entries.
+	 * This means that the module name for a given symbol will change
+	 * depending on whether the module is built-in or loaded separately.
+	 */
+	if (module_source(ctl) == MODULE_SOURCE_KERNEL &&
+	    strcmp(module_name(ctl), "netbsd")) {
+		return;
+	}
+#endif
+
 	mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
 	mutex_enter(&mod_lock);
+#endif
 
+#ifdef illumos
 	ASSERT(ctl->mod_busy);
+#endif
 
 	/*
 	 * We're going to call each providers per-module provide operation
@@ -14904,7 +17080,9 @@ dtrace_module_loaded(modctl_t *ctl)
 	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
 		prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
 
+#ifdef illumos
 	mutex_exit(&mod_lock);
+#endif
 	mutex_exit(&dtrace_provider_lock);
 
 	/*
@@ -14937,28 +17115,76 @@ dtrace_module_loaded(modctl_t *ctl)
 	 * not a serious problem -- it just means that the module that we
 	 * just loaded may not be immediately instrumentable.
 	 */
-	xdelay(1);
+	delay(1);
 }
 
 static void
+#ifndef __FreeBSD__
 dtrace_module_unloaded(modctl_t *ctl)
+#else
+dtrace_module_unloaded(modctl_t *ctl, int *error)
+#endif
 {
 	dtrace_probe_t template, *probe, *first, *next;
 	dtrace_provider_t *prov;
+#ifndef illumos
+	char modname[DTRACE_MODNAMELEN];
+	size_t len;
+#endif
 
+#ifdef illumos
 	template.dtpr_mod = ctl->mod_modname;
+#endif
+#ifdef __FreeBSD__
+	/* Handle the fact that ctl->filename may end in ".ko". */
+	strlcpy(modname, ctl->filename, sizeof(modname));
+	len = strlen(ctl->filename);
+	if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
+		modname[len - 3] = '\0';
+	template.dtpr_mod = modname;
+#endif
+#ifdef __NetBSD__
+	if (module_source(ctl) == MODULE_SOURCE_KERNEL &&
+	    strcmp(module_name(ctl), "netbsd")) {
+		return;
+	}
+
+	/* Handle the fact that ctl->filename may end in ".kmod". */
+	strlcpy(modname, module_name(ctl), sizeof(modname));
+	len = strlen(modname);
+	if (len > 5 && strcmp(modname + len - 5, ".kmod") == 0)
+		modname[len - 5] = '\0';
+	template.dtpr_mod = modname;
+
+#endif
 
 	mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
 	mutex_enter(&mod_lock);
+#endif
 	mutex_enter(&dtrace_lock);
 
+#ifdef __FreeBSD__
+	if (ctl->nenabled > 0) {
+		/* Don't allow unloads if a probe is enabled. */
+		mutex_exit(&dtrace_provider_lock);
+		mutex_exit(&dtrace_lock);
+		*error = -1;
+		printf(
+			"kldunload: attempt to unload module that has DTrace probes enabled\n");
+		return;
+	}
+#endif
+
 	if (dtrace_bymod == NULL) {
 		/*
 		 * The DTrace module is loaded (obviously) but not attached;
 		 * we don't have any work to do.
 		 */
 		mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
 		mutex_exit(&mod_lock);
+#endif
 		mutex_exit(&dtrace_lock);
 		return;
 	}
@@ -14967,7 +17193,9 @@ dtrace_module_unloaded(modctl_t *ctl)
 	    probe != NULL; probe = probe->dtpr_nextmod) {
 		if (probe->dtpr_ecb != NULL) {
 			mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
 			mutex_exit(&mod_lock);
+#endif
 			mutex_exit(&dtrace_lock);
 
 			/*
@@ -14981,8 +17209,13 @@ dtrace_module_unloaded(modctl_t *ctl)
 			 * probe, either.
 			 */
 			if (dtrace_err_verbose) {
+#ifdef illumos
 				cmn_err(CE_WARN, "unloaded module '%s' had "
 				    "enabled probes", ctl->mod_modname);
+#else
+				cmn_err(CE_WARN, "unloaded module '%s' had "
+				    "enabled probes", modname);
+#endif
 			}
 
 			return;
@@ -15025,15 +17258,45 @@ dtrace_module_unloaded(modctl_t *ctl)
 		kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
 		kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 		kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+#ifdef illumos
 		vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
+#endif
+#ifdef __FreeBSD__
+		free_unr(dtrace_arena, probe->dtpr_id);
+#endif
+#ifdef __NetBSD__
+		vmem_free(dtrace_arena, (uintptr_t)probe->dtpr_id, 1);
+#endif
 		kmem_free(probe, sizeof (dtrace_probe_t));
 	}
 
 	mutex_exit(&dtrace_lock);
+#ifdef illumos
 	mutex_exit(&mod_lock);
+#endif
 	mutex_exit(&dtrace_provider_lock);
 }
 
+#ifdef __FreeBSD__
+static void
+dtrace_kld_load(void *arg __unused, linker_file_t lf)
+{
+
+	dtrace_module_loaded(lf);
+}
+
+static void
+dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
+{
+
+	if (*error != 0)
+		/* We already have an error, so don't do anything. */
+		return;
+	dtrace_module_unloaded(lf, error);
+}
+#endif
+
+#ifdef illumos
 static void
 dtrace_suspend(void)
 {
@@ -15106,7 +17369,7 @@ dtrace_cpu_setup(cpu_setup_t what, proce
 	return (0);
 }
 
-#if defined(sun)
+#ifdef illumos
 static void
 dtrace_cpu_setup_initial(processorid_t cpu)
 {
@@ -15151,10 +17414,29 @@ dtrace_toxrange_add(uintptr_t base, uint
 	dtrace_toxranges++;
 }
 
+static void
+dtrace_getf_barrier()
+{
+#ifdef illumos
+	/*
+	 * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
+	 * that contain calls to getf(), this routine will be called on every
+	 * closef() before either the underlying vnode is released or the
+	 * file_t itself is freed.  By the time we are here, it is essential
+	 * that the file_t can no longer be accessed from a call to getf()
+	 * in probe context -- that assures that a dtrace_sync() can be used
+	 * to clear out any enablings referring to the old structures.
+	 */
+	if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
+	    kcred->cr_zone->zone_dtrace_getf != 0)
+		dtrace_sync();
+#endif
+}
+
 /*
  * DTrace Driver Cookbook Functions
  */
-#if defined(sun)
+#ifdef illumos
 /*ARGSUSED*/
 static int
 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
@@ -15266,17 +17548,6 @@ dtrace_attach(dev_info_t *devi, ddi_atta
 	mutex_exit(&cpu_lock);
 
 	/*
-	 * If DTrace helper tracing is enabled, we need to allocate the
-	 * trace buffer and initialize the values.
-	 */
-	if (dtrace_helptrace_enabled) {
-		ASSERT(dtrace_helptrace_buffer == NULL);
-		dtrace_helptrace_buffer =
-		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
-		dtrace_helptrace_next = 0;
-	}
-
-	/*
 	 * If there are already providers, we must ask them to provide their
 	 * probes, and then match any anonymous enabling against them.  Note
 	 * that there should be no other retained enablings at this time:
@@ -15323,16 +17594,7 @@ dtrace_attach(dev_info_t *devi, ddi_atta
 }
 #endif
 
-#if !defined(sun)
-#if __FreeBSD_version >= 800039
-static void
-dtrace_dtr(void *data __unused)
-{
-}
-#endif
-#endif
-
-#if !defined(sun)
+#ifdef __NetBSD__
 static dev_type_open(dtrace_open);
 
 /* Pseudo Device Entry points */
@@ -15367,11 +17629,19 @@ static const struct fileops dtrace_fileo
 };
 #endif
 
+#ifndef illumos
+static void dtrace_dtr(void *);
+#endif
+ 
 /*ARGSUSED*/
 static int
-#if defined(sun)
+#ifdef illumos
 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
-#else
+#endif
+#ifdef __FreeBSD_
+dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+#endif
+#ifdef __NetBSD__
 dtrace_open(dev_t dev, int flags, int mode, struct lwp *l)
 #endif
 {
@@ -15380,7 +17650,7 @@ dtrace_open(dev_t dev, int flags, int mo
 	uid_t uid;
 	zoneid_t zoneid;
 
-#if defined(sun)
+#ifdef illumos
 	if (getminor(*devp) == DTRACEMNRN_HELPER)
 		return (0);
 
@@ -15388,8 +17658,16 @@ dtrace_open(dev_t dev, int flags, int mo
 	 * If this wasn't an open with the "helper" minor, then it must be
 	 * the "dtrace" minor.
 	 */
-	ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
-#else
+	if (getminor(*devp) == DTRACEMNRN_DTRACE)
+		return (ENXIO);
+#endif
+#ifdef __FreeBSD__
+ 	cred_t *cred_p = NULL;
+ 	cred_p = dev->si_cred;
+
+
+#endif
+#ifdef __NetBSD__
 	cred_t *cred_p = NULL;
 	struct file *fp;
 	int fd;
@@ -15397,27 +17675,6 @@ dtrace_open(dev_t dev, int flags, int mo
 
 	if ((res = fd_allocfile(&fp, &fd)) != 0)
 		return res;
-#if 0
-#if __FreeBSD_version < 800039
-	/*
-	 * The first minor device is the one that is cloned so there is
-	 * nothing more to do here.
-	 */
-	if (dev2unit(dev) == 0)
-		return 0;
-
-	/*
-	 * Devices are cloned, so if the DTrace state has already
-	 * been allocated, that means this device belongs to a
-	 * different client. Each client should open '/dev/dtrace'
-	 * to get a cloned device.
-	 */
-	if (dev->si_drv1 != NULL)
-		return (EBUSY);
-#endif
-
-	cred_p = dev->si_cred;
-#endif
 	cred_p = l->l_cred;
 #endif
 
@@ -15442,7 +17699,7 @@ dtrace_open(dev_t dev, int flags, int mo
 	dtrace_opens++;
 	dtrace_membar_producer();
 
-#if defined(sun)
+#ifdef illumos
 	/*
 	 * If the kernel debugger is active (that is, if the kernel debugger
 	 * modified text in some way), we won't allow the open.
@@ -15454,15 +17711,31 @@ dtrace_open(dev_t dev, int flags, int mo
 		return (EBUSY);
 	}
 
+	if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
+		/*
+		 * If DTrace helper tracing is enabled, we need to allocate the
+		 * trace buffer and initialize the values.
+		 */
+		dtrace_helptrace_buffer =
+		    kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
+		dtrace_helptrace_next = 0;
+		dtrace_helptrace_wrapped = 0;
+		dtrace_helptrace_enable = 0;
+	}
 	state = dtrace_state_create(devp, cred_p);
-#else
-	state = dtrace_state_create(dev, cred_p);
+#endif
+#ifdef __FreeBSD__
+	state = dtrace_state_create(dev, NULL);
+	devfs_set_cdevpriv(state, dtrace_dtr);
+#endif
+#ifdef __NetBSD__
+	state = dtrace_state_create(&dev, cred_p);
 #endif
 
 	mutex_exit(&cpu_lock);
 
 	if (state == NULL) {
-#if defined(sun)
+#ifdef illumos
 		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
 			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
 #else
@@ -15474,54 +17747,86 @@ dtrace_open(dev_t dev, int flags, int mo
 
 	mutex_exit(&dtrace_lock);
 
-#if defined(sun)
-	return (0);
-#else
+#ifdef __NetBSD__
 	return fd_clone(fp, fd, flags, &dtrace_fileops, state);
+#else
+	return (0);
 #endif
 }
 
 /*ARGSUSED*/
+#ifdef illumos
 static int
-#if defined(sun)
 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
-#else
+#endif
+#ifdef __FreeBSD__
+static void
+dtrace_dtr(void *data)
+#endif
+#ifdef __NetBSD__
+static int
 dtrace_close(struct file *fp)
 #endif
 {
-#if defined(sun)
+#ifdef illumos
 	minor_t minor = getminor(dev);
 	dtrace_state_t *state;
-
+#endif
+	dtrace_helptrace_t *buf = NULL;
+ 
+#ifdef illumos
 	if (minor == DTRACEMNRN_HELPER)
 		return (0);
 
 	state = ddi_get_soft_state(dtrace_softstate, minor);
-#else
+#endif
+#ifdef __FreeBSD__
+	dtrace_state_t *state = data;
+#endif
+#ifdef __NetBSD__
 	dtrace_state_t *state = (dtrace_state_t *)fp->f_data;
 #endif
 
 	mutex_enter(&cpu_lock);
 	mutex_enter(&dtrace_lock);
 
-	if (state != NULL) {
-		if (state->dts_anon) {
-			/*
-			 * There is anonymous state. Destroy that first.
-			 */
-			ASSERT(dtrace_anon.dta_state == NULL);
-			dtrace_state_destroy(state->dts_anon);
-		}
+#if defined(illumos) || defined(__NetBSD__)
+	if (state->dts_anon)
+#else
+	if (state != NULL && state->dts_anon)
+#endif
+	{
+		/*
+		 * There is anonymous state. Destroy that first.
+		 */
+		ASSERT(dtrace_anon.dta_state == NULL);
+		dtrace_state_destroy(state->dts_anon);
+	}
 
-		dtrace_state_destroy(state);
+	if (dtrace_helptrace_disable) {
+		/*
+		 * If we have been told to disable helper tracing, set the
+		 * buffer to NULL before calling into dtrace_state_destroy();
+		 * we take advantage of its dtrace_sync() to know that no
+		 * CPU is in probe context with enabled helper tracing
+		 * after it returns.
+		 */
+		buf = dtrace_helptrace_buffer;
+		dtrace_helptrace_buffer = NULL;
+	}
 
-#if !defined(sun)
-		fp->f_data = NULL;
-#endif
+#if defined(illumos) || defined(__NetBSD__)
+	dtrace_state_destroy(state);
+#else
+	if (state != NULL) {
+		dtrace_state_destroy(state);
+		kmem_free(state, 0);
 	}
+#endif
 
 	ASSERT(dtrace_opens > 0);
-#if defined(sun)
+
+#ifdef illumos
 	/*
 	 * Only relinquish control of the kernel debugger interface when there
 	 * are no consumers and no anonymous enablings.
@@ -15532,13 +17837,20 @@ dtrace_close(struct file *fp)
 	--dtrace_opens;
 #endif
 
+	if (buf != NULL) {
+		kmem_free(buf, dtrace_helptrace_bufsize);
+		dtrace_helptrace_disable = 0;
+	}
+
 	mutex_exit(&dtrace_lock);
 	mutex_exit(&cpu_lock);
 
+#if defined(illumos) || defined(__NetBSD__)
 	return (0);
+#endif
 }
 
-#if defined(sun)
+#ifdef illumos
 /*ARGSUSED*/
 static int
 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
@@ -15582,7 +17894,7 @@ dtrace_ioctl_helper(int cmd, intptr_t ar
 
 	case DTRACEHIOC_REMOVE: {
 		mutex_enter(&dtrace_lock);
-		rval = dtrace_helper_destroygen(arg);
+		rval = dtrace_helper_destroygen(NULL, arg);
 		mutex_exit(&dtrace_lock);
 
 		return (rval);
@@ -16153,6 +18465,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_
 			desc.dtbd_drops = buf->dtb_drops;
 			desc.dtbd_errors = buf->dtb_errors;
 			desc.dtbd_oldest = buf->dtb_xamot_offset;
+			desc.dtbd_timestamp = dtrace_gethrtime();
 
 			mutex_exit(&dtrace_lock);
 
@@ -16205,6 +18518,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_
 		desc.dtbd_drops = buf->dtb_xamot_drops;
 		desc.dtbd_errors = buf->dtb_xamot_errors;
 		desc.dtbd_oldest = 0;
+		desc.dtbd_timestamp = buf->dtb_switched;
 
 		mutex_exit(&dtrace_lock);
 
@@ -16420,12 +18734,10 @@ dtrace_detach(dev_info_t *dip, ddi_detac
 	dtrace_modload = NULL;
 	dtrace_modunload = NULL;
 
-	mutex_exit(&cpu_lock);
+	ASSERT(dtrace_getf == 0);
+	ASSERT(dtrace_closef == NULL);
 
-	if (dtrace_helptrace_enabled) {
-		kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
-		dtrace_helptrace_buffer = NULL;
-	}
+	mutex_exit(&cpu_lock);
 
 	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
 	dtrace_probes = NULL;
@@ -16477,7 +18789,7 @@ dtrace_detach(dev_info_t *dip, ddi_detac
 }
 #endif
 
-#if defined(sun)
+#ifdef illumos
 /*ARGSUSED*/
 static int
 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
@@ -16500,7 +18812,7 @@ dtrace_info(dev_info_t *dip, ddi_info_cm
 }
 #endif
 
-#if defined(sun)
+#ifdef illumos
 static struct cb_ops dtrace_cb_ops = {
 	dtrace_open,		/* open */
 	dtrace_close,		/* close */
@@ -16562,62 +18874,36 @@ _fini(void)
 {
 	return (mod_remove(&modlinkage));
 }
-#else
+#endif
 
-#if 0
+#ifdef __FreeBSD__
 static d_ioctl_t	dtrace_ioctl;
+static d_ioctl_t	dtrace_ioctl_helper;
 static void		dtrace_load(void *);
 static int		dtrace_unload(void);
-#if __FreeBSD_version < 800039
-static void		dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
-static struct clonedevs	*dtrace_clones;		/* Ptr to the array of cloned devices. */
-static eventhandler_tag	eh_tag;			/* Event handler tag. */
-#else
 static struct cdev	*dtrace_dev;
-#endif
+static struct cdev	*helper_dev;
 
 void dtrace_invop_init(void);
 void dtrace_invop_uninit(void);
 
 static struct cdevsw dtrace_cdevsw = {
 	.d_version	= D_VERSION,
-	.d_flags	= D_TRACKCLOSE | D_NEEDMINOR,
-	.d_close	= dtrace_close,
 	.d_ioctl	= dtrace_ioctl,
 	.d_open		= dtrace_open,
 	.d_name		= "dtrace",
 };
-#endif
-void dtrace_invop_init(void);
-void dtrace_invop_uninit(void);
-
-static void		dtrace_load(void *);
-static int		dtrace_unload(void);
-
-#include <dtrace_anon.c>
-#include <dtrace_ioctl.c>
-#include <dtrace_load.c>
-#include <dtrace_modevent.c>
-#include <dtrace_sysctl.c>
-#include <dtrace_unload.c>
-#include <dtrace_vtime.c>
-#include <dtrace_hacks.c>
-#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
-#include <dtrace_isa.c>
-#endif
-
-MODULE(MODULE_CLASS_DRIVER, dtrace, "solaris");
 
-#if 0
-DEV_MODULE(dtrace, dtrace_modevent, NULL);
-MODULE_VERSION(dtrace, 1);
-MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
-MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
-#endif
-#endif
+static struct cdevsw helper_cdevsw = {
+	.d_version	= D_VERSION,
+	.d_ioctl	= dtrace_ioctl_helper,
+	.d_name		= "helper",
+};
+#endif /* __FreeBSD__ */
 
-#if !defined(sun)
-#undef mutex_init
+#ifdef __NetBSD__
+void dtrace_invop_init(void);
+void dtrace_invop_uninit(void);
 
 struct dtrace_state_worker {
 	kmutex_t lock;
@@ -16657,7 +18943,7 @@ dtrace_state_worker_add(void (*fn)(dtrac
 	int error __diagused;
 
 	w = kmem_alloc(sizeof(*w), KM_SLEEP);
-	mutex_init(&w->lock, MUTEX_DEFAULT, IPL_NONE);
+	mutex_init(&w->lock, "dtrace", MUTEX_DEFAULT, NULL);
 	cv_init(&w->cv, "dtrace");
 	w->interval = ((uintmax_t)hz * interval) / NANOSEC;
 	w->fn = fn;
@@ -16685,4 +18971,28 @@ dtrace_state_worker_remove(struct dtrace
 	mutex_destroy(&w->lock);
 	kmem_free(w, sizeof(*w));
 }
-#endif
+
+#endif /* __NetBSD__ */
+
+static void		dtrace_load(void *);
+static int		dtrace_unload(void);
+
+#include <dtrace_anon.c>
+#include <dtrace_ioctl.c>
+#include <dtrace_load.c>
+#include <dtrace_modevent.c>
+#include <dtrace_sysctl.c>
+#include <dtrace_unload.c>
+#include <dtrace_vtime.c>
+#include <dtrace_hacks.c>
+#include <dtrace_isa.c>
+
+#ifdef __FreeBSD__
+DEV_MODULE(dtrace, dtrace_modevent, NULL);
+MODULE_VERSION(dtrace, 1);
+MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
+MODULE(MODULE_CLASS_DRIVER, dtrace, "solaris");
+#endif /* __NetBSD__ */
Index: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c
diff -N src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c	2 Mar 2017 10:54:24 -0000
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2016 (Graeme Jenkinson)
+ * All rights reserved.
+ *
+ * This software was developed by BAE Systems, the University of Cambridge
+ * Computer Laboratory, and Memorial University under DARPA/AFRL contract
+ * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
+ * (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/types.h>
+
+#include "dtrace_xoroshiro128_plus.h"
+
+static __inline uint64_t
+rotl(const uint64_t x, int k)
+{
+	return (x << k) | (x >> (64 - k));
+}
+
+/*
+ * This is the jump function for the generator. It is equivalent to 2^64 calls
+ * to next(); it can be used to generate 2^64 non-overlapping subsequences for
+ * parallel computations.
+ */
+void
+dtrace_xoroshiro128_plus_jump(uint64_t * const state,
+	uint64_t * const jump_state)
+{
+	static const uint64_t JUMP[] = { 0xbeac0467eba5facb,
+		0xd86b048b86aa9922 };
+
+	uint64_t s0 = 0;
+	uint64_t s1 = 0;
+	int i = 0;
+	int b = 0;
+	for (i = 0; i < sizeof JUMP / sizeof *JUMP; i++) {
+		for (b = 0; b < 64; b++) {
+			if (JUMP[i] & 1ULL << b) {
+				s0 ^= state[0];
+				s1 ^= state[1];
+			}
+			dtrace_xoroshiro128_plus_next(state);
+		}
+	}
+	jump_state[0] = s0;
+	jump_state[1] = s1;
+}
+
+/*
+ * xoroshiro128+ - XOR/rotate/shift/rotate
+ * xorshift.di.unimi.it
+ */
+uint64_t
+dtrace_xoroshiro128_plus_next(uint64_t * const state)
+{
+	const uint64_t s0 = state[0];
+	uint64_t s1 = state[1];
+	uint64_t result;
+	result = s0 + s1;
+
+	s1 ^= s0;
+	state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+	state[1] = rotl(s1, 36);
+
+	return result;
+}
Index: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h
diff -N src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h	2 Mar 2017 10:54:24 -0000
@@ -0,0 +1,41 @@
+/*-
+ * Copyright (c) 2016 (Graeme Jenkinson)
+ * All rights reserved.
+ *
+ * This software was developed by BAE Systems, the University of Cambridge
+ * Computer Laboratory, and Memorial University under DARPA/AFRL contract
+ * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing
+ * (TC) research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef _DTRACE_XOROSHIRO128_PLUS_H
+#define _DTRACE_XOROSHIRO128_PLUS_H
+
+#include <sys/types.h>
+
+void dtrace_xoroshiro128_plus_jump(uint64_t * const, uint64_t * const);
+uint64_t dtrace_xoroshiro128_plus_next(uint64_t * const);
+
+#endif
Index: src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c,v
retrieving revision 1.4
diff -u -p -r1.4 fasttrap.c
--- src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c	27 Mar 2014 15:50:48 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c	2 Mar 2017 10:54:24 -0000
@@ -17,14 +17,20 @@
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
+ *
+ * Portions Copyright 2010 The FreeBSD Foundation
+ *
+ * $FreeBSD: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 313758 2017-02-15 06:07:01Z markj $
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ */
 
 #include <sys/atomic.h>
 #include <sys/errno.h>
@@ -32,11 +38,15 @@
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
+#ifdef illumos
 #include <sys/ddi.h>
+#endif
 #include <sys/sunddi.h>
 #include <sys/cpuvar.h>
 #include <sys/kmem.h>
+#ifdef illumos
 #include <sys/strsubr.h>
+#endif
 #include <sys/fasttrap.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/fasttrap_isa.h>
@@ -44,9 +54,28 @@
 #include <sys/dtrace_impl.h>
 #include <sys/sysmacros.h>
 #include <sys/proc.h>
-#include <sys/priv.h>
 #include <sys/policy.h>
+#ifdef illumos
 #include <util/qsort.h>
+#endif
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#ifndef illumos
+#include <sys/dtrace_bsd.h>
+#include <sys/eventhandler.h>
+#include <sys/rmlock.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/u8_textprep.h>
+#include <sys/user.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_param.h>
+
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#endif
 
 /*
  * User-Land Trap-Based Tracing
@@ -125,12 +154,21 @@
  *	never hold the provider lock and creation lock simultaneously
  */
 
-static dev_info_t *fasttrap_devi;
+static d_open_t fasttrap_open;
+static d_ioctl_t fasttrap_ioctl;
+
+static struct cdevsw fasttrap_cdevsw = {
+	.d_version	= D_VERSION,
+	.d_open		= fasttrap_open,
+	.d_ioctl	= fasttrap_ioctl,
+	.d_name		= "fasttrap",
+};
+static struct cdev *fasttrap_cdev;
 static dtrace_meta_provider_id_t fasttrap_meta_id;
 
-static timeout_id_t fasttrap_timeout;
-static kmutex_t fasttrap_cleanup_mtx;
-static uint_t fasttrap_cleanup_work;
+static struct proc *fasttrap_cleanup_proc;
+static struct mtx fasttrap_cleanup_mtx;
+static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv;
 
 /*
  * Generation count on modifications to the global tracepoint lookup table.
@@ -139,15 +177,19 @@ static volatile uint64_t fasttrap_mod_ge
 
 /*
  * When the fasttrap provider is loaded, fasttrap_max is set to either
- * FASTTRAP_MAX_DEFAULT or the value for fasttrap-max-probes in the
- * fasttrap.conf file. Each time a probe is created, fasttrap_total is
- * incremented by the number of tracepoints that may be associated with that
- * probe; fasttrap_total is capped at fasttrap_max.
+ * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the
+ * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD).
+ * Each time a probe is created, fasttrap_total is incremented by the number
+ * of tracepoints that may be associated with that probe; fasttrap_total is capped
+ * at fasttrap_max.
  */
 #define	FASTTRAP_MAX_DEFAULT		250000
-static uint32_t fasttrap_max;
+static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT;
 static uint32_t fasttrap_total;
 
+/*
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ */
 
 #define	FASTTRAP_TPOINTS_DEFAULT_SIZE	0x4000
 #define	FASTTRAP_PROVIDERS_DEFAULT_SIZE	0x100
@@ -176,11 +218,31 @@ static void fasttrap_provider_free(fastt
 static fasttrap_proc_t *fasttrap_proc_lookup(pid_t);
 static void fasttrap_proc_release(fasttrap_proc_t *);
 
+#ifndef illumos
+static void fasttrap_thread_dtor(void *, struct thread *);
+#endif
+
 #define	FASTTRAP_PROVS_INDEX(pid, name) \
 	((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask)
 
 #define	FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask)
 
+#ifndef illumos
+struct rmlock fasttrap_tp_lock;
+static eventhandler_tag fasttrap_thread_dtor_tag;
+#endif
+
+static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+
+#ifdef __FreeBSD__
+SYSCTL_DECL(_kern_dtrace);
+SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD, 0, "DTrace fasttrap parameters");
+SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max,
+    FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes");
+SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size,
+    FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table");
+#endif
+
 static int
 fasttrap_highbit(ulong_t i)
 {
@@ -229,6 +291,7 @@ fasttrap_hash_str(const char *p)
 void
 fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc)
 {
+#ifdef illumos
 	sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 
 	sqp->sq_info.si_signo = SIGTRAP;
@@ -241,7 +304,130 @@ fasttrap_sigtrap(proc_t *p, kthread_t *t
 
 	if (t != NULL)
 		aston(t);
+#else
+	ksiginfo_t *ksi = kmem_zalloc(sizeof (ksiginfo_t), KM_SLEEP);
+
+	ksiginfo_init(ksi);
+	ksi->ksi_signo = SIGTRAP;
+	ksi->ksi_code = TRAP_DTRACE;
+	ksi->ksi_addr = (caddr_t)pc;
+	PROC_LOCK(p);
+	(void) tdsendsignal(p, t, SIGTRAP, ksi);
+	PROC_UNLOCK(p);
+#endif
+}
+
+#ifndef illumos
+/*
+ * Obtain a chunk of scratch space in the address space of the target process.
+ */
+fasttrap_scrspace_t *
+fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc)
+{
+	fasttrap_scrblock_t *scrblk;
+	fasttrap_scrspace_t *scrspc;
+	struct proc *p;
+	vm_offset_t addr;
+	int error, i;
+
+	scrspc = NULL;
+	if (td->t_dtrace_sscr != NULL) {
+		/* If the thread already has scratch space, we're done. */
+		scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+		return (scrspc);
+	}
+
+	p = td->td_proc;
+
+	mutex_enter(&fprc->ftpc_mtx);
+	if (LIST_EMPTY(&fprc->ftpc_fscr)) {
+		/*
+		 * No scratch space is available, so we'll map a new scratch
+		 * space block into the traced process' address space.
+		 */
+		addr = 0;
+		error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr,
+		    FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL,
+		    VM_PROT_ALL, 0);
+		if (error != KERN_SUCCESS)
+			goto done;
+
+		scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK);
+		scrblk->ftsb_addr = addr;
+		LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next);
+
+		/*
+		 * Carve the block up into chunks and put them on the free list.
+		 */
+		for (i = 0;
+		    i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) {
+			scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK);
+			scrspc->ftss_addr = addr +
+			    i * FASTTRAP_SCRSPACE_SIZE;
+			LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc,
+			    ftss_next);
+		}
+	}
+
+	/*
+	 * Take the first scratch chunk off the free list, put it on the
+	 * allocated list, and return its address.
+	 */
+	scrspc = LIST_FIRST(&fprc->ftpc_fscr);
+	LIST_REMOVE(scrspc, ftss_next);
+	LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next);
+
+	/*
+	 * This scratch space is reserved for use by td until the thread exits.
+	 */
+	td->t_dtrace_sscr = scrspc;
+
+done:
+	mutex_exit(&fprc->ftpc_mtx);
+
+	return (scrspc);
+}
+
+/*
+ * Return any allocated per-thread scratch space chunks back to the process'
+ * free list.
+ */
+static void
+fasttrap_thread_dtor(void *arg __unused, struct thread *td)
+{
+	fasttrap_bucket_t *bucket;
+	fasttrap_proc_t *fprc;
+	fasttrap_scrspace_t *scrspc;
+	pid_t pid;
+
+	if (td->t_dtrace_sscr == NULL)
+		return;
+
+	pid = td->td_proc->p_pid;
+	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
+	fprc = NULL;
+
+	/* Look up the fasttrap process handle for this process. */
+	mutex_enter(&bucket->ftb_mtx);
+	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
+		if (fprc->ftpc_pid == pid) {
+			mutex_enter(&fprc->ftpc_mtx);
+			mutex_exit(&bucket->ftb_mtx);
+			break;
+		}
+	}
+	if (fprc == NULL) {
+		mutex_exit(&bucket->ftb_mtx);
+		return;
+	}
+
+	scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+	LIST_REMOVE(scrspc, ftss_next);
+	LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next);
+
+	mutex_exit(&fprc->ftpc_mtx);
 }
+#endif
 
 /*
  * This function ensures that no threads are actively using the memory
@@ -257,15 +443,23 @@ fasttrap_mod_barrier(uint64_t gen)
 
 	fasttrap_mod_gen++;
 
-	for (i = 0; i < NCPU; i++) {
-		mutex_enter(&cpu_core[i].cpuc_pid_lock);
-		mutex_exit(&cpu_core[i].cpuc_pid_lock);
+#ifdef illumos
+	CPU_FOREACH(i) {
+		mutex_enter(&fasttrap_cpuc_pid_lock[i]);
+		mutex_exit(&fasttrap_cpuc_pid_lock[i]);
 	}
+#else
+	rm_wlock(&fasttrap_tp_lock);
+	rm_wunlock(&fasttrap_tp_lock);
+#endif
 }
 
 /*
- * This is the timeout's callback for cleaning up the providers and their
- * probes.
+ * This function performs asynchronous cleanup of fasttrap providers. The
+ * Solaris implementation of this mechanism use a timeout that's activated in
+ * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while
+ * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler.
+ * Thus we use a dedicated process to perform the cleanup when requested.
  */
 /*ARGSUSED*/
 static void
@@ -274,16 +468,12 @@ fasttrap_pid_cleanup_cb(void *data)
 	fasttrap_provider_t **fpp, *fp;
 	fasttrap_bucket_t *bucket;
 	dtrace_provider_id_t provid;
-	int i, later;
-
-	static volatile int in = 0;
-	ASSERT(in == 0);
-	in = 1;
+	int i, later = 0, rval;
 
-	mutex_enter(&fasttrap_cleanup_mtx);
-	while (fasttrap_cleanup_work) {
+	mtx_lock(&fasttrap_cleanup_mtx);
+	while (!fasttrap_cleanup_drain || later > 0) {
 		fasttrap_cleanup_work = 0;
-		mutex_exit(&fasttrap_cleanup_mtx);
+		mtx_unlock(&fasttrap_cleanup_mtx);
 
 		later = 0;
 
@@ -336,9 +526,13 @@ fasttrap_pid_cleanup_cb(void *data)
 				 * clean out the unenabled probes.
 				 */
 				provid = fp->ftp_provid;
-				if (dtrace_unregister(provid) != 0) {
+				if ((rval = dtrace_unregister(provid)) != 0) {
 					if (fasttrap_total > fasttrap_max / 2)
 						(void) dtrace_condense(provid);
+
+					if (rval == EAGAIN)
+						fp->ftp_marked = 1;
+
 					later += fp->ftp_marked;
 					fpp = &fp->ftp_next;
 				} else {
@@ -348,31 +542,32 @@ fasttrap_pid_cleanup_cb(void *data)
 			}
 			mutex_exit(&bucket->ftb_mtx);
 		}
+		mtx_lock(&fasttrap_cleanup_mtx);
 
-		mutex_enter(&fasttrap_cleanup_mtx);
+		/*
+		 * If we were unable to retire a provider, try again after a
+		 * second. This situation can occur in certain circumstances
+		 * where providers cannot be unregistered even though they have
+		 * no probes enabled because of an execution of dtrace -l or
+		 * something similar.
+		 */
+		if (later > 0 || fasttrap_cleanup_work ||
+		    fasttrap_cleanup_drain) {
+			mtx_unlock(&fasttrap_cleanup_mtx);
+			pause("ftclean", hz);
+			mtx_lock(&fasttrap_cleanup_mtx);
+		} else
+			mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx,
+			    0, "ftcl", 0);
 	}
 
-	ASSERT(fasttrap_timeout != 0);
-
 	/*
-	 * If we were unable to remove a retired provider, try again after
-	 * a second. This situation can occur in certain circumstances where
-	 * providers cannot be unregistered even though they have no probes
-	 * enabled because of an execution of dtrace -l or something similar.
-	 * If the timeout has been disabled (set to 1 because we're trying
-	 * to detach), we set fasttrap_cleanup_work to ensure that we'll
-	 * get a chance to do that work if and when the timeout is reenabled
-	 * (if detach fails).
+	 * Wake up the thread in fasttrap_unload() now that we're done.
 	 */
-	if (later > 0 && fasttrap_timeout != (timeout_id_t)1)
-		fasttrap_timeout = timeout(&fasttrap_pid_cleanup_cb, NULL, hz);
-	else if (later > 0)
-		fasttrap_cleanup_work = 1;
-	else
-		fasttrap_timeout = 0;
+	wakeup(&fasttrap_cleanup_drain);
+	mtx_unlock(&fasttrap_cleanup_mtx);
 
-	mutex_exit(&fasttrap_cleanup_mtx);
-	in = 0;
+	kthread_exit();
 }
 
 /*
@@ -381,11 +576,13 @@ fasttrap_pid_cleanup_cb(void *data)
 static void
 fasttrap_pid_cleanup(void)
 {
-	mutex_enter(&fasttrap_cleanup_mtx);
-	fasttrap_cleanup_work = 1;
-	if (fasttrap_timeout == 0)
-		fasttrap_timeout = timeout(&fasttrap_pid_cleanup_cb, NULL, 1);
-	mutex_exit(&fasttrap_cleanup_mtx);
+
+	mtx_lock(&fasttrap_cleanup_mtx);
+	if (!fasttrap_cleanup_work) {
+		fasttrap_cleanup_work = 1;
+		wakeup(&fasttrap_cleanup_cv);
+	}
+	mtx_unlock(&fasttrap_cleanup_mtx);
 }
 
 /*
@@ -397,12 +594,42 @@ fasttrap_pid_cleanup(void)
 static void
 fasttrap_fork(proc_t *p, proc_t *cp)
 {
+#ifndef illumos
+	fasttrap_scrblock_t *scrblk;
+	fasttrap_proc_t *fprc = NULL;
+#endif
 	pid_t ppid = p->p_pid;
 	int i;
 
+#ifdef illumos
 	ASSERT(curproc == p);
 	ASSERT(p->p_proc_flag & P_PR_LOCK);
+#else
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+#endif
+#ifdef illumos
 	ASSERT(p->p_dtrace_count > 0);
+#else
+	if (p->p_dtrace_helpers) {
+		/*
+		 * dtrace_helpers_duplicate() allocates memory.
+		 */
+		_PHOLD(cp);
+		PROC_UNLOCK(p);
+		PROC_UNLOCK(cp);
+		dtrace_helpers_duplicate(p, cp);
+		PROC_LOCK(cp);
+		PROC_LOCK(p);
+		_PRELE(cp);
+	}
+	/*
+	 * This check is purposely here instead of in kern_fork.c because,
+	 * for legal resons, we cannot include the dtrace_cddl.h header
+	 * inside kern_fork.c and insert if-clause there.
+	 */
+	if (p->p_dtrace_count == 0)
+		return;
+#endif
 	ASSERT(cp->p_dtrace_count == 0);
 
 	/*
@@ -419,9 +646,19 @@ fasttrap_fork(proc_t *p, proc_t *cp)
 	 * We don't have to worry about the child process disappearing
 	 * because we're in fork().
 	 */
-	mutex_enter(&cp->p_lock);
+#ifdef illumos
+	mtx_lock_spin(&cp->p_slock);
 	sprlock_proc(cp);
-	mutex_exit(&cp->p_lock);
+	mtx_unlock_spin(&cp->p_slock);
+#else
+	/*
+	 * fasttrap_tracepoint_remove() expects the child process to be
+	 * unlocked and the VM then expects curproc to be unlocked.
+	 */
+	_PHOLD(cp);
+	PROC_UNLOCK(cp);
+	PROC_UNLOCK(p);
+#endif
 
 	/*
 	 * Iterate over every tracepoint looking for ones that belong to the
@@ -446,13 +683,38 @@ fasttrap_fork(proc_t *p, proc_t *cp)
 				 * mid-fork.
 				 */
 				ASSERT(tp->ftt_proc->ftpc_acount != 0);
+#ifndef illumos
+				fprc = tp->ftt_proc;
+#endif
 			}
 		}
 		mutex_exit(&bucket->ftb_mtx);
+
+#ifndef illumos
+		/*
+		 * Unmap any scratch space inherited from the parent's address
+		 * space.
+		 */
+		if (fprc != NULL) {
+			mutex_enter(&fprc->ftpc_mtx);
+			LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) {
+				vm_map_remove(&cp->p_vmspace->vm_map,
+				    scrblk->ftsb_addr,
+				    scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE);
+			}
+			mutex_exit(&fprc->ftpc_mtx);
+		}
+#endif
 	}
 
+#ifdef illumos
 	mutex_enter(&cp->p_lock);
 	sprunlock(cp);
+#else
+	PROC_LOCK(p);
+	PROC_LOCK(cp);
+	_PRELE(cp);
+#endif
 }
 
 /*
@@ -463,24 +725,42 @@ fasttrap_fork(proc_t *p, proc_t *cp)
 static void
 fasttrap_exec_exit(proc_t *p)
 {
-	ASSERT(p == curproc);
-	ASSERT(MUTEX_HELD(&p->p_lock));
+#ifndef illumos
+	struct thread *td;
+#endif
 
-	mutex_exit(&p->p_lock);
+#ifdef illumos
+	ASSERT(p == curproc);
+#else
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	_PHOLD(p);
+	/*
+	 * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr
+	 * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it
+	 * ourselves when a process exits.
+	 */
+	FOREACH_THREAD_IN_PROC(p, td)
+		td->t_dtrace_sscr = NULL;
+	PROC_UNLOCK(p);
+#endif
 
 	/*
 	 * We clean up the pid provider for this process here; user-land
 	 * static probes are handled by the meta-provider remove entry point.
 	 */
 	fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0);
-
-	mutex_enter(&p->p_lock);
+#ifndef illumos
+	if (p->p_dtrace_helpers)
+		dtrace_helpers_destroy(p);
+	PROC_LOCK(p);
+	_PRELE(p);
+#endif
 }
 
 
 /*ARGSUSED*/
 static void
-fasttrap_pid_provide(void *arg, const dtrace_probedesc_t *desc)
+fasttrap_pid_provide(void *arg, dtrace_probedesc_t *desc)
 {
 	/*
 	 * There are no "default" pid probes.
@@ -504,7 +784,9 @@ fasttrap_tracepoint_enable(proc_t *p, fa
 
 	ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
 
+#ifdef illumos
 	ASSERT(!(p->p_flag & SVFORK));
+#endif
 
 	/*
 	 * Before we make any modifications, make sure we've imposed a barrier
@@ -610,7 +892,9 @@ again:
 		 * Increment the count of the number of tracepoints active in
 		 * the victim process.
 		 */
+#ifdef illumos
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
+#endif
 		p->p_dtrace_count++;
 
 		return (rc);
@@ -647,6 +931,13 @@ again:
 		ASSERT(0);
 	}
 
+#ifdef __FreeBSD__
+	if (SV_PROC_FLAG(p, SV_LP64))
+		p->p_model = DATAMODEL_LP64;
+	else
+		p->p_model = DATAMODEL_ILP32;
+#endif
+
 	/*
 	 * If the ISA-dependent initialization goes to plan, go back to the
 	 * beginning and try to install this freshly made tracepoint.
@@ -666,7 +957,7 @@ fasttrap_tracepoint_disable(proc_t *p, f
 	fasttrap_bucket_t *bucket;
 	fasttrap_provider_t *provider = probe->ftp_prov;
 	fasttrap_tracepoint_t **pp, *tp;
-	fasttrap_id_t *id, **idp;
+	fasttrap_id_t *id, **idp = NULL;
 	pid_t pid;
 	uintptr_t pc;
 
@@ -800,7 +1091,9 @@ fasttrap_tracepoint_disable(proc_t *p, f
 		 * Decrement the count of the number of tracepoints active
 		 * in the victim process.
 		 */
+#ifdef illumos
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
+#endif
 		p->p_dtrace_count--;
 	}
 
@@ -851,42 +1144,49 @@ fasttrap_enable_callbacks(void)
 static void
 fasttrap_disable_callbacks(void)
 {
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
+#endif
+
 
 	mutex_enter(&fasttrap_count_mtx);
 	ASSERT(fasttrap_pid_count > 0);
 	fasttrap_pid_count--;
 	if (fasttrap_pid_count == 0) {
+#ifdef illumos
 		cpu_t *cur, *cpu = CPU;
 
 		for (cur = cpu->cpu_next_onln; cur != cpu;
 		    cur = cur->cpu_next_onln) {
 			rw_enter(&cur->cpu_ft_lock, RW_WRITER);
 		}
-
+#endif
 		dtrace_pid_probe_ptr = NULL;
 		dtrace_return_probe_ptr = NULL;
-
+#ifdef illumos
 		for (cur = cpu->cpu_next_onln; cur != cpu;
 		    cur = cur->cpu_next_onln) {
 			rw_exit(&cur->cpu_ft_lock);
 		}
+#endif
 	}
 	mutex_exit(&fasttrap_count_mtx);
 }
 
 /*ARGSUSED*/
-static int
+static void
 fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
-	proc_t *p;
+	proc_t *p = NULL;
 	int i, rc;
 
 	ASSERT(probe != NULL);
 	ASSERT(!probe->ftp_enabled);
 	ASSERT(id == probe->ftp_id);
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
+#endif
 
 	/*
 	 * Increment the count of enabled probes on this probe's provider;
@@ -904,25 +1204,36 @@ fasttrap_pid_enable(void *arg, dtrace_id
 	 * provider can't go away while we're in this code path.
 	 */
 	if (probe->ftp_prov->ftp_retired)
-		return (0);
+		return;
 
 	/*
 	 * If we can't find the process, it may be that we're in the context of
 	 * a fork in which the traced process is being born and we're copying
 	 * USDT probes. Otherwise, the process is gone so bail.
 	 */
+#ifdef illumos
 	if ((p = sprlock(probe->ftp_pid)) == NULL) {
 		if ((curproc->p_flag & SFORKING) == 0)
-			return (0);
+			return;
 
 		mutex_enter(&pidlock);
 		p = prfind(probe->ftp_pid);
 
+		if (p == NULL) {
+			/*
+			 * So it's not that the target process is being born,
+			 * it's that it isn't there at all (and we simply
+			 * happen to be forking).  Anyway, we know that the
+			 * target is definitely gone, so bail out.
+			 */
+			mutex_exit(&pidlock);
+			return (0);
+		}
+
 		/*
 		 * Confirm that curproc is indeed forking the process in which
 		 * we're trying to enable probes.
 		 */
-		ASSERT(p != NULL);
 		ASSERT(p->p_parent == curproc);
 		ASSERT(p->p_stat == SIDL);
 
@@ -934,6 +1245,10 @@ fasttrap_pid_enable(void *arg, dtrace_id
 
 	ASSERT(!(p->p_flag & SVFORK));
 	mutex_exit(&p->p_lock);
+#else
+	if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0)
+		return;
+#endif
 
 	/*
 	 * We have to enable the trap entry point before any user threads have
@@ -967,23 +1282,29 @@ fasttrap_pid_enable(void *arg, dtrace_id
 				i--;
 			}
 
+#ifdef illumos
 			mutex_enter(&p->p_lock);
 			sprunlock(p);
+#else
+			PRELE(p);
+#endif
 
 			/*
 			 * Since we're not actually enabling this probe,
 			 * drop our reference on the trap table entry.
 			 */
 			fasttrap_disable_callbacks();
-			return (0);
+			return;
 		}
 	}
-
+#ifdef illumos
 	mutex_enter(&p->p_lock);
 	sprunlock(p);
+#else
+	PRELE(p);
+#endif
 
 	probe->ftp_enabled = 1;
-	return (0);
 }
 
 /*ARGSUSED*/
@@ -997,18 +1318,16 @@ fasttrap_pid_disable(void *arg, dtrace_i
 
 	ASSERT(id == probe->ftp_id);
 
+	mutex_enter(&provider->ftp_mtx);
+
 	/*
 	 * We won't be able to acquire a /proc-esque lock on the process
 	 * iff the process is dead and gone. In this case, we rely on the
 	 * provider lock as a point of mutual exclusion to prevent other
 	 * DTrace consumers from disabling this probe.
 	 */
-	if ((p = sprlock(probe->ftp_pid)) != NULL) {
-		ASSERT(!(p->p_flag & SVFORK));
-		mutex_exit(&p->p_lock);
-	}
-
-	mutex_enter(&provider->ftp_mtx);
+	if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0)
+		p = NULL;
 
 	/*
 	 * Disable all the associated tracepoints (for fully enabled probes).
@@ -1031,9 +1350,6 @@ fasttrap_pid_disable(void *arg, dtrace_i
 		if (provider->ftp_retired && !provider->ftp_marked)
 			whack = provider->ftp_marked = 1;
 		mutex_exit(&provider->ftp_mtx);
-
-		mutex_enter(&p->p_lock);
-		sprunlock(p);
 	} else {
 		/*
 		 * If the process is dead, we're just waiting for the
@@ -1047,12 +1363,18 @@ fasttrap_pid_disable(void *arg, dtrace_i
 	if (whack)
 		fasttrap_pid_cleanup();
 
+#ifdef __FreeBSD__
+	if (p != NULL)
+		PRELE(p);
+#endif
 	if (!probe->ftp_enabled)
 		return;
 
 	probe->ftp_enabled = 0;
 
+#ifdef illumos
 	ASSERT(MUTEX_HELD(&cpu_lock));
+#endif
 	fasttrap_disable_callbacks();
 }
 
@@ -1164,6 +1486,7 @@ fasttrap_proc_lookup(pid_t pid)
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc, *new_fprc;
 
+
 	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
 	mutex_enter(&bucket->ftb_mtx);
 
@@ -1172,7 +1495,7 @@ fasttrap_proc_lookup(pid_t pid)
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
-			atomic_add_64(&fprc->ftpc_acount, 1);
+			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
@@ -1190,6 +1513,10 @@ fasttrap_proc_lookup(pid_t pid)
 	new_fprc->ftpc_pid = pid;
 	new_fprc->ftpc_rcount = 1;
 	new_fprc->ftpc_acount = 1;
+#ifndef illumos
+	mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT,
+	    NULL);
+#endif
 
 	mutex_enter(&bucket->ftb_mtx);
 
@@ -1202,7 +1529,7 @@ fasttrap_proc_lookup(pid_t pid)
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
-			atomic_add_64(&fprc->ftpc_acount, 1);
+			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
@@ -1226,6 +1553,12 @@ fasttrap_proc_release(fasttrap_proc_t *p
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc, **fprcp;
 	pid_t pid = proc->ftpc_pid;
+#ifndef illumos
+	fasttrap_scrblock_t *scrblk, *scrblktmp;
+	fasttrap_scrspace_t *scrspc, *scrspctmp;
+	struct proc *p;
+	struct thread *td;
+#endif
 
 	mutex_enter(&proc->ftpc_mtx);
 
@@ -1237,6 +1570,31 @@ fasttrap_proc_release(fasttrap_proc_t *p
 		return;
 	}
 
+#ifndef illumos
+	/*
+	 * Free all structures used to manage per-thread scratch space.
+	 */
+	LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next,
+	    scrblktmp) {
+		LIST_REMOVE(scrblk, ftsb_next);
+		free(scrblk, M_SOLARIS);
+	}
+	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) {
+		LIST_REMOVE(scrspc, ftss_next);
+		free(scrspc, M_SOLARIS);
+	}
+	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) {
+		LIST_REMOVE(scrspc, ftss_next);
+		free(scrspc, M_SOLARIS);
+	}
+
+	if ((p = pfind(pid)) != NULL) {
+		FOREACH_THREAD_IN_PROC(p, td)
+			td->t_dtrace_sscr = NULL;
+		PROC_UNLOCK(p);
+	}
+#endif
+
 	mutex_exit(&proc->ftpc_mtx);
 
 	/*
@@ -1312,17 +1670,8 @@ fasttrap_provider_lookup(pid_t pid, cons
 	 * Make sure the process exists, isn't a child created as the result
 	 * of a vfork(2), and isn't a zombie (but may be in fork).
 	 */
-	mutex_enter(&pidlock);
-	if ((p = prfind(pid)) == NULL) {
-		mutex_exit(&pidlock);
+	if ((p = pfind(pid)) == NULL)
 		return (NULL);
-	}
-	mutex_enter(&p->p_lock);
-	mutex_exit(&pidlock);
-	if (p->p_flag & (SVFORK | SEXITING)) {
-		mutex_exit(&p->p_lock);
-		return (NULL);
-	}
 
 	/*
 	 * Increment p_dtrace_probes so that the process knows to inform us
@@ -1335,15 +1684,18 @@ fasttrap_provider_lookup(pid_t pid, cons
 	 * Grab the credentials for this process so we have
 	 * something to pass to dtrace_register().
 	 */
-	mutex_enter(&p->p_crlock);
-	crhold(p->p_cred);
-	cred = p->p_cred;
-	mutex_exit(&p->p_crlock);
-	mutex_exit(&p->p_lock);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	crhold(p->p_ucred);
+	cred = p->p_ucred;
+	PROC_UNLOCK(p);
 
 	new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP);
 	new_fp->ftp_pid = pid;
 	new_fp->ftp_proc = fasttrap_proc_lookup(pid);
+#ifndef illumos
+	mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL);
+	mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL);
+#endif
 
 	ASSERT(new_fp->ftp_proc != NULL);
 
@@ -1414,13 +1766,17 @@ fasttrap_provider_free(fasttrap_provider
 	 * count of active providers on the associated process structure.
 	 */
 	if (!provider->ftp_retired) {
-		atomic_add_64(&provider->ftp_proc->ftpc_acount, -1);
+		atomic_dec_64(&provider->ftp_proc->ftpc_acount);
 		ASSERT(provider->ftp_proc->ftpc_acount <
 		    provider->ftp_proc->ftpc_rcount);
 	}
 
 	fasttrap_proc_release(provider->ftp_proc);
 
+#ifndef illumos
+	mutex_destroy(&provider->ftp_mtx);
+	mutex_destroy(&provider->ftp_cmtx);
+#endif
 	kmem_free(provider, sizeof (fasttrap_provider_t));
 
 	/*
@@ -1430,17 +1786,14 @@ fasttrap_provider_free(fasttrap_provider
 	 * corresponds to this process's hash chain in the provider hash
 	 * table. Don't sweat it if we can't find the process.
 	 */
-	mutex_enter(&pidlock);
-	if ((p = prfind(pid)) == NULL) {
-		mutex_exit(&pidlock);
+	if ((p = pfind(pid)) == NULL) {
 		return;
 	}
 
-	mutex_enter(&p->p_lock);
-	mutex_exit(&pidlock);
-
 	p->p_dtrace_probes--;
-	mutex_exit(&p->p_lock);
+#ifndef illumos
+	PROC_UNLOCK(p);
+#endif
 }
 
 static void
@@ -1489,7 +1842,7 @@ fasttrap_provider_retire(pid_t pid, cons
 	 * bucket lock therefore protects the integrity of the provider hash
 	 * table.
 	 */
-	atomic_add_64(&fp->ftp_proc->ftpc_acount, -1);
+	atomic_dec_64(&fp->ftp_proc->ftpc_acount);
 	ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount);
 
 	fp->ftp_retired = 1;
@@ -1528,7 +1881,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t
 	fasttrap_probe_t *pp;
 	fasttrap_tracepoint_t *tp;
 	char *name;
-	int i, aframes, whack;
+	int i, aframes = 0, whack;
 
 	/*
 	 * There needs to be at least one desired trace point.
@@ -1578,17 +1931,17 @@ fasttrap_add_probe(fasttrap_probe_spec_t
 		for (i = 0; i < pdata->ftps_noffs; i++) {
 			char name_str[17];
 
-			(void) snprintf(name_str, sizeof(name_str), "%llx",
+			(void) sprintf(name_str, "%llx",
 			    (unsigned long long)pdata->ftps_offs[i]);
 
 			if (dtrace_probe_lookup(provider->ftp_provid,
 			    pdata->ftps_mod, pdata->ftps_func, name_str) != 0)
 				continue;
 
-			atomic_add_32(&fasttrap_total, 1);
+			atomic_inc_32(&fasttrap_total);
 
 			if (fasttrap_total > fasttrap_max) {
-				atomic_add_32(&fasttrap_total, -1);
+				atomic_dec_32(&fasttrap_total);
 				goto no_mem;
 			}
 
@@ -1716,7 +2069,7 @@ fasttrap_meta_provide(void *arg, dtrace_
 	 */
 	if (strlen(dhpv->dthpv_provname) + 10 >=
 	    sizeof (provider->ftp_name)) {
-		cmn_err(CE_WARN, "failed to instantiate provider %s: "
+		printf("failed to instantiate provider %s: "
 		    "name too long to accomodate pid", dhpv->dthpv_provname);
 		return (NULL);
 	}
@@ -1725,7 +2078,7 @@ fasttrap_meta_provide(void *arg, dtrace_
 	 * Don't let folks spoof the true pid provider.
 	 */
 	if (strcmp(dhpv->dthpv_provname, FASTTRAP_PID_NAME) == 0) {
-		cmn_err(CE_WARN, "failed to instantiate provider %s: "
+		printf("failed to instantiate provider %s: "
 		    "%s is an invalid name", dhpv->dthpv_provname,
 		    FASTTRAP_PID_NAME);
 		return (NULL);
@@ -1748,7 +2101,7 @@ fasttrap_meta_provide(void *arg, dtrace_
 
 	if ((provider = fasttrap_provider_lookup(pid, dhpv->dthpv_provname,
 	    &dhpv->dthpv_pattr)) == NULL) {
-		cmn_err(CE_WARN, "failed to instantiate provider %s for "
+		printf("failed to instantiate provider %s for "
 		    "process %u",  dhpv->dthpv_provname, (uint_t)pid);
 		return (NULL);
 	}
@@ -1764,6 +2117,18 @@ fasttrap_meta_provide(void *arg, dtrace_
 	return (provider);
 }
 
+/*
+ * We know a few things about our context here:  we know that the probe being
+ * created doesn't already exist (DTrace won't load DOF at the same address
+ * twice, even if explicitly told to do so) and we know that we are
+ * single-threaded with respect to the meta provider machinery. Knowing that
+ * this is a new probe and that there is no way for us to race with another
+ * operation on this provider allows us an important optimization: we need not
+ * lookup a probe before adding it.  Saving this lookup is important because
+ * this code is in the fork path for processes with USDT probes, and lookups
+ * here are potentially very expensive because of long hash conflicts on
+ * module, function and name (DTrace doesn't hash on provider name).
+ */
 /*ARGSUSED*/
 static void
 fasttrap_meta_create_probe(void *arg, void *parg,
@@ -1800,19 +2165,6 @@ fasttrap_meta_create_probe(void *arg, vo
 			return;
 	}
 
-	/*
-	 * Grab the creation lock to ensure consistency between calls to
-	 * dtrace_probe_lookup() and dtrace_probe_create() in the face of
-	 * other threads creating probes.
-	 */
-	mutex_enter(&provider->ftp_cmtx);
-
-	if (dtrace_probe_lookup(provider->ftp_provid, dhpb->dthpb_mod,
-	    dhpb->dthpb_func, dhpb->dthpb_name) != 0) {
-		mutex_exit(&provider->ftp_cmtx);
-		return;
-	}
-
 	ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs;
 	ASSERT(ntps > 0);
 
@@ -1820,7 +2172,6 @@ fasttrap_meta_create_probe(void *arg, vo
 
 	if (fasttrap_total > fasttrap_max) {
 		atomic_add_32(&fasttrap_total, -ntps);
-		mutex_exit(&provider->ftp_cmtx);
 		return;
 	}
 
@@ -1884,8 +2235,6 @@ fasttrap_meta_create_probe(void *arg, vo
 	 */
 	pp->ftp_id = dtrace_probe_create(provider->ftp_provid, dhpb->dthpb_mod,
 	    dhpb->dthpb_func, dhpb->dthpb_name, FASTTRAP_OFFSET_AFRAMES, pp);
-
-	mutex_exit(&provider->ftp_cmtx);
 }
 
 /*ARGSUSED*/
@@ -1909,25 +2258,30 @@ static dtrace_mops_t fasttrap_mops = {
 
 /*ARGSUSED*/
 static int
-fasttrap_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
+fasttrap_open(struct cdev *dev __unused, int oflags __unused,
+    int devtype __unused, struct thread *td __unused)
 {
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
-fasttrap_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+fasttrap_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int fflag,
+    struct thread *td)
 {
+#ifdef notyet
+	struct kinfo_proc kp;
+	const cred_t *cr = td->td_ucred;
+#endif
 	if (!dtrace_attached())
 		return (EAGAIN);
 
 	if (cmd == FASTTRAPIOC_MAKEPROBE) {
-		fasttrap_probe_spec_t *uprobe = (void *)arg;
+		fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg;
 		fasttrap_probe_spec_t *probe;
 		uint64_t noffs;
 		size_t size;
-		int ret;
-		char *c;
+		int ret, err;
 
 		if (copyin(&uprobe->ftps_noffs, &noffs,
 		    sizeof (uprobe->ftps_noffs)))
@@ -1957,44 +2311,66 @@ fasttrap_ioctl(dev_t dev, int cmd, intpt
 		 * Verify that the function and module strings contain no
 		 * funny characters.
 		 */
-		for (c = &probe->ftps_func[0]; *c != '\0'; c++) {
-			if (*c < 0x20 || 0x7f <= *c) {
-				ret = EINVAL;
-				goto err;
-			}
+		if (u8_validate(probe->ftps_func, strlen(probe->ftps_func),
+		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+			ret = EINVAL;
+			goto err;
 		}
 
-		for (c = &probe->ftps_mod[0]; *c != '\0'; c++) {
-			if (*c < 0x20 || 0x7f <= *c) {
-				ret = EINVAL;
-				goto err;
-			}
+		if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod),
+		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+			ret = EINVAL;
+			goto err;
 		}
 
+#ifdef notyet
 		if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
 			proc_t *p;
 			pid_t pid = probe->ftps_pid;
 
+#ifdef illumos
 			mutex_enter(&pidlock);
+#endif
 			/*
 			 * Report an error if the process doesn't exist
 			 * or is actively being birthed.
 			 */
-			if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) {
+			sx_slock(&proctree_lock);
+			p = pfind(pid);
+			if (p)
+				fill_kinfo_proc(p, &kp);
+			sx_sunlock(&proctree_lock);
+			if (p == NULL || kp.ki_stat == SIDL) {
+#ifdef illumos
 				mutex_exit(&pidlock);
+#endif
 				return (ESRCH);
 			}
+#ifdef illumos
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
+#else
+			PROC_LOCK_ASSERT(p, MA_OWNED);
+#endif
 
+#ifdef notyet
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD | VWRITE)) != 0) {
+#ifdef illumos
 				mutex_exit(&p->p_lock);
+#else
+				PROC_UNLOCK(p);
+#endif
 				return (ret);
 			}
-
+#endif /* notyet */
+#ifdef illumos
 			mutex_exit(&p->p_lock);
+#else
+			PROC_UNLOCK(p);
+#endif
 		}
+#endif /* notyet */
 
 		ret = fasttrap_add_probe(probe);
 err:
@@ -2006,35 +2382,64 @@ err:
 		fasttrap_instr_query_t instr;
 		fasttrap_tracepoint_t *tp;
 		uint_t index;
+#ifdef illumos
 		int ret;
+#endif
 
+#ifdef illumos
 		if (copyin((void *)arg, &instr, sizeof (instr)) != 0)
 			return (EFAULT);
+#endif
 
+#ifdef notyet
 		if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
 			proc_t *p;
 			pid_t pid = instr.ftiq_pid;
 
+#ifdef illumos
 			mutex_enter(&pidlock);
+#endif
 			/*
 			 * Report an error if the process doesn't exist
 			 * or is actively being birthed.
 			 */
-			if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) {
+			sx_slock(&proctree_lock);
+			p = pfind(pid);
+			if (p)
+				fill_kinfo_proc(p, &kp);
+			sx_sunlock(&proctree_lock);
+			if (p == NULL || kp.ki_stat == SIDL) {
+#ifdef illumos
 				mutex_exit(&pidlock);
+#endif
 				return (ESRCH);
 			}
+#ifdef illumos
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
+#else
+			PROC_LOCK_ASSERT(p, MA_OWNED);
+#endif
 
+#ifdef notyet
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD)) != 0) {
+#ifdef illumos
 				mutex_exit(&p->p_lock);
+#else
+				PROC_UNLOCK(p);
+#endif
 				return (ret);
 			}
+#endif /* notyet */
 
+#ifdef illumos
 			mutex_exit(&p->p_lock);
+#else
+			PROC_UNLOCK(p);
+#endif
 		}
+#endif /* notyet */
 
 		index = FASTTRAP_TPOINTS_INDEX(instr.ftiq_pid, instr.ftiq_pc);
 
@@ -2067,89 +2472,42 @@ err:
 	return (EINVAL);
 }
 
-static struct cb_ops fasttrap_cb_ops = {
-	fasttrap_open,		/* open */
-	nodev,			/* close */
-	nulldev,		/* strategy */
-	nulldev,		/* print */
-	nodev,			/* dump */
-	nodev,			/* read */
-	nodev,			/* write */
-	fasttrap_ioctl,		/* ioctl */
-	nodev,			/* devmap */
-	nodev,			/* mmap */
-	nodev,			/* segmap */
-	nochpoll,		/* poll */
-	ddi_prop_op,		/* cb_prop_op */
-	0,			/* streamtab  */
-	D_NEW | D_MP		/* Driver compatibility flag */
-};
-
-/*ARGSUSED*/
-static int
-fasttrap_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
-{
-	int error;
-
-	switch (infocmd) {
-	case DDI_INFO_DEVT2DEVINFO:
-		*result = (void *)fasttrap_devi;
-		error = DDI_SUCCESS;
-		break;
-	case DDI_INFO_DEVT2INSTANCE:
-		*result = (void *)0;
-		error = DDI_SUCCESS;
-		break;
-	default:
-		error = DDI_FAILURE;
-	}
-	return (error);
-}
-
 static int
-fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+fasttrap_load(void)
 {
 	ulong_t nent;
+	int i, ret;
 
-	switch (cmd) {
-	case DDI_ATTACH:
-		break;
-	case DDI_RESUME:
-		return (DDI_SUCCESS);
-	default:
-		return (DDI_FAILURE);
-	}
-
-	if (ddi_create_minor_node(devi, "fasttrap", S_IFCHR, 0,
-	    DDI_PSEUDO, NULL) == DDI_FAILURE) {
-		ddi_remove_minor_node(devi, NULL);
-		return (DDI_FAILURE);
-	}
-
-	ddi_report_dev(devi);
-	fasttrap_devi = devi;
+        /* Create the /dev/dtrace/fasttrap entry. */
+        fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
+            "dtrace/fasttrap");
 
-	/*
-	 * Install our hooks into fork(2), exec(2), and exit(2).
-	 */
-	dtrace_fasttrap_fork_ptr = &fasttrap_fork;
-	dtrace_fasttrap_exit_ptr = &fasttrap_exec_exit;
-	dtrace_fasttrap_exec_ptr = &fasttrap_exec_exit;
+	mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF);
+	mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT,
+	    NULL);
 
+#ifdef illumos
 	fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT);
+#endif
 	fasttrap_total = 0;
 
 	/*
 	 * Conjure up the tracepoints hashtable...
 	 */
+#ifdef illumos
 	nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE);
+#else
+	nent = tpoints_hash_size;
+#endif
 
 	if (nent == 0 || nent > 0x1000000)
 		nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
 
-	if ((nent & (nent - 1)) == 0)
+	tpoints_hash_size = nent;
+
+	if (ISP2(nent))
 		fasttrap_tpoints.fth_nent = nent;
 	else
 		fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2157,12 +2515,17 @@ fasttrap_attach(dev_info_t *devi, ddi_at
 	fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1;
 	fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
+#ifndef illumos
+	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+		mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx,
+		    "tracepoints bucket mtx", MUTEX_DEFAULT, NULL);
+#endif
 
 	/*
 	 * ... and the providers hash table...
 	 */
 	nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE;
-	if ((nent & (nent - 1)) == 0)
+	if (ISP2(nent))
 		fasttrap_provs.fth_nent = nent;
 	else
 		fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2170,12 +2533,35 @@ fasttrap_attach(dev_info_t *devi, ddi_at
 	fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1;
 	fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
+#ifndef illumos
+	for (i = 0; i < fasttrap_provs.fth_nent; i++)
+		mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx, 
+		    "providers bucket mtx", MUTEX_DEFAULT, NULL);
+#endif
+
+	ret = kproc_create(fasttrap_pid_cleanup_cb, NULL,
+	    &fasttrap_cleanup_proc, 0, 0, "ftcleanup");
+	if (ret != 0) {
+		destroy_dev(fasttrap_cdev);
+#ifndef illumos
+		for (i = 0; i < fasttrap_provs.fth_nent; i++)
+			mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+		for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+			mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+#endif
+		kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent *
+		    sizeof (fasttrap_bucket_t));
+		mtx_destroy(&fasttrap_cleanup_mtx);
+		mutex_destroy(&fasttrap_count_mtx);
+		return (ret);
+	}
+
 
 	/*
 	 * ... and the procs hash table.
 	 */
 	nent = FASTTRAP_PROCS_DEFAULT_SIZE;
-	if ((nent & (nent - 1)) == 0)
+	if (ISP2(nent))
 		fasttrap_procs.fth_nent = nent;
 	else
 		fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2183,27 +2569,38 @@ fasttrap_attach(dev_info_t *devi, ddi_at
 	fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1;
 	fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
+#ifndef illumos
+	for (i = 0; i < fasttrap_procs.fth_nent; i++)
+		mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx,
+		    "processes bucket mtx", MUTEX_DEFAULT, NULL);
+
+	rm_init(&fasttrap_tp_lock, "fasttrap tracepoint");
+
+	/*
+	 * This event handler must run before kdtrace_thread_dtor() since it
+	 * accesses the thread's struct kdtrace_thread.
+	 */
+	fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
+	    fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST);
+#endif
+
+	/*
+	 * Install our hooks into fork(2), exec(2), and exit(2).
+	 */
+	dtrace_fasttrap_fork = &fasttrap_fork;
+	dtrace_fasttrap_exit = &fasttrap_exec_exit;
+	dtrace_fasttrap_exec = &fasttrap_exec_exit;
 
 	(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
 	    &fasttrap_meta_id);
 
-	return (DDI_SUCCESS);
+	return (0);
 }
 
 static int
-fasttrap_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
+fasttrap_unload(void)
 {
 	int i, fail = 0;
-	timeout_id_t tmp;
-
-	switch (cmd) {
-	case DDI_DETACH:
-		break;
-	case DDI_SUSPEND:
-		return (DDI_SUCCESS);
-	default:
-		return (DDI_FAILURE);
-	}
 
 	/*
 	 * Unregister the meta-provider to make sure no new fasttrap-
@@ -2214,28 +2611,7 @@ fasttrap_detach(dev_info_t *devi, ddi_de
 	 */
 	if (fasttrap_meta_id != DTRACE_METAPROVNONE &&
 	    dtrace_meta_unregister(fasttrap_meta_id) != 0)
-		return (DDI_FAILURE);
-
-	/*
-	 * Prevent any new timeouts from running by setting fasttrap_timeout
-	 * to a non-zero value, and wait for the current timeout to complete.
-	 */
-	mutex_enter(&fasttrap_cleanup_mtx);
-	fasttrap_cleanup_work = 0;
-
-	while (fasttrap_timeout != (timeout_id_t)1) {
-		tmp = fasttrap_timeout;
-		fasttrap_timeout = (timeout_id_t)1;
-
-		if (tmp != 0) {
-			mutex_exit(&fasttrap_cleanup_mtx);
-			(void) untimeout(tmp);
-			mutex_enter(&fasttrap_cleanup_mtx);
-		}
-	}
-
-	fasttrap_cleanup_work = 0;
-	mutex_exit(&fasttrap_cleanup_mtx);
+		return (-1);
 
 	/*
 	 * Iterate over all of our providers. If there's still a process
@@ -2271,32 +2647,51 @@ fasttrap_detach(dev_info_t *devi, ddi_de
 	}
 
 	if (fail) {
-		uint_t work;
-		/*
-		 * If we're failing to detach, we need to unblock timeouts
-		 * and start a new timeout if any work has accumulated while
-		 * we've been unsuccessfully trying to detach.
-		 */
-		mutex_enter(&fasttrap_cleanup_mtx);
-		fasttrap_timeout = 0;
-		work = fasttrap_cleanup_work;
-		mutex_exit(&fasttrap_cleanup_mtx);
-
-		if (work)
-			fasttrap_pid_cleanup();
-
 		(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
 		    &fasttrap_meta_id);
 
-		return (DDI_FAILURE);
+		return (-1);
 	}
 
+	/*
+	 * Stop new processes from entering these hooks now, before the
+	 * fasttrap_cleanup thread runs.  That way all processes will hopefully
+	 * be out of these hooks before we free fasttrap_provs.fth_table
+	 */
+	ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
+	dtrace_fasttrap_fork = NULL;
+
+	ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
+	dtrace_fasttrap_exec = NULL;
+
+	ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
+	dtrace_fasttrap_exit = NULL;
+
+	mtx_lock(&fasttrap_cleanup_mtx);
+	fasttrap_cleanup_drain = 1;
+	/* Wait for the cleanup thread to finish up and signal us. */
+	wakeup(&fasttrap_cleanup_cv);
+	mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld",
+	    0);
+	fasttrap_cleanup_proc = NULL;
+	mtx_destroy(&fasttrap_cleanup_mtx);
+
 #ifdef DEBUG
 	mutex_enter(&fasttrap_count_mtx);
 	ASSERT(fasttrap_pid_count == 0);
 	mutex_exit(&fasttrap_count_mtx);
 #endif
 
+#ifndef illumos
+	EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag);
+
+	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+		mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+	for (i = 0; i < fasttrap_provs.fth_nent; i++)
+		mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+	for (i = 0; i < fasttrap_procs.fth_nent; i++)
+		mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx);
+#endif
 	kmem_free(fasttrap_tpoints.fth_table,
 	    fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_tpoints.fth_nent = 0;
@@ -2309,70 +2704,44 @@ fasttrap_detach(dev_info_t *devi, ddi_de
 	    fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_procs.fth_nent = 0;
 
-	/*
-	 * We know there are no tracepoints in any process anywhere in
-	 * the system so there is no process which has its p_dtrace_count
-	 * greater than zero, therefore we know that no thread can actively
-	 * be executing code in fasttrap_fork(). Similarly for p_dtrace_probes
-	 * and fasttrap_exec() and fasttrap_exit().
-	 */
-	ASSERT(dtrace_fasttrap_fork_ptr == &fasttrap_fork);
-	dtrace_fasttrap_fork_ptr = NULL;
-
-	ASSERT(dtrace_fasttrap_exec_ptr == &fasttrap_exec_exit);
-	dtrace_fasttrap_exec_ptr = NULL;
-
-	ASSERT(dtrace_fasttrap_exit_ptr == &fasttrap_exec_exit);
-	dtrace_fasttrap_exit_ptr = NULL;
-
-	ddi_remove_minor_node(devi, NULL);
-
-	return (DDI_SUCCESS);
-}
-
-static struct dev_ops fasttrap_ops = {
-	DEVO_REV,		/* devo_rev */
-	0,			/* refcnt */
-	fasttrap_info,		/* get_dev_info */
-	nulldev,		/* identify */
-	nulldev,		/* probe */
-	fasttrap_attach,	/* attach */
-	fasttrap_detach,	/* detach */
-	nodev,			/* reset */
-	&fasttrap_cb_ops,	/* driver operations */
-	NULL,			/* bus operations */
-	nodev			/* dev power */
-};
-
-/*
- * Module linkage information for the kernel.
- */
-static struct modldrv modldrv = {
-	&mod_driverops,		/* module type (this is a pseudo driver) */
-	"Fasttrap Tracing",	/* name of module */
-	&fasttrap_ops,		/* driver ops */
-};
-
-static struct modlinkage modlinkage = {
-	MODREV_1,
-	(void *)&modldrv,
-	NULL
-};
+#ifndef illumos
+	destroy_dev(fasttrap_cdev);
+	mutex_destroy(&fasttrap_count_mtx);
+	rm_destroy(&fasttrap_tp_lock);
+#endif
 
-int
-_init(void)
-{
-	return (mod_install(&modlinkage));
+	return (0);
 }
 
-int
-_info(struct modinfo *modinfop)
+/* ARGSUSED */
+static int
+fasttrap_modevent(module_t mod __unused, int type, void *data __unused)
 {
-	return (mod_info(&modlinkage, modinfop));
-}
+	int error = 0;
 
-int
-_fini(void)
-{
-	return (mod_remove(&modlinkage));
+	switch (type) {
+	case MOD_LOAD:
+		break;
+
+	case MOD_UNLOAD:
+		break;
+
+	case MOD_SHUTDOWN:
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+	return (error);
 }
+
+SYSINIT(fasttrap_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_load,
+    NULL);
+SYSUNINIT(fasttrap_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+    fasttrap_unload, NULL);
+
+DEV_MODULE(fasttrap, fasttrap_modevent, NULL);
+MODULE_VERSION(fasttrap, 1);
+MODULE_DEPEND(fasttrap, dtrace, 1, 1, 1);
+MODULE_DEPEND(fasttrap, opensolaris, 1, 1, 1);
Index: src/external/cddl/osnet/dist/uts/common/fs/gfs.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/gfs.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/gfs.c
--- src/external/cddl/osnet/dist/uts/common/fs/gfs.c	14 Dec 2010 01:28:18 -0000	1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,1191 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/* Portions Copyright 2007 Shivakumar GN */
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/dirent.h>
-#include <sys/kmem.h>
-#include <sys/mman.h>
-#include <sys/mutex.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/sunddi.h>
-#include <sys/uio.h>
-#include <sys/vmsystm.h>
-#include <sys/vfs.h>
-#include <sys/vnode.h>
-
-#include <vm/as.h>
-#include <vm/seg_vn.h>
-
-#include <sys/gfs.h>
-
-/*
- * Generic pseudo-filesystem routines.
- *
- * There are significant similarities between the implementation of certain file
- * system entry points across different filesystems.  While one could attempt to
- * "choke up on the bat" and incorporate common functionality into a VOP
- * preamble or postamble, such an approach is limited in the benefit it can
- * provide.  In this file we instead define a toolkit of routines which can be
- * called from a filesystem (with in-kernel pseudo-filesystems being the focus
- * of the exercise) in a more component-like fashion.
- *
- * There are three basic classes of routines:
- *
- * 1) Lowlevel support routines
- *
- *    These routines are designed to play a support role for existing
- *    pseudo-filesystems (such as procfs).  They simplify common tasks,
- *    without forcing the filesystem to hand over management to GFS.  The
- *    routines covered are:
- *
- *	gfs_readdir_init()
- *	gfs_readdir_emit()
- *	gfs_readdir_emitn()
- *	gfs_readdir_pred()
- *	gfs_readdir_fini()
- *	gfs_lookup_dot()
- *
- * 2) Complete GFS management
- *
- *    These routines take a more active role in management of the
- *    pseudo-filesystem.  They handle the relationship between vnode private
- *    data and VFS data, as well as the relationship between vnodes in the
- *    directory hierarchy.
- *
- *    In order to use these interfaces, the first member of every private
- *    v_data must be a gfs_file_t or a gfs_dir_t.  This hands over all control
- *    to GFS.
- *
- * 	gfs_file_create()
- * 	gfs_dir_create()
- * 	gfs_root_create()
- *
- *	gfs_file_inactive()
- *	gfs_dir_inactive()
- *	gfs_dir_lookup()
- *	gfs_dir_readdir()
- *
- * 	gfs_vop_inactive()
- * 	gfs_vop_lookup()
- * 	gfs_vop_readdir()
- * 	gfs_vop_map()
- *
- * 3) Single File pseudo-filesystems
- *
- *    This routine creates a rooted file to be overlayed ontop of another
- *    file in the physical filespace.
- *
- *    Note that the parent is NULL (actually the vfs), but there is nothing
- *    technically keeping such a file from utilizing the "Complete GFS
- *    management" set of routines.
- *
- * 	gfs_root_create_file()
- */
-
-/*
- * gfs_make_opsvec: take an array of vnode type definitions and create
- * their vnodeops_t structures
- *
- * This routine takes an array of gfs_opsvec_t's.  It could
- * alternatively take an array of gfs_opsvec_t*'s, which would allow
- * vnode types to be completely defined in files external to the caller
- * of gfs_make_opsvec().  As it stands, much more sharing takes place --
- * both the caller and the vnode type provider need to access gfsv_ops
- * and gfsv_template, and the caller also needs to know gfsv_name.
- */
-#ifdef PORT_SOLARIS
-int
-gfs_make_opsvec(gfs_opsvec_t *vec)
-{
-	int error, i;
-
-	for (i = 0; ; i++) {
-		if (vec[i].gfsv_name == NULL)
-			return (0);
-		error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
-		    vec[i].gfsv_ops);
-		if (error)
-			break;
-	}
-
-	cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
-	    vec[i].gfsv_name);
-	for (i--; i >= 0; i--) {
-		vn_freevnodeops(*vec[i].gfsv_ops);
-		*vec[i].gfsv_ops = NULL;
-	}
-	return (error);
-}
-#endif
-
-/*
- * Low level directory routines
- *
- * These routines provide some simple abstractions for reading directories.
- * They are designed to be used by existing pseudo filesystems (namely procfs)
- * that already have a complicated management infrastructure.
- */
-
-/*
- * gfs_get_parent_ino: used to obtain a parent inode number and the
- * inode number of the given vnode in preparation for calling gfs_readdir_init.
- */
-int
-gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
-    ino64_t *pino, ino64_t *ino)
-{
-	vnode_t *parent;
-	gfs_dir_t *dp = dvp->v_data;
-	int error;
-
-	*ino = dp->gfsd_file.gfs_ino;
-	parent = dp->gfsd_file.gfs_parent;
-
-	if (parent == NULL) {
-		*pino = *ino;		/* root of filesystem */
-	} else if (dvp->v_flag & V_XATTRDIR) {
-		vattr_t va;
-
-		va.va_mask = AT_NODEID;
-		error = VOP_GETATTR(parent, &va, 0, cr, ct);
-		if (error)
-			return (error);
-		*pino = va.va_nodeid;
-	} else {
-		*pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
-	}
-
-	return (0);
-}
-
-/*
- * gfs_readdir_init: initiate a generic readdir
- *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
- *   name_max	- the directory's maximum file name length
- *   ureclen	- the exported file-space record length (1 for non-legacy FSs)
- *   uiop	- the uiop passed to readdir
- *   parent	- the parent directory's inode
- *   self	- this directory's inode
- *   flags	- flags from VOP_READDIR
- *
- * Returns 0 or a non-zero errno.
- *
- * Typical VOP_READDIR usage of gfs_readdir_*:
- *
- *	if ((error = gfs_readdir_init(...)) != 0)
- *		return (error);
- *	eof = 0;
- *	while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
- *		if (!consumer_entry_at(voffset))
- *			voffset = consumer_next_entry(voffset);
- *		if (consumer_eof(voffset)) {
- *			eof = 1
- *			break;
- *		}
- *		if ((error = gfs_readdir_emit(..., voffset,
- *		    consumer_ino(voffset), consumer_name(voffset))) != 0)
- *			break;
- *	}
- *	return (gfs_readdir_fini(..., error, eofp, eof));
- *
- * As you can see, a zero result from gfs_readdir_pred() or
- * gfs_readdir_emit() indicates that processing should continue,
- * whereas a non-zero result indicates that the loop should terminate.
- * Most consumers need do nothing more than let gfs_readdir_fini()
- * determine what the cause of failure was and return the appropriate
- * value.
- */
-int
-gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
-    uio_t *uiop, ino64_t parent, ino64_t self, int flags)
-{
-	size_t dirent_size;
-
-	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
-	    (uiop->uio_loffset % ureclen) != 0)
-		return (EINVAL);
-
-	st->grd_ureclen = ureclen;
-	st->grd_oresid = uiop->uio_resid;
-	st->grd_namlen = name_max;
-	if (flags & V_RDDIR_ENTFLAGS)
-		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
-	else
-		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
-	st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
-	st->grd_parent = parent;
-	st->grd_self = self;
-	st->grd_flags = flags;
-
-	return (0);
-}
-
-/*
- * gfs_readdir_emit_int: internal routine to emit directory entry
- *
- *   st		- the current readdir state, which must have d_ino/ed_ino
- *		  and d_name/ed_name set
- *   uiop	- caller-supplied uio pointer
- *   next	- the offset of the next entry
- */
-static int
-gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next)
-{
-	int reclen, namelen;
-	dirent64_t *dp;
-	edirent_t *edp;
-
-	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
-		edp = st->grd_dirent;
-		namelen = strlen(edp->ed_name);
-		reclen = EDIRENT_RECLEN(strlen(edp->ed_name));
-	} else {
-		dp = st->grd_dirent;
-		namelen = strlen(dp->d_name);
-		reclen = DIRENT64_RECLEN(strlen(dp->d_name));
-	}
-
-	if (reclen > uiop->uio_resid) {
-		/*
-		 * Error if no entries were returned yet
-		 */
-		if (uiop->uio_resid == st->grd_oresid)
-			return (EINVAL);
-		return (-1);
-	}
-
-	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
-		edp->ed_off = next;
-		edp->ed_reclen = (ushort_t)reclen;
-	} else {
-		dp->d_reclen = (ushort_t)reclen;
-		dp->d_type = DT_DIR;
-		dp->d_namlen = namelen;
-	}
-
-	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
-		return (EFAULT);
-
-	uiop->uio_loffset = next;
-
-	return (0);
-}
-
-/*
- * gfs_readdir_emit: emit a directory entry
- *   voff       - the virtual offset (obtained from gfs_readdir_pred)
- *   ino        - the entry's inode
- *   name       - the entry's name
- *   eflags	- value for ed_eflags (if processing edirent_t)
- *
- * Returns a 0 on success, a non-zero errno on failure, or -1 if the
- * readdir loop should terminate.  A non-zero result (either errno or
- * -1) from this function is typically passed directly to
- * gfs_readdir_fini().
- */
-int
-gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
-    ino64_t ino, const char *name, int eflags)
-{
-	offset_t off = (voff + 2) * st->grd_ureclen;
-
-	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
-		edirent_t *edp = st->grd_dirent;
-
-		edp->ed_ino = ino;
-		(void) strncpy(edp->ed_name, name, st->grd_namlen);
-		edp->ed_eflags = eflags;
-	} else {
-		dirent64_t *dp = st->grd_dirent;
-
-		dp->d_ino = ino;
-		(void) strncpy(dp->d_name, name, st->grd_namlen);
-	}
-
-	/*
-	 * Inter-entry offsets are invalid, so we assume a record size of
-	 * grd_ureclen and explicitly set the offset appropriately.
-	 */
-	return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen));
-}
-
-/*
- * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
- * instead of a string for the entry's name.
- */
-int
-gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
-    ino64_t ino, unsigned long num)
-{
-	char buf[40];
-
-	numtos(num, buf);
-	return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
-}
-
-/*
- * gfs_readdir_pred: readdir loop predicate
- *   voffp - a pointer in which the next virtual offset should be stored
- *
- * Returns a 0 on success, a non-zero errno on failure, or -1 if the
- * readdir loop should terminate.  A non-zero result (either errno or
- * -1) from this function is typically passed directly to
- * gfs_readdir_fini().
- */
-int
-gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp)
-{
-	offset_t off, voff;
-	int error;
-
-top:
-	if (uiop->uio_resid <= 0)
-		return (-1);
-
-	off = uiop->uio_loffset / st->grd_ureclen;
-	voff = off - 2;
-	if (off == 0) {
-		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
-		    ".", 0)) == 0)
-			goto top;
-	} else if (off == 1) {
-		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
-		    "..", 0)) == 0)
-			goto top;
-	} else {
-		*voffp = voff;
-		return (0);
-	}
-
-	return (error);
-}
-
-/*
- * gfs_readdir_fini: generic readdir cleanup
- *   error	- if positive, an error to return
- *   eofp	- the eofp passed to readdir
- *   eof	- the eof value
- *
- * Returns a 0 on success, a non-zero errno on failure.  This result
- * should be returned from readdir.
- */
-int
-gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
-{
-	size_t dirent_size;
-
-	if (st->grd_flags & V_RDDIR_ENTFLAGS)
-		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
-	else
-		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
-	kmem_free(st->grd_dirent, dirent_size);
-	if (error > 0)
-		return (error);
-	if (eofp)
-		*eofp = eof;
-	return (0);
-}
-
-/*
- * gfs_lookup_dot
- *
- * Performs a basic check for "." and ".." directory entries.
- */
-int
-gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
-{
-	if (*nm == '\0' || strcmp(nm, ".") == 0) {
-		VN_HOLD(dvp);
-		*vpp = dvp;
-		return (0);
-	} else if (strcmp(nm, "..") == 0) {
-		if (pvp == NULL) {
-			ASSERT(dvp->v_flag & VROOT);
-			VN_HOLD(dvp);
-			*vpp = dvp;
-		} else {
-			VN_HOLD(pvp);
-			*vpp = pvp;
-		}
-		return (0);
-	}
-
-	return (-1);
-}
-
-/*
- * gfs_file_create(): create a new GFS file
- *
- *   size	- size of private data structure (v_data)
- *   pvp	- parent vnode (GFS directory)
- *   ops	- vnode operations vector
- *
- * In order to use this interface, the parent vnode must have been created by
- * gfs_dir_create(), and the private data stored in v_data must have a
- * 'gfs_file_t' as its first field.
- *
- * Given these constraints, this routine will automatically:
- *
- * 	- Allocate v_data for the vnode
- * 	- Initialize necessary fields in the vnode
- * 	- Hold the parent
- */
-vnode_t *
-gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops)
-{
-	gfs_file_t *fp;
-	vnode_t *vp;
-	int error;
-
-	/*
-	 * Allocate vnode and internal data structure
-	 */
-	fp = kmem_zalloc(size, KM_SLEEP);
-	/* XXX FreeBSD adds vfs_t * as parameter to gfs_file_create and
-	   gfs_dir_create */
-	error = getnewvnode(VT_ZFS, pvp->v_vfsp, ops, &vp);
-	ASSERT(error == 0);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-
-	/*
-	 * Set up various pointers
-	 */
-	fp->gfs_vnode = vp;
-	fp->gfs_parent = pvp;
-	vp->v_data = fp;
-	fp->gfs_size = size;
-	fp->gfs_type = GFS_FILE;
-
-	/*
-	 * Initialize vnode and hold parent.
-	 */
-	vn_setops(vp, ops);
-	if (pvp) {
-		VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0);
-		VN_HOLD(pvp);
-	}
-
-	return (vp);
-}
-
-/*
- * gfs_dir_create: creates a new directory in the parent
- *
- *   size	- size of private data structure (v_data)
- *   pvp	- parent vnode (GFS directory)
- *   ops	- vnode operations vector
- *   entries	- NULL-terminated list of static entries (if any)
- *   maxlen	- maximum length of a directory entry
- *   readdir_cb	- readdir callback (see gfs_dir_readdir)
- *   inode_cb	- inode callback (see gfs_dir_readdir)
- *   lookup_cb	- lookup callback (see gfs_dir_lookup)
- *
- * In order to use this function, the first member of the private vnode
- * structure (v_data) must be a gfs_dir_t.  For each directory, there are
- * static entries, defined when the structure is initialized, and dynamic
- * entries, retrieved through callbacks.
- *
- * If a directory has static entries, then it must supply a inode callback,
- * which will compute the inode number based on the parent and the index.
- * For a directory with dynamic entries, the caller must supply a readdir
- * callback and a lookup callback.  If a static lookup fails, we fall back to
- * the supplied lookup callback, if any.
- *
- * This function also performs the same initialization as gfs_file_create().
- */
-vnode_t *
-gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops,
-    gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
-    gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
-{
-	vnode_t *vp;
-	gfs_dir_t *dp;
-	gfs_dirent_t *de;
-
-	vp = gfs_file_create(struct_size, pvp, ops);
-	vp->v_type = VDIR;
-
-	dp = vp->v_data;
-	dp->gfsd_file.gfs_type = GFS_DIR;
-	dp->gfsd_maxlen = maxlen;
-
-	if (entries != NULL) {
-		for (de = entries; de->gfse_name != NULL; de++)
-			dp->gfsd_nstatic++;
-
-		dp->gfsd_static = kmem_alloc(
-		    dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
-		bcopy(entries, dp->gfsd_static,
-		    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
-	}
-
-	dp->gfsd_readdir = readdir_cb;
-	dp->gfsd_lookup = lookup_cb;
-	dp->gfsd_inode = inode_cb;
-
-	mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	return (vp);
-}
-
-/*
- * gfs_root_create(): create a root vnode for a GFS filesystem
- *
- * Similar to gfs_dir_create(), this creates a root vnode for a filesystem.  The
- * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
- */
-vnode_t *
-gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
-    gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
-    gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
-{
-	vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb,
-	    maxlen, readdir_cb, lookup_cb);
-
-	/* Manually set the inode */
-	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
-
-	VFS_HOLD(vfsp);
-	VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0);
-	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
-
-	return (vp);
-}
-
-/*
- * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
- *
- * Similar to gfs_root_create(), this creates a root vnode for a file to
- * be the pseudo-filesystem.
- */
-vnode_t *
-gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
-{
-	vnode_t	*vp = gfs_file_create(size, NULL, ops);
-
-	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
-
-	VFS_HOLD(vfsp);
-	VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
-	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
-
-	return (vp);
-}
-
-/*
- * gfs_file_inactive()
- *
- * Called from the VOP_INACTIVE() routine.  If necessary, this routine will
- * remove the given vnode from the parent directory and clean up any references
- * in the VFS layer.
- *
- * If the vnode was not removed (due to a race with vget), then NULL is
- * returned.  Otherwise, a pointer to the private data is returned.
- */
-void *
-gfs_file_inactive(vnode_t *vp)
-{
-	int i;
-	gfs_dirent_t *ge = NULL;
-	gfs_file_t *fp = vp->v_data;
-	gfs_dir_t *dp = NULL;
-	void *data;
-
-	if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
-		goto found;
-
-	dp = fp->gfs_parent->v_data;
-
-	/*
-	 * First, see if this vnode is cached in the parent.
-	 */
-	gfs_dir_lock(dp);
-
-	/*
-	 * Find it in the set of static entries.
-	 */
-	for (i = 0; i < dp->gfsd_nstatic; i++)  {
-		ge = &dp->gfsd_static[i];
-
-		if (ge->gfse_vnode == vp)
-			goto found;
-	}
-
-	/*
-	 * If 'ge' is NULL, then it is a dynamic entry.
-	 */
-	ge = NULL;
-
-found:
-	if (vp->v_flag & V_XATTRDIR) {
-		mutex_enter(&fp->gfs_parent->v_lock);
-	}
-	mutex_enter(&vp->v_lock);
-#ifdef PORT_SOLARIS
-	if (vp->v_count == 1) {
-		/*
-		 * Really remove this vnode
-		 */
-		data = vp->v_data;
-		if (ge != NULL) {
-			/*
-			 * If this was a statically cached entry, simply set the
-			 * cached vnode to NULL.
-			 */
-			ge->gfse_vnode = NULL;
-		}
-		if (vp->v_flag & V_XATTRDIR) {
-			fp->gfs_parent->v_xattrdir = NULL;
-			mutex_exit(&fp->gfs_parent->v_lock);
-		}
-		mutex_exit(&vp->v_lock);
-
-		/*
-		 * Free vnode and release parent
-		 */
-		if (fp->gfs_parent) {
-			if (dp) {
-				gfs_dir_unlock(dp);
-			}
-			VN_RELE(fp->gfs_parent);
-		} else {
-			ASSERT(vp->v_vfsp != NULL);
-			VFS_RELE(vp->v_vfsp);
-		}
-		vn_free(vp);
-	} else {
-		vp->v_count--;
-		data = NULL;
-		mutex_exit(&vp->v_lock);
-		if (vp->v_flag & V_XATTRDIR) {
-			mutex_exit(&fp->gfs_parent->v_lock);
-		}
-		if (dp)
-			gfs_dir_unlock(dp);
-	}
-#endif
-
-	return (data);
-}
-
-/*
- * gfs_dir_inactive()
- *
- * Same as above, but for directories.
- */
-void *
-gfs_dir_inactive(vnode_t *vp)
-{
-	gfs_dir_t *dp;
-
-	ASSERT(vp->v_type == VDIR);
-
-	if ((dp = gfs_file_inactive(vp)) != NULL) {
-		mutex_destroy(&dp->gfsd_lock);
-		if (dp->gfsd_nstatic)
-			kmem_free(dp->gfsd_static,
-			    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
-	}
-
-	return (dp);
-}
-
-/*
- * gfs_dir_lookup_dynamic()
- *
- * This routine looks up the provided name amongst the dynamic entries
- * in the gfs directory and returns the corresponding vnode, if found.
- *
- * The gfs directory is expected to be locked by the caller prior to
- * calling this function.  The directory will be unlocked during the
- * execution of this function, but will be locked upon return from the
- * function.  This function returns 0 on success, non-zero on error.
- *
- * The dynamic lookups are performed by invoking the lookup
- * callback, which is passed to this function as the first argument.
- * The arguments to the callback are:
- *
- * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
- *     int flags, int *deflgs, pathname_t *rpnp);
- *
- *	pvp	- parent vnode
- *	nm	- name of entry
- *	vpp	- pointer to resulting vnode
- *	cr	- pointer to cred
- *	flags	- flags value from lookup request
- *		ignored here; currently only used to request
- *		insensitive lookups
- *	direntflgs - output parameter, directory entry flags
- *		ignored here; currently only used to indicate a lookup
- *		has more than one possible match when case is not considered
- *	realpnp	- output parameter, real pathname
- *		ignored here; when lookup was performed case-insensitively,
- *		this field contains the "real" name of the file.
- *
- * 	Returns 0 on success, non-zero on error.
- */
-static int
-gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
-    const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
-    int *direntflags, pathname_t *realpnp)
-{
-	gfs_file_t *fp;
-	ino64_t ino;
-	int ret;
-
-	ASSERT(GFS_DIR_LOCKED(dp));
-
-	/*
-	 * Drop the directory lock, as the lookup routine
-	 * will need to allocate memory, or otherwise deadlock on this
-	 * directory.
-	 */
-	gfs_dir_unlock(dp);
-	ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
-	gfs_dir_lock(dp);
-
-	/*
-	 * The callback for extended attributes returns a vnode
-	 * with v_data from an underlying fs.
-	 */
-	if (ret == 0 && !IS_XATTRDIR(dvp)) {
-		fp = (gfs_file_t *)((*vpp)->v_data);
-		fp->gfs_index = -1;
-		fp->gfs_ino = ino;
-	}
-
-	return (ret);
-}
-
-/*
- * gfs_dir_lookup_static()
- *
- * This routine looks up the provided name amongst the static entries
- * in the gfs directory and returns the corresponding vnode, if found.
- * The first argument to the function is a pointer to the comparison
- * function this function should use to decide if names are a match.
- *
- * If a match is found, and GFS_CACHE_VNODE is set and the vnode
- * exists, we simply return the existing vnode.  Otherwise, we call
- * the static entry's callback routine, caching the result if
- * necessary.  If the idx pointer argument is non-NULL, we use it to
- * return the index of the matching static entry.
- *
- * The gfs directory is expected to be locked by the caller prior to calling
- * this function.  The directory may be unlocked during the execution of
- * this function, but will be locked upon return from the function.
- *
- * This function returns 0 if a match is found, ENOENT if not.
- */
-static int
-gfs_dir_lookup_static(int (*compare)(const char *, const char *),
-    gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
-    vnode_t **vpp, pathname_t *rpnp)
-{
-	gfs_dirent_t *ge;
-	vnode_t *vp = NULL;
-	int i;
-
-	ASSERT(GFS_DIR_LOCKED(dp));
-
-	/*
-	 * Search static entries.
-	 */
-	for (i = 0; i < dp->gfsd_nstatic; i++) {
-		ge = &dp->gfsd_static[i];
-
-		if (compare(ge->gfse_name, nm) == 0) {
-			if (rpnp)
-				(void) strlcpy(rpnp->pn_buf, ge->gfse_name,
-				    rpnp->pn_bufsize);
-
-			if (ge->gfse_vnode) {
-				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
-				vp = ge->gfse_vnode;
-				VN_HOLD(vp);
-				break;
-			}
-
-			/*
-			 * We drop the directory lock, as the constructor will
-			 * need to do KM_SLEEP allocations.  If we return from
-			 * the constructor only to find that a parallel
-			 * operation has completed, and GFS_CACHE_VNODE is set
-			 * for this entry, we discard the result in favor of
-			 * the cached vnode.
-			 */
-			gfs_dir_unlock(dp);
-			vp = ge->gfse_ctor(dvp);
-			gfs_dir_lock(dp);
-
-			((gfs_file_t *)vp->v_data)->gfs_index = i;
-
-			/* Set the inode according to the callback. */
-			((gfs_file_t *)vp->v_data)->gfs_ino =
-			    dp->gfsd_inode(dvp, i);
-
-			if (ge->gfse_flags & GFS_CACHE_VNODE) {
-				if (ge->gfse_vnode == NULL) {
-					ge->gfse_vnode = vp;
-				} else {
-					/*
-					 * A parallel constructor beat us to it;
-					 * return existing vnode.  We have to be
-					 * careful because we can't release the
-					 * current vnode while holding the
-					 * directory lock; its inactive routine
-					 * will try to lock this directory.
-					 */
-					vnode_t *oldvp = vp;
-					vp = ge->gfse_vnode;
-					VN_HOLD(vp);
-
-					gfs_dir_unlock(dp);
-					VN_RELE(oldvp);
-					gfs_dir_lock(dp);
-				}
-			}
-			break;
-		}
-	}
-
-	if (vp == NULL)
-		return (ENOENT);
-	else if (idx)
-		*idx = i;
-	*vpp = vp;
-	return (0);
-}
-
-/*
- * gfs_dir_lookup()
- *
- * Looks up the given name in the directory and returns the corresponding
- * vnode, if found.
- *
- * First, we search statically defined entries, if any, with a call to
- * gfs_dir_lookup_static().  If no static entry is found, and we have
- * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
- *
- * This function returns 0 on success, non-zero on error.
- */
-int
-gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
-    int flags, int *direntflags, pathname_t *realpnp)
-{
-	gfs_dir_t *dp = dvp->v_data;
-	boolean_t casecheck;
-	vnode_t *dynvp = NULL;
-	vnode_t *vp = NULL;
-	int (*compare)(const char *, const char *);
-	int error, idx;
-
-	ASSERT(dvp->v_type == VDIR);
-
-	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
-		return (0);
-
-	casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
-	if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
-	    (flags & FIGNORECASE))
-		compare = strcasecmp;
-	else
-		compare = strcmp;
-
-	gfs_dir_lock(dp);
-
-	error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
-
-	if (vp && casecheck) {
-		gfs_dirent_t *ge;
-		int i;
-
-		for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
-			ge = &dp->gfsd_static[i];
-
-			if (strcasecmp(ge->gfse_name, nm) == 0) {
-				*direntflags |= ED_CASE_CONFLICT;
-				goto out;
-			}
-		}
-	}
-
-	if ((error || casecheck) && dp->gfsd_lookup)
-		error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
-		    &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
-
-	if (vp && dynvp) {
-		/* static and dynamic entries are case-insensitive conflict */
-		ASSERT(casecheck);
-		*direntflags |= ED_CASE_CONFLICT;
-		VN_RELE(dynvp);
-	} else if (vp == NULL) {
-		vp = dynvp;
-	} else if (error == ENOENT) {
-		error = 0;
-	} else if (error) {
-		VN_RELE(vp);
-		vp = NULL;
-	}
-
-out:
-	gfs_dir_unlock(dp);
-
-	*vpp = vp;
-	return (error);
-}
-
-/*
- * gfs_dir_readdir: does a readdir() on the given directory
- *
- *    dvp	- directory vnode
- *    uiop	- uio structure
- *    eofp	- eof pointer
- *    data	- arbitrary data passed to readdir callback
- *
- * This routine does all the readdir() dirty work.  Even so, the caller must
- * supply two callbacks in order to get full compatibility.
- *
- * If the directory contains static entries, an inode callback must be
- * specified.  This avoids having to create every vnode and call VOP_GETATTR()
- * when reading the directory.  This function has the following arguments:
- *
- *	ino_t gfs_inode_cb(vnode_t *vp, int index);
- *
- * 	vp	- vnode for the directory
- * 	index	- index in original gfs_dirent_t array
- *
- * 	Returns the inode number for the given entry.
- *
- * For directories with dynamic entries, a readdir callback must be provided.
- * This is significantly more complex, thanks to the particulars of
- * VOP_READDIR().
- *
- *	int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
- *	    offset_t *off, offset_t *nextoff, void *data, int flags)
- *
- *	vp	- directory vnode
- *	dp	- directory entry, sized according to maxlen given to
- *		  gfs_dir_create().  callback must fill in d_name and
- *		  d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
- *		  (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
- *		  is set in 'flags'.
- *	eofp	- callback must set to 1 when EOF has been reached
- *	off	- on entry, the last offset read from the directory.  Callback
- *		  must set to the offset of the current entry, typically left
- *		  untouched.
- *	nextoff	- callback must set to offset of next entry.  Typically
- *		  (off + 1)
- *	data	- caller-supplied data
- *	flags	- VOP_READDIR flags
- *
- *	Return 0 on success, or error on failure.
- */
-int
-gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr,
-    caller_context_t *ct, int flags)
-{
-	gfs_readdir_state_t gstate;
-	int error, eof = 0;
-	ino64_t ino, pino;
-	offset_t off, next;
-	gfs_dir_t *dp = dvp->v_data;
-
-	error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino);
-	if (error)
-		return (error);
-
-	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
-	    pino, ino, flags)) != 0)
-		return (error);
-
-	while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 &&
-	    !eof) {
-
-		if (off >= 0 && off < dp->gfsd_nstatic) {
-			ino = dp->gfsd_inode(dvp, off);
-
-			if ((error = gfs_readdir_emit(&gstate, uiop,
-			    off, ino, dp->gfsd_static[off].gfse_name, 0))
-			    != 0)
-				break;
-
-		} else if (dp->gfsd_readdir) {
-			off -= dp->gfsd_nstatic;
-
-			if ((error = dp->gfsd_readdir(dvp,
-			    gstate.grd_dirent, &eof, &off, &next,
-			    data, flags)) != 0 || eof)
-				break;
-
-			off += dp->gfsd_nstatic + 2;
-			next += dp->gfsd_nstatic + 2;
-
-			if ((error = gfs_readdir_emit_int(&gstate, uiop,
-			    next)) != 0)
-				break;
-		} else {
-			/*
-			 * Offset is beyond the end of the static entries, and
-			 * we have no dynamic entries.  Set EOF.
-			 */
-			eof = 1;
-		}
-	}
-
-	return (gfs_readdir_fini(&gstate, error, eofp, eof));
-}
-
-
-/*
- * gfs_vop_lookup: VOP_LOOKUP() entry point
- *
- * For use directly in vnode ops table.  Given a GFS directory, calls
- * gfs_dir_lookup() as necessary.
- */
-/* ARGSUSED */
-int
-gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
-{
-	return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
-}
-
-/*
- * gfs_vop_readdir: VOP_READDIR() entry point
- *
- * For use directly in vnode ops table.  Given a GFS directory, calls
- * gfs_dir_readdir() as necessary.
- */
-/* ARGSUSED */
-int
-gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
-    caller_context_t *ct, int flags)
-{
-	return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags));
-}
-
-
-/*
- * gfs_vop_map: VOP_MAP() entry point
- *
- * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
- * This function only works for readonly files, and uses the read function for
- * the vnode to fill in the data.  The mapped data is immediately faulted in and
- * filled with the necessary data during this call; there are no getpage() or
- * putpage() routines.
- */
-/* ARGSUSED */
-#ifdef PORT_SOLARIS
-int
-gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
-    caller_context_t *ct)
-{
-	int rv;
-	ssize_t resid = len;
-
-	/*
-	 * Check for bad parameters
-	 */
-#ifdef _ILP32
-	if (len > MAXOFF_T)
-		return (ENOMEM);
-#endif
-	if (vp->v_flag & VNOMAP)
-		return (ENOTSUP);
-	if (off > MAXOFF_T)
-		return (EFBIG);
-	if ((long)off < 0 || (long)(off + len) < 0)
-		return (EINVAL);
-	if (vp->v_type != VREG)
-		return (ENODEV);
-	if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
-		return (EACCES);
-
-	/*
-	 * Find appropriate address if needed, otherwise clear address range.
-	 */
-	as_rangelock(as);
-	rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
-	if (rv != 0) {
-		as_rangeunlock(as);
-		return (rv);
-	}
-
-	/*
-	 * Create mapping
-	 */
-	rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
-	as_rangeunlock(as);
-	if (rv != 0)
-		return (rv);
-
-	/*
-	 * Fill with data from read()
-	 */
-	rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
-	    0, (rlim64_t)0, cred, &resid);
-
-	if (rv == 0 && resid != 0)
-		rv = ENXIO;
-
-	if (rv != 0) {
-		as_rangelock(as);
-		(void) as_unmap(as, *addrp, len);
-		as_rangeunlock(as);
-	}
-
-	return (rv);
-}
-#endif
-/*
- * gfs_vop_inactive: VOP_INACTIVE() entry point
- *
- * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
- * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
- */
-/* ARGSUSED */
-void
-gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-	gfs_file_t *fp = vp->v_data;
-	void *data;
-
-	if (fp->gfs_type == GFS_DIR)
-		data = gfs_dir_inactive(vp);
-	else
-		data = gfs_file_inactive(vp);
-
-	if (data != NULL)
-		kmem_free(data, fp->gfs_size);
-}
Index: src/external/cddl/osnet/dist/uts/common/fs/vnode.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/vnode.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/vnode.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/vnode.c	11 May 2017 20:07:48 -0000
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/taskq.h>
+#include <sys/vnode.h>
+
+/* Extensible attribute (xva) routines. */
+
+/*
+ * Zero out the structure, set the size of the requested/returned bitmaps,
+ * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
+ * to the returned attributes array.
+ */
+void
+xva_init(xvattr_t *xvap)
+{
+	bzero(xvap, sizeof (xvattr_t));
+	xvap->xva_mapsize = XVA_MAPSIZE;
+	xvap->xva_magic = XVA_MAGIC;
+	xvap->xva_vattr.va_mask = AT_XVATTR;
+	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
+}
+
+/*
+ * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
+ * structure.  Otherwise, returns NULL.
+ */
+xoptattr_t *
+xva_getxoptattr(xvattr_t *xvap)
+{
+	xoptattr_t *xoap = NULL;
+	if (xvap->xva_vattr.va_mask & AT_XVATTR)
+		xoap = &xvap->xva_xoptattrs;
+	return (xoap);
+}
+
+#ifdef __FreeBSD__
+static void
+vn_rele_inactive(vnode_t *vp)
+{
+	vrele(vp);
+}
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq)
+{
+	VERIFY(vp->v_count > 0);
+	VI_LOCK(vp);
+	if (vp->v_count == 1 && !(vp->v_iflag & VI_DOINGINACT)) {
+		VI_UNLOCK(vp);
+		VERIFY(taskq_dispatch((taskq_t *)taskq,
+		    (task_func_t *)vn_rele_inactive, vp, TQ_SLEEP) != 0);
+		return;
+	}
+	refcount_release(&vp->v_usecount);
+	vdropl(vp);
+}
+#endif /* __FreeBSD__ */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4	16 Feb 2013 16:40:57 -0000
@@ -0,0 +1,30 @@
+LZ4 - Fast LZ compression algorithm
+Copyright (C) 2011-2013, Yann Collet.
+BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+- LZ4 source repository : http://code.google.com/p/lz4/
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip	16 Feb 2013 16:40:56 -0000
@@ -0,0 +1 @@
+LZ4 COMPRESSION FUNCTIONALITY IN ZFS
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c,v
retrieving revision 1.11
diff -u -p -r1.11 arc.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c	27 Jan 2012 19:48:38 -0000	1.11
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c	16 May 2017 00:54:12 -0000
@@ -19,8 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -56,11 +59,11 @@
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefor exactly the same size.  So
+ * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes).  We therefor choose a set of blocks to evict to make
+ * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
@@ -75,13 +78,13 @@
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal arc algorithms for
- * adjusting the cache use method 2.  We therefor provide two
+ * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * arc list locks.
  *
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
@@ -102,13 +105,13 @@
  * with the buffer may be evicted prior to the callback.  The callback
  * must be made with *no locks held* (to prevent deadlock).  Additionally,
  * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_buf_evict()
+ * protected from simultaneous callbacks from arc_clear_callback()
  * and arc_do_user_evicts().
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
- * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
@@ -117,29 +120,167 @@
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer, and always contains uncompressed data. The ARC will provide
+ * references to this data and will keep it cached until it is no longer in
+ * use. Typically, the arc will try to cache only the L1ARC's physical data
+ * block and will aggressively evict any arc_buf_t that is no longer referenced.
+ * The amount of memory consumed by the arc_buf_t's can be seen via the
+ * "overhead_size" kstat.
+ *
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                   (potentially) |      |             |               |
+ *                     compressed  |      |             |               |
+ *                        data     +------+             |               v
+ *                                                      +->+------+     +------+
+ *                                            uncompressed |      |     |      |
+ *                                                data     |      |     |      |
+ *                                                         +------+     +------+
+ *
+ * The L1ARC's data pointer, however, may or may not be uncompressed. The
+ * ARC has the ability to store the physical data (b_pdata) associated with
+ * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
+ * physical block, it will match its on-disk compression characteristics.
+ * If the block on-disk is compressed, then the physical data block
+ * in the cache will also be compressed and vice-versa. This behavior
+ * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
+ * then an additional arc_buf_t is allocated and the uncompressed data is
+ * bcopied from the existing arc_buf_t. If the hdr is cached but does not
+ * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
+ * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
+ * b_pdata is not compressed, then the block is shared with the newly
+ * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
+ * in the arc buffer chain. Sharing the block reduces the memory overhead
+ * required when the hdr is caching uncompressed blocks or the compressed
+ * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t:
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t    (shared)
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                                 |      |             |               |
+ *                   uncompressed  |      |             |               |
+ *                        data     +------+             |               |
+ *                                    ^                 +->+------+     |
+ *                                    |       uncompressed |      |     |
+ *                                    |           data     |      |     |
+ *                                    |                    +------+     |
+ *                                    +---------------------------------+
+ *
+ * Writing to the arc requires that the ARC first discard the b_pdata
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
+ * performs the write, it may compress the data before writing it to disk.
+ * The ARC will be called with the transformed data and will bcopy the
+ * transformed on-disk block into a newly allocated b_pdata.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pdata. The
+ * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * that when compressed arc is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * arc is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ */
+
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/multilist.h>
 #ifdef _KERNEL
-#include <sys/vmsystm.h>
-#include <vm/anon.h>
-#include <sys/fs/swapnode.h>
 #include <sys/dnlc.h>
+#include <sys/racct.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <sys/trim_map.h>
 #include <zfs_fletcher.h>
+#include <sys/sdt.h>
+
+#include <machine/vmparam.h>
+
+#ifdef illumos
+#ifndef _KERNEL
+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
+boolean_t arc_watch = B_FALSE;
+int arc_procfd;
+#endif
+#endif /* illumos */
 
 #ifdef __NetBSD__
 #include <uvm/uvm.h>
 #ifndef btop
 #define	btop(x)		((x) / PAGE_SIZE)
 #endif
-#define	needfree	(uvmexp.free < uvmexp.freetarg ? uvmexp.freetarg : 0)
+//#define	needfree	(uvmexp.free < uvmexp.freetarg ? uvmexp.freetarg : 0)
 #define	buf_init	arc_buf_init
 #define	freemem		uvmexp.free
 #define	minfree		uvmexp.freemin
@@ -160,30 +301,58 @@ static struct callback_entry arc_kva_rec
 
 #endif	/* __NetBSD__ */
 
-static kmutex_t		arc_reclaim_thr_lock;
-static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
-static uint8_t		arc_thread_exit;
-
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
-#define	ARC_REDUCE_DNLC_PERCENT	3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
-
-typedef enum arc_reclaim_strategy {
-	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
-	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
-} arc_reclaim_strategy_t;
+static kmutex_t		arc_reclaim_lock;
+static kcondvar_t	arc_reclaim_thread_cv;
+static boolean_t	arc_reclaim_thread_exit;
+static kcondvar_t	arc_reclaim_waiters_cv;
+
+#ifdef __FreeBSD__
+static kmutex_t		arc_dnlc_evicts_lock;
+static kcondvar_t	arc_dnlc_evicts_cv;
+static boolean_t	arc_dnlc_evicts_thread_exit;
+
+uint_t arc_reduce_dnlc_percent = 3;
+#endif
+
+/*
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
+
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
 
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int		zfs_arc_overflow_shift = 8;
+
 /* shift of arc_c for calculating both min and max arc_p */
 static int		arc_p_min_shift = 4;
 
 /* log2(fraction of arc to reclaim) */
-static int		arc_shrink_shift = 5;
+static int		arc_shrink_shift = 7;
+
+/*
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
+ */
+int			arc_no_grow_shift = 5;
+
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
@@ -191,7 +360,13 @@ static int		arc_shrink_shift = 5;
  */
 static int		arc_min_prefetch_lifespan;
 
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
 static int arc_dead;
+extern boolean_t zfs_prefetch_disable;
 
 /*
  * The arc has filled available memory and has now warmed up.
@@ -204,9 +379,89 @@ static boolean_t arc_warm;
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
+uint64_t zfs_arc_meta_min = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
+uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+u_int zfs_arc_free_target = 0;
+
+/* Absolute min for arc min / max is 16MB. */
+static uint64_t arc_abs_min = 16 << 20;
+
+boolean_t zfs_compressed_arc_enabled = B_TRUE;
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
+
+static void
+arc_free_target_init(void *unused __unused)
+{
+
+	zfs_arc_free_target = vm_pageout_wakeup_thresh;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+    arc_free_target_init, NULL);
+
+TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
+TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN,
+    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN,
+    0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
+    &zfs_arc_average_blocksize, 0,
+    "ARC average blocksize");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
+    &arc_shrink_shift, 0,
+    "log2(fraction of arc to reclaim)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
+    &zfs_compressed_arc_enabled, 0, "Enable compressed ARC");
+
+/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
+    sysctl_vfs_zfs_arc_free_target, "IU",
+    "Desired number of free pages below which ARC triggers reclaim");
+
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+	u_int val;
+	int err;
+
+	val = zfs_arc_free_target;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < minfree)
+		return (EINVAL);
+	if (val > vm_cnt.v_page_count)
+		return (EINVAL);
+
+	zfs_arc_free_target = val;
+
+	return (0);
+}
+
+/*
+ * Must be declared here, before the definition of corresponding kstat
+ * macro which uses the same names will confuse the compiler.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_vfs_zfs_arc_meta_limit, "QU",
+    "ARC metadata limit");
+#endif
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -241,10 +496,19 @@ int zfs_arc_p_min_shift = 0;
  */
 
 typedef struct arc_state {
-	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
-	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
-	uint64_t arcs_size;	/* total amount of data in this state */
-	kmutex_t arcs_mtx;
+	/*
+	 * list of evictable buffers
+	 */
+	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of evictable data in this state
+	 */
+	refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
+	/*
+	 * total amount of data in this state; this includes: evictable,
+	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+	 */
+	refcount_t arcs_size;
 } arc_state_t;
 
 /* The 6 states: */
@@ -270,13 +534,30 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_mru_ghost_hits;
 	kstat_named_t arcstat_mfu_hits;
 	kstat_named_t arcstat_mfu_ghost_hits;
+	kstat_named_t arcstat_allocated;
 	kstat_named_t arcstat_deleted;
-	kstat_named_t arcstat_recycle_miss;
+	/*
+	 * Number of buffers that could not be evicted because the hash lock
+	 * was held by another thread.  The lock may not necessarily be held
+	 * by something using the same buffer, since hash locks are shared
+	 * by multiple buffers.
+	 */
 	kstat_named_t arcstat_mutex_miss;
+	/*
+	 * Number of buffers skipped because they have I/O in progress, are
+	 * indrect prefetch buffers that have not lived long enough, or are
+	 * not from the spa we're trying to evict from.
+	 */
 	kstat_named_t arcstat_evict_skip;
+	/*
+	 * Number of times arc_evict_state() was unable to evict enough
+	 * buffers to reach it's target amount.
+	 */
+	kstat_named_t arcstat_evict_not_enough;
 	kstat_named_t arcstat_evict_l2_cached;
 	kstat_named_t arcstat_evict_l2_eligible;
 	kstat_named_t arcstat_evict_l2_ineligible;
+	kstat_named_t arcstat_evict_l2_skip;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
@@ -287,9 +568,157 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_c_min;
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
+	/*
+	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+	 * Note that the compressed bytes may match the uncompressed bytes
+	 * if the block is either not compressed or compressed arc is disabled.
+	 */
+	kstat_named_t arcstat_compressed_size;
+	/*
+	 * Uncompressed size of the data stored in b_pdata. If compressed
+	 * arc is disabled then this value will be identical to the stat
+	 * above.
+	 */
+	kstat_named_t arcstat_uncompressed_size;
+	/*
+	 * Number of bytes stored in all the arc_buf_t's. This is classified
+	 * as "overhead" since this data is typically short-lived and will
+	 * be evicted from the arc when it becomes unreferenced unless the
+	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+	 * values have been set (see comment in dbuf.c for more information).
+	 */
+	kstat_named_t arcstat_overhead_size;
+	/*
+	 * Number of bytes consumed by internal ARC structures necessary
+	 * for tracking purposes; these structures are not actually
+	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
+	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
+	 * caches), and arc_buf_t structures (allocated via arc_buf_t
+	 * cache).
+	 */
 	kstat_named_t arcstat_hdr_size;
+	/*
+	 * Number of bytes consumed by ARC buffers of type equal to
+	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
+	 * on disk user data (e.g. plain file contents).
+	 */
 	kstat_named_t arcstat_data_size;
+	/*
+	 * Number of bytes consumed by ARC buffers of type equal to
+	 * ARC_BUFC_METADATA. This is generally consumed by buffers
+	 * backing on disk data that is used for internal ZFS
+	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
+	 */
+	kstat_named_t arcstat_metadata_size;
+	/*
+	 * Number of bytes consumed by various buffers and structures
+	 * not actually backed with ARC buffers. This includes bonus
+	 * buffers (allocated directly via zio_buf_* functions),
+	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
+	 * cache), and dnode_t structures (allocated via dnode_t cache).
+	 */
 	kstat_named_t arcstat_other_size;
+	/*
+	 * Total number of bytes consumed by ARC buffers residing in the
+	 * arc_anon state. This includes *all* buffers in the arc_anon
+	 * state; e.g. data, metadata, evictable, and unevictable buffers
+	 * are all included in this value.
+	 */
+	kstat_named_t arcstat_anon_size;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_DATA,
+	 * residing in the arc_anon state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_anon_evictable_data;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
+	 * residing in the arc_anon state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_anon_evictable_metadata;
+	/*
+	 * Total number of bytes consumed by ARC buffers residing in the
+	 * arc_mru state. This includes *all* buffers in the arc_mru
+	 * state; e.g. data, metadata, evictable, and unevictable buffers
+	 * are all included in this value.
+	 */
+	kstat_named_t arcstat_mru_size;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_DATA,
+	 * residing in the arc_mru state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_mru_evictable_data;
+	/*
+	 * Number of bytes consumed by ARC buffers that meet the
+	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
+	 * residing in the arc_mru state, and are eligible for eviction
+	 * (e.g. have no outstanding holds on the buffer).
+	 */
+	kstat_named_t arcstat_mru_evictable_metadata;
+	/*
+	 * Total number of bytes that *would have been* consumed by ARC
+	 * buffers in the arc_mru_ghost state. The key thing to note
+	 * here, is the fact that this size doesn't actually indicate
+	 * RAM consumption. The ghost lists only consist of headers and
+	 * don't actually have ARC buffers linked off of these headers.
+	 * Thus, *if* the headers had associated ARC buffers, these
+	 * buffers *would have* consumed this number of bytes.
+	 */
+	kstat_named_t arcstat_mru_ghost_size;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+	 */
+	kstat_named_t arcstat_mru_ghost_evictable_data;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+	 */
+	kstat_named_t arcstat_mru_ghost_evictable_metadata;
+	/*
+	 * Total number of bytes consumed by ARC buffers residing in the
+	 * arc_mfu state. This includes *all* buffers in the arc_mfu
+	 * state; e.g. data, metadata, evictable, and unevictable buffers
+	 * are all included in this value.
+	 */
+	kstat_named_t arcstat_mfu_size;
+	/*
+	 * Number of bytes consumed by ARC buffers that are eligible for
+	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
+	 * state.
+	 */
+	kstat_named_t arcstat_mfu_evictable_data;
+	/*
+	 * Number of bytes consumed by ARC buffers that are eligible for
+	 * eviction, of type ARC_BUFC_METADATA, and reside in the
+	 * arc_mfu state.
+	 */
+	kstat_named_t arcstat_mfu_evictable_metadata;
+	/*
+	 * Total number of bytes that *would have been* consumed by ARC
+	 * buffers in the arc_mfu_ghost state. See the comment above
+	 * arcstat_mru_ghost_size for more details.
+	 */
+	kstat_named_t arcstat_mfu_ghost_size;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+	 */
+	kstat_named_t arcstat_mfu_ghost_evictable_data;
+	/*
+	 * Number of bytes that *would have been* consumed by ARC
+	 * buffers that are eligible for eviction, of type
+	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+	 */
+	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
 	kstat_named_t arcstat_l2_hits;
 	kstat_named_t arcstat_l2_misses;
 	kstat_named_t arcstat_l2_feeds;
@@ -299,16 +728,36 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_writes_sent;
 	kstat_named_t arcstat_l2_writes_done;
 	kstat_named_t arcstat_l2_writes_error;
-	kstat_named_t arcstat_l2_writes_hdr_miss;
+	kstat_named_t arcstat_l2_writes_lock_retry;
 	kstat_named_t arcstat_l2_evict_lock_retry;
 	kstat_named_t arcstat_l2_evict_reading;
+	kstat_named_t arcstat_l2_evict_l1cached;
 	kstat_named_t arcstat_l2_free_on_write;
 	kstat_named_t arcstat_l2_abort_lowmem;
 	kstat_named_t arcstat_l2_cksum_bad;
 	kstat_named_t arcstat_l2_io_error;
 	kstat_named_t arcstat_l2_size;
+	kstat_named_t arcstat_l2_asize;
 	kstat_named_t arcstat_l2_hdr_size;
+	kstat_named_t arcstat_l2_write_trylock_fail;
+	kstat_named_t arcstat_l2_write_passed_headroom;
+	kstat_named_t arcstat_l2_write_spa_mismatch;
+	kstat_named_t arcstat_l2_write_in_l2;
+	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
+	kstat_named_t arcstat_l2_write_not_cacheable;
+	kstat_named_t arcstat_l2_write_full;
+	kstat_named_t arcstat_l2_write_buffer_iter;
+	kstat_named_t arcstat_l2_write_pios;
+	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
+	kstat_named_t arcstat_l2_write_buffer_list_iter;
+	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
 	kstat_named_t arcstat_memory_throttle_count;
+	kstat_named_t arcstat_meta_used;
+	kstat_named_t arcstat_meta_limit;
+	kstat_named_t arcstat_meta_max;
+	kstat_named_t arcstat_meta_min;
+	kstat_named_t arcstat_sync_wait_for_async;
+	kstat_named_t arcstat_demand_hit_predictive_prefetch;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -326,13 +775,15 @@ static arc_stats_t arc_stats = {
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
+	{ "allocated",			KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
-	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
@@ -343,9 +794,28 @@ static arc_stats_t arc_stats = {
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
+	{ "compressed_size",		KSTAT_DATA_UINT64 },
+	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
+	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
+	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "other_size",			KSTAT_DATA_UINT64 },
+	{ "anon_size",			KSTAT_DATA_UINT64 },
+	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mru_size",			KSTAT_DATA_UINT64 },
+	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
+	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+	{ "mfu_size",			KSTAT_DATA_UINT64 },
+	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
+	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
@@ -355,22 +825,42 @@ static arc_stats_t arc_stats = {
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
-	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
+	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
+	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
-	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
+	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
+	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
+	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
+	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
+	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
+	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
+	{ "l2_write_full",		KSTAT_DATA_UINT64 },
+	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
+	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
+	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
+	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
+	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
+	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
+	{ "sync_wait_for_async",	KSTAT_DATA_UINT64 },
+	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 };
 
 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
 
 #define	ARCSTAT_INCR(stat, val) \
-	atomic_add_64(&arc_stats.stat.value.ui64, (val));
+	atomic_add_64(&arc_stats.stat.value.ui64, (val))
 
 #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
@@ -426,15 +916,21 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
+#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
+#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
+
+/* compressed size of entire arc */
+#define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
 static uint64_t		arc_loaned_bytes;
-static uint64_t		arc_meta_used;
-static uint64_t		arc_meta_limit;
-static uint64_t		arc_meta_max = 0;
-
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 
 typedef struct arc_callback arc_callback_t;
 
@@ -451,35 +947,64 @@ typedef struct arc_write_callback arc_wr
 struct arc_write_callback {
 	void		*awcb_private;
 	arc_done_func_t	*awcb_ready;
+	arc_done_func_t	*awcb_children_ready;
+	arc_done_func_t	*awcb_physdone;
 	arc_done_func_t	*awcb_done;
 	arc_buf_t	*awcb_buf;
 };
 
-struct arc_buf_hdr {
-	/* protected by hash lock */
-	dva_t			b_dva;
-	uint64_t		b_birth;
-	uint64_t		b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ *   - Common fields struct, always defined, and embedded within it:
+ *       - L2-only fields, always allocated but undefined when not in L2ARC
+ *       - L1-only fields, only allocated when in L1ARC
+ *
+ *           Buffer in L1                     Buffer only in L2
+ *    +------------------------+          +------------------------+
+ *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
+ *    |                        |          |                        |
+ *    |                        |          |                        |
+ *    |                        |          |                        |
+ *    +------------------------+          +------------------------+
+ *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
+ *    | (undefined if L1-only) |          |                        |
+ *    +------------------------+          +------------------------+
+ *    | l1arc_buf_hdr_t        |
+ *    |                        |
+ *    |                        |
+ *    |                        |
+ *    |                        |
+ *    +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
 	kmutex_t		b_freeze_lock;
 	zio_cksum_t		*b_freeze_cksum;
+#ifdef ZFS_DEBUG
+	/*
+	 * used for debugging wtih kmem_flags - by allocating and freeing
+	 * b_thawed when the buffer is thawed, we get a record of the stack
+	 * trace that thawed it.
+	 */
+	void			*b_thawed;
+#endif
 
-	arc_buf_hdr_t		*b_hash_next;
 	arc_buf_t		*b_buf;
-	uint32_t		b_flags;
-	uint32_t		b_datacnt;
-
-	arc_callback_t		*b_acb;
+	uint32_t		b_bufcnt;
+	/* for waiting on writes to complete */
 	kcondvar_t		b_cv;
-
-	/* immutable */
-	arc_buf_contents_t	b_type;
-	uint64_t		b_size;
-	uint64_t		b_spa;
+	uint8_t			b_byteswap;
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
-	list_node_t		b_arc_node;
+	multilist_node_t	b_arc_node;
 
 	/* updated atomically */
 	clock_t			b_arc_access;
@@ -487,69 +1012,207 @@ struct arc_buf_hdr {
 	/* self protecting */
 	refcount_t		b_refcnt;
 
-	l2arc_buf_hdr_t		*b_l2hdr;
+	arc_callback_t		*b_acb;
+	void			*b_pdata;
+} l1arc_buf_hdr_t;
+
+typedef struct l2arc_dev l2arc_dev_t;
+
+typedef struct l2arc_buf_hdr {
+	/* protected by arc_buf_hdr mutex */
+	l2arc_dev_t		*b_dev;		/* L2ARC device */
+	uint64_t		b_daddr;	/* disk address, offset byte */
+
 	list_node_t		b_l2node;
+} l2arc_buf_hdr_t;
+
+struct arc_buf_hdr {
+	/* protected by hash lock */
+	dva_t			b_dva;
+	uint64_t		b_birth;
+
+	arc_buf_contents_t	b_type;
+	arc_buf_hdr_t		*b_hash_next;
+	arc_flags_t		b_flags;
+
+	/*
+	 * This field stores the size of the data buffer after
+	 * compression, and is set in the arc's zio completion handlers.
+	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+	 *
+	 * While the block pointers can store up to 32MB in their psize
+	 * field, we can only store up to 32MB minus 512B. This is due
+	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+	 * a field of zeros represents 512B in the bp). We can't use a
+	 * bias of 1 since we need to reserve a psize of zero, here, to
+	 * represent holes and embedded blocks.
+	 *
+	 * This isn't a problem in practice, since the maximum size of a
+	 * buffer is limited to 16MB, so we never need to store 32MB in
+	 * this field. Even in the upstream illumos code base, the
+	 * maximum size of a buffer is limited to 16MB.
+	 */
+	uint16_t		b_psize;
+
+	/*
+	 * This field stores the size of the data buffer before
+	 * compression, and cannot change once set. It is in units
+	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+	 */
+	uint16_t		b_lsize;	/* immutable */
+	uint64_t		b_spa;		/* immutable */
+
+	/* L2ARC fields. Undefined when not in L2ARC. */
+	l2arc_buf_hdr_t		b_l2hdr;
+	/* L1ARC fields. Undefined when in l2arc_only state */
+	l1arc_buf_hdr_t		b_l1hdr;
 };
 
-static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
-static arc_buf_hdr_t arc_eviction_hdr;
-static void arc_get_data_buf(arc_buf_t *buf);
-static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
-static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static int
+sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = arc_meta_limit;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+        if (val <= 0 || val > arc_c_max)
+		return (EINVAL);
+
+	arc_meta_limit = val;
+	return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_arc_max;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (zfs_arc_max == 0) {
+		/* Loader tunable so blindly set */
+		zfs_arc_max = val;
+		return (0);
+	}
+
+	if (val < arc_abs_min || val > kmem_size())
+		return (EINVAL);
+	if (val < arc_c_min)
+		return (EINVAL);
+	if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
+		return (EINVAL);
+
+	arc_c_max = val;
+
+	arc_c = arc_c_max;
+        arc_p = (arc_c >> 1);
+
+	if (zfs_arc_meta_limit == 0) {
+		/* limit meta-data to 1/4 of the arc capacity */
+		arc_meta_limit = arc_c_max / 4;
+	}
+
+	/* if kmem_flags are set, lets try to use less memory */
+	if (kmem_debugging())
+		arc_c = arc_c / 2;
+
+	zfs_arc_max = arc_c;
+
+	return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_arc_min;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (zfs_arc_min == 0) {
+		/* Loader tunable so blindly set */
+		zfs_arc_min = val;
+		return (0);
+	}
+
+	if (val < arc_abs_min || val > arc_c_max)
+		return (EINVAL);
+
+	arc_c_min = val;
+
+	if (zfs_arc_meta_min == 0)
+                arc_meta_min = arc_c_min / 2;
 
-static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+	if (arc_c < arc_c_min)
+                arc_c = arc_c_min;
+
+	zfs_arc_min = arc_c_min;
+
+	return (0);
+}
+#endif
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
-/*
- * Private ARC flags.  These flags are private ARC only flags that will show up
- * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
- * be passed in as arc_flags in things like arc_read.  However, these flags
- * should never be passed and should only be set by ARC code.  When adding new
- * public flags, make sure not to smash the private ones.
- */
-
-#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
-#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
-#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
-#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
-#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
-#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
-#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
-#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
-#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
-#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
-
-#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
-#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
-#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
-#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
-#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
-#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
-#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
-#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
-#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
-				    (hdr)->b_l2hdr != NULL)
-#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
-#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
-#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
+#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
+#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define	HDR_COMPRESSION_ENABLED(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
+
+#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define	HDR_L2_READING(hdr)	\
+	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
+	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
+#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
+#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
+
+#define	HDR_ISTYPE_METADATA(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
+
+#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+
+/* For storing compression mode in b_flags */
+#define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
 
 /*
  * Other sizes
  */
 
-#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
-#define	HT_LOCK_PAD	64
+#define	HT_LOCK_PAD	CACHE_LINE_SIZE
 
 struct ht_lock {
 	kmutex_t	ht_lock;
@@ -562,7 +1225,7 @@ struct ht_lock {
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
-	struct ht_lock ht_locks[BUF_LOCKS];
+	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
@@ -571,8 +1234,8 @@ static buf_hash_table_t buf_hash_table;
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
-#define	HDR_LOCK(buf) \
-	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
+#define	HDR_LOCK(hdr) \
+	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
@@ -581,59 +1244,127 @@ uint64_t zfs_crc64_table[256];
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
-#define	L2ARC_HEADROOM		2		/* num of writes */
+#define	L2ARC_HEADROOM		2			/* num of writes */
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
 
-/*
- * L2ARC Performance Tunables
- */
+/* L2ARC Performance Tunables */
 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
+uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
 boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
 boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
 
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+    &l2arc_write_max, 0, "max write size");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+    &l2arc_write_boost, 0, "extra write during warmup");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+    &l2arc_headroom, 0, "number of dev writes");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+    &l2arc_feed_secs, 0, "interval seconds");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
+    &l2arc_feed_min_ms, 0, "min interval milliseconds");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
+    &l2arc_feed_again, 0, "turbo warmup");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
+    &l2arc_norw, 0, "no reads during writes");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+    &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of anonymous state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+    &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+    &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+    &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+
 /*
  * L2ARC Internals
  */
-typedef struct l2arc_dev {
+struct l2arc_dev {
 	vdev_t			*l2ad_vdev;	/* vdev */
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
-	uint64_t		l2ad_write;	/* desired write size, bytes */
-	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
-	uint64_t		l2ad_evict;	/* last addr eviction reached */
 	boolean_t		l2ad_first;	/* first sweep through */
 	boolean_t		l2ad_writing;	/* currently writing */
-	list_t			*l2ad_buflist;	/* buffer list */
+	kmutex_t		l2ad_mtx;	/* lock for buffer list */
+	list_t			l2ad_buflist;	/* buffer list */
 	list_node_t		l2ad_node;	/* device list node */
-} l2arc_dev_t;
+	refcount_t		l2ad_alloc;	/* allocated bytes */
+};
 
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
-static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
-	arc_buf_t	*l2rcb_buf;		/* read buffer */
-	spa_t		*l2rcb_spa;		/* spa */
-	blkptr_t	l2rcb_bp;		/* original blkptr */
-	zbookmark_t	l2rcb_zb;		/* original bookmark */
-	int		l2rcb_flags;		/* original flags */
+	arc_buf_hdr_t		*l2rcb_hdr;		/* read buffer */
+	blkptr_t		l2rcb_bp;		/* original blkptr */
+	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
+	int			l2rcb_flags;		/* original flags */
+	void			*l2rcb_data;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
@@ -641,17 +1372,11 @@ typedef struct l2arc_write_callback {
 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
 } l2arc_write_callback_t;
 
-struct l2arc_buf_hdr {
-	/* protected by arc_buf_hdr  mutex */
-	l2arc_dev_t	*b_dev;			/* L2ARC device */
-	uint64_t	b_daddr;		/* disk address, offset byte */
-};
-
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	void		*l2df_data;
 	size_t		l2df_size;
-	void		(*l2df_func)(void *, size_t);
+	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
@@ -659,9 +1384,35 @@ static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
-static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
-static void l2arc_hdr_stat_remove(void);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
+static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
+static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static boolean_t arc_is_overflowing();
+static void arc_buf_watch(arc_buf_t *);
+
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+
+static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
+static void l2arc_read_done(zio_t *);
+
+static void
+l2arc_trim(const arc_buf_hdr_t *hdr)
+{
+	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+	ASSERT(HDR_HAS_L2HDR(hdr));
+	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+
+	if (HDR_GET_PSIZE(hdr) != 0) {
+		trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
+		    HDR_GET_PSIZE(hdr), 0);
+	}
+}
 
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
@@ -680,29 +1431,38 @@ buf_hash(uint64_t spa, const dva_t *dva,
 	return (crc);
 }
 
-#define	BUF_EMPTY(buf)						\
-	((buf)->b_dva.dva_word[0] == 0 &&			\
-	(buf)->b_dva.dva_word[1] == 0 &&			\
-	(buf)->b_birth == 0)
-
-#define	BUF_EQUAL(spa, dva, birth, buf)				\
-	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
-	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
-	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+#define	HDR_EMPTY(hdr)						\
+	((hdr)->b_dva.dva_word[0] == 0 &&			\
+	(hdr)->b_dva.dva_word[1] == 0)
+
+#define	HDR_EQUAL(spa, dva, birth, hdr)				\
+	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
+	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
+	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
+
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+	hdr->b_dva.dva_word[0] = 0;
+	hdr->b_dva.dva_word[1] = 0;
+	hdr->b_birth = 0;
+}
 
 static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
+	const dva_t *dva = BP_IDENTITY(bp);
+	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *buf;
+	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
-	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
-	    buf = buf->b_hash_next) {
-		if (BUF_EQUAL(spa, dva, birth, buf)) {
+	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
+	    hdr = hdr->b_hash_next) {
+		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
-			return (buf);
+			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
@@ -715,27 +1475,36 @@ buf_hash_find(uint64_t spa, const dva_t 
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
-	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
-	arc_buf_hdr_t *fbuf;
+	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
-	ASSERT(!HDR_IN_HASH_TABLE(buf));
-	*lockp = hash_lock;
-	mutex_enter(hash_lock);
-	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
-	    fbuf = fbuf->b_hash_next, i++) {
-		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
-			return (fbuf);
+	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
+	ASSERT(hdr->b_birth != 0);
+	ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+	if (lockp != NULL) {
+		*lockp = hash_lock;
+		mutex_enter(hash_lock);
+	} else {
+		ASSERT(MUTEX_HELD(hash_lock));
+	}
+
+	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
+	    fhdr = fhdr->b_hash_next, i++) {
+		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+			return (fhdr);
 	}
 
-	buf->b_hash_next = buf_hash_table.ht_table[idx];
-	buf_hash_table.ht_table[idx] = buf;
-	buf->b_flags |= ARC_IN_HASH_TABLE;
+	hdr->b_hash_next = buf_hash_table.ht_table[idx];
+	buf_hash_table.ht_table[idx] = hdr;
+	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
@@ -753,22 +1522,22 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmut
 }
 
 static void
-buf_hash_remove(arc_buf_hdr_t *buf)
+buf_hash_remove(arc_buf_hdr_t *hdr)
 {
-	arc_buf_hdr_t *fbuf, **bufp;
-	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+	arc_buf_hdr_t *fhdr, **hdrp;
+	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
-	ASSERT(HDR_IN_HASH_TABLE(buf));
+	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
-	bufp = &buf_hash_table.ht_table[idx];
-	while ((fbuf = *bufp) != buf) {
-		ASSERT(fbuf != NULL);
-		bufp = &fbuf->b_hash_next;
-	}
-	*bufp = buf->b_hash_next;
-	buf->b_hash_next = NULL;
-	buf->b_flags &= ~ARC_IN_HASH_TABLE;
+	hdrp = &buf_hash_table.ht_table[idx];
+	while ((fhdr = *hdrp) != hdr) {
+		ASSERT3P(fhdr, !=, NULL);
+		hdrp = &fhdr->b_hash_next;
+	}
+	*hdrp = hdr->b_hash_next;
+	hdr->b_hash_next = NULL;
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
@@ -781,7 +1550,8 @@ buf_hash_remove(arc_buf_hdr_t *buf)
 /*
  * Global data structures and functions for the buf kmem cache.
  */
-static kmem_cache_t *hdr_cache;
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
@@ -793,7 +1563,8 @@ buf_fini(void)
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
-	kmem_cache_destroy(hdr_cache);
+	kmem_cache_destroy(hdr_full_cache);
+	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
@@ -803,15 +1574,34 @@ buf_fini(void)
  */
 /* ARGSUSED */
 static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+#ifdef __NetBSD__
+	hdr = unused;
+#endif
+	bzero(hdr, HDR_FULL_SIZE);
+	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+	refcount_create(&hdr->b_l1hdr.b_refcnt);
+	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
-	arc_buf_hdr_t *buf = unused;
+	arc_buf_hdr_t *hdr = vbuf;
 
-	bzero(buf, sizeof (arc_buf_hdr_t));
-	refcount_create(&buf->b_refcnt);
-	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
-	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
-	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+#ifdef __NetBSD__
+	hdr = unused;
+#endif
+	bzero(hdr, HDR_L2ONLY_SIZE);
+	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
@@ -820,10 +1610,13 @@ hdr_cons(void *vbuf, void *unused, int k
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
-	arc_buf_t *buf = unused;
+	arc_buf_t *buf = vbuf;
 
+#ifdef __NetBSD__
+	buf = unused;
+#endif
 	bzero(buf, sizeof (arc_buf_t));
-	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
@@ -835,24 +1628,44 @@ buf_cons(void *vbuf, void *unused, int k
  */
 /* ARGSUSED */
 static void
-hdr_dest(void *vbuf, void *unused)
+hdr_full_dest(void *vbuf, void *unused)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+#ifdef __NetBSD__
+	hdr = unused;
+#endif
+	ASSERT(HDR_EMPTY(hdr));
+	cv_destroy(&hdr->b_l1hdr.b_cv);
+	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
 {
-	arc_buf_hdr_t *buf = unused;
+	arc_buf_hdr_t *hdr = vbuf;
 
-	ASSERT(BUF_EMPTY(buf));
-	refcount_destroy(&buf->b_refcnt);
-	cv_destroy(&buf->b_cv);
-	mutex_destroy(&buf->b_freeze_lock);
-	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+#ifdef __NetBSD__
+	hdr = unused;
+#endif
+	ASSERT(HDR_EMPTY(hdr));
+	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 /* ARGSUSED */
 static void
 buf_dest(void *vbuf, void *unused)
 {
-	arc_buf_t *buf = unused;
+	arc_buf_t *buf = vbuf;
 
-	rw_destroy(&buf->b_lock);
+#ifdef __NetBSD__
+	buf = unused;
+#endif
+	mutex_destroy(&buf->b_evict_lock);
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
@@ -869,7 +1682,7 @@ hdr_recl(void *unused)
 	 * which is after we do arc_fini().
 	 */
 	if (!arc_dead)
-		cv_signal(&arc_reclaim_thr_cv);
+		cv_signal(&arc_reclaim_thread_cv);
 }
 
 static void
@@ -881,10 +1694,11 @@ buf_init(void)
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
-	 * with an average 64K block size.  The table will take up
-	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
+	 * with an average block size of zfs_arc_average_blocksize (default 8K).
+	 * By default, the table will take up
+	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
-	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
+	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
@@ -896,8 +1710,11 @@ retry:
 		goto retry;
 	}
 
-	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
-	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
@@ -913,232 +1730,709 @@ retry:
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+	boolean_t shared = (buf->b_data != NULL &&
+	    buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+	return (shared);
+}
+
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+		hdr->b_l1hdr.b_freeze_cksum = NULL;
+	}
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum == NULL ||
-	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
-		mutex_exit(&buf->b_hdr->b_freeze_lock);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
-	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
-static int
-arc_cksum_equal(arc_buf_t *buf)
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
-	zio_cksum_t zc;
-	int equal;
+	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
+	boolean_t valid_cksum;
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
-	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
+
+	/*
+	 * We rely on the blkptr's checksum to determine if the block
+	 * is valid or not. When compressed arc is enabled, the l2arc
+	 * writes the block to the l2arc just as it appears in the pool.
+	 * This allows us to use the blkptr's checksum to validate the
+	 * data that we just read off of the l2arc without having to store
+	 * a separate checksum in the arc_buf_hdr_t. However, if compressed
+	 * arc is disabled, then the data written to the l2arc is always
+	 * uncompressed and won't match the block as it exists in the main
+	 * pool. When this is the case, we must first compress it if it is
+	 * compressed on the main pool before we can validate the checksum.
+	 */
+	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+		uint64_t lsize = HDR_GET_LSIZE(hdr);
+		uint64_t csize;
+
+		void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
+		csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
+		if (csize < HDR_GET_PSIZE(hdr)) {
+			/*
+			 * Compressed blocks are always a multiple of the
+			 * smallest ashift in the pool. Ideally, we would
+			 * like to round up the csize to the next
+			 * spa_min_ashift but that value may have changed
+			 * since the block was last written. Instead,
+			 * we rely on the fact that the hdr's psize
+			 * was set to the psize of the block when it was
+			 * last written. We set the csize to that value
+			 * and zero out any part that should not contain
+			 * data.
+			 */
+			bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
+			csize = HDR_GET_PSIZE(hdr);
+		}
+		zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
+	}
 
-	return (equal);
+	/*
+	 * Block pointers always store the checksum for the logical data.
+	 * If the block pointer has the gang bit set, then the checksum
+	 * it represents is for the reconstituted data and not for an
+	 * individual gang member. The zio pipeline, however, must be able to
+	 * determine the checksum of each of the gang constituents so it
+	 * treats the checksum comparison differently than what we need
+	 * for l2arc blocks. This prevents us from using the
+	 * zio_checksum_error() interface directly. Instead we must call the
+	 * zio_checksum_error_impl() so that we can ensure the checksum is
+	 * generated using the correct checksum algorithm and accounts for the
+	 * logical I/O size and not just a gang fragment.
+	 */
+	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+	    BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+	    zio->io_offset, NULL) == 0);
+	zio_pop_transforms(zio);
+	return (valid_cksum);
 }
 
 static void
-arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+arc_cksum_compute(arc_buf_t *buf)
 {
-	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		mutex_exit(&buf->b_hdr->b_freeze_lock);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
-	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
-	    buf->b_hdr->b_freeze_cksum);
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+	    KM_SLEEP);
+	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+	    hdr->b_l1hdr.b_freeze_cksum);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+#ifdef illumos
+	arc_buf_watch(buf);
+#endif
 }
 
-void
-arc_buf_thaw(arc_buf_t *buf)
+#ifdef illumos
+#ifndef _KERNEL
+typedef struct procctl {
+	long cmd;
+	prwatch_t prwatch;
+} procctl_t;
+#endif
+
+/* ARGSUSED */
+static void
+arc_buf_unwatch(arc_buf_t *buf)
 {
-	if (zfs_flags & ZFS_DEBUG_MODIFY) {
-		if (buf->b_hdr->b_state != arc_anon)
+#ifndef _KERNEL
+	if (arc_watch) {
+		int result;
+		procctl_t ctl;
+		ctl.cmd = PCWATCH;
+		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
+		ctl.prwatch.pr_size = 0;
+		ctl.prwatch.pr_wflags = 0;
+		result = write(arc_procfd, &ctl, sizeof (ctl));
+		ASSERT3U(result, ==, sizeof (ctl));
+	}
+#endif
+}
+
+/* ARGSUSED */
+static void
+arc_buf_watch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+	if (arc_watch) {
+		int result;
+		procctl_t ctl;
+		ctl.cmd = PCWATCH;
+		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
+		ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
+		ctl.prwatch.pr_wflags = WA_WRITE;
+		result = write(arc_procfd, &ctl, sizeof (ctl));
+		ASSERT3U(result, ==, sizeof (ctl));
+	}
+#endif
+}
+#endif /* illumos */
+
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+	arc_buf_contents_t type;
+	if (HDR_ISTYPE_METADATA(hdr)) {
+		type = ARC_BUFC_METADATA;
+	} else {
+		type = ARC_BUFC_DATA;
+	}
+	VERIFY3U(hdr->b_type, ==, type);
+	return (type);
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+	switch (type) {
+	case ARC_BUFC_DATA:
+		/* metadata field is 0 if buffer contains normal data */
+		return (0);
+	case ARC_BUFC_METADATA:
+		return (ARC_FLAG_BUFC_METADATA);
+	default:
+		break;
+	}
+	panic("undefined ARC buffer type!");
+	return ((uint32_t)-1);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	if (zfs_flags & ZFS_DEBUG_MODIFY) {
+		if (hdr->b_l1hdr.b_state != arc_anon)
 			panic("modifying non-anon buffer!");
-		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+		if (HDR_IO_IN_PROGRESS(hdr))
 			panic("modifying buffer while i/o in progress!");
 		arc_cksum_verify(buf);
 	}
 
-	mutex_enter(&buf->b_hdr->b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-		buf->b_hdr->b_freeze_cksum = NULL;
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	arc_cksum_free(hdr);
+
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+#ifdef ZFS_DEBUG
+	if (zfs_flags & ZFS_DEBUG_MODIFY) {
+		if (hdr->b_l1hdr.b_thawed != NULL)
+			kmem_free(hdr->b_l1hdr.b_thawed, 1);
+		hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
 	}
-	mutex_exit(&buf->b_hdr->b_freeze_lock);
+#endif
+
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+
+#ifdef illumos
+	arc_buf_unwatch(buf);
+#endif
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	kmutex_t *hash_lock;
+
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
-	    buf->b_hdr->b_state == arc_anon);
-	arc_cksum_compute(buf, B_FALSE);
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
+	    hdr->b_l1hdr.b_state == arc_anon);
+	arc_cksum_compute(buf);
+	mutex_exit(hash_lock);
+
+}
+
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+	hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+	hdr->b_flags &= ~flags;
 }
 
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
 static void
-add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
-	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+	/*
+	 * Holes and embedded blocks will always have a psize = 0 so
+	 * we ignore the compression of the blkptr and set the
+	 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
+	 * Holes and embedded blocks remain anonymous so we don't
+	 * want to uncompress them. Mark them as uncompressed.
+	 */
+	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		HDR_SET_COMPRESS(hdr, cmp);
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
+	}
+}
+
+static int
+arc_decompress(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+	int error;
+
+	if (arc_buf_is_shared(buf)) {
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+		/*
+		 * The arc_buf_hdr_t is either not compressed or is
+		 * associated with an embedded block or a hole in which
+		 * case they remain anonymous.
+		 */
+		IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
+		    HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+	} else {
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+		error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+		    hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
+		    HDR_GET_LSIZE(hdr));
+		if (error != 0) {
+			zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
+			    hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
+			    HDR_GET_LSIZE(hdr));
+			return (SET_ERROR(EIO));
+		}
+	}
+	if (bswap != DMU_BSWAP_NUMFUNCS) {
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
+		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
+	}
+	arc_cksum_compute(buf);
+	return (0);
+}
+
+/*
+ * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
+ */
+static uint64_t
+arc_hdr_size(arc_buf_hdr_t *hdr)
+{
+	uint64_t size;
+
+	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+	    HDR_GET_PSIZE(hdr) > 0) {
+		size = HDR_GET_PSIZE(hdr);
+	} else {
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
+		size = HDR_GET_LSIZE(hdr);
+	}
+	return (size);
+}
+
+/*
+ * Increment the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+	uint64_t lsize = HDR_GET_LSIZE(hdr);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	if (GHOST_STATE(state)) {
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		(void) refcount_add_many(&state->arcs_esize[type], lsize, hdr);
+		return;
+	}
+
+	ASSERT(!GHOST_STATE(state));
+	if (hdr->b_l1hdr.b_pdata != NULL) {
+		(void) refcount_add_many(&state->arcs_esize[type],
+		    arc_hdr_size(hdr), hdr);
+	}
+	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+	    buf = buf->b_next) {
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(ARC_BUF_LAST(buf));
+			continue;
+		}
+		(void) refcount_add_many(&state->arcs_esize[type], lsize, buf);
+	}
+}
+
+/*
+ * Decrement the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+	uint64_t lsize = HDR_GET_LSIZE(hdr);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	if (GHOST_STATE(state)) {
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    lsize, hdr);
+		return;
+	}
+
+	ASSERT(!GHOST_STATE(state));
+	if (hdr->b_l1hdr.b_pdata != NULL) {
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    arc_hdr_size(hdr), hdr);
+	}
+	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+	    buf = buf->b_next) {
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(ARC_BUF_LAST(buf));
+			continue;
+		}
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    lsize, buf);
+	}
+}
+
+/*
+ * Add a reference to this hdr indicating that someone is actively
+ * referencing that memory. When the refcount transitions from 0 to 1,
+ * we remove it from the respective arc_state_t list to indicate that
+ * it is not evictable.
+ */
+static void
+add_reference(arc_buf_hdr_t *hdr, void *tag)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	if (!MUTEX_HELD(HDR_LOCK(hdr))) {
+		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+	}
+
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 
-	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
-	    (ab->b_state != arc_anon)) {
-		uint64_t delta = ab->b_size * ab->b_datacnt;
-		list_t *list = &ab->b_state->arcs_list[ab->b_type];
-		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
-
-		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
-		mutex_enter(&ab->b_state->arcs_mtx);
-		ASSERT(list_link_active(&ab->b_arc_node));
-		list_remove(list, ab);
-		if (GHOST_STATE(ab->b_state)) {
-			ASSERT3U(ab->b_datacnt, ==, 0);
-			ASSERT3P(ab->b_buf, ==, NULL);
-			delta = ab->b_size;
-		}
-		ASSERT(delta > 0);
-		ASSERT3U(*size, >=, delta);
-		atomic_add_64(size, -delta);
-		mutex_exit(&ab->b_state->arcs_mtx);
+	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+	    (state != arc_anon)) {
+		/* We don't use the L2-only state list. */
+		if (state != arc_l2c_only) {
+			multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
+			    hdr);
+			arc_evitable_space_decrement(hdr, state);
+		}
 		/* remove the prefetch flag if we get a reference */
-		if (ab->b_flags & ARC_PREFETCH)
-			ab->b_flags &= ~ARC_PREFETCH;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
 	}
 }
 
+/*
+ * Remove a reference from this hdr. When the reference transitions from
+ * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
+ * list making it eligible for eviction.
+ */
 static int
-remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
 {
 	int cnt;
-	arc_state_t *state = ab->b_state;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 
+	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
 	ASSERT(!GHOST_STATE(state));
 
-	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+	/*
+	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
+	 * check to prevent usage of the arc_l2c_only list.
+	 */
+	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
 	    (state != arc_anon)) {
-		uint64_t *size = &state->arcs_lsize[ab->b_type];
-
-		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
-		mutex_enter(&state->arcs_mtx);
-		ASSERT(!list_link_active(&ab->b_arc_node));
-		list_insert_head(&state->arcs_list[ab->b_type], ab);
-		ASSERT(ab->b_datacnt > 0);
-		atomic_add_64(size, ab->b_size * ab->b_datacnt);
-		mutex_exit(&state->arcs_mtx);
+		multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+		arc_evictable_space_increment(hdr, state);
 	}
 	return (cnt);
 }
 
 /*
- * Move the supplied buffer to the indicated state.  The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
+    kmutex_t *hash_lock)
 {
-	arc_state_t *old_state = ab->b_state;
-	int64_t refcnt = refcount_count(&ab->b_refcnt);
-	uint64_t from_delta, to_delta;
+	arc_state_t *old_state;
+	int64_t refcnt;
+	uint32_t bufcnt;
+	boolean_t update_old, update_new;
+	arc_buf_contents_t buftype = arc_buf_type(hdr);
+
+	/*
+	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
+	 * L1 hdr doesn't always exist when we change state to arc_anon before
+	 * destroying a header, in which case reallocating to add the L1 hdr is
+	 * pointless.
+	 */
+	if (HDR_HAS_L1HDR(hdr)) {
+		old_state = hdr->b_l1hdr.b_state;
+		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
+		bufcnt = hdr->b_l1hdr.b_bufcnt;
+		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL);
+	} else {
+		old_state = arc_l2c_only;
+		refcnt = 0;
+		bufcnt = 0;
+		update_old = B_FALSE;
+	}
+	update_new = update_old;
 
 	ASSERT(MUTEX_HELD(hash_lock));
-	ASSERT(new_state != old_state);
-	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
-	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
-	ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
-	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
-
-	from_delta = to_delta = ab->b_datacnt * ab->b_size;
+	ASSERT3P(new_state, !=, old_state);
+	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
+	ASSERT(old_state != arc_anon || bufcnt <= 1);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
-		if (old_state != arc_anon) {
-			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
-			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
-
-			if (use_mutex)
-				mutex_enter(&old_state->arcs_mtx);
-
-			ASSERT(list_link_active(&ab->b_arc_node));
-			list_remove(&old_state->arcs_list[ab->b_type], ab);
+		if (old_state != arc_anon && old_state != arc_l2c_only) {
+			ASSERT(HDR_HAS_L1HDR(hdr));
+			multilist_remove(&old_state->arcs_list[buftype], hdr);
+
+			if (GHOST_STATE(old_state)) {
+				ASSERT0(bufcnt);
+				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+				update_old = B_TRUE;
+			}
+			arc_evitable_space_decrement(hdr, old_state);
+		}
+		if (new_state != arc_anon && new_state != arc_l2c_only) {
 
 			/*
-			 * If prefetching out of the ghost cache,
-			 * we will have a non-null datacnt.
+			 * An L1 header always exists here, since if we're
+			 * moving to some L1-cached state (i.e. not l2c_only or
+			 * anonymous), we realloc the header to add an L1hdr
+			 * beforehand.
 			 */
-			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
-				/* ghost elements have a ghost size */
-				ASSERT(ab->b_buf == NULL);
-				from_delta = ab->b_size;
-			}
-			ASSERT3U(*size, >=, from_delta);
-			atomic_add_64(size, -from_delta);
+			ASSERT(HDR_HAS_L1HDR(hdr));
+			multilist_insert(&new_state->arcs_list[buftype], hdr);
 
-			if (use_mutex)
-				mutex_exit(&old_state->arcs_mtx);
+			if (GHOST_STATE(new_state)) {
+				ASSERT0(bufcnt);
+				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+				update_new = B_TRUE;
+			}
+			arc_evictable_space_increment(hdr, new_state);
 		}
-		if (new_state != arc_anon) {
-			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
-			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+	}
 
-			if (use_mutex)
-				mutex_enter(&new_state->arcs_mtx);
+	ASSERT(!HDR_EMPTY(hdr));
+	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
+		buf_hash_remove(hdr);
 
-			list_insert_head(&new_state->arcs_list[ab->b_type], ab);
+	/* adjust state sizes (ignore arc_l2c_only) */
 
-			/* ghost elements have a ghost size */
-			if (GHOST_STATE(new_state)) {
-				ASSERT(ab->b_datacnt == 0);
-				ASSERT(ab->b_buf == NULL);
-				to_delta = ab->b_size;
+	if (update_new && new_state != arc_l2c_only) {
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(new_state)) {
+			ASSERT0(bufcnt);
+
+			/*
+			 * When moving a header to a ghost state, we first
+			 * remove all arc buffers. Thus, we'll have a
+			 * bufcnt of zero, and no arc buffer to use for
+			 * the reference. As a result, we use the arc
+			 * header pointer for the reference.
+			 */
+			(void) refcount_add_many(&new_state->arcs_size,
+			    HDR_GET_LSIZE(hdr), hdr);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		} else {
+			uint32_t buffers = 0;
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				ASSERT3U(bufcnt, !=, 0);
+				buffers++;
+
+				/*
+				 * When the arc_buf_t is sharing the data
+				 * block with the hdr, the owner of the
+				 * reference belongs to the hdr. Only
+				 * add to the refcount if the arc_buf_t is
+				 * not shared.
+				 */
+				if (arc_buf_is_shared(buf)) {
+					ASSERT(ARC_BUF_LAST(buf));
+					continue;
+				}
+
+				(void) refcount_add_many(&new_state->arcs_size,
+				    HDR_GET_LSIZE(hdr), buf);
 			}
-			atomic_add_64(size, to_delta);
+			ASSERT3U(bufcnt, ==, buffers);
 
-			if (use_mutex)
-				mutex_exit(&new_state->arcs_mtx);
+			if (hdr->b_l1hdr.b_pdata != NULL) {
+				(void) refcount_add_many(&new_state->arcs_size,
+				    arc_hdr_size(hdr), hdr);
+			} else {
+				ASSERT(GHOST_STATE(old_state));
+			}
 		}
 	}
 
-	ASSERT(!BUF_EMPTY(ab));
-	if (new_state == arc_anon) {
-		buf_hash_remove(ab);
-	}
+	if (update_old && old_state != arc_l2c_only) {
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(old_state)) {
+			ASSERT0(bufcnt);
+
+			/*
+			 * When moving a header off of a ghost state,
+			 * the header will not contain any arc buffers.
+			 * We use the arc header pointer for the reference
+			 * which is exactly what we did when we put the
+			 * header on the ghost state.
+			 */
+
+			(void) refcount_remove_many(&old_state->arcs_size,
+			    HDR_GET_LSIZE(hdr), hdr);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		} else {
+			uint32_t buffers = 0;
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				ASSERT3P(bufcnt, !=, 0);
+				buffers++;
+
+				/*
+				 * When the arc_buf_t is sharing the data
+				 * block with the hdr, the owner of the
+				 * reference belongs to the hdr. Only
+				 * add to the refcount if the arc_buf_t is
+				 * not shared.
+				 */
+				if (arc_buf_is_shared(buf)) {
+					ASSERT(ARC_BUF_LAST(buf));
+					continue;
+				}
 
-	/* adjust state sizes */
-	if (to_delta)
-		atomic_add_64(&new_state->arcs_size, to_delta);
-	if (from_delta) {
-		ASSERT3U(old_state->arcs_size, >=, from_delta);
-		atomic_add_64(&old_state->arcs_size, -from_delta);
+				(void) refcount_remove_many(
+				    &old_state->arcs_size, HDR_GET_LSIZE(hdr),
+				    buf);
+			}
+			ASSERT3U(bufcnt, ==, buffers);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+			(void) refcount_remove_many(
+			    &old_state->arcs_size, arc_hdr_size(hdr), hdr);
+		}
 	}
-	ab->b_state = new_state;
 
-	/* adjust l2arc hdr stats */
-	if (new_state == arc_l2c_only)
-		l2arc_hdr_stat_add();
-	else if (old_state == arc_l2c_only)
-		l2arc_hdr_stat_remove();
+	if (HDR_HAS_L1HDR(hdr))
+		hdr->b_l1hdr.b_state = new_state;
+
+	/*
+	 * L2 headers should never be on the L2 state list since they don't
+	 * have L1 headers allocated.
+	 */
+	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
 }
 
 void
@@ -1150,6 +2444,9 @@ arc_space_consume(uint64_t space, arc_sp
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
+	case ARC_SPACE_META:
+		ARCSTAT_INCR(arcstat_metadata_size, space);
+		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, space);
 		break;
@@ -1161,7 +2458,9 @@ arc_space_consume(uint64_t space, arc_sp
 		break;
 	}
 
-	atomic_add_64(&arc_meta_used, space);
+	if (type != ARC_SPACE_DATA)
+		ARCSTAT_INCR(arcstat_meta_used, space);
+
 	atomic_add_64(&arc_size, space);
 }
 
@@ -1174,6 +2473,9 @@ arc_space_return(uint64_t space, arc_spa
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
+	case ARC_SPACE_META:
+		ARCSTAT_INCR(arcstat_metadata_size, -space);
+		break;
 	case ARC_SPACE_OTHER:
 		ARCSTAT_INCR(arcstat_other_size, -space);
 		break;
@@ -1185,58 +2487,96 @@ arc_space_return(uint64_t space, arc_spa
 		break;
 	}
 
-	ASSERT(arc_meta_used >= space);
-	if (arc_meta_max < arc_meta_used)
-		arc_meta_max = arc_meta_used;
-	atomic_add_64(&arc_meta_used, -space);
+	if (type != ARC_SPACE_DATA) {
+		ASSERT(arc_meta_used >= space);
+		if (arc_meta_max < arc_meta_used)
+			arc_meta_max = arc_meta_used;
+		ARCSTAT_INCR(arcstat_meta_used, -space);
+	}
+
 	ASSERT(arc_size >= space);
 	atomic_add_64(&arc_size, -space);
 }
 
-void *
-arc_data_buf_alloc(uint64_t size)
+/*
+ * Allocate an initial buffer for this hdr, subsequent buffers will
+ * use arc_buf_clone().
+ */
+static arc_buf_t *
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
 {
-	if (arc_evict_needed(ARC_BUFC_DATA))
-		cv_signal(&arc_reclaim_thr_cv);
-	atomic_add_64(&arc_size, size);
-	return (zio_data_buf_alloc(size));
-}
+	arc_buf_t *buf;
 
-void
-arc_data_buf_free(void *buf, uint64_t size)
-{
-	zio_data_buf_free(buf, size);
-	ASSERT(arc_size >= size);
-	atomic_add_64(&arc_size, -size);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
+	    hdr->b_type == ARC_BUFC_METADATA);
+
+	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+	ASSERT0(hdr->b_l1hdr.b_bufcnt);
+
+	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+	buf->b_hdr = hdr;
+	buf->b_data = NULL;
+	buf->b_next = NULL;
+
+	add_reference(hdr, tag);
+
+	/*
+	 * We're about to change the hdr's b_flags. We must either
+	 * hold the hash_lock or be undiscoverable.
+	 */
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+	/*
+	 * If the hdr's data can be shared (no byteswapping, hdr is
+	 * uncompressed, hdr's data is not currently being written to the
+	 * L2ARC write) then we share the data buffer and set the appropriate
+	 * bit in the hdr's b_flags to indicate the hdr is sharing it's
+	 * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to
+	 * store the buf's data.
+	 */
+	if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+	    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) {
+		buf->b_data = hdr->b_l1hdr.b_pdata;
+		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+	} else {
+		buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+		ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+		arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+	}
+	VERIFY3P(buf->b_data, !=, NULL);
+
+	hdr->b_l1hdr.b_buf = buf;
+	hdr->b_l1hdr.b_bufcnt += 1;
+
+	return (buf);
 }
 
-arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+/*
+ * Used when allocating additional buffers.
+ */
+static arc_buf_t *
+arc_buf_clone(arc_buf_t *from)
 {
-	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
+	arc_buf_hdr_t *hdr = from->b_hdr;
+	uint64_t size = HDR_GET_LSIZE(hdr);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
 
-	ASSERT3U(size, >, 0);
-	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
-	ASSERT(BUF_EMPTY(hdr));
-	hdr->b_size = size;
-	hdr->b_type = type;
-	hdr->b_spa = spa_guid(spa);
-	hdr->b_state = arc_anon;
-	hdr->b_arc_access = 0;
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_next = NULL;
-	hdr->b_buf = buf;
-	arc_get_data_buf(buf);
-	hdr->b_datacnt = 1;
-	hdr->b_flags = 0;
-	ASSERT(refcount_is_zero(&hdr->b_refcnt));
-	(void) refcount_add(&hdr->b_refcnt, tag);
+	buf->b_next = hdr->b_l1hdr.b_buf;
+	hdr->b_l1hdr.b_buf = buf;
+	buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+	bcopy(from->b_data, buf->b_data, size);
+	hdr->b_l1hdr.b_bufcnt += 1;
 
+	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 	return (buf);
 }
 
@@ -1253,7 +2593,7 @@ arc_loan_buf(spa_t *spa, int size)
 {
 	arc_buf_t *buf;
 
-	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+	buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
 
 	atomic_add_64(&arc_loaned_bytes, size);
 	return (buf);
@@ -1267,161 +2607,224 @@ arc_return_buf(arc_buf_t *buf, void *tag
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(buf->b_data != NULL);
-	(void) refcount_add(&hdr->b_refcnt, tag);
-	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+	ASSERT3P(buf->b_data, !=, NULL);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
-	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+	atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
 {
-	arc_buf_hdr_t *hdr;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	rw_enter(&buf->b_lock, RW_WRITER);
-	ASSERT(buf->b_data != NULL);
-	hdr = buf->b_hdr;
-	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
-	(void) refcount_remove(&hdr->b_refcnt, tag);
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
+	ASSERT3P(buf->b_data, !=, NULL);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
-	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
-	rw_exit(&buf->b_lock);
+	atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr));
 }
 
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
+static void
+l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
 {
-	arc_buf_t *buf;
-	arc_buf_hdr_t *hdr = from->b_hdr;
-	uint64_t size = hdr->b_size;
-
-	ASSERT(hdr->b_state != arc_anon);
+	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
-	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
-	buf->b_hdr = hdr;
-	buf->b_data = NULL;
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_next = hdr->b_buf;
-	hdr->b_buf = buf;
-	arc_get_data_buf(buf);
-	bcopy(from->b_data, buf->b_data, size);
-	hdr->b_datacnt += 1;
-	return (buf);
+	df->l2df_data = data;
+	df->l2df_size = size;
+	df->l2df_type = type;
+	mutex_enter(&l2arc_free_on_write_mtx);
+	list_insert_head(l2arc_free_on_write, df);
+	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
-void
-arc_buf_add_ref(arc_buf_t *buf, void* tag)
+static void
+arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
 {
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-
-	/*
-	 * Check to see if this buffer is evicted.  Callers
-	 * must verify b_data != NULL to know if the add_ref
-	 * was successful.
-	 */
-	rw_enter(&buf->b_lock, RW_READER);
-	if (buf->b_data == NULL) {
-		rw_exit(&buf->b_lock);
-		return;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
+	uint64_t size = arc_hdr_size(hdr);
+
+	/* protected by hash lock, if in the hash table */
+	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT(state != arc_anon && state != arc_l2c_only);
+
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    size, hdr);
+	}
+	(void) refcount_remove_many(&state->arcs_size, size, hdr);
+	if (type == ARC_BUFC_METADATA) {
+		arc_space_return(size, ARC_SPACE_META);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		arc_space_return(size, ARC_SPACE_DATA);
 	}
-	hdr = buf->b_hdr;
-	ASSERT(hdr != NULL);
-	hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
-	rw_exit(&buf->b_lock);
 
-	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-	add_reference(hdr, hash_lock, tag);
-	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
-	arc_access(hdr, hash_lock);
-	mutex_exit(hash_lock);
-	ARCSTAT_BUMP(arcstat_hits);
-	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
-	    data, metadata, hits);
+	l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
 }
 
 /*
- * Free the arc data buffer.  If it is an l2arc write in progress,
- * the buffer is placed on l2arc_free_on_write to be freed later.
+ * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
+ * data buffer, we transfer the refcount ownership to the hdr and update
+ * the appropriate kstats.
  */
 static void
-arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
-    void *data, size_t size)
+arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
-	if (HDR_L2_WRITING(hdr)) {
-		l2arc_data_free_t *df;
-		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
-		df->l2df_data = data;
-		df->l2df_size = size;
-		df->l2df_func = free_func;
-		mutex_enter(&l2arc_free_on_write_mtx);
-		list_insert_head(l2arc_free_on_write, df);
-		mutex_exit(&l2arc_free_on_write_mtx);
-		ARCSTAT_BUMP(arcstat_l2_free_on_write);
-	} else {
-		free_func(data, size);
-	}
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+
+	ASSERT(!HDR_SHARED_DATA(hdr));
+	ASSERT(!arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+	/*
+	 * Start sharing the data buffer. We transfer the
+	 * refcount ownership to the hdr since it always owns
+	 * the refcount whenever an arc_buf_t is shared.
+	 */
+	refcount_transfer_ownership(&state->arcs_size, buf, hdr);
+	hdr->b_l1hdr.b_pdata = buf->b_data;
+	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+
+	/*
+	 * Since we've transferred ownership to the hdr we need
+	 * to increment its compressed and uncompressed kstats and
+	 * decrement the overhead size.
+	 */
+	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+	ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr));
 }
 
 static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
-	arc_buf_t **bufp;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
 
-	/* free up data associated with the buf */
-	if (buf->b_data) {
-		arc_state_t *state = buf->b_hdr->b_state;
-		uint64_t size = buf->b_hdr->b_size;
-		arc_buf_contents_t type = buf->b_hdr->b_type;
+	ASSERT(HDR_SHARED_DATA(hdr));
+	ASSERT(arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
 
-		arc_cksum_verify(buf);
+	/*
+	 * We are no longer sharing this buffer so we need
+	 * to transfer its ownership to the rightful owner.
+	 */
+	refcount_transfer_ownership(&state->arcs_size, hdr, buf);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+	hdr->b_l1hdr.b_pdata = NULL;
 
-		if (!recycle) {
-			if (type == ARC_BUFC_METADATA) {
-				arc_buf_data_free(buf->b_hdr, zio_buf_free,
-				    buf->b_data, size);
-				arc_space_return(size, ARC_SPACE_DATA);
-			} else {
-				ASSERT(type == ARC_BUFC_DATA);
-				arc_buf_data_free(buf->b_hdr,
-				    zio_data_buf_free, buf->b_data, size);
-				ARCSTAT_INCR(arcstat_data_size, -size);
-				atomic_add_64(&arc_size, -size);
-			}
-		}
-		if (list_link_active(&buf->b_hdr->b_arc_node)) {
-			uint64_t *cnt = &state->arcs_lsize[type];
+	/*
+	 * Since the buffer is no longer shared between
+	 * the arc buf and the hdr, count it as overhead.
+	 */
+	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+	ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+}
+
+/*
+ * Free up buf->b_data and if 'remove' is set, then pull the
+ * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ */
+static void
+arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
+{
+	arc_buf_t **bufp;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	uint64_t size = HDR_GET_LSIZE(hdr);
+	boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf);
 
-			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
-			ASSERT(state != arc_anon);
+	/*
+	 * Free up the data associated with the buf but only
+	 * if we're not sharing this with the hdr. If we are sharing
+	 * it with the hdr, then hdr will have performed the allocation
+	 * so allow it to do the free.
+	 */
+	if (buf->b_data != NULL) {
+		/*
+		 * We're about to change the hdr's b_flags. We must either
+		 * hold the hash_lock or be undiscoverable.
+		 */
+		ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
 
-			ASSERT3U(*cnt, >=, size);
-			atomic_add_64(cnt, -size);
+		arc_cksum_verify(buf);
+#ifdef illumos
+		arc_buf_unwatch(buf);
+#endif
+
+		if (destroyed_buf_is_shared) {
+			ASSERT(ARC_BUF_LAST(buf));
+			ASSERT(HDR_SHARED_DATA(hdr));
+			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+		} else {
+			arc_free_data_buf(hdr, buf->b_data, size, buf);
+			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
-		ASSERT3U(state->arcs_size, >=, size);
-		atomic_add_64(&state->arcs_size, -size);
 		buf->b_data = NULL;
-		ASSERT(buf->b_hdr->b_datacnt > 0);
-		buf->b_hdr->b_datacnt -= 1;
+
+		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+		hdr->b_l1hdr.b_bufcnt -= 1;
 	}
 
 	/* only remove the buf if requested */
-	if (!all)
+	if (!remove)
 		return;
 
 	/* remove the buf from the hdr list */
-	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
-		continue;
-	*bufp = buf->b_next;
+	arc_buf_t *lastbuf = NULL;
+	bufp = &hdr->b_l1hdr.b_buf;
+	while (*bufp != NULL) {
+		if (*bufp == buf)
+			*bufp = buf->b_next;
+
+		/*
+		 * If we've removed a buffer in the middle of
+		 * the list then update the lastbuf and update
+		 * bufp.
+		 */
+		if (*bufp != NULL) {
+			lastbuf = *bufp;
+			bufp = &(*bufp)->b_next;
+		}
+	}
+	buf->b_next = NULL;
+	ASSERT3P(lastbuf, !=, buf);
+
+	/*
+	 * If the current arc_buf_t is sharing its data
+	 * buffer with the hdr, then reassign the hdr's
+	 * b_pdata to share it with the new buffer at the end
+	 * of the list. The shared buffer is always the last one
+	 * on the hdr's buffer list.
+	 */
+	if (destroyed_buf_is_shared && lastbuf != NULL) {
+		ASSERT(ARC_BUF_LAST(buf));
+		ASSERT(ARC_BUF_LAST(lastbuf));
+		VERIFY(!arc_buf_is_shared(lastbuf));
+
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		arc_hdr_free_pdata(hdr);
 
-	ASSERT(buf->b_efunc == NULL);
+		/*
+		 * We must setup a new shared block between the
+		 * last buffer and the hdr. The data would have
+		 * been allocated by the arc buf so we need to transfer
+		 * ownership to the hdr since it's now being shared.
+		 */
+		arc_share_buf(hdr, lastbuf);
+	} else if (HDR_SHARED_DATA(hdr)) {
+		ASSERT(arc_buf_is_shared(lastbuf));
+	}
+
+	if (hdr->b_l1hdr.b_bufcnt == 0)
+		arc_cksum_free(hdr);
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
@@ -1429,527 +2832,1026 @@ arc_buf_destroy(arc_buf_t *buf, boolean_
 }
 
 static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
+arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
 {
-	ASSERT(refcount_is_zero(&hdr->b_refcnt));
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!HDR_SHARED_DATA(hdr));
+
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
+	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+
+	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+}
+
+static void
+arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+
+	/*
+	 * If the hdr is currently being written to the l2arc then
+	 * we defer freeing the data by adding it to the l2arc_free_on_write
+	 * list. The l2arc will free the data once it's finished
+	 * writing it to the l2arc device.
+	 */
+	if (HDR_L2_WRITING(hdr)) {
+		arc_hdr_free_on_write(hdr);
+		ARCSTAT_BUMP(arcstat_l2_free_on_write);
+	} else {
+		arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
+		    arc_hdr_size(hdr), hdr);
+	}
+	hdr->b_l1hdr.b_pdata = NULL;
+	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+
+	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+}
+
+static arc_buf_hdr_t *
+arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
+    enum zio_compress compress, arc_buf_contents_t type)
+{
+	arc_buf_hdr_t *hdr;
+
+	ASSERT3U(lsize, >, 0);
+	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
+
+	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
+	ASSERT(HDR_EMPTY(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
+	HDR_SET_PSIZE(hdr, psize);
+	HDR_SET_LSIZE(hdr, lsize);
+	hdr->b_spa = spa;
+	hdr->b_type = type;
+	hdr->b_flags = 0;
+	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
+	arc_hdr_set_compress(hdr, compress);
+
+	hdr->b_l1hdr.b_state = arc_anon;
+	hdr->b_l1hdr.b_arc_access = 0;
+	hdr->b_l1hdr.b_bufcnt = 0;
+	hdr->b_l1hdr.b_buf = NULL;
+
+	/*
+	 * Allocate the hdr's buffer. This will contain either
+	 * the compressed or uncompressed data depending on the block
+	 * it references and compressed arc enablement.
+	 */
+	arc_hdr_alloc_pdata(hdr);
+	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+
+	return (hdr);
+}
+
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
+	arc_buf_hdr_t *nhdr;
+	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+	    (old == hdr_l2only_cache && new == hdr_full_cache));
+
+	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+	buf_hash_remove(hdr);
 
-	if (l2hdr != NULL) {
-		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
+	if (new == hdr_full_cache) {
+		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
-		 * To prevent arc_free() and l2arc_evict() from
-		 * attempting to free the same buffer at the same time,
-		 * a FREE_IN_PROGRESS flag is given to arc_free() to
-		 * give it priority.  l2arc_evict() can't destroy this
-		 * header while we are waiting on l2arc_buflist_mtx.
-		 *
-		 * The hdr may be removed from l2ad_buflist before we
-		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+		 * arc_access and arc_change_state need to be aware that a
+		 * header has just come out of L2ARC, so we set its state to
+		 * l2c_only even though it's about to change.
+		 */
+		nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+		/* Verify previous threads set to NULL before freeing */
+		ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
+	} else {
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+		/*
+		 * If we've reached here, We must have been called from
+		 * arc_evict_hdr(), as such we should have already been
+		 * removed from any ghost list we were previously on
+		 * (which protects us from racing with arc_evict_state),
+		 * thus no locking is needed during this check.
+		 */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		/*
+		 * A buffer must not be moved into the arc_l2c_only
+		 * state if it's not finished being written out to the
+		 * l2arc device. Otherwise, the b_l1hdr.b_pdata field
+		 * might try to be accessed, even though it was removed.
 		 */
-		if (!buflist_held) {
-			mutex_enter(&l2arc_buflist_mtx);
-			l2hdr = hdr->b_l2hdr;
+		VERIFY(!HDR_L2_WRITING(hdr));
+		VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+
+#ifdef ZFS_DEBUG
+		if (hdr->b_l1hdr.b_thawed != NULL) {
+			kmem_free(hdr->b_l1hdr.b_thawed, 1);
+			hdr->b_l1hdr.b_thawed = NULL;
 		}
+#endif
+
+		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
+	}
+	/*
+	 * The header has been reallocated so we need to re-insert it into any
+	 * lists it was on.
+	 */
+	(void) buf_hash_insert(nhdr, NULL);
+
+	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+	mutex_enter(&dev->l2ad_mtx);
+
+	/*
+	 * We must place the realloc'ed header back into the list at
+	 * the same spot. Otherwise, if it's placed earlier in the list,
+	 * l2arc_write_buffers() could find it during the function's
+	 * write phase, and try to write it out to the l2arc.
+	 */
+	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+	list_remove(&dev->l2ad_buflist, hdr);
+
+	mutex_exit(&dev->l2ad_mtx);
+
+	/*
+	 * Since we're using the pointer address as the tag when
+	 * incrementing and decrementing the l2ad_alloc refcount, we
+	 * must remove the old pointer (that we're about to destroy) and
+	 * add the new pointer to the refcount. Otherwise we'd remove
+	 * the wrong pointer address when calling arc_hdr_destroy() later.
+	 */
+
+	(void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+	(void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
+
+	buf_discard_identity(hdr);
+	kmem_cache_free(old, hdr);
+
+	return (nhdr);
+}
+
+/*
+ * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
+ * The buf is returned thawed since we expect the consumer to modify it.
+ */
+arc_buf_t *
+arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
+{
+	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
+	    ZIO_COMPRESS_OFF, type);
+	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+	arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag);
+	arc_buf_thaw(buf);
+	return (buf);
+}
+
+static void
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
+{
+	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+	l2arc_dev_t *dev = l2hdr->b_dev;
+	uint64_t asize = arc_hdr_size(hdr);
+
+	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
+	list_remove(&dev->l2ad_buflist, hdr);
+
+	ARCSTAT_INCR(arcstat_l2_asize, -asize);
+	ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
+
+	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+
+	(void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+	if (HDR_HAS_L1HDR(hdr)) {
+		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+		    hdr->b_l1hdr.b_bufcnt > 0);
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+	}
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+	if (!HDR_EMPTY(hdr))
+		buf_discard_identity(hdr);
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
-		if (l2hdr != NULL) {
-			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
-			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
-			if (hdr->b_state == arc_l2c_only)
-				l2arc_hdr_stat_remove();
-			hdr->b_l2hdr = NULL;
+		if (!buflist_held)
+			mutex_enter(&dev->l2ad_mtx);
+
+		/*
+		 * Even though we checked this conditional above, we
+		 * need to check this again now that we have the
+		 * l2ad_mtx. This is because we could be racing with
+		 * another thread calling l2arc_evict() which might have
+		 * destroyed this header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx. If that happens, we don't
+		 * want to re-destroy the header's L2 portion.
+		 */
+		if (HDR_HAS_L2HDR(hdr)) {
+			l2arc_trim(hdr);
+			arc_hdr_l2hdr_destroy(hdr);
 		}
 
 		if (!buflist_held)
-			mutex_exit(&l2arc_buflist_mtx);
+			mutex_exit(&dev->l2ad_mtx);
 	}
 
-	if (!BUF_EMPTY(hdr)) {
-		ASSERT(!HDR_IN_HASH_TABLE(hdr));
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
-	}
-	while (hdr->b_buf) {
-		arc_buf_t *buf = hdr->b_buf;
-
-		if (buf->b_efunc) {
-			mutex_enter(&arc_eviction_mtx);
-			rw_enter(&buf->b_lock, RW_WRITER);
-			ASSERT(buf->b_hdr != NULL);
-			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
-			hdr->b_buf = buf->b_next;
-			buf->b_hdr = &arc_eviction_hdr;
-			buf->b_next = arc_eviction_list;
-			arc_eviction_list = buf;
-			rw_exit(&buf->b_lock);
-			mutex_exit(&arc_eviction_mtx);
-		} else {
-			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
+	if (HDR_HAS_L1HDR(hdr)) {
+		arc_cksum_free(hdr);
+
+		while (hdr->b_l1hdr.b_buf != NULL)
+			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE);
+
+#ifdef ZFS_DEBUG
+		if (hdr->b_l1hdr.b_thawed != NULL) {
+			kmem_free(hdr->b_l1hdr.b_thawed, 1);
+			hdr->b_l1hdr.b_thawed = NULL;
+		}
+#endif
+
+		if (hdr->b_l1hdr.b_pdata != NULL) {
+			arc_hdr_free_pdata(hdr);
 		}
-	}
-	if (hdr->b_freeze_cksum != NULL) {
-		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-		hdr->b_freeze_cksum = NULL;
 	}
 
-	ASSERT(!list_link_active(&hdr->b_arc_node));
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
-	ASSERT3P(hdr->b_acb, ==, NULL);
-	kmem_cache_free(hdr_cache, hdr);
+	if (HDR_HAS_L1HDR(hdr)) {
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+		kmem_cache_free(hdr_full_cache, hdr);
+	} else {
+		kmem_cache_free(hdr_l2only_cache, hdr);
+	}
 }
 
 void
-arc_buf_free(arc_buf_t *buf, void *tag)
+arc_buf_destroy(arc_buf_t *buf, void* tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
-	int hashed = hdr->b_state != arc_anon;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+	if (hdr->b_l1hdr.b_state == arc_anon) {
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		VERIFY0(remove_reference(hdr, NULL, tag));
+		arc_hdr_destroy(hdr);
+		return;
+	}
 
-	ASSERT(buf->b_efunc == NULL);
-	ASSERT(buf->b_data != NULL);
+	mutex_enter(hash_lock);
+	ASSERT3P(hdr, ==, buf->b_hdr);
+	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
+	ASSERT3P(buf->b_data, !=, NULL);
 
-	if (hashed) {
-		kmutex_t *hash_lock = HDR_LOCK(hdr);
+	(void) remove_reference(hdr, hash_lock, tag);
+	arc_buf_destroy_impl(buf, B_TRUE);
+	mutex_exit(hash_lock);
+}
+
+int32_t
+arc_buf_size(arc_buf_t *buf)
+{
+	return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+/*
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on its state prior to entering this
+ * function. The following transitions are possible:
+ *
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
+ */
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+{
+	arc_state_t *evicted_state, *state;
+	int64_t bytes_evicted = 0;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	state = hdr->b_l1hdr.b_state;
+	if (GHOST_STATE(state)) {
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
-		mutex_enter(hash_lock);
-		(void) remove_reference(hdr, hash_lock, tag);
-		if (hdr->b_datacnt > 1) {
-			arc_buf_destroy(buf, FALSE, TRUE);
-		} else {
-			ASSERT(buf == hdr->b_buf);
-			ASSERT(buf->b_efunc == NULL);
-			hdr->b_flags |= ARC_BUF_AVAILABLE;
-		}
-		mutex_exit(hash_lock);
-	} else if (HDR_IO_IN_PROGRESS(hdr)) {
-		int destroy_hdr;
 		/*
-		 * We are in the middle of an async write.  Don't destroy
-		 * this buffer unless the write completes before we finish
-		 * decrementing the reference count.
-		 */
-		mutex_enter(&arc_eviction_mtx);
-		(void) remove_reference(hdr, NULL, tag);
-		ASSERT(refcount_is_zero(&hdr->b_refcnt));
-		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
-		mutex_exit(&arc_eviction_mtx);
-		if (destroy_hdr)
+		 * l2arc_write_buffers() relies on a header's L1 portion
+		 * (i.e. its b_pdata field) during its write phase.
+		 * Thus, we cannot push a header onto the arc_l2c_only
+		 * state (removing it's L1 piece) until the header is
+		 * done being written to the l2arc.
+		 */
+		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+			ARCSTAT_BUMP(arcstat_evict_l2_skip);
+			return (bytes_evicted);
+		}
+
+		ARCSTAT_BUMP(arcstat_deleted);
+		bytes_evicted += HDR_GET_LSIZE(hdr);
+
+		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+		if (HDR_HAS_L2HDR(hdr)) {
+			ASSERT(hdr->b_l1hdr.b_pdata == NULL);
+			/*
+			 * This buffer is cached on the 2nd Level ARC;
+			 * don't destroy the header.
+			 */
+			arc_change_state(arc_l2c_only, hdr, hash_lock);
+			/*
+			 * dropping from L1+L2 cached to L2-only,
+			 * realloc to remove the L1 header.
+			 */
+			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+			    hdr_l2only_cache);
+		} else {
+			ASSERT(hdr->b_l1hdr.b_pdata == NULL);
+			arc_change_state(arc_anon, hdr, hash_lock);
 			arc_hdr_destroy(hdr);
+		}
+		return (bytes_evicted);
+	}
+
+	ASSERT(state == arc_mru || state == arc_mfu);
+	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+	/* prefetch buffers have a minimum lifespan */
+	if (HDR_IO_IN_PROGRESS(hdr) ||
+	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+	    arc_min_prefetch_lifespan)) {
+		ARCSTAT_BUMP(arcstat_evict_skip);
+		return (bytes_evicted);
+	}
+
+	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+	while (hdr->b_l1hdr.b_buf) {
+		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+		if (!mutex_tryenter(&buf->b_evict_lock)) {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+			break;
+		}
+		if (buf->b_data != NULL)
+			bytes_evicted += HDR_GET_LSIZE(hdr);
+		mutex_exit(&buf->b_evict_lock);
+		arc_buf_destroy_impl(buf, B_TRUE);
+	}
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
-		if (remove_reference(hdr, NULL, tag) > 0) {
-			ASSERT(HDR_IO_ERROR(hdr));
-			arc_buf_destroy(buf, FALSE, TRUE);
+		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
+			ARCSTAT_INCR(arcstat_evict_l2_eligible,
+			    HDR_GET_LSIZE(hdr));
 		} else {
-			arc_hdr_destroy(hdr);
+			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
+			    HDR_GET_LSIZE(hdr));
 		}
 	}
+
+	if (hdr->b_l1hdr.b_bufcnt == 0) {
+		arc_cksum_free(hdr);
+
+		bytes_evicted += arc_hdr_size(hdr);
+
+		/*
+		 * If this hdr is being evicted and has a compressed
+		 * buffer then we discard it here before we change states.
+		 * This ensures that the accounting is updated correctly
+		 * in arc_free_data_buf().
+		 */
+		arc_hdr_free_pdata(hdr);
+
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+	}
+
+	return (bytes_evicted);
 }
 
-int
-arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
 {
-	arc_buf_hdr_t *hdr = buf->b_hdr;
-	kmutex_t *hash_lock = HDR_LOCK(hdr);
-	int no_callback = (buf->b_efunc == NULL);
+	multilist_sublist_t *mls;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	int evict_count = 0;
 
-	if (hdr->b_state == arc_anon) {
-		ASSERT(hdr->b_datacnt == 1);
-		arc_buf_free(buf, tag);
-		return (no_callback);
-	}
+	ASSERT3P(marker, !=, NULL);
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
 
-	mutex_enter(hash_lock);
-	ASSERT(hdr->b_state != arc_anon);
-	ASSERT(buf->b_data != NULL);
+	mls = multilist_sublist_lock(ml, idx);
 
-	(void) remove_reference(hdr, hash_lock, tag);
-	if (hdr->b_datacnt > 1) {
-		if (no_callback)
-			arc_buf_destroy(buf, FALSE, TRUE);
-	} else if (no_callback) {
-		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
-		ASSERT(buf->b_efunc == NULL);
-		hdr->b_flags |= ARC_BUF_AVAILABLE;
+	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+	    hdr = multilist_sublist_prev(mls, marker)) {
+		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+		    (evict_count >= zfs_arc_evict_batch_limit))
+			break;
+
+		/*
+		 * To keep our iteration location, move the marker
+		 * forward. Since we're not holding hdr's hash lock, we
+		 * must be very careful and not remove 'hdr' from the
+		 * sublist. Otherwise, other consumers might mistake the
+		 * 'hdr' as not being on a sublist when they call the
+		 * multilist_link_active() function (they all rely on
+		 * the hash lock protecting concurrent insertions and
+		 * removals). multilist_sublist_move_forward() was
+		 * specifically implemented to ensure this is the case
+		 * (only 'marker' will be removed and re-inserted).
+		 */
+		multilist_sublist_move_forward(mls, marker);
+
+		/*
+		 * The only case where the b_spa field should ever be
+		 * zero, is the marker headers inserted by
+		 * arc_evict_state(). It's possible for multiple threads
+		 * to be calling arc_evict_state() concurrently (e.g.
+		 * dsl_pool_close() and zio_inject_fault()), so we must
+		 * skip any markers we see from these other threads.
+		 */
+		if (hdr->b_spa == 0)
+			continue;
+
+		/* we're only interested in evicting buffers of a certain spa */
+		if (spa != 0 && hdr->b_spa != spa) {
+			ARCSTAT_BUMP(arcstat_evict_skip);
+			continue;
+		}
+
+		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We aren't calling this function from any code path
+		 * that would already be holding a hash lock, so we're
+		 * asserting on this assumption to be defensive in case
+		 * this ever changes. Without this check, it would be
+		 * possible to incorrectly increment arcstat_mutex_miss
+		 * below (e.g. if the code changed such that we called
+		 * this function with a hash lock held).
+		 */
+		ASSERT(!MUTEX_HELD(hash_lock));
+
+		if (mutex_tryenter(hash_lock)) {
+			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+			mutex_exit(hash_lock);
+
+			bytes_evicted += evicted;
+
+			/*
+			 * If evicted is zero, arc_evict_hdr() must have
+			 * decided to skip this header, don't increment
+			 * evict_count in this case.
+			 */
+			if (evicted != 0)
+				evict_count++;
+
+			/*
+			 * If arc_size isn't overflowing, signal any
+			 * threads that might happen to be waiting.
+			 *
+			 * For each header evicted, we wake up a single
+			 * thread. If we used cv_broadcast, we could
+			 * wake up "too many" threads causing arc_size
+			 * to significantly overflow arc_c; since
+			 * arc_get_data_buf() doesn't check for overflow
+			 * when it's woken up (it doesn't because it's
+			 * possible for the ARC to be overflowing while
+			 * full of un-evictable buffers, and the
+			 * function should proceed in this case).
+			 *
+			 * If threads are left sleeping, due to not
+			 * using cv_broadcast, they will be woken up
+			 * just before arc_reclaim_thread() sleeps.
+			 */
+			mutex_enter(&arc_reclaim_lock);
+			if (!arc_is_overflowing())
+				cv_signal(&arc_reclaim_waiters_cv);
+			mutex_exit(&arc_reclaim_lock);
+		} else {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+		}
 	}
-	ASSERT(no_callback || hdr->b_datacnt > 1 ||
-	    refcount_is_zero(&hdr->b_refcnt));
-	mutex_exit(hash_lock);
-	return (no_callback);
+
+	multilist_sublist_unlock(mls);
+
+	return (bytes_evicted);
 }
 
-int
-arc_buf_size(arc_buf_t *buf)
+/*
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
 {
-	return (buf->b_hdr->b_size);
+	uint64_t total_evicted = 0;
+	multilist_t *ml = &state->arcs_list[type];
+	int num_sublists;
+	arc_buf_hdr_t **markers;
+
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+	num_sublists = multilist_get_num_sublists(ml);
+
+	/*
+	 * If we've tried to evict from each sublist, made some
+	 * progress, but still have not hit the target number of bytes
+	 * to evict, we want to keep trying. The markers allow us to
+	 * pick up where we left off for each individual sublist, rather
+	 * than starting from the tail each time.
+	 */
+	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+	for (int i = 0; i < num_sublists; i++) {
+		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+		/*
+		 * A b_spa of 0 is used to indicate that this header is
+		 * a marker. This fact is used in arc_adjust_type() and
+		 * arc_evict_state_impl().
+		 */
+		markers[i]->b_spa = 0;
+
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_insert_tail(mls, markers[i]);
+		multilist_sublist_unlock(mls);
+	}
+
+	/*
+	 * While we haven't hit our target number of bytes to evict, or
+	 * we're evicting all available buffers.
+	 */
+	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+		/*
+		 * Start eviction using a randomly selected sublist,
+		 * this is to try and evenly balance eviction across all
+		 * sublists. Always starting at the same sublist
+		 * (e.g. index 0) would cause evictions to favor certain
+		 * sublists over others.
+		 */
+		int sublist_idx = multilist_get_random_index(ml);
+		uint64_t scan_evicted = 0;
+
+		for (int i = 0; i < num_sublists; i++) {
+			uint64_t bytes_remaining;
+			uint64_t bytes_evicted;
+
+			if (bytes == ARC_EVICT_ALL)
+				bytes_remaining = ARC_EVICT_ALL;
+			else if (total_evicted < bytes)
+				bytes_remaining = bytes - total_evicted;
+			else
+				break;
+
+			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+			    markers[sublist_idx], spa, bytes_remaining);
+
+			scan_evicted += bytes_evicted;
+			total_evicted += bytes_evicted;
+
+			/* we've reached the end, wrap to the beginning */
+			if (++sublist_idx >= num_sublists)
+				sublist_idx = 0;
+		}
+
+		/*
+		 * If we didn't evict anything during this scan, we have
+		 * no reason to believe we'll evict more during another
+		 * scan, so break the loop.
+		 */
+		if (scan_evicted == 0) {
+			/* This isn't possible, let's make that obvious */
+			ASSERT3S(bytes, !=, 0);
+
+			/*
+			 * When bytes is ARC_EVICT_ALL, the only way to
+			 * break the loop is when scan_evicted is zero.
+			 * In that case, we actually have evicted enough,
+			 * so we don't want to increment the kstat.
+			 */
+			if (bytes != ARC_EVICT_ALL) {
+				ASSERT3S(total_evicted, <, bytes);
+				ARCSTAT_BUMP(arcstat_evict_not_enough);
+			}
+
+			break;
+		}
+	}
+
+	for (int i = 0; i < num_sublists; i++) {
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_remove(mls, markers[i]);
+		multilist_sublist_unlock(mls);
+
+		kmem_cache_free(hdr_full_cache, markers[i]);
+	}
+	kmem_free(markers, sizeof (*markers) * num_sublists);
+
+	return (total_evicted);
 }
 
 /*
- * Evict buffers from list until we've removed the specified number of
- * bytes.  Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
  *
- * This function makes a "best effort".  It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ * When 'retry' is set to B_FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to B_TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
  */
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
-    arc_buf_contents_t type)
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+    boolean_t retry)
 {
-	arc_state_t *evicted_state;
-	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
-	arc_buf_hdr_t *ab, *ab_prev = NULL;
-	list_t *list = &state->arcs_list[type];
-	kmutex_t *hash_lock;
-	boolean_t have_lock;
-	void *stolen = NULL;
-
-	ASSERT(state == arc_mru || state == arc_mfu);
+	uint64_t evicted = 0;
 
-	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+	while (refcount_count(&state->arcs_esize[type]) != 0) {
+		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
 
-	mutex_enter(&state->arcs_mtx);
-	mutex_enter(&evicted_state->arcs_mtx);
+		if (!retry)
+			break;
+	}
 
-	for (ab = list_tail(list); ab; ab = ab_prev) {
-		ab_prev = list_prev(list, ab);
-		/* prefetch buffers have a minimum lifespan */
-		if (HDR_IO_IN_PROGRESS(ab) ||
-		    (spa && ab->b_spa != spa) ||
-		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-		    ddi_get_lbolt() - ab->b_arc_access <
-		    arc_min_prefetch_lifespan)) {
-			skipped++;
-			continue;
-		}
-		/* "lookahead" for better eviction candidate */
-		if (recycle && ab->b_size != bytes &&
-		    ab_prev && ab_prev->b_size == bytes)
-			continue;
-		hash_lock = HDR_LOCK(ab);
-		have_lock = MUTEX_HELD(hash_lock);
-		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
-			ASSERT(ab->b_datacnt > 0);
-			while (ab->b_buf) {
-				arc_buf_t *buf = ab->b_buf;
-				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
-					missed += 1;
-					break;
-				}
-				if (buf->b_data) {
-					bytes_evicted += ab->b_size;
-					if (recycle && ab->b_type == type &&
-					    ab->b_size == bytes &&
-					    !HDR_L2_WRITING(ab)) {
-						stolen = buf->b_data;
-						recycle = FALSE;
-					}
-				}
-				if (buf->b_efunc) {
-					mutex_enter(&arc_eviction_mtx);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, FALSE);
-					ab->b_buf = buf->b_next;
-					buf->b_hdr = &arc_eviction_hdr;
-					buf->b_next = arc_eviction_list;
-					arc_eviction_list = buf;
-					mutex_exit(&arc_eviction_mtx);
-					rw_exit(&buf->b_lock);
-				} else {
-					rw_exit(&buf->b_lock);
-					arc_buf_destroy(buf,
-					    buf->b_data == stolen, TRUE);
-				}
-			}
+	return (evicted);
+}
 
-			if (ab->b_l2hdr) {
-				ARCSTAT_INCR(arcstat_evict_l2_cached,
-				    ab->b_size);
-			} else {
-				if (l2arc_write_eligible(ab->b_spa, ab)) {
-					ARCSTAT_INCR(arcstat_evict_l2_eligible,
-					    ab->b_size);
-				} else {
-					ARCSTAT_INCR(
-					    arcstat_evict_l2_ineligible,
-					    ab->b_size);
-				}
-			}
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
+{
+	int64_t delta;
 
-			if (ab->b_datacnt == 0) {
-				arc_change_state(evicted_state, ab, hash_lock);
-				ASSERT(HDR_IN_HASH_TABLE(ab));
-				ab->b_flags |= ARC_IN_HASH_TABLE;
-				ab->b_flags &= ~ARC_BUF_AVAILABLE;
-				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
-			}
-			if (!have_lock)
-				mutex_exit(hash_lock);
-			if (bytes >= 0 && bytes_evicted >= bytes)
-				break;
-		} else {
-			missed += 1;
-		}
+	if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
+		delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
+		return (arc_evict_state(state, spa, delta, type));
 	}
 
-	mutex_exit(&evicted_state->arcs_mtx);
-	mutex_exit(&state->arcs_mtx);
+	return (0);
+}
 
-	if (bytes_evicted < bytes)
-		dprintf("only evicted %lld bytes from %x",
-		    (longlong_t)bytes_evicted, state);
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_adjust_meta(void)
+{
+	uint64_t total_evicted = 0;
+	int64_t target;
 
-	if (skipped)
-		ARCSTAT_INCR(arcstat_evict_skip, skipped);
+	/*
+	 * If we're over the meta limit, we want to evict enough
+	 * metadata to get back under the meta limit. We don't want to
+	 * evict so much that we drop the MRU below arc_p, though. If
+	 * we're over the meta limit more than we're over arc_p, we
+	 * evict some from the MRU here, and some from the MFU below.
+	 */
+	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
+	    refcount_count(&arc_mru->arcs_size) - arc_p));
 
-	if (missed)
-		ARCSTAT_INCR(arcstat_mutex_miss, missed);
+	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 
 	/*
-	 * We have just evicted some date into the ghost state, make
-	 * sure we also adjust the ghost state size if necessary.
+	 * Similar to the above, we want to evict enough bytes to get us
+	 * below the meta limit, but not so much as to drop us below the
+	 * space alloted to the MFU (which is defined as arc_c - arc_p).
 	 */
-	if (arc_no_grow &&
-	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
-		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
-		    arc_mru_ghost->arcs_size - arc_c;
+	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+	    (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
 
-		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
-			int64_t todelete =
-			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
-			arc_evict_ghost(arc_mru_ghost, 0, todelete);
-		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
-			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
-			    arc_mru_ghost->arcs_size +
-			    arc_mfu_ghost->arcs_size - arc_c);
-			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
-		}
-	}
+	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 
-	return (stolen);
+	return (total_evicted);
 }
 
 /*
- * Remove buffers from list until we've removed the specified number of
- * bytes.  Destroy the buffers that are removed.
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
  */
-static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
 {
-	arc_buf_hdr_t *ab, *ab_prev;
-	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
-	kmutex_t *hash_lock;
-	uint64_t bytes_deleted = 0;
-	uint64_t bufs_skipped = 0;
-	boolean_t have_lock;
+	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
+	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
+	int data_idx = multilist_get_random_index(data_ml);
+	int meta_idx = multilist_get_random_index(meta_ml);
+	multilist_sublist_t *data_mls;
+	multilist_sublist_t *meta_mls;
+	arc_buf_contents_t type;
+	arc_buf_hdr_t *data_hdr;
+	arc_buf_hdr_t *meta_hdr;
 
-	ASSERT(GHOST_STATE(state));
-top:
-	mutex_enter(&state->arcs_mtx);
-	for (ab = list_tail(list); ab; ab = ab_prev) {
-		ab_prev = list_prev(list, ab);
-		if (spa && ab->b_spa != spa)
-			continue;
-		hash_lock = HDR_LOCK(ab);
-		have_lock = MUTEX_HELD(hash_lock);
-		if (have_lock || mutex_tryenter(hash_lock)) {
-			ASSERT(!HDR_IO_IN_PROGRESS(ab));
-			ASSERT(ab->b_buf == NULL);
-			ARCSTAT_BUMP(arcstat_deleted);
-			bytes_deleted += ab->b_size;
+	/*
+	 * We keep the sublist lock until we're finished, to prevent
+	 * the headers from being destroyed via arc_evict_state().
+	 */
+	data_mls = multilist_sublist_lock(data_ml, data_idx);
+	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
 
-			if (ab->b_l2hdr != NULL) {
-				/*
-				 * This buffer is cached on the 2nd Level ARC;
-				 * don't destroy the header.
-				 */
-				arc_change_state(arc_l2c_only, ab, hash_lock);
-				if (!have_lock)
-					mutex_exit(hash_lock);
-			} else {
-				arc_change_state(arc_anon, ab, hash_lock);
-				if (!have_lock)
-					mutex_exit(hash_lock);
-				arc_hdr_destroy(ab);
-			}
+	/*
+	 * These two loops are to ensure we skip any markers that
+	 * might be at the tail of the lists due to arc_evict_state().
+	 */
 
-			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
-			if (bytes >= 0 && bytes_deleted >= bytes)
-				break;
-		} else {
-			if (bytes < 0) {
-				mutex_exit(&state->arcs_mtx);
-				mutex_enter(hash_lock);
-				mutex_exit(hash_lock);
-				goto top;
-			}
-			bufs_skipped += 1;
-		}
+	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+		if (data_hdr->b_spa != 0)
+			break;
 	}
-	mutex_exit(&state->arcs_mtx);
 
-	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
-	    (bytes < 0 || bytes_deleted < bytes)) {
-		list = &state->arcs_list[ARC_BUFC_METADATA];
-		goto top;
+	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+		if (meta_hdr->b_spa != 0)
+			break;
 	}
 
-	if (bufs_skipped) {
-		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
-		ASSERT(bytes >= 0);
+	if (data_hdr == NULL && meta_hdr == NULL) {
+		type = ARC_BUFC_DATA;
+	} else if (data_hdr == NULL) {
+		ASSERT3P(meta_hdr, !=, NULL);
+		type = ARC_BUFC_METADATA;
+	} else if (meta_hdr == NULL) {
+		ASSERT3P(data_hdr, !=, NULL);
+		type = ARC_BUFC_DATA;
+	} else {
+		ASSERT3P(data_hdr, !=, NULL);
+		ASSERT3P(meta_hdr, !=, NULL);
+
+		/* The headers can't be on the sublist without an L1 header */
+		ASSERT(HDR_HAS_L1HDR(data_hdr));
+		ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+		if (data_hdr->b_l1hdr.b_arc_access <
+		    meta_hdr->b_l1hdr.b_arc_access) {
+			type = ARC_BUFC_DATA;
+		} else {
+			type = ARC_BUFC_METADATA;
+		}
 	}
 
-	if (bytes_deleted < bytes)
-		dprintf("only deleted %lld bytes from %p",
-		    (longlong_t)bytes_deleted, state);
+	multilist_sublist_unlock(meta_mls);
+	multilist_sublist_unlock(data_mls);
+
+	return (type);
 }
 
-static void
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
 arc_adjust(void)
 {
-	int64_t adjustment, delta;
+	uint64_t total_evicted = 0;
+	uint64_t bytes;
+	int64_t target;
 
 	/*
-	 * Adjust MRU size
+	 * If we're over arc_meta_limit, we want to correct that before
+	 * potentially evicting data buffers below.
 	 */
+	total_evicted += arc_adjust_meta();
 
-	adjustment = MIN(arc_size - arc_c,
-	    arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
+	/*
+	 * Adjust MRU size
+	 *
+	 * If we're over the target cache size, we want to evict enough
+	 * from the list to get back to our target size. We don't want
+	 * to evict too much from the MRU, such that it drops below
+	 * arc_p. So, if we're over our target cache size more than
+	 * the MRU is over arc_p, we'll evict enough to get back to
+	 * arc_p here, and then evict more from the MFU below.
+	 */
+	target = MIN((int64_t)(arc_size - arc_c),
+	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
+	    refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
+
+	/*
+	 * If we're below arc_meta_min, always prefer to evict data.
+	 * Otherwise, try to satisfy the requested number of bytes to
+	 * evict from the type which contains older buffers; in an
+	 * effort to keep newer buffers in the cache regardless of their
+	 * type. If we cannot satisfy the number of bytes from this
+	 * type, spill over into the next type.
+	 */
+	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
-		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
-		adjustment -= delta;
-	}
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
 
-	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
-		(void) arc_evict(arc_mru, 0, delta, FALSE,
-		    ARC_BUFC_METADATA);
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from metadata.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust MFU size
-	 */
+	 *
+	 * Now that we've tried to evict enough from the MRU to get its
+	 * size back to arc_p, if we're still above the target cache
+	 * size, we evict the rest from the MFU.
+	 */
+	target = arc_size - arc_c;
+
+	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
+	    arc_meta_used > arc_meta_min) {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
 
-	adjustment = arc_size - arc_c;
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
-		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
-		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
-		adjustment -= delta;
-	}
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from data.
+		 */
+		target -= bytes;
 
-	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		int64_t delta = MIN(adjustment,
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
-		(void) arc_evict(arc_mfu, 0, delta, FALSE,
-		    ARC_BUFC_METADATA);
+		total_evicted +=
+		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
 	}
 
 	/*
 	 * Adjust ghost lists
+	 *
+	 * In addition to the above, the ARC also defines target values
+	 * for the ghost lists. The sum of the mru list and mru ghost
+	 * list should never exceed the target size of the cache, and
+	 * the sum of the mru list, mfu list, mru ghost list, and mfu
+	 * ghost list should never exceed twice the target size of the
+	 * cache. The following logic enforces these limits on the ghost
+	 * caches, and evicts from them as needed.
 	 */
+	target = refcount_count(&arc_mru->arcs_size) +
+	    refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
 
-	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
 
-	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
-		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mru_ghost, 0, delta);
-	}
+	target -= bytes;
 
-	adjustment =
-	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+	total_evicted +=
+	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
 
-	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
-		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
-		arc_evict_ghost(arc_mfu_ghost, 0, delta);
-	}
-}
+	/*
+	 * We assume the sum of the mru list and mfu list is less than
+	 * or equal to arc_c (we enforced this above), which means we
+	 * can use the simpler of the two equations below:
+	 *
+	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+	 *		    mru ghost + mfu ghost <= arc_c
+	 */
+	target = refcount_count(&arc_mru_ghost->arcs_size) +
+	    refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
 
-static void
-arc_do_user_evicts(void)
-{
-	mutex_enter(&arc_eviction_mtx);
-	while (arc_eviction_list != NULL) {
-		arc_buf_t *buf = arc_eviction_list;
-		arc_eviction_list = buf->b_next;
-		rw_enter(&buf->b_lock, RW_WRITER);
-		buf->b_hdr = NULL;
-		rw_exit(&buf->b_lock);
-		mutex_exit(&arc_eviction_mtx);
+	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
 
-		if (buf->b_efunc != NULL)
-			VERIFY(buf->b_efunc(buf) == 0);
+	target -= bytes;
 
-		buf->b_efunc = NULL;
-		buf->b_private = NULL;
-		kmem_cache_free(buf_cache, buf);
-		mutex_enter(&arc_eviction_mtx);
-	}
-	mutex_exit(&arc_eviction_mtx);
+	total_evicted +=
+	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+	return (total_evicted);
 }
 
-/*
- * Flush all *evictable* data from the cache for the given spa.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
 void
-arc_flush(spa_t *spa)
+arc_flush(spa_t *spa, boolean_t retry)
 {
 	uint64_t guid = 0;
 
-	if (spa)
-		guid = spa_guid(spa);
+	/*
+	 * If retry is B_TRUE, a spa must not be specified since we have
+	 * no good way to determine if all of a spa's buffers have been
+	 * evicted from an arc state.
+	 */
+	ASSERT(!retry || spa == 0);
 
-	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa)
-			break;
-	}
-	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
-		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa)
-			break;
-	}
-	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
-		if (spa)
-			break;
-	}
-	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
-		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
-		if (spa)
-			break;
-	}
+	if (spa != NULL)
+		guid = spa_load_guid(spa);
+
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
-	arc_evict_ghost(arc_mru_ghost, guid, -1);
-	arc_evict_ghost(arc_mfu_ghost, guid, -1);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
-	mutex_enter(&arc_reclaim_thr_lock);
-	arc_do_user_evicts();
-	mutex_exit(&arc_reclaim_thr_lock);
-	ASSERT(spa || arc_eviction_list == NULL);
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
-arc_shrink(void)
+arc_shrink(int64_t to_free)
 {
 	if (arc_c > arc_c_min) {
-		uint64_t to_free;
-
-#ifdef _KERNEL
-		to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
-#else
-		to_free = arc_c >> arc_shrink_shift;
-#endif
+		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
+			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
 		if (arc_c > arc_c_min + to_free)
 			atomic_add_64(&arc_c, -to_free);
 		else
@@ -1960,29 +3862,79 @@ arc_shrink(void)
 			arc_c = MAX(arc_size, arc_c_min);
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
+
+		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
+			arc_p);
+
 		ASSERT(arc_c >= arc_c_min);
 		ASSERT((int64_t)arc_p >= 0);
 	}
 
-	if (arc_size > arc_c)
-		arc_adjust();
+	if (arc_size > arc_c) {
+		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
+			uint64_t, arc_c);
+		(void) arc_adjust();
+	}
 }
 
-static int
-arc_reclaim_needed(void)
+static long needfree = 0;
+
+typedef enum free_memory_reason_t {
+	FMR_UNKNOWN,
+	FMR_NEEDFREE,
+	FMR_LOTSFREE,
+	FMR_SWAPFS_MINFREE,
+	FMR_PAGES_PP_MAXIMUM,
+	FMR_HEAP_ARENA,
+	FMR_ZIO_ARENA,
+	FMR_ZIO_FRAG,
+} free_memory_reason_t;
+
+int64_t last_free_memory;
+free_memory_reason_t last_free_reason;
+
+/*
+ * Additional reserve of pages for pp_reserve.
+ */
+int64_t arc_pages_pp_reserve = 64;
+
+/*
+ * Additional reserve of pages for swapfs.
+ */
+int64_t arc_swapfs_reserve = 64;
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed.  Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+static int64_t
+arc_available_memory(void)
 {
-	uint64_t extra;
+	int64_t lowest = INT64_MAX;
+	int64_t n;
+	free_memory_reason_t r = FMR_UNKNOWN;
 
 #ifdef _KERNEL
-
-	if (needfree)
-		return (1);
+	if (needfree > 0) {
+		n = PAGESIZE * (-needfree);
+		if (n < lowest) {
+			lowest = n;
+			r = FMR_NEEDFREE;
+		}
+	}
 
 	/*
-	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
+	 * Cooperate with pagedaemon when it's time for it to scan
+	 * and reclaim some pages.
 	 */
-	extra = desfree;
+	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_LOTSFREE;
+	}
 
+#ifdef illumos
 	/*
 	 * check that we're out of range of the pageout scanner.  It starts to
 	 * schedule paging if freemem is less than lotsfree and needfree.
@@ -1990,8 +3942,11 @@ arc_reclaim_needed(void)
 	 * number of needed free pages.  We add extra pages here to make sure
 	 * the scanner doesn't start up while we're freeing memory.
 	 */
-	if (freemem < lotsfree + needfree + extra)
-		return (1);
+	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_LOTSFREE;
+	}
 
 	/*
 	 * check to make sure that swapfs has enough space so that anon
@@ -2000,14 +3955,34 @@ arc_reclaim_needed(void)
 	 * swap pages.  We also add a bit of extra here just to prevent
 	 * circumstances from getting really dire.
 	 */
-	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
-		return (1);
+	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
+	    desfree - arc_swapfs_reserve);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_SWAPFS_MINFREE;
+	}
+
 
-#if defined(__i386)
+	/*
+	 * Check that we have enough availrmem that memory locking (e.g., via
+	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+	 * stores the number of pages that cannot be locked; when availrmem
+	 * drops below pages_pp_maximum, page locking mechanisms such as
+	 * page_pp_lock() will fail.)
+	 */
+	n = PAGESIZE * (availrmem - pages_pp_maximum -
+	    arc_pages_pp_reserve);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_PAGES_PP_MAXIMUM;
+	}
+
+#endif	/* illumos */
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
 	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
 	 * kernel heap space before we ever run out of available physical
-	 * memory.  Most checks of the size of the kmem_area compare against
+	 * memory.  Most checks of the size of the heap_area compare against
 	 * tune.t_minarmem, which is the minimum available real memory that we
 	 * can have in the system.  However, this is generally fixed at 25 pages
 	 * which is so low that it's useless.  In this comparison, we seek to
@@ -2015,27 +3990,87 @@ arc_reclaim_needed(void)
 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
 	 * free)
 	 */
-	if (btop(vmem_size(kmem_arena, VMEM_FREE)) <
-	    (btop(vmem_size(kmem_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-		return (1);
+	n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
+	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
+	if (n < lowest) {
+		lowest = n;
+		r = FMR_HEAP_ARENA;
+	}
+#define	zio_arena	NULL
+#else
+#define	zio_arena	heap_arena
 #endif
 
-#else
-	if (spa_get_random(100) == 0)
-		return (1);
+	/*
+	 * If zio data pages are being allocated out of a separate heap segment,
+	 * then enforce that the size of available vmem for this arena remains
+	 * above about 1/16th free.
+	 *
+	 * Note: The 1/16th arena free requirement was put in place
+	 * to aggressively evict memory from the arc in order to avoid
+	 * memory fragmentation issues.
+	 */
+	if (zio_arena != NULL) {
+		n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
+		    (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
+		if (n < lowest) {
+			lowest = n;
+			r = FMR_ZIO_ARENA;
+		}
+	}
+
+#if __FreeBSD__
+	/*
+	 * Above limits know nothing about real level of KVA fragmentation.
+	 * Start aggressive reclamation if too little sequential KVA left.
+	 */
+	if (lowest > 0) {
+		n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ?
+		    -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) :
+		    INT64_MAX;
+		if (n < lowest) {
+			lowest = n;
+			r = FMR_ZIO_FRAG;
+		}
+	}
 #endif
-	return (0);
+
+#else	/* _KERNEL */
+	/* Every 100 calls, free a small amount */
+	if (spa_get_random(100) == 0)
+		lowest = -1024;
+#endif	/* _KERNEL */
+
+	last_free_memory = lowest;
+	last_free_reason = r;
+	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+	return (lowest);
 }
 
-static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of B_TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+static boolean_t
+arc_reclaim_needed(void)
+{
+	return (arc_available_memory() < 0);
+}
+
+extern kmem_cache_t	*zio_buf_cache[];
+extern kmem_cache_t	*zio_data_buf_cache[];
+extern kmem_cache_t	*range_seg_cache;
+
+static __noinline void
+arc_kmem_reap_now(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
-	extern kmem_cache_t	*zio_buf_cache[];
-	extern kmem_cache_t	*zio_data_buf_cache[];
 
+	DTRACE_PROBE(arc__kmem_reap_start);
 #ifdef _KERNEL
 	if (arc_meta_used >= arc_meta_limit) {
 		/*
@@ -2052,13 +4087,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t
 #endif
 #endif
 
-	/*
-	 * An aggressive reclamation will shrink the cache size as well as
-	 * reap free buffers from the arc kmem caches.
-	 */
-	if (strat == ARC_RECLAIM_AGGR)
-		arc_shrink();
-
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
@@ -2070,64 +4098,202 @@ arc_kmem_reap_now(arc_reclaim_strategy_t
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
-	kmem_cache_reap_now(hdr_cache);
+	kmem_cache_reap_now(hdr_full_cache);
+	kmem_cache_reap_now(hdr_l2only_cache);
+	kmem_cache_reap_now(range_seg_cache);
+
+#ifdef illumos
+	if (zio_arena != NULL) {
+		/*
+		 * Ask the vmem arena to reclaim unused memory from its
+		 * quantum caches.
+		 */
+		vmem_qcache_reap(zio_arena);
+	}
+#endif
+	DTRACE_PROBE(arc__kmem_reap_end);
 }
 
+/*
+ * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * enough data and signal them to proceed. When this happens, the threads in
+ * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * particular arc header. Thus, we must be careful to never sleep on a
+ * hash lock in this thread. This is to prevent the following deadlock:
+ *
+ *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ *    waiting for the reclaim thread to signal it.
+ *
+ *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
+ *    fails, and goes to sleep forever.
+ *
+ * This possible deadlock is avoided by always acquiring a hash lock
+ * using mutex_tryenter() from arc_reclaim_thread().
+ */
 static void
-arc_reclaim_thread(void *unused __unused)
+arc_reclaim_thread(void *dummy __unused)
 {
-	clock_t			growtime = 0;
-	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
+	hrtime_t		growtime = 0;
 	callb_cpr_t		cpr;
 
-	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
 
-	mutex_enter(&arc_reclaim_thr_lock);
-	while (arc_thread_exit == 0) {
-		if (arc_reclaim_needed()) {
+	mutex_enter(&arc_reclaim_lock);
+	while (!arc_reclaim_thread_exit) {
+		uint64_t evicted = 0;
 
-			if (arc_no_grow) {
-				if (last_reclaim == ARC_RECLAIM_CONS) {
-					last_reclaim = ARC_RECLAIM_AGGR;
-				} else {
-					last_reclaim = ARC_RECLAIM_CONS;
-				}
-			} else {
-				arc_no_grow = TRUE;
-				last_reclaim = ARC_RECLAIM_AGGR;
-				membar_producer();
-			}
+		/*
+		 * This is necessary in order for the mdb ::arc dcmd to
+		 * show up to date information. Since the ::arc command
+		 * does not call the kstat's update function, without
+		 * this call, the command may show stale stats for the
+		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+		 * with this change, the data might be up to 1 second
+		 * out of date; but that should suffice. The arc_state_t
+		 * structures can be queried directly if more accurate
+		 * information is needed.
+		 */
+		if (arc_ksp != NULL)
+			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+
+		mutex_exit(&arc_reclaim_lock);
+
+		/*
+		 * We call arc_adjust() before (possibly) calling
+		 * arc_kmem_reap_now(), so that we can wake up
+		 * arc_get_data_buf() sooner.
+		 */
+		evicted = arc_adjust();
 
-			/* reset the growth delay for every reclaim */
-			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+		int64_t free_memory = arc_available_memory();
+		if (free_memory < 0) {
 
-			arc_kmem_reap_now(last_reclaim);
+			arc_no_grow = B_TRUE;
 			arc_warm = B_TRUE;
 
-		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
-			arc_no_grow = FALSE;
+			/*
+			 * Wait at least zfs_grow_retry (default 60) seconds
+			 * before considering growing.
+			 */
+			growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+
+			arc_kmem_reap_now();
+
+			/*
+			 * If we are still low on memory, shrink the ARC
+			 * so that we have arc_shrink_min free space.
+			 */
+			free_memory = arc_available_memory();
+
+			int64_t to_free =
+			    (arc_c >> arc_shrink_shift) - free_memory;
+			if (to_free > 0) {
+#ifdef _KERNEL
+				to_free = MAX(to_free, ptob(needfree));
+#endif
+				arc_shrink(to_free);
+			}
+		} else if (free_memory < arc_c >> arc_no_grow_shift) {
+			arc_no_grow = B_TRUE;
+		} else if (gethrtime() >= growtime) {
+			arc_no_grow = B_FALSE;
 		}
 
-		if (2 * arc_c < arc_size +
-		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size)
-			arc_adjust();
+		mutex_enter(&arc_reclaim_lock);
 
-		if (arc_eviction_list != NULL)
-			arc_do_user_evicts();
+		/*
+		 * If evicted is zero, we couldn't evict anything via
+		 * arc_adjust(). This could be due to hash lock
+		 * collisions, but more likely due to the majority of
+		 * arc buffers being unevictable. Therefore, even if
+		 * arc_size is above arc_c, another pass is unlikely to
+		 * be helpful and could potentially cause us to enter an
+		 * infinite loop.
+		 */
+		if (arc_size <= arc_c || evicted == 0) {
+#ifdef _KERNEL
+			needfree = 0;
+#endif
+			/*
+			 * We're either no longer overflowing, or we
+			 * can't evict anything more, so we should wake
+			 * up any threads before we go to sleep.
+			 */
+			cv_broadcast(&arc_reclaim_waiters_cv);
 
-		/* block until needed, or one second, whichever is shorter */
-		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait(&arc_reclaim_thr_cv,
-		    &arc_reclaim_thr_lock, (hz));
-		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+			/*
+			 * Block until signaled, or after one second (we
+			 * might need to perform arc_kmem_reap_now()
+			 * even if we aren't being signalled)
+			 */
+			CALLB_CPR_SAFE_BEGIN(&cpr);
+			(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
+			    &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+		}
 	}
 
-	arc_thread_exit = 0;
-	cv_broadcast(&arc_reclaim_thr_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
+	arc_reclaim_thread_exit = B_FALSE;
+	cv_broadcast(&arc_reclaim_thread_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
+	thread_exit();
+}
+
+#ifdef __FreeBSD__
+
+static u_int arc_dnlc_evicts_arg;
+extern struct vfsops zfs_vfsops;
+
+static void
+arc_dnlc_evicts_thread(void *dummy __unused)
+{
+	callb_cpr_t cpr;
+	u_int percent;
+
+	CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&arc_dnlc_evicts_lock);
+	while (!arc_dnlc_evicts_thread_exit) {
+		CALLB_CPR_SAFE_BEGIN(&cpr);
+		(void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
+		CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock);
+		if (arc_dnlc_evicts_arg != 0) {
+			percent = arc_dnlc_evicts_arg;
+			mutex_exit(&arc_dnlc_evicts_lock);
+#ifdef _KERNEL
+			vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops);
+#endif
+			mutex_enter(&arc_dnlc_evicts_lock);
+			/*
+			 * Clear our token only after vnlru_free()
+			 * pass is done, to avoid false queueing of
+			 * the requests.
+			 */
+			arc_dnlc_evicts_arg = 0;
+		}
+	}
+	arc_dnlc_evicts_thread_exit = FALSE;
+	cv_broadcast(&arc_dnlc_evicts_cv);
+	CALLB_CPR_EXIT(&cpr);
 	thread_exit();
 }
 
+void
+dnlc_reduce_cache(void *arg)
+{
+	u_int percent;
+
+	percent = (u_int)(uintptr_t)arg;
+	mutex_enter(&arc_dnlc_evicts_lock);
+	if (arc_dnlc_evicts_arg == 0) {
+		arc_dnlc_evicts_arg = percent;
+		cv_broadcast(&arc_dnlc_evicts_cv);
+	}
+	mutex_exit(&arc_dnlc_evicts_lock);
+}
+
+#endif
+
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are comming from.  This function is only called
@@ -2138,6 +4304,8 @@ arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+	int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
+	int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
 
 	if (state == arc_l2c_only)
 		return;
@@ -2152,15 +4320,15 @@ arc_adapt(int bytes, arc_state_t *state)
 	 *	  target size of the MRU list.
 	 */
 	if (state == arc_mru_ghost) {
-		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
-		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
+		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
 
 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
 		uint64_t delta;
 
-		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
-		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
+		mult = MIN(mult, 10);
 
 		delta = MIN(bytes * mult, arc_p);
 		arc_p = MAX(arc_p_min, arc_p - delta);
@@ -2168,7 +4336,7 @@ arc_adapt(int bytes, arc_state_t *state)
 	ASSERT((int64_t)arc_p >= 0);
 
 	if (arc_reclaim_needed()) {
-		cv_signal(&arc_reclaim_thr_cv);
+		cv_signal(&arc_reclaim_thread_cv);
 		return;
 	}
 
@@ -2183,6 +4351,7 @@ arc_adapt(int bytes, arc_state_t *state)
 	 * cache size, increment the target cache size
 	 */
 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
 		atomic_add_64(&arc_c, (int64_t)bytes);
 		if (arc_c > arc_c_max)
 			arc_c = arc_c_max;
@@ -2195,135 +4364,144 @@ arc_adapt(int bytes, arc_state_t *state)
 }
 
 /*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
  */
-static int
-arc_evict_needed(arc_buf_contents_t type)
+static boolean_t
+arc_is_overflowing(void)
 {
-	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
-		return (1);
-
-#ifdef _KERNEL
-	/*
-	 * If zio data pages are being allocated out of a separate heap segment,
-	 * then enforce that the size of available vmem for this area remains
-	 * above about 1/32nd free.
-	 */
-	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-	    vmem_size(zio_arena, VMEM_FREE) <
-	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-		return (1);
-#endif
+	/* Always allow at least one block of overflow */
+	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+	    arc_c >> zfs_arc_overflow_shift);
 
-	if (arc_reclaim_needed())
-		return (1);
-
-	return (arc_size > arc_c);
+	return (arc_size >= arc_c + overflow);
 }
 
-/*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead.  Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU.  In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted.  In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
+/*
+ * Allocate a block and return it to the caller. If we are hitting the
+ * hard limit for the cache size, we must sleep, waiting for the eviction
+ * thread to catch up. If we're past the target size but below the hard
+ * limit, we'll only signal the reclaim thread and continue on.
  */
-static void
-arc_get_data_buf(arc_buf_t *buf)
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
 {
-	arc_state_t		*state = buf->b_hdr->b_state;
-	uint64_t		size = buf->b_hdr->b_size;
-	arc_buf_contents_t	type = buf->b_hdr->b_type;
+	void *datap = NULL;
+	arc_state_t		*state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t	type = arc_buf_type(hdr);
 
 	arc_adapt(size, state);
 
 	/*
-	 * We have not yet reached cache maximum size,
-	 * just allocate a new buffer.
+	 * If arc_size is currently overflowing, and has grown past our
+	 * upper limit, we must be adding data faster than the evict
+	 * thread can evict. Thus, to ensure we don't compound the
+	 * problem by adding more data and forcing arc_size to grow even
+	 * further past it's target size, we halt and wait for the
+	 * eviction thread to catch up.
+	 *
+	 * It's also possible that the reclaim thread is unable to evict
+	 * enough buffers to get arc_size below the overflow limit (e.g.
+	 * due to buffers being un-evictable, or hash lock collisions).
+	 * In this case, we want to proceed regardless if we're
+	 * overflowing; thus we don't use a while loop here.
 	 */
-	if (!arc_evict_needed(type)) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			ARCSTAT_INCR(arcstat_data_size, size);
-			atomic_add_64(&arc_size, size);
+	if (arc_is_overflowing()) {
+		mutex_enter(&arc_reclaim_lock);
+
+		/*
+		 * Now that we've acquired the lock, we may no longer be
+		 * over the overflow limit, lets check.
+		 *
+		 * We're ignoring the case of spurious wake ups. If that
+		 * were to happen, it'd let this thread consume an ARC
+		 * buffer before it should have (i.e. before we're under
+		 * the overflow limit and were signalled by the reclaim
+		 * thread). As long as that is a rare occurrence, it
+		 * shouldn't cause any harm.
+		 */
+		if (arc_is_overflowing()) {
+			cv_signal(&arc_reclaim_thread_cv);
+			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
 		}
-		goto out;
+
+		mutex_exit(&arc_reclaim_lock);
 	}
 
-	/*
-	 * If we are prefetching from the mfu ghost list, this buffer
-	 * will end up on the mru list; so steal space from there.
-	 */
-	if (state == arc_mfu_ghost)
-		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
-	else if (state == arc_mru_ghost)
-		state = arc_mru;
-
-	if (state == arc_mru || state == arc_anon) {
-		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-		state = (arc_mfu->arcs_lsize[type] >= size &&
-		    arc_p > mru_used) ? arc_mfu : arc_mru;
+	VERIFY3U(hdr->b_type, ==, type);
+	if (type == ARC_BUFC_METADATA) {
+		datap = zio_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_META);
 	} else {
-		/* MFU cases */
-		uint64_t mfu_space = arc_c - arc_p;
-		state =  (arc_mru->arcs_lsize[type] >= size &&
-		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
-	}
-	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
-		if (type == ARC_BUFC_METADATA) {
-			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size, ARC_SPACE_DATA);
-		} else {
-			ASSERT(type == ARC_BUFC_DATA);
-			buf->b_data = zio_data_buf_alloc(size);
-			ARCSTAT_INCR(arcstat_data_size, size);
-			atomic_add_64(&arc_size, size);
-		}
-		ARCSTAT_BUMP(arcstat_recycle_miss);
+		ASSERT(type == ARC_BUFC_DATA);
+		datap = zio_data_buf_alloc(size);
+		arc_space_consume(size, ARC_SPACE_DATA);
 	}
-	ASSERT(buf->b_data != NULL);
-out:
+
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
-	if (!GHOST_STATE(buf->b_hdr->b_state)) {
-		arc_buf_hdr_t *hdr = buf->b_hdr;
+	if (!GHOST_STATE(state)) {
 
-		atomic_add_64(&hdr->b_state->arcs_size, size);
-		if (list_link_active(&hdr->b_arc_node)) {
-			ASSERT(refcount_is_zero(&hdr->b_refcnt));
-			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+		(void) refcount_add_many(&state->arcs_size, size, tag);
+
+		/*
+		 * If this is reached via arc_read, the link is
+		 * protected by the hash lock. If reached via
+		 * arc_buf_alloc, the header should not be accessed by
+		 * any other thread. And, if reached via arc_read_done,
+		 * the hash lock will protect it if it's found in the
+		 * hash table; otherwise no other thread should be
+		 * trying to [add|remove]_reference it.
+		 */
+		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+			(void) refcount_add_many(&state->arcs_esize[type],
+			    size, tag);
 		}
+
 		/*
 		 * If we are growing the cache, and we are adding anonymous
 		 * data, and we have outgrown arc_p, update arc_p
 		 */
-		if (arc_size < arc_c && hdr->b_state == arc_anon &&
-		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
+		    (refcount_count(&arc_anon->arcs_size) +
+		    refcount_count(&arc_mru->arcs_size) > arc_p))
 			arc_p = MIN(arc_c, arc_p + size);
 	}
+	ARCSTAT_BUMP(arcstat_allocated);
+	return (datap);
+}
+
+/*
+ * Free the arc data buffer.
+ */
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
+{
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	/* protected by hash lock, if in the hash table */
+	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT(state != arc_anon && state != arc_l2c_only);
+
+		(void) refcount_remove_many(&state->arcs_esize[type],
+		    size, tag);
+	}
+	(void) refcount_remove_many(&state->arcs_size, size, tag);
+
+	VERIFY3U(hdr->b_type, ==, type);
+	if (type == ARC_BUFC_METADATA) {
+		zio_buf_free(data, size);
+		arc_space_return(size, ARC_SPACE_META);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		zio_data_buf_free(data, size);
+		arc_space_return(size, ARC_SPACE_DATA);
+	}
 }
 
 /*
@@ -2331,25 +4509,26 @@ out:
  * NOTE: the hash lock is dropped in this function.
  */
 static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
 	clock_t now;
 
 	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
 
-	if (buf->b_state == arc_anon) {
+	if (hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * This buffer is not in the cache, and does not
 		 * appear in our "ghost" list.  Add the new buffer
 		 * to the MRU state.
 		 */
 
-		ASSERT(buf->b_arc_access == 0);
-		buf->b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
-		arc_change_state(arc_mru, buf, hash_lock);
+		ASSERT0(hdr->b_l1hdr.b_arc_access);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mru, hdr, hash_lock);
 
-	} else if (buf->b_state == arc_mru) {
+	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		now = ddi_get_lbolt();
 
 		/*
@@ -2360,14 +4539,16 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t 
 		 * - move the buffer to the head of the list if this is
 		 *   another prefetch (to make it less likely to be evicted).
 		 */
-		if ((buf->b_flags & ARC_PREFETCH) != 0) {
-			if (refcount_count(&buf->b_refcnt) == 0) {
-				ASSERT(list_link_active(&buf->b_arc_node));
+		if (HDR_PREFETCH(hdr)) {
+			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+				/* link protected by hash lock */
+				ASSERT(multilist_link_active(
+				    &hdr->b_l1hdr.b_arc_node));
 			} else {
-				buf->b_flags &= ~ARC_PREFETCH;
+				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
-			buf->b_arc_access = now;
+			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
@@ -2376,18 +4557,18 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t 
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
-		if (now > buf->b_arc_access + ARC_MINTIME) {
+		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
-			buf->b_arc_access = now;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-			arc_change_state(arc_mfu, buf, hash_lock);
+			hdr->b_l1hdr.b_arc_access = now;
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+			arc_change_state(arc_mfu, hdr, hash_lock);
 		}
 		ARCSTAT_BUMP(arcstat_mru_hits);
-	} else if (buf->b_state == arc_mru_ghost) {
+	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been "accessed" recently, but
@@ -2395,21 +4576,21 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t 
 		 * MFU state.
 		 */
 
-		if (buf->b_flags & ARC_PREFETCH) {
+		if (HDR_PREFETCH(hdr)) {
 			new_state = arc_mru;
-			if (refcount_count(&buf->b_refcnt) > 0)
-				buf->b_flags &= ~ARC_PREFETCH;
-			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
+				arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
-			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 
-		buf->b_arc_access = ddi_get_lbolt();
-		arc_change_state(new_state, buf, hash_lock);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		arc_change_state(new_state, hdr, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
-	} else if (buf->b_state == arc_mfu) {
+	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and is
 		 * still in the cache.  Keep it in the MFU state.
@@ -2419,13 +4600,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t 
 		 * If it was a prefetch, we will explicitly move it to
 		 * the head of the list now.
 		 */
-		if ((buf->b_flags & ARC_PREFETCH) != 0) {
-			ASSERT(refcount_count(&buf->b_refcnt) == 0);
-			ASSERT(list_link_active(&buf->b_arc_node));
+		if ((HDR_PREFETCH(hdr)) != 0) {
+			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+			/* link protected by hash_lock */
+			ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		}
 		ARCSTAT_BUMP(arcstat_mfu_hits);
-		buf->b_arc_access = ddi_get_lbolt();
-	} else if (buf->b_state == arc_mfu_ghost) {
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
 		 * This buffer has been accessed more than once but has
@@ -2433,28 +4615,28 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t 
 		 * MFU state.
 		 */
 
-		if (buf->b_flags & ARC_PREFETCH) {
+		if (HDR_PREFETCH(hdr)) {
 			/*
 			 * This is a prefetch access...
 			 * move this block back to the MRU state.
 			 */
-			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
+			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
 			new_state = arc_mru;
 		}
 
-		buf->b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-		arc_change_state(new_state, buf, hash_lock);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		arc_change_state(new_state, hdr, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
-	} else if (buf->b_state == arc_l2c_only) {
+	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
-		buf->b_arc_access = ddi_get_lbolt();
-		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
-		arc_change_state(arc_mfu, buf, hash_lock);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mfu, hdr, hash_lock);
 	} else {
 		ASSERT(!"invalid arc state");
 	}
@@ -2465,8 +4647,9 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t 
 void
 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
 {
-	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
-	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+	if (zio == NULL || zio->io_error == 0)
+		bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr));
+	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_done_func_t */
@@ -2475,25 +4658,38 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *b
 {
 	arc_buf_t **bufp = arg;
 	if (zio && zio->io_error) {
-		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
+		arc_buf_destroy(buf, arg);
 		*bufp = NULL;
 	} else {
 		*bufp = buf;
+		ASSERT(buf->b_data);
+	}
+}
+
+static void
+arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
+{
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else {
+		if (HDR_COMPRESSION_ENABLED(hdr)) {
+			ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
+			    BP_GET_COMPRESS(bp));
+		}
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
-	arc_buf_hdr_t	*hdr, *found;
-	arc_buf_t	*buf;
-	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
-	kmutex_t	*hash_lock;
+	arc_buf_hdr_t	*hdr = zio->io_private;
+	arc_buf_t	*abuf = NULL;	/* buffer we're assigning to callback */
+	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list, *acb;
-	int		freeable = FALSE;
-
-	buf = zio->io_private;
-	hdr = buf->b_hdr;
+	int		freeable = B_FALSE;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
@@ -2503,30 +4699,45 @@ arc_read_done(zio_t *zio)
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
-	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
-	    &hash_lock);
+	if (HDR_IN_HASH_TABLE(hdr)) {
+		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_dva.dva_word[0], ==,
+		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
+		ASSERT3U(hdr->b_dva.dva_word[1], ==,
+		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
+
+		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
+		    &hash_lock);
+
+		ASSERT((found == hdr &&
+		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+		    (found == hdr && HDR_L2_READING(hdr)));
+		ASSERT3P(hash_lock, !=, NULL);
+	}
 
-	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
-	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
-	    (found == hdr && HDR_L2_READING(hdr)));
-
-	hdr->b_flags &= ~ARC_L2_EVICTED;
-	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
-		hdr->b_flags &= ~ARC_L2CACHE;
-
-	/* byteswap if necessary */
-	callback_list = hdr->b_acb;
-	ASSERT(callback_list != NULL);
-	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
-		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
-		    byteswap_uint64_array :
-		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
-		func(buf->b_data, hdr->b_size);
+	if (zio->io_error == 0) {
+		/* byteswap if necessary */
+		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+			if (BP_GET_LEVEL(zio->io_bp) > 0) {
+				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+			} else {
+				hdr->b_l1hdr.b_byteswap =
+				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+			}
+		} else {
+			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+		}
 	}
 
-	arc_cksum_compute(buf, B_FALSE);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
+	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
+		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
-	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+	callback_list = hdr->b_l1hdr.b_acb;
+	ASSERT3P(callback_list, !=, NULL);
+
+	if (hash_lock && zio->io_error == 0 &&
+	    hdr->b_l1hdr.b_state == arc_anon) {
 		/*
 		 * Only call arc_access on anonymous buffers.  This is because
 		 * if we've issued an I/O for an evicted buffer, we've already
@@ -2537,33 +4748,55 @@ arc_read_done(zio_t *zio)
 	}
 
 	/* create copies of the data buffer for the callers */
-	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
-		if (acb->acb_done) {
-			if (abuf == NULL)
-				abuf = arc_buf_clone(buf);
-			acb->acb_buf = abuf;
-			abuf = NULL;
-		}
-	}
-	hdr->b_acb = NULL;
-	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-	ASSERT(!HDR_BUF_AVAILABLE(hdr));
-	if (abuf == buf) {
-		ASSERT(buf->b_efunc == NULL);
-		ASSERT(hdr->b_datacnt == 1);
-		hdr->b_flags |= ARC_BUF_AVAILABLE;
-	}
-
-	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
-
-	if (zio->io_error != 0) {
-		hdr->b_flags |= ARC_IO_ERROR;
-		if (hdr->b_state != arc_anon)
+		if (acb->acb_done != NULL) {
+			/*
+			 * If we're here, then this must be a demand read
+			 * since prefetch requests don't have callbacks.
+			 * If a read request has a callback (i.e. acb_done is
+			 * not NULL), then we decompress the data for the
+			 * first request and clone the rest. This avoids
+			 * having to waste cpu resources decompressing data
+			 * that nobody is explicitly waiting to read.
+			 */
+			if (abuf == NULL) {
+				acb->acb_buf = arc_buf_alloc_impl(hdr,
+				    acb->acb_private);
+				if (zio->io_error == 0) {
+					zio->io_error =
+					    arc_decompress(acb->acb_buf);
+				}
+				abuf = acb->acb_buf;
+			} else {
+				add_reference(hdr, acb->acb_private);
+				acb->acb_buf = arc_buf_clone(abuf);
+			}
+		}
+	}
+	hdr->b_l1hdr.b_acb = NULL;
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+	if (abuf == NULL) {
+		/*
+		 * This buffer didn't have a callback so it must
+		 * be a prefetch.
+		 */
+		ASSERT(HDR_PREFETCH(hdr));
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+	}
+
+	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+	    callback_list != NULL);
+
+	if (zio->io_error == 0) {
+		arc_hdr_verify(hdr, zio->io_bp);
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr, hash_lock);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
-		freeable = refcount_is_zero(&hdr->b_refcnt);
+		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/*
@@ -2571,9 +4804,9 @@ arc_read_done(zio_t *zio)
 	 * that the hdr (and hence the cv) might be freed before we get to
 	 * the cv_broadcast().
 	 */
-	cv_broadcast(&hdr->b_cv);
+	cv_broadcast(&hdr->b_l1hdr.b_cv);
 
-	if (hash_lock) {
+	if (hash_lock != NULL) {
 		mutex_exit(hash_lock);
 	} else {
 		/*
@@ -2582,8 +4815,8 @@ arc_read_done(zio_t *zio)
 		 * moved to the anonymous state (so that it won't show up
 		 * in the cache).
 		 */
-		ASSERT3P(hdr->b_state, ==, arc_anon);
-		freeable = refcount_is_zero(&hdr->b_refcnt);
+		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
 	}
 
 	/* execute each callback and free its structure */
@@ -2605,7 +4838,7 @@ arc_read_done(zio_t *zio)
 }
 
 /*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
@@ -2621,58 +4854,75 @@ arc_read_done(zio_t *zio)
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
- *
- * Normal callers should use arc_read and pass the arc buffer and offset
- * for the bp.  But if you know you don't need locking, you can use
- * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
-{
-	int err;
-
-	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
-	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
-	rw_enter(&pbuf->b_lock, RW_READER);
-
-	err = arc_read_nolock(pio, spa, bp, done, private, priority,
-	    zio_flags, arc_flags, zb);
-	rw_exit(&pbuf->b_lock);
-
-	return (err);
-}
-
-int
-arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb)
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+    void *private, zio_priority_t priority, int zio_flags,
+    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
-	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
-	kmutex_t *hash_lock;
+	arc_buf_hdr_t *hdr = NULL;
+	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
-	uint64_t guid = spa_guid(spa);
+	uint64_t guid = spa_load_guid(spa);
+
+	ASSERT(!BP_IS_EMBEDDED(bp) ||
+	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 
 top:
-	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
-	    &hash_lock);
-	if (hdr && hdr->b_datacnt > 0) {
+	if (!BP_IS_EMBEDDED(bp)) {
+		/*
+		 * Embedded BP's have no DVA and require no I/O to "read".
+		 * Create an anonymous arc buf to back it.
+		 */
+		hdr = buf_hash_find(guid, bp, &hash_lock);
+	}
 
-		*arc_flags |= ARC_CACHED;
+	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
+		arc_buf_t *buf = NULL;
+		*arc_flags |= ARC_FLAG_CACHED;
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 
-			if (*arc_flags & ARC_WAIT) {
-				cv_wait(&hdr->b_cv, hash_lock);
+			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+			    priority == ZIO_PRIORITY_SYNC_READ) {
+				/*
+				 * This sync read must wait for an
+				 * in-progress async read (e.g. a predictive
+				 * prefetch).  Async reads are queued
+				 * separately at the vdev_queue layer, so
+				 * this is a form of priority inversion.
+				 * Ideally, we would "inherit" the demand
+				 * i/o's priority by moving the i/o from
+				 * the async queue to the synchronous queue,
+				 * but there is currently no mechanism to do
+				 * so.  Track this so that we can evaluate
+				 * the magnitude of this potential performance
+				 * problem.
+				 *
+				 * Note that if the prefetch i/o is already
+				 * active (has been issued to the device),
+				 * the prefetch improved performance, because
+				 * we issued it sooner than we would have
+				 * without the prefetch.
+				 */
+				DTRACE_PROBE1(arc__sync__wait__for__async,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+			}
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREDICTIVE_PREFETCH);
+			}
+
+			if (*arc_flags & ARC_FLAG_WAIT) {
+				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
 				mutex_exit(hash_lock);
 				goto top;
 			}
-			ASSERT(*arc_flags & ARC_NOWAIT);
+			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 
 			if (done) {
-				arc_callback_t	*acb = NULL;
+				arc_callback_t *acb = NULL;
 
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
@@ -2682,10 +4932,9 @@ top:
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 
-				ASSERT(acb->acb_done != NULL);
-				acb->acb_next = hdr->b_acb;
-				hdr->b_acb = acb;
-				add_reference(hdr, hash_lock, private);
+				ASSERT3P(acb->acb_done, !=, NULL);
+				acb->acb_next = hdr->b_l1hdr.b_acb;
+				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
 				return (0);
 			}
@@ -2693,126 +4942,151 @@ top:
 			return (0);
 		}
 
-		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+		    hdr->b_l1hdr.b_state == arc_mfu);
 
 		if (done) {
-			add_reference(hdr, hash_lock, private);
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				/*
+				 * This is a demand read which does not have to
+				 * wait for i/o because we did a predictive
+				 * prefetch i/o for it, which has completed.
+				 */
+				DTRACE_PROBE1(
+				    arc__demand__hit__predictive__prefetch,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_predictive_prefetch);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREDICTIVE_PREFETCH);
+			}
+			ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
+
 			/*
 			 * If this block is already in use, create a new
 			 * copy of the data so that we will be guaranteed
 			 * that arc_release() will always succeed.
 			 */
-			buf = hdr->b_buf;
-			ASSERT(buf);
-			ASSERT(buf->b_data);
-			if (HDR_BUF_AVAILABLE(hdr)) {
-				ASSERT(buf->b_efunc == NULL);
-				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+			buf = hdr->b_l1hdr.b_buf;
+			if (buf == NULL) {
+				ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+				buf = arc_buf_alloc_impl(hdr, private);
+				VERIFY0(arc_decompress(buf));
 			} else {
+				add_reference(hdr, private);
 				buf = arc_buf_clone(buf);
 			}
+			ASSERT3P(buf->b_data, !=, NULL);
 
-		} else if (*arc_flags & ARC_PREFETCH &&
-		    refcount_count(&hdr->b_refcnt) == 0) {
-			hdr->b_flags |= ARC_PREFETCH;
+		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
+		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		}
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, hash_lock);
-		if (*arc_flags & ARC_L2CACHE)
-			hdr->b_flags |= ARC_L2CACHE;
+		if (*arc_flags & ARC_FLAG_L2CACHE)
+			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
-		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, hits);
 
 		if (done)
 			done(NULL, buf, private);
 	} else {
-		uint64_t size = BP_GET_LSIZE(bp);
-		arc_callback_t	*acb;
+		uint64_t lsize = BP_GET_LSIZE(bp);
+		uint64_t psize = BP_GET_PSIZE(bp);
+		arc_callback_t *acb;
 		vdev_t *vd = NULL;
-		uint64_t addr;
+		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
+		uint64_t size;
 
 		if (hdr == NULL) {
 			/* this block is not in the cache */
-			arc_buf_hdr_t	*exists;
+			arc_buf_hdr_t *exists = NULL;
 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
-			buf = arc_buf_alloc(spa, size, private, type);
-			hdr = buf->b_hdr;
-			hdr->b_dva = *BP_IDENTITY(bp);
-			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
-			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
-			exists = buf_hash_insert(hdr, &hash_lock);
-			if (exists) {
+			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+			    BP_GET_COMPRESS(bp), type);
+
+			if (!BP_IS_EMBEDDED(bp)) {
+				hdr->b_dva = *BP_IDENTITY(bp);
+				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+				exists = buf_hash_insert(hdr, &hash_lock);
+			}
+			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
-				bzero(&hdr->b_dva, sizeof (dva_t));
-				hdr->b_birth = 0;
-				hdr->b_cksum0 = 0;
-				(void) arc_buf_remove_ref(buf, private);
+				buf_discard_identity(hdr);
+				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_PREFETCH) {
-				(void) remove_reference(hdr, hash_lock,
-				    private);
-				hdr->b_flags |= ARC_PREFETCH;
-			}
-			if (*arc_flags & ARC_L2CACHE)
-				hdr->b_flags |= ARC_L2CACHE;
-			if (BP_GET_LEVEL(bp) > 0)
-				hdr->b_flags |= ARC_INDIRECT;
 		} else {
-			/* this block is in the ghost cache */
-			ASSERT(GHOST_STATE(hdr->b_state));
+			/*
+			 * This block is in the ghost cache. If it was L2-only
+			 * (and thus didn't have an L1 hdr), we realloc the
+			 * header to add an L1 hdr.
+			 */
+			if (!HDR_HAS_L1HDR(hdr)) {
+				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+				    hdr_full_cache);
+			}
+			ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
-			ASSERT(hdr->b_buf == NULL);
+			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 
-			/* if this is a prefetch, we don't have a reference */
-			if (*arc_flags & ARC_PREFETCH)
-				hdr->b_flags |= ARC_PREFETCH;
-			else
-				add_reference(hdr, hash_lock, private);
-			if (*arc_flags & ARC_L2CACHE)
-				hdr->b_flags |= ARC_L2CACHE;
-			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
-			buf->b_hdr = hdr;
-			buf->b_data = NULL;
-			buf->b_efunc = NULL;
-			buf->b_private = NULL;
-			buf->b_next = NULL;
-			hdr->b_buf = buf;
-			arc_get_data_buf(buf);
-			ASSERT(hdr->b_datacnt == 0);
-			hdr->b_datacnt = 1;
+			/*
+			 * This is a delicate dance that we play here.
+			 * This hdr is in the ghost list so we access it
+			 * to move it out of the ghost list before we
+			 * initiate the read. If it's a prefetch then
+			 * it won't have a callback so we'll remove the
+			 * reference that arc_buf_alloc_impl() created. We
+			 * do this after we've called arc_access() to
+			 * avoid hitting an assert in remove_reference().
+			 */
+			arc_access(hdr, hash_lock);
+			arc_hdr_alloc_pdata(hdr);
 		}
-
-		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
-		acb->acb_done = done;
-		acb->acb_private = private;
-
-		ASSERT(hdr->b_acb == NULL);
-		hdr->b_acb = acb;
-		hdr->b_flags |= ARC_IO_IN_PROGRESS;
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		size = arc_hdr_size(hdr);
 
 		/*
-		 * If the buffer has been evicted, migrate it to a present state
-		 * before issuing the I/O.  Once we drop the hash-table lock,
-		 * the header will be marked as I/O in progress and have an
-		 * attached buffer.  At this point, anybody who finds this
-		 * buffer ought to notice that it's legit but has a pending I/O.
+		 * If compression is enabled on the hdr, then will do
+		 * RAW I/O and will store the compressed data in the hdr's
+		 * data block. Otherwise, the hdr's data block will contain
+		 * the uncompressed data.
 		 */
+		if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+			zio_flags |= ZIO_FLAG_RAW;
+		}
 
-		if (GHOST_STATE(hdr->b_state))
-			arc_access(hdr, hash_lock);
+		if (*arc_flags & ARC_FLAG_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+		if (*arc_flags & ARC_FLAG_L2CACHE)
+			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+		if (BP_GET_LEVEL(bp) > 0)
+			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
+		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
+		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
+
+		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+		acb->acb_done = done;
+		acb->acb_private = private;
 
-		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
-		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
-			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
-			addr = hdr->b_l2hdr->b_daddr;
+		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+		hdr->b_l1hdr.b_acb = acb;
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+		if (HDR_HAS_L2HDR(hdr) &&
+		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out device removal.
 			 */
@@ -2821,15 +5095,39 @@ top:
 				vd = NULL;
 		}
 
-		mutex_exit(hash_lock);
+		if (priority == ZIO_PRIORITY_ASYNC_READ)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+		else
+			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+
+		if (hash_lock != NULL)
+			mutex_exit(hash_lock);
+
+		/*
+		 * At this point, we have a level 1 cache miss.  Try again in
+		 * L2ARC if possible.
+		 */
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
 
-		ASSERT3U(hdr->b_size, ==, size);
 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
-		    uint64_t, size, zbookmark_t *, zb);
+		    uint64_t, lsize, zbookmark_phys_t *, zb);
 		ARCSTAT_BUMP(arcstat_misses);
-		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
-		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, misses);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_force(curproc, RACCT_READBPS, size);
+			racct_add_force(curproc, RACCT_READIOPS, 1);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
+		curthread->td_ru.ru_inblock++;
+#endif
+#endif
 
 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
 			/*
@@ -2841,42 +5139,59 @@ top:
 			 *    also have invalidated the vdev.
 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
 			 */
-			if (hdr->b_l2hdr != NULL &&
+			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
 				l2arc_read_callback_t *cb;
+				void* b_data;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
-				cb->l2rcb_buf = buf;
-				cb->l2rcb_spa = spa;
+				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
+				uint64_t asize = vdev_psize_to_asize(vd, size);
+				if (asize != size) {
+					b_data = zio_data_buf_alloc(asize);
+					cb->l2rcb_data = b_data;
+				} else {
+					b_data = hdr->b_l1hdr.b_pdata;
+				}
+
+				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+				    addr + asize < vd->vdev_psize -
+				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
+				 * Issue a null zio if the underlying buffer
+				 * was squashed to zero size by compression.
 				 */
-				rzio = zio_read_phys(pio, vd, addr, size,
-				    buf->b_data, ZIO_CHECKSUM_OFF,
-				    l2arc_read_done, cb, priority, zio_flags |
-				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+				ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
+				    ZIO_COMPRESS_EMPTY);
+				rzio = zio_read_phys(pio, vd, addr,
+				    asize, b_data,
+				    ZIO_CHECKSUM_OFF,
+				    l2arc_read_done, cb, priority,
+				    zio_flags | ZIO_FLAG_DONT_CACHE |
+				    ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
 
-				if (*arc_flags & ARC_NOWAIT) {
+				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					return (0);
 				}
 
-				ASSERT(*arc_flags & ARC_WAIT);
+				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					return (0);
 
@@ -2899,230 +5214,272 @@ top:
 			}
 		}
 
-		rzio = zio_read(pio, spa, bp, buf->b_data, size,
-		    arc_read_done, buf, priority, zio_flags, zb);
+		rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
+		    arc_read_done, hdr, priority, zio_flags, zb);
 
-		if (*arc_flags & ARC_WAIT)
+		if (*arc_flags & ARC_FLAG_WAIT)
 			return (zio_wait(rzio));
 
-		ASSERT(*arc_flags & ARC_NOWAIT);
+		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 	return (0);
 }
 
-void
-arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
-{
-	ASSERT(buf->b_hdr != NULL);
-	ASSERT(buf->b_hdr->b_state != arc_anon);
-	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
-	ASSERT(buf->b_efunc == NULL);
-	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
-
-	buf->b_efunc = func;
-	buf->b_private = private;
-}
-
 /*
- * This is used by the DMU to let the ARC know that a buffer is
- * being evicted, so the ARC should clean up.  If this arc buf
- * is not yet in the evicted state, it will be put there.
+ * Notify the arc that a block was freed, and thus will never be used again.
  */
-int
-arc_buf_evict(arc_buf_t *buf)
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
-	arc_buf_t **bufp;
+	uint64_t guid = spa_load_guid(spa);
 
-	rw_enter(&buf->b_lock, RW_WRITER);
-	hdr = buf->b_hdr;
-	if (hdr == NULL) {
-		/*
-		 * We are in arc_do_user_evicts().
-		 */
-		ASSERT(buf->b_data == NULL);
-		rw_exit(&buf->b_lock);
-		return (0);
-	} else if (buf->b_data == NULL) {
-		arc_buf_t copy = *buf; /* structure assignment */
-		/*
-		 * We are on the eviction list; process this buffer now
-		 * but let arc_do_user_evicts() do the reaping.
-		 */
-		buf->b_efunc = NULL;
-		rw_exit(&buf->b_lock);
-		VERIFY(copy.b_efunc(&copy) == 0);
-		return (1);
-	}
-	hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
+	ASSERT(!BP_IS_EMBEDDED(bp));
 
-	ASSERT(buf->b_hdr == hdr);
-	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
-	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+	hdr = buf_hash_find(guid, bp, &hash_lock);
+	if (hdr == NULL)
+		return;
 
 	/*
-	 * Pull this buffer off of the hdr
+	 * We might be trying to free a block that is still doing I/O
+	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
+	 * dmu_sync-ed block). If this block is being prefetched, then it
+	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
+	 * until the I/O completes. A block may also have a reference if it is
+	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
+	 * have written the new block to its final resting place on disk but
+	 * without the dedup flag set. This would have left the hdr in the MRU
+	 * state and discoverable. When the txg finally syncs it detects that
+	 * the block was overridden in open context and issues an override I/O.
+	 * Since this is a dedup block, the override I/O will determine if the
+	 * block is already in the DDT. If so, then it will replace the io_bp
+	 * with the bp from the DDT and allow the I/O to finish. When the I/O
+	 * reaches the done callback, dbuf_write_override_done, it will
+	 * check to see if the io_bp and io_bp_override are identical.
+	 * If they are not, then it indicates that the bp was replaced with
+	 * the bp in the DDT and the override bp is freed. This allows
+	 * us to arrive here with a reference on a block that is being
+	 * freed. So if we have an I/O in progress, or a reference to
+	 * this hdr, then we don't destroy the hdr.
 	 */
-	bufp = &hdr->b_buf;
-	while (*bufp != buf)
-		bufp = &(*bufp)->b_next;
-	*bufp = buf->b_next;
-
-	ASSERT(buf->b_data != NULL);
-	arc_buf_destroy(buf, FALSE, FALSE);
-
-	if (hdr->b_datacnt == 0) {
-		arc_state_t *old_state = hdr->b_state;
-		arc_state_t *evicted_state;
-
-		ASSERT(refcount_is_zero(&hdr->b_refcnt));
-
-		evicted_state =
-		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
-		mutex_enter(&old_state->arcs_mtx);
-		mutex_enter(&evicted_state->arcs_mtx);
-
-		arc_change_state(evicted_state, hdr, hash_lock);
-		ASSERT(HDR_IN_HASH_TABLE(hdr));
-		hdr->b_flags |= ARC_IN_HASH_TABLE;
-		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
-
-		mutex_exit(&evicted_state->arcs_mtx);
-		mutex_exit(&old_state->arcs_mtx);
+	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
+	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
+		arc_change_state(arc_anon, hdr, hash_lock);
+		arc_hdr_destroy(hdr);
+		mutex_exit(hash_lock);
+	} else {
+		mutex_exit(hash_lock);
 	}
-	mutex_exit(hash_lock);
-	rw_exit(&buf->b_lock);
 
-	VERIFY(buf->b_efunc(buf) == 0);
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-	buf->b_hdr = NULL;
-	kmem_cache_free(buf_cache, buf);
-	return (1);
 }
 
 /*
- * Release this buffer from the cache.  This must be done
- * after a read and prior to modifying the buffer contents.
+ * Release this buffer from the cache, making it an anonymous buffer.  This
+ * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, void *tag)
 {
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_lock;
-	l2arc_buf_hdr_t *l2hdr;
-	uint64_t buf_size;
-	boolean_t released = B_FALSE;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	rw_enter(&buf->b_lock, RW_WRITER);
-	hdr = buf->b_hdr;
+	/*
+	 * It would be nice to assert that if it's DMU metadata (level >
+	 * 0 || it's the dnode file), then it must be syncing context.
+	 * But we don't know that information at this level.
+	 */
 
-	/* this buffer is not on any list */
-	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+	mutex_enter(&buf->b_evict_lock);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	/*
+	 * We don't grab the hash lock prior to this check, because if
+	 * the buffer's header is in the arc_anon state, it won't be
+	 * linked into the hash table.
+	 */
+	if (hdr->b_l1hdr.b_state == arc_anon) {
+		mutex_exit(&buf->b_evict_lock);
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(!HDR_IN_HASH_TABLE(hdr));
+		ASSERT(!HDR_HAS_L2HDR(hdr));
+		ASSERT(HDR_EMPTY(hdr));
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		hdr->b_l1hdr.b_arc_access = 0;
 
-	if (hdr->b_state == arc_anon) {
-		/* this buffer is already released */
-		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
-		ASSERT(BUF_EMPTY(hdr));
-		ASSERT(buf->b_efunc == NULL);
+		/*
+		 * If the buf is being overridden then it may already
+		 * have a hdr that is not empty.
+		 */
+		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
-		rw_exit(&buf->b_lock);
-		released = B_TRUE;
-	} else {
-		hash_lock = HDR_LOCK(hdr);
-		mutex_enter(hash_lock);
-	}
 
-	l2hdr = hdr->b_l2hdr;
-	if (l2hdr) {
-		mutex_enter(&l2arc_buflist_mtx);
-		hdr->b_l2hdr = NULL;
-		buf_size = hdr->b_size;
+		return;
 	}
 
-	if (released)
-		goto out;
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	/*
+	 * This assignment is only valid as long as the hash_lock is
+	 * held, we must be careful not to reference state or the
+	 * b_state field after dropping the lock.
+	 */
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	ASSERT3P(state, !=, arc_anon);
+
+	/* this buffer is not on any list */
+	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+		/*
+		 * We have to recheck this conditional again now that
+		 * we're holding the l2ad_mtx to prevent a race with
+		 * another thread which might be concurrently calling
+		 * l2arc_evict(). In that case, l2arc_evict() might have
+		 * destroyed the header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx.
+		 */
+		if (HDR_HAS_L2HDR(hdr)) {
+			l2arc_trim(hdr);
+			arc_hdr_l2hdr_destroy(hdr);
+		}
+
+		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+	}
 
 	/*
 	 * Do we have more than one buf?
 	 */
-	if (hdr->b_datacnt > 1) {
+	if (hdr->b_l1hdr.b_bufcnt > 1) {
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
-		uint64_t blksz = hdr->b_size;
 		uint64_t spa = hdr->b_spa;
-		arc_buf_contents_t type = hdr->b_type;
-		uint32_t flags = hdr->b_flags;
+		uint64_t psize = HDR_GET_PSIZE(hdr);
+		uint64_t lsize = HDR_GET_LSIZE(hdr);
+		enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+		arc_buf_contents_t type = arc_buf_type(hdr);
+		VERIFY3U(hdr->b_type, ==, type);
+
+		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
+		(void) remove_reference(hdr, hash_lock, tag);
+
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(HDR_SHARED_DATA(hdr));
+			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+			ASSERT(ARC_BUF_LAST(buf));
+		}
 
-		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
 		/*
-		 * Pull the data off of this buf and attach it to
-		 * a new anonymous buf.
+		 * Pull the data off of this hdr and attach it to
+		 * a new anonymous hdr. Also find the last buffer
+		 * in the hdr's buffer list.
 		 */
-		(void) remove_reference(hdr, hash_lock, tag);
-		bufp = &hdr->b_buf;
-		while (*bufp != buf)
-			bufp = &(*bufp)->b_next;
-		*bufp = (*bufp)->b_next;
+		arc_buf_t *lastbuf = NULL;
+		bufp = &hdr->b_l1hdr.b_buf;
+		while (*bufp != NULL) {
+			if (*bufp == buf) {
+				*bufp = buf->b_next;
+			}
+
+			/*
+			 * If we've removed a buffer in the middle of
+			 * the list then update the lastbuf and update
+			 * bufp.
+			 */
+			if (*bufp != NULL) {
+				lastbuf = *bufp;
+				bufp = &(*bufp)->b_next;
+			}
+		}
 		buf->b_next = NULL;
+		ASSERT3P(lastbuf, !=, buf);
+		ASSERT3P(lastbuf, !=, NULL);
+
+		/*
+		 * If the current arc_buf_t and the hdr are sharing their data
+		 * buffer, then we must stop sharing that block, transfer
+		 * ownership and setup sharing with a new arc_buf_t at the end
+		 * of the hdr's b_buf list.
+		 */
+		if (arc_buf_is_shared(buf)) {
+			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+			ASSERT(ARC_BUF_LAST(lastbuf));
+			VERIFY(!arc_buf_is_shared(lastbuf));
+
+			/*
+			 * First, sever the block sharing relationship between
+			 * buf and the arc_buf_hdr_t. Then, setup a new
+			 * block sharing relationship with the last buffer
+			 * on the arc_buf_t list.
+			 */
+			arc_unshare_buf(hdr, buf);
+			arc_share_buf(hdr, lastbuf);
+			VERIFY3P(lastbuf->b_data, !=, NULL);
+		} else if (HDR_SHARED_DATA(hdr)) {
+			ASSERT(arc_buf_is_shared(lastbuf));
+		}
+		ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+		ASSERT3P(state, !=, arc_l2c_only);
 
-		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
-		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
-		if (refcount_is_zero(&hdr->b_refcnt)) {
-			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
-			ASSERT3U(*size, >=, hdr->b_size);
-			atomic_add_64(size, -hdr->b_size);
+		(void) refcount_remove_many(&state->arcs_size,
+		    HDR_GET_LSIZE(hdr), buf);
+
+		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+			ASSERT3P(state, !=, arc_l2c_only);
+			(void) refcount_remove_many(&state->arcs_esize[type],
+			    HDR_GET_LSIZE(hdr), buf);
 		}
-		hdr->b_datacnt -= 1;
+
+		hdr->b_l1hdr.b_bufcnt -= 1;
 		arc_cksum_verify(buf);
+#ifdef illumos
+		arc_buf_unwatch(buf);
+#endif
 
 		mutex_exit(hash_lock);
 
-		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
-		nhdr->b_size = blksz;
-		nhdr->b_spa = spa;
-		nhdr->b_type = type;
-		nhdr->b_buf = buf;
-		nhdr->b_state = arc_anon;
-		nhdr->b_arc_access = 0;
-		nhdr->b_flags = flags & ARC_L2_WRITING;
-		nhdr->b_l2hdr = NULL;
-		nhdr->b_datacnt = 1;
-		nhdr->b_freeze_cksum = NULL;
-		(void) refcount_add(&nhdr->b_refcnt, tag);
+		/*
+		 * Allocate a new hdr. The new hdr will contain a b_pdata
+		 * buffer which will be freed in arc_write().
+		 */
+		nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
+		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
+		ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
+		VERIFY3U(nhdr->b_type, ==, type);
+		ASSERT(!HDR_SHARED_DATA(nhdr));
+
+		nhdr->b_l1hdr.b_buf = buf;
+		nhdr->b_l1hdr.b_bufcnt = 1;
+		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
-		rw_exit(&buf->b_lock);
-		atomic_add_64(&arc_anon->arcs_size, blksz);
+
+		mutex_exit(&buf->b_evict_lock);
+		(void) refcount_add_many(&arc_anon->arcs_size,
+		    HDR_GET_LSIZE(nhdr), buf);
 	} else {
-		rw_exit(&buf->b_lock);
-		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
-		ASSERT(!list_link_active(&hdr->b_arc_node));
+		mutex_exit(&buf->b_evict_lock);
+		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+		/* protected by hash lock, or hdr is on arc_anon */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		arc_change_state(arc_anon, hdr, hash_lock);
-		hdr->b_arc_access = 0;
+		hdr->b_l1hdr.b_arc_access = 0;
 		mutex_exit(hash_lock);
 
-		bzero(&hdr->b_dva, sizeof (dva_t));
-		hdr->b_birth = 0;
-		hdr->b_cksum0 = 0;
+		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
-	buf->b_efunc = NULL;
-	buf->b_private = NULL;
-
-out:
-	if (l2hdr) {
-		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
-		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
-		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
-		mutex_exit(&l2arc_buflist_mtx);
-	}
 }
 
 int
@@ -3130,62 +5487,132 @@ arc_released(arc_buf_t *buf)
 {
 	int released;
 
-	rw_enter(&buf->b_lock, RW_READER);
-	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
-	rw_exit(&buf->b_lock);
+	mutex_enter(&buf->b_evict_lock);
+	released = (buf->b_data != NULL &&
+	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
+	mutex_exit(&buf->b_evict_lock);
 	return (released);
 }
 
-int
-arc_has_callback(arc_buf_t *buf)
-{
-	int callback;
-
-	rw_enter(&buf->b_lock, RW_READER);
-	callback = (buf->b_efunc != NULL);
-	rw_exit(&buf->b_lock);
-	return (callback);
-}
-
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	int referenced;
 
-	rw_enter(&buf->b_lock, RW_READER);
-	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
-	rw_exit(&buf->b_lock);
+	mutex_enter(&buf->b_evict_lock);
+	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
+	mutex_exit(&buf->b_evict_lock);
 	return (referenced);
 }
-#endif
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+	arc_write_callback_t *callback = zio->io_private;
+	arc_buf_t *buf = callback->awcb_buf;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+
+	/*
+	 * If we're reexecuting this zio because the pool suspended, then
+	 * cleanup any state that was previously set the first time the
+	 * callback as invoked.
+	 */
+	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
+		arc_cksum_free(hdr);
+#ifdef illumos
+		arc_buf_unwatch(buf);
+#endif
+		if (hdr->b_l1hdr.b_pdata != NULL) {
+			if (arc_buf_is_shared(buf)) {
+				ASSERT(HDR_SHARED_DATA(hdr));
+
+				arc_unshare_buf(hdr, buf);
+			} else {
+				arc_hdr_free_pdata(hdr);
+			}
+		}
+	}
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+	ASSERT(!HDR_SHARED_DATA(hdr));
+	ASSERT(!arc_buf_is_shared(buf));
+
+	callback->awcb_ready(zio, buf, callback->awcb_private);
+
+	if (HDR_IO_IN_PROGRESS(hdr))
+		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
+
+	arc_cksum_compute(buf);
+	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+	enum zio_compress compress;
+	if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+		compress = ZIO_COMPRESS_OFF;
+	} else {
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
+		compress = BP_GET_COMPRESS(zio->io_bp);
+	}
+	HDR_SET_PSIZE(hdr, psize);
+	arc_hdr_set_compress(hdr, compress);
+
+	/*
+	 * If the hdr is compressed, then copy the compressed
+	 * zio contents into arc_buf_hdr_t. Otherwise, copy the original
+	 * data buf into the hdr. Ideally, we would like to always copy the
+	 * io_data into b_pdata but the user may have disabled compressed
+	 * arc thus the on-disk block may or may not match what we maintain
+	 * in the hdr's b_pdata field.
+	 */
+	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+		ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF);
+		ASSERT3U(psize, >, 0);
+		arc_hdr_alloc_pdata(hdr);
+		bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
+	} else {
+		ASSERT3P(buf->b_data, ==, zio->io_orig_data);
+		ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr));
+		ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS);
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT(!arc_buf_is_shared(buf));
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+
+		/*
+		 * This hdr is not compressed so we're able to share
+		 * the arc_buf_t data buffer with the hdr.
+		 */
+		arc_share_buf(hdr, buf);
+		VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
+		    HDR_GET_LSIZE(hdr)));
+	}
+	arc_hdr_verify(hdr, zio->io_bp);
+}
 
 static void
-arc_write_ready(zio_t *zio)
+arc_write_children_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
-	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
-	callback->awcb_ready(zio, buf, callback->awcb_private);
+	callback->awcb_children_ready(zio, buf, callback->awcb_private);
+}
 
-	/*
-	 * If the IO is already in progress, then this is a re-write
-	 * attempt, so we need to thaw and re-compute the cksum.
-	 * It is the responsibility of the callback to handle the
-	 * accounting for any re-write attempt.
-	 */
-	if (HDR_IO_IN_PROGRESS(hdr)) {
-		mutex_enter(&hdr->b_freeze_lock);
-		if (hdr->b_freeze_cksum != NULL) {
-			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-			hdr->b_freeze_cksum = NULL;
-		}
-		mutex_exit(&hdr->b_freeze_lock);
-	}
-	arc_cksum_compute(buf, B_FALSE);
-	hdr->b_flags |= ARC_IO_IN_PROGRESS;
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write.  See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+	arc_write_callback_t *cb = zio->io_private;
+	if (cb->awcb_physdone != NULL)
+		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
 }
 
 static void
@@ -3195,23 +5622,28 @@ arc_write_done(zio_t *zio)
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	ASSERT(hdr->b_acb == NULL);
+	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 	if (zio->io_error == 0) {
-		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
-		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+		arc_hdr_verify(hdr, zio->io_bp);
+
+		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+			buf_discard_identity(hdr);
+		} else {
+			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+		}
 	} else {
-		ASSERT(BUF_EMPTY(hdr));
+		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
-	 * If the block to be written was all-zero, we may have
-	 * compressed it away.  In this case no write was performed
-	 * so there will be no dva/birth-date/checksum.  The buffer
-	 * must therefor remain anonymous (and uncached).
+	 * If the block to be written was all-zero or compressed enough to be
+	 * embedded in the BP, no write was performed so there will be no
+	 * dva/birth/checksum.  The buffer must therefore remain anonymous
+	 * (and uncached).
 	 */
-	if (!BUF_EMPTY(hdr)) {
+	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
@@ -3220,7 +5652,7 @@ arc_write_done(zio_t *zio)
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
-		if (exists) {
+		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
@@ -3230,116 +5662,115 @@ arc_write_done(zio_t *zio)
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
-				ASSERT(refcount_is_zero(&exists->b_refcnt));
+				ASSERT(refcount_is_zero(
+				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists, hash_lock);
 				mutex_exit(hash_lock);
 				arc_hdr_destroy(exists);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
+			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+				/* nopwrite */
+				ASSERT(zio->io_prop.zp_nopwrite);
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad nopwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
-				ASSERT(hdr->b_datacnt == 1);
-				ASSERT(hdr->b_state == arc_anon);
+				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		/* if it's not anon, we are doing a scrub */
-		if (!exists && hdr->b_state == arc_anon)
+		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	}
 
-	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private,
-    int priority, int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
+    arc_done_func_t *children_ready, arc_done_func_t *physdone,
+    arc_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 
-	ASSERT(ready != NULL);
-	ASSERT(done != NULL);
+	ASSERT3P(ready, !=, NULL);
+	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
-	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-	ASSERT(hdr->b_acb == NULL);
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
 	if (l2arc)
-		hdr->b_flags |= ARC_L2CACHE;
+		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
+	callback->awcb_children_ready = children_ready;
+	callback->awcb_physdone = physdone;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
-	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
-	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
-
-	return (zio);
-}
-
-void
-arc_free(spa_t *spa, const blkptr_t *bp)
-{
-	arc_buf_hdr_t *ab;
-	kmutex_t *hash_lock;
-	uint64_t guid = spa_guid(spa);
-
 	/*
-	 * If this buffer is in the cache, release it, so it can be re-used.
+	 * The hdr's b_pdata is now stale, free it now. A new data block
+	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
-	ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
-	    &hash_lock);
-	if (ab != NULL) {
-		if (ab->b_state != arc_anon)
-			arc_change_state(arc_anon, ab, hash_lock);
-		if (HDR_IO_IN_PROGRESS(ab)) {
-			/*
-			 * This should only happen when we prefetch.
-			 */
-			ASSERT(ab->b_flags & ARC_PREFETCH);
-			ASSERT3U(ab->b_datacnt, ==, 1);
-			ab->b_flags |= ARC_FREED_IN_READ;
-			if (HDR_IN_HASH_TABLE(ab))
-				buf_hash_remove(ab);
-			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
-			ab->b_buf->b_efunc = NULL;
-			ab->b_buf->b_private = NULL;
-			mutex_exit(hash_lock);
+	if (hdr->b_l1hdr.b_pdata != NULL) {
+		/*
+		 * If the buf is currently sharing the data block with
+		 * the hdr then we need to break that relationship here.
+		 * The hdr will remain with a NULL data pointer and the
+		 * buf will take sole ownership of the block.
+		 */
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(ARC_BUF_LAST(buf));
+			arc_unshare_buf(hdr, buf);
 		} else {
-			ASSERT(refcount_is_zero(&ab->b_refcnt));
-			ab->b_flags |= ARC_FREE_IN_PROGRESS;
-			mutex_exit(hash_lock);
-			arc_hdr_destroy(ab);
-			ARCSTAT_BUMP(arcstat_deleted);
+			arc_hdr_free_pdata(hdr);
 		}
+		VERIFY3P(buf->b_data, !=, NULL);
+		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 	}
+	ASSERT(!arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+
+	zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp,
+	    arc_write_ready,
+	    (children_ready != NULL) ? arc_write_children_ready : NULL,
+	    arc_write_physdone, arc_write_done, callback,
+	    priority, zio_flags, zb);
+
+	return (zio);
 }
 
 static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
 	uint64_t available_memory = ptob(freemem);
 	static uint64_t page_load = 0;
 	static uint64_t last_txg = 0;
 
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
 	available_memory =
-	    MIN(available_memory, vmem_size(kmem_arena, VMEM_FREE));
-	if (available_memory >= zfs_write_limit_max)
+	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
+#endif
+
+	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
 		return (0);
 
 	if (txg > last_txg) {
@@ -3351,32 +5782,18 @@ arc_memory_throttle(uint64_t reserve, ui
 	 * the arc is already going to be evicting, so we just want to
 	 * continue to let page writes occur as quickly as possible.
 	 */
-	if (curproc == proc_pageout) {
+	if (curlwp == uvm.pagedaemon_lwp) {
 		if (page_load > MAX(ptob(minfree), available_memory) / 4)
-			return (ERESTART);
+			return (SET_ERROR(ERESTART));
 		/* Note: reserve is inflated, so we deflate */
 		page_load += reserve / 8;
 		return (0);
 	} else if (page_load > 0 && arc_reclaim_needed()) {
 		/* memory is low, delay before restarting */
 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-		return (EAGAIN);
+		return (SET_ERROR(EAGAIN));
 	}
 	page_load = 0;
-
-	if (arc_size > arc_c_min) {
-		uint64_t evictable_memory =
-		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
-		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
-	}
-
-	if (inflight_data > available_memory / 4) {
-		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
-		return (ERESTART);
-	}
 #endif
 	return (0);
 }
@@ -3394,33 +5811,28 @@ arc_tempreserve_space(uint64_t reserve, 
 	int error;
 	uint64_t anon_size;
 
-#ifdef ZFS_DEBUG
-	/*
-	 * Once in a while, fail for no reason.  Everything should cope.
-	 */
-	if (spa_get_random(10000) == 0) {
-		dprintf("forcing random failure\n");
-		return (ERESTART);
-	}
-#endif
-	if (reserve > arc_c/4 && !arc_no_grow)
+	if (reserve > arc_c/4 && !arc_no_grow) {
 		arc_c = MIN(arc_c_max, reserve * 4);
+		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
+	}
 	if (reserve > arc_c)
-		return (ENOMEM);
+		return (SET_ERROR(ENOMEM));
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
-	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+	anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
+	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
-	 * in order to compress/encrypt/etc the data.  We therefor need to
+	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	if (error = arc_memory_throttle(reserve, anon_size, txg))
+	error = arc_memory_throttle(reserve, txg);
+	if (error != 0)
 		return (error);
 
 	/*
@@ -3433,86 +5845,301 @@ arc_tempreserve_space(uint64_t reserve, 
 
 	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
 	    anon_size > arc_c / 4) {
+		uint64_t meta_esize =
+		    refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+		uint64_t data_esize =
+		    refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
-		    arc_tempreserve>>10,
-		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
-		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
-		    reserve>>10, arc_c>>10);
-		return (ERESTART);
+		    arc_tempreserve >> 10, meta_esize >> 10,
+		    data_esize >> 10, reserve >> 10, arc_c >> 10);
+		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
-#if defined(__NetBSD__) && defined(_KERNEL)
-/* Reclaim hook registered to uvm for reclaiming KVM and memory */
 static void
-arc_uvm_reclaim_hook(void)
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
-
-	if (mutex_tryenter(&arc_reclaim_thr_lock)) {
-		cv_broadcast(&arc_reclaim_thr_cv);
-		mutex_exit(&arc_reclaim_thr_lock);
-	}
+	size->value.ui64 = refcount_count(&state->arcs_size);
+	evict_data->value.ui64 =
+	    refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
+	evict_metadata->value.ui64 =
+	    refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
 }
 
 static int
-arc_kva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg)
+arc_kstat_update(kstat_t *ksp, int rw)
 {
+	arc_stats_t *as = ksp->ks_data;
 
-	
-	if (mutex_tryenter(&arc_reclaim_thr_lock)) {
-		cv_broadcast(&arc_reclaim_thr_cv);
-		mutex_exit(&arc_reclaim_thr_lock);
+	if (rw == KSTAT_WRITE) {
+		return (EACCES);
+	} else {
+		arc_kstat_update_state(arc_anon,
+		    &as->arcstat_anon_size,
+		    &as->arcstat_anon_evictable_data,
+		    &as->arcstat_anon_evictable_metadata);
+		arc_kstat_update_state(arc_mru,
+		    &as->arcstat_mru_size,
+		    &as->arcstat_mru_evictable_data,
+		    &as->arcstat_mru_evictable_metadata);
+		arc_kstat_update_state(arc_mru_ghost,
+		    &as->arcstat_mru_ghost_size,
+		    &as->arcstat_mru_ghost_evictable_data,
+		    &as->arcstat_mru_ghost_evictable_metadata);
+		arc_kstat_update_state(arc_mfu,
+		    &as->arcstat_mfu_size,
+		    &as->arcstat_mfu_evictable_data,
+		    &as->arcstat_mfu_evictable_metadata);
+		arc_kstat_update_state(arc_mfu_ghost,
+		    &as->arcstat_mfu_ghost_size,
+		    &as->arcstat_mfu_ghost_evictable_data,
+		    &as->arcstat_mfu_ghost_evictable_metadata);
 	}
-	
-	return CALLBACK_CHAIN_CONTINUE;
+
+	return (0);
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+	arc_buf_hdr_t *hdr = obj;
+
+	/*
+	 * We rely on b_dva to generate evenly distributed index
+	 * numbers using buf_hash below. So, as an added precaution,
+	 * let's make sure we never add empty buffers to the arc lists.
+	 */
+	ASSERT(!HDR_EMPTY(hdr));
+
+	/*
+	 * The assumption here, is the hash value for a given
+	 * arc_buf_hdr_t will remain constant throughout it's lifetime
+	 * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
+	 * Thus, we don't need to store the header's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+	    multilist_get_num_sublists(ml));
+}
+
+#ifdef _KERNEL
+#ifdef __FreeBSD__
+static eventhandler_tag arc_event_lowmem = NULL;
+#endif
+
+static void
+arc_lowmem(void *arg __unused, int howto __unused)
+{
+
+	mutex_enter(&arc_reclaim_lock);
+	/* XXX: Memory deficit should be passed as argument. */
+	needfree = btoc(arc_c >> arc_shrink_shift);
+	DTRACE_PROBE(arc__needfree);
+	cv_signal(&arc_reclaim_thread_cv);
+
+	/*
+	 * It is unsafe to block here in arbitrary threads, because we can come
+	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
+	 * with ARC reclaim thread.
+	 */
+	if (curlwp == uvm.pagedaemon_lwp)
+		(void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+	mutex_exit(&arc_reclaim_lock);
+}
+#endif
+
+static void
+arc_state_init(void)
+{
+	arc_anon = &ARC_anon;
+	arc_mru = &ARC_mru;
+	arc_mru_ghost = &ARC_mru_ghost;
+	arc_mfu = &ARC_mfu;
+	arc_mfu_ghost = &ARC_mfu_ghost;
+	arc_l2c_only = &ARC_l2c_only;
+
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+	    sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+
+	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+	refcount_create(&arc_anon->arcs_size);
+	refcount_create(&arc_mru->arcs_size);
+	refcount_create(&arc_mru_ghost->arcs_size);
+	refcount_create(&arc_mfu->arcs_size);
+	refcount_create(&arc_mfu_ghost->arcs_size);
+	refcount_create(&arc_l2c_only->arcs_size);
+}
+
+static void
+arc_state_fini(void)
+{
+	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+	refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+	refcount_destroy(&arc_anon->arcs_size);
+	refcount_destroy(&arc_mru->arcs_size);
+	refcount_destroy(&arc_mru_ghost->arcs_size);
+	refcount_destroy(&arc_mfu->arcs_size);
+	refcount_destroy(&arc_mfu_ghost->arcs_size);
+	refcount_destroy(&arc_l2c_only->arcs_size);
+
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 }
 
-#endif /* __NetBSD__ */
+uint64_t
+arc_max_bytes(void)
+{
+	return (arc_c_max);
+}
 
 void
 arc_init(void)
 {
-	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
+	int i, prefetch_tunable_set = 0;
+
+	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+#ifdef __FreeBSD__
+	mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
+#endif
 
 	/* Convert seconds to clock ticks */
 	arc_min_prefetch_lifespan = 1 * hz;
 
 	/* Start out with 1/8 of all memory */
-	arc_c = physmem * PAGESIZE / 8;
+	arc_c = kmem_size() / 8;
 
+#ifdef illumos
 #ifdef _KERNEL
 	/*
 	 * On architectures where the physical memory can be larger
 	 * than the addressable space (intel in 32-bit mode), we may
 	 * need to limit the cache to 1/8 of VM size.
 	 */
-	arc_c = MIN(arc_c, vmem_size(kmem_arena, VMEM_ALLOC | VMEM_FREE) / 8);
+	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
 #endif
-
-	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
-	arc_c_min = MAX(arc_c / 4, 64<<20);
-	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
-	if (arc_c * 8 >= 1<<30)
-		arc_c_max = (arc_c * 8) - (1<<30);
+#endif	/* illumos */
+	/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
+	arc_c_min = MAX(arc_c / 4, arc_abs_min);
+	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
+	if (arc_c * 8 >= 1 << 30)
+		arc_c_max = (arc_c * 8) - (1 << 30);
 	else
 		arc_c_max = arc_c_min;
-	arc_c_max = MAX(arc_c * 6, arc_c_max);
+	arc_c_max = MAX(arc_c * 5, arc_c_max);
+
+	/*
+	 * In userland, there's only the memory pressure that we artificially
+	 * create (see arc_available_memory()).  Don't let arc_c get too
+	 * small, because it can cause transactions to be larger than
+	 * arc_c, causing arc_tempreserve_space() to fail.
+	 */
+#ifndef _KERNEL
+	arc_c_min = arc_c_max / 2;
+#endif
 
+#ifdef _KERNEL
 	/*
 	 * Allow the tunables to override our calculations if they are
-	 * reasonable (ie. over 64MB)
+	 * reasonable.
 	 */
-	if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
+	if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) {
 		arc_c_max = zfs_arc_max;
-	if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
+		arc_c_min = MIN(arc_c_min, arc_c_max);
+	}
+	if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
 		arc_c_min = zfs_arc_min;
+#endif
 
 	arc_c = arc_c_max;
 	arc_p = (arc_c >> 1);
+	arc_size = 0;
 
 	/* limit meta-data to 1/4 of the arc capacity */
 	arc_meta_limit = arc_c_max / 4;
@@ -3524,143 +6151,182 @@ arc_init(void)
 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
 		arc_c_min = arc_meta_limit / 2;
 
+	if (zfs_arc_meta_min > 0) {
+		arc_meta_min = zfs_arc_meta_min;
+	} else {
+		arc_meta_min = arc_c_min / 2;
+	}
+
 	if (zfs_arc_grow_retry > 0)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	if (zfs_arc_shrink_shift > 0)
 		arc_shrink_shift = zfs_arc_shrink_shift;
 
+	/*
+	 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
+	 */
+	if (arc_no_grow_shift >= arc_shrink_shift)
+		arc_no_grow_shift = arc_shrink_shift - 1;
+
 	if (zfs_arc_p_min_shift > 0)
 		arc_p_min_shift = zfs_arc_p_min_shift;
 
+	if (zfs_arc_num_sublists_per_state < 1)
+		zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1);
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
-	arc_anon = &ARC_anon;
-	arc_mru = &ARC_mru;
-	arc_mru_ghost = &ARC_mru_ghost;
-	arc_mfu = &ARC_mfu;
-	arc_mfu_ghost = &ARC_mfu_ghost;
-	arc_l2c_only = &ARC_l2c_only;
-	arc_size = 0;
-
-	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
-
-	list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
-	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
-	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+	zfs_arc_min = arc_c_min;
+	zfs_arc_max = arc_c_max;
 
+	arc_state_init();
 	buf_init();
 
-	arc_thread_exit = 0;
-	arc_eviction_list = NULL;
-	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
-	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
+	arc_reclaim_thread_exit = B_FALSE;
+#ifdef  __FreeBSD__
+	arc_dnlc_evicts_thread_exit = FALSE;
+#endif
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
+		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
-	    TS_RUN, maxclsyspri);
-
-#if defined(__NetBSD__) && defined(_KERNEL)
-/* 	arc_hook.uvm_reclaim_hook = &arc_uvm_reclaim_hook;
+	    TS_RUN, minclsyspri);
 
-	uvm_reclaim_hook_add(&arc_hook);
-	callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
-	    &arc_kva_reclaim_entry, NULL, arc_kva_reclaim_callback); */
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+#endif
 
+	(void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
+	    TS_RUN, minclsyspri);
 #endif
 
-	arc_dead = FALSE;
+	arc_dead = B_FALSE;
 	arc_warm = B_FALSE;
 
-	if (zfs_write_limit_max == 0)
-		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-	else
-		zfs_write_limit_shift = 0;
-	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+	/*
+	 * Calculate maximum amount of dirty data per pool.
+	 *
+	 * If it has been set by /etc/system, take that.
+	 * Otherwise, use a percentage of physical memory defined by
+	 * zfs_dirty_data_max_percent (default 10%) with a cap at
+	 * zfs_dirty_data_max_max (default 4GB).
+	 */
+	if (zfs_dirty_data_max == 0) {
+		zfs_dirty_data_max = ptob(physmem) *
+		    zfs_dirty_data_max_percent / 100;
+		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+		    zfs_dirty_data_max_max);
+	}
+
+#ifdef _KERNEL
+	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
+		prefetch_tunable_set = 1;
+
+#ifdef __i386__
+	if (prefetch_tunable_set == 0) {
+		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
+		    "-- to enable,\n");
+		printf("            add \"vfs.zfs.prefetch_disable=0\" "
+		    "to /boot/loader.conf.\n");
+		zfs_prefetch_disable = 1;
+	}
+#else
+	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
+	    prefetch_tunable_set == 0) {
+		printf("ZFS NOTICE: Prefetch is disabled by default if less "
+		    "than 4GB of RAM is present;\n"
+		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
+		    "to /boot/loader.conf.\n");
+		zfs_prefetch_disable = 1;
+	}
+#endif
+	/* Warn about ZFS memory and address space requirements. */
+	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
+		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
+		    "expect unstable behavior.\n");
+	}
+	if (kmem_size() < 512 * (1 << 20)) {
+		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
+		    "expect unstable behavior.\n");
+		printf("             Consider tuning vm.kmem_size and "
+		    "vm.kmem_size_max\n");
+		printf("             in /boot/loader.conf.\n");
+	}
+#endif
 }
 
 void
 arc_fini(void)
 {
-	mutex_enter(&arc_reclaim_thr_lock);
-	arc_thread_exit = 1;
-	while (arc_thread_exit != 0)
-		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
-	mutex_exit(&arc_reclaim_thr_lock);
+	mutex_enter(&arc_reclaim_lock);
+	arc_reclaim_thread_exit = B_TRUE;
+	/*
+	 * The reclaim thread will set arc_reclaim_thread_exit back to
+	 * B_FALSE when it is finished exiting; we're waiting for that.
+	 */
+	while (arc_reclaim_thread_exit) {
+		cv_signal(&arc_reclaim_thread_cv);
+		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
+	}
+	mutex_exit(&arc_reclaim_lock);
 
-	arc_flush(NULL);
+	/* Use B_TRUE to ensure *all* buffers are evicted */
+	arc_flush(NULL, B_TRUE);
 
-	arc_dead = TRUE;
+#ifdef __FreeBSD__
+	mutex_enter(&arc_dnlc_evicts_lock);
+	arc_dnlc_evicts_thread_exit = TRUE;
+
+	/*
+	 * The user evicts thread will set arc_user_evicts_thread_exit
+	 * to FALSE when it is finished exiting; we're waiting for that.
+	 */
+	while (arc_dnlc_evicts_thread_exit) {
+		cv_signal(&arc_dnlc_evicts_cv);
+		cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
+	}
+	mutex_exit(&arc_dnlc_evicts_lock);
+
+	mutex_destroy(&arc_dnlc_evicts_lock);
+	cv_destroy(&arc_dnlc_evicts_cv);
+#endif
+
+	arc_dead = B_TRUE;
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
-	mutex_destroy(&arc_eviction_mtx);
-	mutex_destroy(&arc_reclaim_thr_lock);
-	cv_destroy(&arc_reclaim_thr_cv);
-
-	list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
-	list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
-	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
-	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
-	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
-
-	mutex_destroy(&arc_anon->arcs_mtx);
-	mutex_destroy(&arc_mru->arcs_mtx);
-	mutex_destroy(&arc_mru_ghost->arcs_mtx);
-	mutex_destroy(&arc_mfu->arcs_mtx);
-	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
-	mutex_destroy(&arc_l2c_only->arcs_mtx);
-
-	mutex_destroy(&zfs_write_limit_lock);
-
-#if defined(__NetBSD__) && defined(_KERNEL)
-/*	uvm_reclaim_hook_del(&arc_hook);
-	callback_unregister(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback,
-	    &arc_kva_reclaim_entry); */
-#endif 	
-	
+	mutex_destroy(&arc_reclaim_lock);
+	cv_destroy(&arc_reclaim_thread_cv);
+	cv_destroy(&arc_reclaim_waiters_cv);
+
+	arc_state_fini();
 	buf_fini();
 
-	ASSERT(arc_loaned_bytes == 0);
+	ASSERT0(arc_loaned_bytes);
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	if (arc_event_lowmem != NULL)
+		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+#endif
+#endif
 }
 
 /*
@@ -3718,8 +6384,12 @@ arc_fini(void)
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there.  It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction.  The thread that does this is
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
@@ -3784,6 +6454,11 @@ arc_fini(void)
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
+ *	l2arc_headroom_boost	when we find compressed buffers during ARC
+ *				scanning, we multiply headroom by this
+ *				percentage factor for the next scan cycle,
+ *				since more compressed buffers are likely to
+ *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
@@ -3800,7 +6475,7 @@ arc_fini(void)
  */
 
 static boolean_t
-l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
@@ -3809,22 +6484,45 @@ l2arc_write_eligible(uint64_t spa_guid, 
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
-	if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
-	    HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
+	if (hdr->b_spa != spa_guid) {
+		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
+		return (B_FALSE);
+	}
+	if (HDR_HAS_L2HDR(hdr)) {
+		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
+		return (B_FALSE);
+	}
+	if (HDR_IO_IN_PROGRESS(hdr)) {
+		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
+		return (B_FALSE);
+	}
+	if (!HDR_L2CACHE(hdr)) {
+		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
 		return (B_FALSE);
+	}
 
 	return (B_TRUE);
 }
 
 static uint64_t
-l2arc_write_size(l2arc_dev_t *dev)
+l2arc_write_size(void)
 {
 	uint64_t size;
 
-	size = dev->l2ad_write;
+	/*
+	 * Make sure our globals have meaningful values in case the user
+	 * altered them.
+	 */
+	size = l2arc_write_max;
+	if (size == 0) {
+		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+		    "be greater than zero, resetting it to the default (%d)",
+		    L2ARC_WRITE_SIZE);
+		size = l2arc_write_max = L2ARC_WRITE_SIZE;
+	}
 
 	if (arc_warm == B_FALSE)
-		size += dev->l2ad_boost;
+		size += l2arc_write_boost;
 
 	return (size);
 
@@ -3852,20 +6550,6 @@ l2arc_write_interval(clock_t began, uint
 	return (next);
 }
 
-static void
-l2arc_hdr_stat_add(void)
-{
-	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
-	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
-}
-
-static void
-l2arc_hdr_stat_remove(void)
-{
-	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
-	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
-}
-
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
@@ -3941,9 +6625,13 @@ l2arc_do_free_on_write()
 
 	for (df = list_tail(buflist); df; df = df_prev) {
 		df_prev = list_prev(buflist, df);
-		ASSERT(df->l2df_data != NULL);
-		ASSERT(df->l2df_func != NULL);
-		df->l2df_func(df->l2df_data, df->l2df_size);
+		ASSERT3P(df->l2df_data, !=, NULL);
+		if (df->l2df_type == ARC_BUFC_METADATA) {
+			zio_buf_free(df->l2df_data, df->l2df_size);
+		} else {
+			ASSERT(df->l2df_type == ARC_BUFC_DATA);
+			zio_data_buf_free(df->l2df_data, df->l2df_size);
+		}
 		list_remove(buflist, df);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
@@ -3961,66 +6649,107 @@ l2arc_write_done(zio_t *zio)
 	l2arc_write_callback_t *cb;
 	l2arc_dev_t *dev;
 	list_t *buflist;
-	arc_buf_hdr_t *head, *ab, *ab_prev;
-	l2arc_buf_hdr_t *abl2;
+	arc_buf_hdr_t *head, *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
+	int64_t bytes_dropped = 0;
 
 	cb = zio->io_private;
-	ASSERT(cb != NULL);
+	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
-	ASSERT(dev != NULL);
+	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
-	ASSERT(head != NULL);
-	buflist = dev->l2ad_buflist;
-	ASSERT(buflist != NULL);
+	ASSERT3P(head, !=, NULL);
+	buflist = &dev->l2ad_buflist;
+	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	if (zio->io_error != 0)
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
-	mutex_enter(&l2arc_buflist_mtx);
-
 	/*
 	 * All writes completed, or an error was hit.
 	 */
-	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
-		ab_prev = list_prev(buflist, ab);
+top:
+	mutex_enter(&dev->l2ad_mtx);
+	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
+		hdr_prev = list_prev(buflist, hdr);
+
+		hash_lock = HDR_LOCK(hdr);
 
-		hash_lock = HDR_LOCK(ab);
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
-			 * This buffer misses out.  It may be in a stage
-			 * of eviction.  Its ARC_L2_WRITING flag will be
-			 * left set, denying reads to this buffer.
+			 * Missed the hash lock. We must retry so we
+			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
-			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
-			continue;
+			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+			/*
+			 * We don't want to rescan the headers we've
+			 * already marked as having been written out, so
+			 * we reinsert the head node so we can pick up
+			 * where we left off.
+			 */
+			list_remove(buflist, head);
+			list_insert_after(buflist, hdr, head);
+
+			mutex_exit(&dev->l2ad_mtx);
+
+			/*
+			 * We wait for the hash lock to become available
+			 * to try and prevent busy waiting, and increase
+			 * the chance we'll be able to acquire the lock
+			 * the next time around.
+			 */
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			goto top;
 		}
 
+		/*
+		 * We could not have been moved into the arc_l2c_only
+		 * state while in-flight due to our ARC_FLAG_L2_WRITING
+		 * bit being set. Let's just ensure that's being enforced.
+		 */
+		ASSERT(HDR_HAS_L1HDR(hdr));
+
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
-			list_remove(buflist, ab);
-			abl2 = ab->b_l2hdr;
-			ab->b_l2hdr = NULL;
-			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
-			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+			list_remove(buflist, hdr);
+			l2arc_trim(hdr);
+			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+			ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr));
+			ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
+
+			bytes_dropped += arc_hdr_size(hdr);
+			(void) refcount_remove_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
-		 * Allow ARC to begin reads to this L2ARC entry.
+		 * Allow ARC to begin reads and ghost list evictions to
+		 * this L2ARC entry.
 		 */
-		ab->b_flags &= ~ARC_L2_WRITING;
+		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
 
 	atomic_inc_64(&l2arc_writes_done);
 	list_remove(buflist, head);
-	kmem_cache_free(hdr_cache, head);
-	mutex_exit(&l2arc_buflist_mtx);
+	ASSERT(!HDR_HAS_L1HDR(head));
+	kmem_cache_free(hdr_l2only_cache, head);
+	mutex_exit(&dev->l2ad_mtx);
+
+	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
@@ -4036,34 +6765,63 @@ l2arc_read_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 	arc_buf_hdr_t *hdr;
-	arc_buf_t *buf;
 	kmutex_t *hash_lock;
-	int equal;
+	boolean_t valid_cksum;
 
-	ASSERT(zio->io_vd != NULL);
+	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	cb = zio->io_private;
-	ASSERT(cb != NULL);
-	buf = cb->l2rcb_buf;
-	ASSERT(buf != NULL);
-	hdr = buf->b_hdr;
-	ASSERT(hdr != NULL);
+	ASSERT3P(cb, !=, NULL);
+	hdr = cb->l2rcb_hdr;
+	ASSERT3P(hdr, !=, NULL);
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
+	/*
+	 * If the data was read into a temporary buffer,
+	 * move it and free the buffer.
+	 */
+	if (cb->l2rcb_data != NULL) {
+		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
+		if (zio->io_error == 0) {
+			bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata,
+			    arc_hdr_size(hdr));
+		}
+
+		/*
+		 * The following must be done regardless of whether
+		 * there was an error:
+		 * - free the temporary buffer
+		 * - point zio to the real ARC buffer
+		 * - set zio size accordingly
+		 * These are required because zio is either re-used for
+		 * an I/O of the block in the case of the error
+		 * or the zio is passed to arc_read_done() and it
+		 * needs real data.
+		 */
+		zio_data_buf_free(cb->l2rcb_data, zio->io_size);
+		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
+		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata;
+	}
+
+	ASSERT3P(zio->io_data, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
-	equal = arc_cksum_equal(buf);
-	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+	ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
+	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
+	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
+
+	valid_cksum = arc_cksum_is_equal(hdr, zio);
+	if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
-		zio->io_private = buf;
-		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
-		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
+		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		mutex_exit(hash_lock);
@@ -4074,9 +6832,9 @@ l2arc_read_done(zio_t *zio)
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
-			zio->io_error = EIO;
+			zio->io_error = SET_ERROR(EIO);
 		}
-		if (!equal)
+		if (!valid_cksum)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
@@ -4089,9 +6847,10 @@ l2arc_read_done(zio_t *zio)
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
-			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
-			    buf->b_data, zio->io_size, arc_read_done, buf,
-			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+			zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
+			    hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
+			    hdr, zio->io_priority, cb->l2rcb_flags,
+			    &cb->l2rcb_zb));
 		}
 	}
 
@@ -4108,35 +6867,37 @@ l2arc_read_done(zio_t *zio)
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
-static list_t *
-l2arc_list_locked(int list_num, kmutex_t **lock)
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
 {
-	list_t *list;
+	multilist_t *ml = NULL;
+	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num <= 3);
 
 	switch (list_num) {
 	case 0:
-		list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
-		*lock = &arc_mfu->arcs_mtx;
+		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
-		list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
-		*lock = &arc_mru->arcs_mtx;
+		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
-		list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
-		*lock = &arc_mfu->arcs_mtx;
+		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
-		list = &arc_mru->arcs_list[ARC_BUFC_DATA];
-		*lock = &arc_mru->arcs_mtx;
+		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	}
 
-	ASSERT(!(MUTEX_HELD(*lock)));
-	mutex_enter(*lock);
-	return (list);
+	/*
+	 * Return a randomly-selected sublist. This is acceptable
+	 * because the caller feeds only a little bit of data for each
+	 * call (8MB). Subsequent calls will result in different
+	 * sublists being selected.
+	 */
+	idx = multilist_get_random_index(ml);
+	return (multilist_sublist_lock(ml, idx));
 }
 
 /*
@@ -4149,15 +6910,11 @@ static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
-	l2arc_buf_hdr_t *abl2;
-	arc_buf_hdr_t *ab, *ab_prev;
+	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 
-	buflist = dev->l2ad_buflist;
-
-	if (buflist == NULL)
-		return;
+	buflist = &dev->l2ad_buflist;
 
 	if (!all && dev->l2ad_first) {
 		/*
@@ -4180,35 +6937,41 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t d
 	    uint64_t, taddr, boolean_t, all);
 
 top:
-	mutex_enter(&l2arc_buflist_mtx);
-	for (ab = list_tail(buflist); ab; ab = ab_prev) {
-		ab_prev = list_prev(buflist, ab);
+	mutex_enter(&dev->l2ad_mtx);
+	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
+		hdr_prev = list_prev(buflist, hdr);
+
+		hash_lock = HDR_LOCK(hdr);
 
-		hash_lock = HDR_LOCK(ab);
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
-			mutex_exit(&l2arc_buflist_mtx);
+			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
-		if (HDR_L2_WRITE_HEAD(ab)) {
+		if (HDR_L2_WRITE_HEAD(hdr)) {
 			/*
 			 * We hit a write head node.  Leave it for
 			 * l2arc_write_done().
 			 */
-			list_remove(buflist, ab);
+			list_remove(buflist, hdr);
 			mutex_exit(hash_lock);
 			continue;
 		}
 
-		if (!all && ab->b_l2hdr != NULL &&
-		    (ab->b_l2hdr->b_daddr > taddr ||
-		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+		if (!all && HDR_HAS_L2HDR(hdr) &&
+		    (hdr->b_l2hdr.b_daddr >= taddr ||
+		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
@@ -4217,94 +6980,78 @@ top:
 			break;
 		}
 
-		if (HDR_FREE_IN_PROGRESS(ab)) {
-			/*
-			 * Already on the path to destruction.
-			 */
-			mutex_exit(hash_lock);
-			continue;
-		}
-
-		if (ab->b_state == arc_l2c_only) {
-			ASSERT(!HDR_L2_READING(ab));
+		ASSERT(HDR_HAS_L2HDR(hdr));
+		if (!HDR_HAS_L1HDR(hdr)) {
+			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_size.
 			 */
-			arc_change_state(arc_anon, ab, hash_lock);
-			arc_hdr_destroy(ab);
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
 		} else {
+			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
-			if (HDR_L2_READING(ab)) {
+			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
-				ab->b_flags |= ARC_L2_EVICTED;
+				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
-			/*
-			 * Tell ARC this no longer exists in L2ARC.
-			 */
-			if (ab->b_l2hdr != NULL) {
-				abl2 = ab->b_l2hdr;
-				ab->b_l2hdr = NULL;
-				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
-				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
-			}
-			list_remove(buflist, ab);
+			/* Ensure this header has finished being written */
+			ASSERT(!HDR_L2_WRITING(hdr));
 
-			/*
-			 * This may have been leftover after a
-			 * failed write.
-			 */
-			ab->b_flags &= ~ARC_L2_WRITING;
+			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
-	mutex_exit(&l2arc_buflist_mtx);
-
-	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
-	dev->l2ad_evict = taddr;
+	mutex_exit(&dev->l2ad_mtx);
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
- * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t *ab, *ab_prev, *head;
-	l2arc_buf_hdr_t *hdrl2;
-	list_t *list;
-	uint64_t passed_sz, write_sz, buf_sz, headroom;
-	void *buf_data;
-	kmutex_t *hash_lock, *list_lock;
-	boolean_t have_lock, full;
+	arc_buf_hdr_t *hdr, *hdr_prev, *head;
+	uint64_t write_asize, write_psize, write_sz, headroom;
+	boolean_t full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
-	uint64_t guid = spa_guid(spa);
+	uint64_t guid = spa_load_guid(spa);
+	int try;
 
-	ASSERT(dev->l2ad_vdev != NULL);
+	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
-	write_sz = 0;
+	write_sz = write_asize = write_psize = 0;
 	full = B_FALSE;
-	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
-	head->b_flags |= ARC_L2_WRITE_HEAD;
+	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
+	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 
+	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
-	mutex_enter(&l2arc_buflist_mtx);
-	for (int try = 0; try <= 3; try++) {
-		list = l2arc_list_locked(try, &list_lock);
-		passed_sz = 0;
+	for (try = 0; try <= 3; try++) {
+		multilist_sublist_t *mls = l2arc_sublist_lock(try);
+		uint64_t passed_sz = 0;
+
+		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
 
 		/*
 		 * L2ARC fast warmup.
@@ -4312,44 +7059,70 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
-		headroom = target_sz * l2arc_headroom;
 		if (arc_warm == B_FALSE)
-			ab = list_head(list);
+			hdr = multilist_sublist_head(mls);
 		else
-			ab = list_tail(list);
+			hdr = multilist_sublist_tail(mls);
+		if (hdr == NULL)
+			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
+
+		headroom = target_sz * l2arc_headroom;
+		if (zfs_compressed_arc_enabled)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
+
+		for (; hdr; hdr = hdr_prev) {
+			kmutex_t *hash_lock;
 
-		for (; ab; ab = ab_prev) {
 			if (arc_warm == B_FALSE)
-				ab_prev = list_next(list, ab);
+				hdr_prev = multilist_sublist_next(mls, hdr);
 			else
-				ab_prev = list_prev(list, ab);
-
-			hash_lock = HDR_LOCK(ab);
-			have_lock = MUTEX_HELD(hash_lock);
-			if (!have_lock && !mutex_tryenter(hash_lock)) {
+				hdr_prev = multilist_sublist_prev(mls, hdr);
+			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
+			    HDR_GET_LSIZE(hdr));
+
+			hash_lock = HDR_LOCK(hdr);
+			if (!mutex_tryenter(hash_lock)) {
+				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
 				/*
 				 * Skip this buffer rather than waiting.
 				 */
 				continue;
 			}
 
-			passed_sz += ab->b_size;
+			passed_sz += HDR_GET_LSIZE(hdr);
 			if (passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
+				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
 				break;
 			}
 
-			if (!l2arc_write_eligible(guid, ab)) {
+			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
 
-			if ((write_sz + ab->b_size) > target_sz) {
+			/*
+			 * We rely on the L1 portion of the header below, so
+			 * it's invalid for this header to have been evicted out
+			 * of the ghost cache, prior to being written out. The
+			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 */
+			ASSERT(HDR_HAS_L1HDR(hdr));
+
+			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+			ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+			ASSERT3U(arc_hdr_size(hdr), >, 0);
+			uint64_t size = arc_hdr_size(hdr);
+			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
+			    size);
+
+			if ((write_psize + asize) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
+				ARCSTAT_BUMP(arcstat_l2_write_full);
 				break;
 			}
 
@@ -4359,7 +7132,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
-				list_insert_head(dev->l2ad_buflist, head);
+				mutex_enter(&dev->l2ad_mtx);
+				list_insert_head(&dev->l2ad_buflist, head);
+				mutex_exit(&dev->l2ad_mtx);
 
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
@@ -4367,76 +7142,92 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 				cb->l2wcb_head = head;
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
+				ARCSTAT_BUMP(arcstat_l2_write_pios);
 			}
 
-			/*
-			 * Create and add a new L2ARC header.
-			 */
-			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
-			hdrl2->b_dev = dev;
-			hdrl2->b_daddr = dev->l2ad_hand;
-
-			ab->b_flags |= ARC_L2_WRITING;
-			ab->b_l2hdr = hdrl2;
-			list_insert_head(dev->l2ad_buflist, ab);
-			buf_data = ab->b_buf->b_data;
-			buf_sz = ab->b_size;
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			arc_hdr_set_flags(hdr,
+			    ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
+
+			mutex_enter(&dev->l2ad_mtx);
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+
+			(void) refcount_add_many(&dev->l2ad_alloc, size, hdr);
 
 			/*
-			 * Compute and store the buffer cksum before
-			 * writing.  On debug the cksum is verified first.
+			 * Normally the L2ARC can use the hdr's data, but if
+			 * we're sharing data between the hdr and one of its
+			 * bufs, L2ARC needs its own copy of the data so that
+			 * the ZIO below can't race with the buf consumer. To
+			 * ensure that this copy will be available for the
+			 * lifetime of the ZIO and be cleaned up afterwards, we
+			 * add it to the l2arc_free_on_write queue.
 			 */
-			arc_cksum_verify(ab->b_buf);
-			arc_cksum_compute(ab->b_buf, B_TRUE);
-
-			mutex_exit(hash_lock);
+			void *to_write;
+			if (!HDR_SHARED_DATA(hdr) && size == asize) {
+				to_write = hdr->b_l1hdr.b_pdata;
+			} else {
+				arc_buf_contents_t type = arc_buf_type(hdr);
+				if (type == ARC_BUFC_METADATA) {
+					to_write = zio_buf_alloc(asize);
+				} else {
+					ASSERT3U(type, ==, ARC_BUFC_DATA);
+					to_write = zio_data_buf_alloc(asize);
+				}
 
+				bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
+				if (asize != size)
+					bzero(to_write + size, asize - size);
+				l2arc_free_data_on_write(to_write, asize, type);
+			}
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
-			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
+			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    ZIO_CHECKSUM_OFF, NULL, hdr,
+			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
+			write_sz += HDR_GET_LSIZE(hdr);
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
-			(void) zio_nowait(wzio);
 
-			/*
-			 * Keep the clock hand suitably device-aligned.
-			 */
-			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+			write_asize += size;
+			write_psize += asize;
+			dev->l2ad_hand += asize;
+
+			mutex_exit(hash_lock);
 
-			write_sz += buf_sz;
-			dev->l2ad_hand += buf_sz;
+			(void) zio_nowait(wzio);
 		}
 
-		mutex_exit(list_lock);
+		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
-	mutex_exit(&l2arc_buflist_mtx);
 
+	/* No buffers selected for writing? */
 	if (pio == NULL) {
-		ASSERT3U(write_sz, ==, 0);
-		kmem_cache_free(hdr_cache, head);
+		ASSERT0(write_sz);
+		ASSERT(!HDR_HAS_L1HDR(head));
+		kmem_cache_free(hdr_l2only_cache, head);
 		return (0);
 	}
 
-	ASSERT3U(write_sz, <=, target_sz);
+	ASSERT3U(write_psize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
-	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
+	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
-	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
+	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
+	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-		vdev_space_update(dev->l2ad_vdev,
-		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
 		dev->l2ad_hand = dev->l2ad_start;
-		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 	}
 
@@ -4444,7 +7235,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
-	return (write_sz);
+	return (write_asize);
 }
 
 /*
@@ -4452,13 +7243,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
  * heart of the L2ARC.
  */
 static void
-l2arc_feed_thread(void *unused __unused)
+l2arc_feed_thread(void *dummy __unused)
 {
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
-	clock_t begin, next = ddi_get_lbolt();
+	clock_t begin, next = ddi_get_lbolt() + hz;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4467,9 +7258,9 @@ l2arc_feed_thread(void *unused __unused)
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-		    (hz * l2arc_feed_secs));
+		    next - ddi_get_lbolt());
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
-		next = ddi_get_lbolt();
+		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
@@ -4496,7 +7287,17 @@ l2arc_feed_thread(void *unused __unused)
 			continue;
 
 		spa = dev->l2ad_spa;
-		ASSERT(spa != NULL);
+		ASSERT3P(spa, !=, NULL);
+
+		/*
+		 * If the pool is read-only then force the feed thread to
+		 * sleep a little longer.
+		 */
+		if (!spa_writeable(spa)) {
+			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+			spa_config_exit(spa, SCL_L2ARC, dev);
+			continue;
+		}
 
 		/*
 		 * Avoid contributing to memory pressure.
@@ -4509,7 +7310,7 @@ l2arc_feed_thread(void *unused __unused)
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
-		size = l2arc_write_size(dev);
+		size = l2arc_write_size();
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
@@ -4561,31 +7362,30 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 
 	ASSERT(!l2arc_vdev_present(vd));
 
+	vdev_ashift_optimize(vd);
+
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
-	adddev->l2ad_write = l2arc_write_max;
-	adddev->l2ad_boost = l2arc_write_boost;
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
-	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
-	ASSERT3U(adddev->l2ad_write, >, 0);
 
+	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
-	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
-	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
-	    offsetof(arc_buf_hdr_t, b_l2node));
+	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+	refcount_create(&adddev->l2ad_alloc);
 
 	/*
 	 * Add device to global list
@@ -4615,7 +7415,7 @@ l2arc_remove_vdev(vdev_t *vd)
 			break;
 		}
 	}
-	ASSERT(remdev != NULL);
+	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Remove device from global list
@@ -4629,8 +7429,9 @@ l2arc_remove_vdev(vdev_t *vd)
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
-	list_destroy(remdev->l2ad_buflist);
-	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+	list_destroy(&remdev->l2ad_buflist);
+	mutex_destroy(&remdev->l2ad_mtx);
+	refcount_destroy(&remdev->l2ad_alloc);
 	kmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
@@ -4645,7 +7446,6 @@ l2arc_init(void)
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
@@ -4670,7 +7470,6 @@ l2arc_fini(void)
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
-	mutex_destroy(&l2arc_buflist_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c	17 Jul 2014 16:23:18 -0000
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Embedded-data Block Pointers
+ *
+ * Normally, block pointers point (via their DVAs) to a block which holds data.
+ * If the data that we need to store is very small, this is an inefficient
+ * use of space, because a block must be at minimum 1 sector (typically 512
+ * bytes or 4KB).  Additionally, reading these small blocks tends to generate
+ * more random reads.
+ *
+ * Embedded-data Block Pointers allow small pieces of data (the "payload",
+ * up to 112 bytes) to be stored in the block pointer itself, instead of
+ * being pointed to.  The "Pointer" part of this name is a bit of a
+ * misnomer, as nothing is pointed to.
+ *
+ * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
+ * be embedded in the block pointer.  The logic for this is handled in
+ * the SPA, by the zio pipeline.  Therefore most code outside the zio
+ * pipeline doesn't need special-cases to handle these block pointers.
+ *
+ * See spa.h for details on the exact layout of embedded block pointers.
+ */
+
+void
+encode_embedded_bp_compressed(blkptr_t *bp, void *data,
+    enum zio_compress comp, int uncompressed_size, int compressed_size)
+{
+	uint64_t *bp64 = (uint64_t *)bp;
+	uint64_t w = 0;
+	uint8_t *data8 = data;
+
+	ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
+	ASSERT(uncompressed_size == compressed_size ||
+	    comp != ZIO_COMPRESS_OFF);
+	ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
+	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+
+	bzero(bp, sizeof (*bp));
+	BP_SET_EMBEDDED(bp, B_TRUE);
+	BP_SET_COMPRESS(bp, comp);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+	BPE_SET_LSIZE(bp, uncompressed_size);
+	BPE_SET_PSIZE(bp, compressed_size);
+
+	/*
+	 * Encode the byte array into the words of the block pointer.
+	 * First byte goes into low bits of first word (little endian).
+	 */
+	for (int i = 0; i < compressed_size; i++) {
+		BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
+		if (i % sizeof (w) == sizeof (w) - 1) {
+			/* we've reached the end of a word */
+			ASSERT3P(bp64, <, bp + 1);
+			*bp64 = w;
+			bp64++;
+			if (!BPE_IS_PAYLOADWORD(bp, bp64))
+				bp64++;
+			w = 0;
+		}
+	}
+	/* write last partial word */
+	if (bp64 < (uint64_t *)(bp + 1))
+		*bp64 = w;
+}
+
+/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+	int psize;
+	uint8_t *buf8 = buf;
+	uint64_t w = 0;
+	const uint64_t *bp64 = (const uint64_t *)bp;
+
+	ASSERT(BP_IS_EMBEDDED(bp));
+
+	psize = BPE_GET_PSIZE(bp);
+
+	/*
+	 * Decode the words of the block pointer into the byte array.
+	 * Low bits of first word are the first byte (little endian).
+	 */
+	for (int i = 0; i < psize; i++) {
+		if (i % sizeof (w) == 0) {
+			/* beginning of a word */
+			ASSERT3P(bp64, <, bp + 1);
+			w = *bp64;
+			bp64++;
+			if (!BPE_IS_PAYLOADWORD(bp, bp64))
+				bp64++;
+		}
+		buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+	}
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 bplist.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c	27 Feb 2010 22:30:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c	23 Mar 2013 15:29:23 -0000
@@ -19,351 +19,59 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
+
 void
-bplist_init(bplist_t *bpl)
+bplist_create(bplist_t *bpl)
 {
-	bzero(bpl, sizeof (*bpl));
 	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+	    offsetof(bplist_entry_t, bpe_node));
 }
 
 void
-bplist_fini(bplist_t *bpl)
+bplist_destroy(bplist_t *bpl)
 {
-	ASSERT(bpl->bpl_queue == NULL);
+	list_destroy(&bpl->bpl_list);
 	mutex_destroy(&bpl->bpl_lock);
 }
 
-static int
-bplist_hold(bplist_t *bpl)
-{
-	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
-	if (bpl->bpl_dbuf == NULL) {
-		int err = dmu_bonus_hold(bpl->bpl_mos,
-		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
-		if (err)
-			return (err);
-		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
-	}
-	return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
-	int size;
-
-	size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
-	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
-	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
-	    DMU_OT_BPLIST_HDR, size, tx));
-}
-
 void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
 {
-	VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
-{
-	dmu_object_info_t doi;
-	int err;
-
-	err = dmu_object_info(mos, object, &doi);
-	if (err)
-		return (err);
-
-	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(bpl->bpl_dbuf == NULL);
-	ASSERT(bpl->bpl_phys == NULL);
-	ASSERT(bpl->bpl_cached_dbuf == NULL);
-	ASSERT(bpl->bpl_queue == NULL);
-	ASSERT(object != 0);
-	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
-	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
-	bpl->bpl_mos = mos;
-	bpl->bpl_object = object;
-	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
-	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
-	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
-}
+	bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
 
-void
-bplist_close(bplist_t *bpl)
-{
 	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(bpl->bpl_queue == NULL);
-
-	if (bpl->bpl_cached_dbuf) {
-		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		bpl->bpl_cached_dbuf = NULL;
-	}
-	if (bpl->bpl_dbuf) {
-		dmu_buf_rele(bpl->bpl_dbuf, bpl);
-		bpl->bpl_dbuf = NULL;
-		bpl->bpl_phys = NULL;
-	}
-
+	bpe->bpe_blk = *bp;
+	list_insert_tail(&bpl->bpl_list, bpe);
 	mutex_exit(&bpl->bpl_lock);
 }
 
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
-	boolean_t rv;
-
-	if (bpl->bpl_object == 0)
-		return (B_TRUE);
-
-	mutex_enter(&bpl->bpl_lock);
-	VERIFY(0 == bplist_hold(bpl)); /* XXX */
-	rv = (bpl->bpl_phys->bpl_entries == 0);
-	mutex_exit(&bpl->bpl_lock);
-
-	return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
-{
-	int err = 0;
-
-	if (bpl->bpl_cached_dbuf == NULL ||
-	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
-		if (bpl->bpl_cached_dbuf != NULL)
-			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		err = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
-		    bpl, &bpl->bpl_cached_dbuf);
-		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
-		    1ULL << bpl->bpl_blockshift);
-	}
-	return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
-
-	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	if (*itorp >= bpl->bpl_phys->bpl_entries) {
-		mutex_exit(&bpl->bpl_lock);
-		return (ENOENT);
-	}
-
-	blk = *itorp >> bpl->bpl_bpshift;
-	off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
-	err = bplist_cache(bpl, blk);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	*bp = bparray[off];
-	(*itorp)++;
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_lock);
-	err = bplist_hold(bpl);
-	if (err)
-		return (err);
-
-	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
-	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
-	err = bplist_cache(bpl, blk);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	bparray[off] = *bp;
-
-	/* We never need the fill count. */
-	bparray[off].blk_fill = 0;
-
-	/* The bplist will compress better if we can leave off the checksum */
-	if (!BP_GET_DEDUP(&bparray[off]))
-		bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	bpl->bpl_phys->bpl_entries++;
-	bpl->bpl_phys->bpl_bytes +=
-	    bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
-		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	return (0);
-}
-
-void
-bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	VERIFY(bplist_enqueue(bpl, bp, tx) == 0);
-}
-
 /*
- * Deferred entry; will be processed later by bplist_sync().
+ * To aid debugging, we keep the most recently removed entry.  This way if
+ * we are in the callback, we can easily locate the entry.
  */
-void
-bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
-{
-	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_lock);
-	bpq->bpq_blk = *bp;
-	bpq->bpq_next = bpl->bpl_queue;
-	bpl->bpl_queue = bpq;
-	mutex_exit(&bpl->bpl_lock);
-}
+static bplist_entry_t *bplist_iterate_last_removed;
 
 void
-bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx)
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 {
-	bplist_q_t *bpq;
+	bplist_entry_t *bpe;
 
 	mutex_enter(&bpl->bpl_lock);
-	while ((bpq = bpl->bpl_queue) != NULL) {
-		bpl->bpl_queue = bpq->bpq_next;
+	while (bpe = list_head(&bpl->bpl_list)) {
+		bplist_iterate_last_removed = bpe;
+		list_remove(&bpl->bpl_list, bpe);
 		mutex_exit(&bpl->bpl_lock);
-		func(arg, &bpq->bpq_blk, tx);
-		kmem_free(bpq, sizeof (*bpq));
+		func(arg, &bpe->bpe_blk, tx);
+		kmem_free(bpe, sizeof (*bpe));
 		mutex_enter(&bpl->bpl_lock);
 	}
 	mutex_exit(&bpl->bpl_lock);
 }
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
-	mutex_enter(&bpl->bpl_lock);
-	ASSERT3P(bpl->bpl_queue, ==, NULL);
-	VERIFY(0 == bplist_hold(bpl));
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
-	    bpl->bpl_object, 0, -1ULL, tx));
-	bpl->bpl_phys->bpl_entries = 0;
-	bpl->bpl_phys->bpl_bytes = 0;
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp = 0;
-		bpl->bpl_phys->bpl_uncomp = 0;
-	}
-	mutex_exit(&bpl->bpl_lock);
-}
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	int err;
-
-	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	*usedp = bpl->bpl_phys->bpl_bytes;
-	if (bpl->bpl_havecomp) {
-		*compp = bpl->bpl_phys->bpl_comp;
-		*uncompp = bpl->bpl_phys->bpl_uncomp;
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	if (!bpl->bpl_havecomp) {
-		uint64_t itor = 0, comp = 0, uncomp = 0;
-		blkptr_t bp;
-
-		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-			comp += BP_GET_PSIZE(&bp);
-			uncomp += BP_GET_UCSIZE(&bp);
-		}
-		if (err == ENOENT)
-			err = 0;
-		*compp = comp;
-		*uncompp = uncomp;
-	}
-
-	return (err);
-}
-
-/*
- * Return (in *dsizep) the amount of space on the deadlist which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *dsizep)
-{
-	uint64_t size = 0;
-	uint64_t itor = 0;
-	blkptr_t bp;
-	int err;
-
-	/*
-	 * As an optimization, if they want the whole txg range, just
-	 * get bpl_bytes rather than iterating over the bps.
-	 */
-	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
-		mutex_enter(&bpl->bpl_lock);
-		err = bplist_hold(bpl);
-		if (err == 0)
-			*dsizep = bpl->bpl_phys->bpl_bytes;
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
-			size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp);
-		}
-	}
-	if (err == ENOENT)
-		err = 0;
-	*dsizep = size;
-	return (err);
-}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c	27 Mar 2016 02:52:19 -0000
@@ -0,0 +1,592 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+			ASSERT0(dp->dp_empty_bpobj);
+			dp->dp_empty_bpobj =
+			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
+			VERIFY(zap_add(os,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+			    &dp->dp_empty_bpobj, tx) == 0);
+		}
+		spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
+		ASSERT(dp->dp_empty_bpobj != 0);
+		return (dp->dp_empty_bpobj);
+	} else {
+		return (bpobj_alloc(os, blocksize, tx));
+	}
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
+	if (!spa_feature_is_active(dmu_objset_spa(os),
+	    SPA_FEATURE_EMPTY_BPOBJ)) {
+		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_EMPTY_BPOBJ, tx));
+		VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+		dp->dp_empty_bpobj = 0;
+	}
+}
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	int size;
+
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+		size = BPOBJ_SIZE_V0;
+	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		size = BPOBJ_SIZE_V1;
+	else
+		size = sizeof (bpobj_phys_t);
+
+	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+	    DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	int64_t i;
+	bpobj_t bpo;
+	dmu_object_info_t doi;
+	int epb;
+	dmu_buf_t *dbuf = NULL;
+
+	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
+	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+	mutex_enter(&bpo.bpo_lock);
+
+	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+		goto out;
+
+	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+	epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+		uint64_t *objarray;
+		uint64_t offset, blkoff;
+
+		offset = i * sizeof (uint64_t);
+		blkoff = P2PHASE(i, epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			VERIFY3U(0, ==, dmu_buf_hold(os,
+			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		objarray = dbuf->db_data;
+		bpobj_free(os, objarray[blkoff], tx);
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+	mutex_exit(&bpo.bpo_lock);
+	bpobj_close(&bpo);
+
+	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+	int err;
+
+	err = dmu_object_info(os, object, &doi);
+	if (err)
+		return (err);
+
+	bzero(bpo, sizeof (*bpo));
+	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	ASSERT(bpo->bpo_dbuf == NULL);
+	ASSERT(bpo->bpo_phys == NULL);
+	ASSERT(object != 0);
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+	err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+	if (err)
+		return (err);
+
+	bpo->bpo_os = os;
+	bpo->bpo_object = object;
+	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+	return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+	/* Lame workaround for closing a bpobj that was never opened. */
+	if (bpo->bpo_object == 0)
+		return;
+
+	dmu_buf_rele(bpo->bpo_dbuf, bpo);
+	if (bpo->bpo_cached_dbuf != NULL)
+		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+	bpo->bpo_dbuf = NULL;
+	bpo->bpo_phys = NULL;
+	bpo->bpo_cached_dbuf = NULL;
+	bpo->bpo_object = 0;
+
+	mutex_destroy(&bpo->bpo_lock);
+}
+
+static boolean_t
+bpobj_hasentries(bpobj_t *bpo)
+{
+	return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
+	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+    boolean_t free)
+{
+	dmu_object_info_t doi;
+	int epb;
+	int64_t i;
+	int err = 0;
+	dmu_buf_t *dbuf = NULL;
+
+	mutex_enter(&bpo->bpo_lock);
+
+	if (free)
+		dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+	for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+		blkptr_t *bparray;
+		blkptr_t *bp;
+		uint64_t offset, blkoff;
+
+		offset = i * sizeof (blkptr_t);
+		blkoff = P2PHASE(i, bpo->bpo_epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+			    FTAG, &dbuf, 0);
+			if (err)
+				break;
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		bparray = dbuf->db_data;
+		bp = &bparray[blkoff];
+		err = func(arg, bp, tx);
+		if (err)
+			break;
+		if (free) {
+			bpo->bpo_phys->bpo_bytes -=
+			    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+			if (bpo->bpo_havecomp) {
+				bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+				bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+			}
+			bpo->bpo_phys->bpo_num_blkptrs--;
+			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+		}
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	if (free) {
+		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+		    (i + 1) * sizeof (blkptr_t), -1ULL, tx));
+	}
+	if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+		goto out;
+
+	ASSERT(bpo->bpo_havecomp);
+	err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+	if (err) {
+		mutex_exit(&bpo->bpo_lock);
+		return (err);
+	}
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+	epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+		uint64_t *objarray;
+		uint64_t offset, blkoff;
+		bpobj_t sublist;
+		uint64_t used_before, comp_before, uncomp_before;
+		uint64_t used_after, comp_after, uncomp_after;
+
+		offset = i * sizeof (uint64_t);
+		blkoff = P2PHASE(i, epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			err = dmu_buf_hold(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+			if (err)
+				break;
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		objarray = dbuf->db_data;
+		err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+		if (err)
+			break;
+		if (free) {
+			err = bpobj_space(&sublist,
+			    &used_before, &comp_before, &uncomp_before);
+			if (err != 0) {
+				bpobj_close(&sublist);
+				break;
+			}
+		}
+		err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+		if (free) {
+			VERIFY3U(0, ==, bpobj_space(&sublist,
+			    &used_after, &comp_after, &uncomp_after));
+			bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+			bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
+			bpo->bpo_phys->bpo_uncomp -=
+			    uncomp_before - uncomp_after;
+		}
+
+		bpobj_close(&sublist);
+		if (err)
+			break;
+		if (free) {
+			err = dmu_object_free(bpo->bpo_os,
+			    objarray[blkoff], tx);
+			if (err)
+				break;
+			bpo->bpo_phys->bpo_num_subobjs--;
+			ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+		}
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	if (free) {
+		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+		    bpo->bpo_phys->bpo_subobjs,
+		    (i + 1) * sizeof (uint64_t), -1ULL, tx));
+	}
+
+out:
+	/* If there are no entries, there should be no bytes. */
+	if (!bpobj_hasentries(bpo)) {
+		ASSERT0(bpo->bpo_phys->bpo_bytes);
+		ASSERT0(bpo->bpo_phys->bpo_comp);
+		ASSERT0(bpo->bpo_phys->bpo_uncomp);
+	}
+
+	mutex_exit(&bpo->bpo_lock);
+	return (err);
+}
+
+/*
+ * Iterate and remove the entries.  If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries.  If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+	bpobj_t subbpo;
+	uint64_t used, comp, uncomp, subsubobjs;
+
+	ASSERT(bpo->bpo_havesubobj);
+	ASSERT(bpo->bpo_havecomp);
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+		bpobj_decr_empty(bpo->bpo_os, tx);
+		return;
+	}
+
+	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+
+	if (!bpobj_hasentries(&subbpo)) {
+		/* No point in having an empty subobj. */
+		bpobj_close(&subbpo);
+		bpobj_free(bpo->bpo_os, subobj, tx);
+		return;
+	}
+
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+	if (bpo->bpo_phys->bpo_subobjs == 0) {
+		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+		    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+		    DMU_OT_NONE, 0, tx);
+	}
+
+	dmu_object_info_t doi;
+	ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+
+	mutex_enter(&bpo->bpo_lock);
+	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+	    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+	    sizeof (subobj), &subobj, tx);
+	bpo->bpo_phys->bpo_num_subobjs++;
+
+	/*
+	 * If subobj has only one block of subobjs, then move subobj's
+	 * subobjs to bpo's subobj list directly.  This reduces
+	 * recursion in bpobj_iterate due to nested subobjs.
+	 */
+	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+	if (subsubobjs != 0) {
+		dmu_object_info_t doi;
+
+		VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+		if (doi.doi_max_offset == doi.doi_data_block_size) {
+			dmu_buf_t *subdb;
+			uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+			VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
+			    0, FTAG, &subdb, 0));
+			/*
+			 * Make sure that we are not asking dmu_write()
+			 * to write more data than we have in our buffer.
+			 */
+			VERIFY3U(subdb->db_size, >=,
+			    numsubsub * sizeof (subobj));
+			dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+			    numsubsub * sizeof (subobj), subdb->db_data, tx);
+			dmu_buf_rele(subdb, FTAG);
+			bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+			dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+			subbpo.bpo_phys->bpo_subobjs = 0;
+			VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
+			    subsubobjs, tx));
+		}
+	}
+	bpo->bpo_phys->bpo_bytes += used;
+	bpo->bpo_phys->bpo_comp += comp;
+	bpo->bpo_phys->bpo_uncomp += uncomp;
+	mutex_exit(&bpo->bpo_lock);
+
+	bpobj_close(&subbpo);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	blkptr_t stored_bp = *bp;
+	uint64_t offset;
+	int blkoff;
+	blkptr_t *bparray;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+	if (BP_IS_EMBEDDED(bp)) {
+		/*
+		 * The bpobj will compress better without the payload.
+		 *
+		 * Note that we store EMBEDDED bp's because they have an
+		 * uncompressed size, which must be accounted for.  An
+		 * alternative would be to add their size to bpo_uncomp
+		 * without storing the bp, but that would create additional
+		 * complications: bpo_uncomp would be inconsistent with the
+		 * set of BP's stored, and bpobj_iterate() wouldn't visit
+		 * all the space accounted for in the bpobj.
+		 */
+		bzero(&stored_bp, sizeof (stored_bp));
+		stored_bp.blk_prop = bp->blk_prop;
+		stored_bp.blk_birth = bp->blk_birth;
+	} else if (!BP_GET_DEDUP(bp)) {
+		/* The bpobj will compress better without the checksum */
+		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+	}
+
+	/* We never need the fill count. */
+	stored_bp.blk_fill = 0;
+
+	mutex_enter(&bpo->bpo_lock);
+
+	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+	if (bpo->bpo_cached_dbuf == NULL ||
+	    offset < bpo->bpo_cached_dbuf->db_offset ||
+	    offset >= bpo->bpo_cached_dbuf->db_offset +
+	    bpo->bpo_cached_dbuf->db_size) {
+		if (bpo->bpo_cached_dbuf)
+			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
+	}
+
+	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+	bparray = bpo->bpo_cached_dbuf->db_data;
+	bparray[blkoff] = stored_bp;
+
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+	bpo->bpo_phys->bpo_num_blkptrs++;
+	bpo->bpo_phys->bpo_bytes +=
+	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+	if (bpo->bpo_havecomp) {
+		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+	}
+	mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+	spa_t *spa;
+	uint64_t mintxg;
+	uint64_t maxtxg;
+	uint64_t used;
+	uint64_t comp;
+	uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct space_range_arg *sra = arg;
+
+	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
+			sra->used += bp_get_dsize_sync(sra->spa, bp);
+		else
+			sra->used += bp_get_dsize(sra->spa, bp);
+		sra->comp += BP_GET_PSIZE(bp);
+		sra->uncomp += BP_GET_UCSIZE(bp);
+	}
+	return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	mutex_enter(&bpo->bpo_lock);
+
+	*usedp = bpo->bpo_phys->bpo_bytes;
+	if (bpo->bpo_havecomp) {
+		*compp = bpo->bpo_phys->bpo_comp;
+		*uncompp = bpo->bpo_phys->bpo_uncomp;
+		mutex_exit(&bpo->bpo_lock);
+		return (0);
+	} else {
+		mutex_exit(&bpo->bpo_lock);
+		return (bpobj_space_range(bpo, 0, UINT64_MAX,
+		    usedp, compp, uncompp));
+	}
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	struct space_range_arg sra = { 0 };
+	int err;
+
+	/*
+	 * As an optimization, if they want the whole txg range, just
+	 * get bpo_bytes rather than iterating over the bps.
+	 */
+	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+		return (bpobj_space(bpo, usedp, compp, uncompp));
+
+	sra.spa = dmu_objset_spa(bpo->bpo_os);
+	sra.mintxg = mintxg;
+	sra.maxtxg = maxtxg;
+
+	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+	*usedp = sra.used;
+	*compp = sra.comp;
+	*uncompp = sra.uncomp;
+	return (err);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c	10 Oct 2016 11:09:56 -0000
@@ -0,0 +1,301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/arc.h>
+#include <sys/bptree.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/refcount.h>
+#include <sys/spa.h>
+
+/*
+ * A bptree is a queue of root block pointers from destroyed datasets. When a
+ * dataset is destroyed its root block pointer is put on the end of the pool's
+ * bptree queue so the dataset's blocks can be freed asynchronously by
+ * dsl_scan_sync. This allows the delete operation to finish without traversing
+ * all the dataset's blocks.
+ *
+ * Note that while bt_begin and bt_end are only ever incremented in this code,
+ * they are effectively reset to 0 every time the entire bptree is freed because
+ * the bptree's object is destroyed and re-created.
+ */
+
+struct bptree_args {
+	bptree_phys_t *ba_phys;	/* data in bonus buffer, dirtied if freeing */
+	boolean_t ba_free;	/* true if freeing during traversal */
+
+	bptree_itor_t *ba_func;	/* function to call for each blockpointer */
+	void *ba_arg;		/* caller supplied argument to ba_func */
+	dmu_tx_t *ba_tx;	/* caller supplied tx, NULL if not freeing */
+} bptree_args_t;
+
+uint64_t
+bptree_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+	    sizeof (bptree_phys_t), tx);
+
+	/*
+	 * Bonus buffer contents are already initialized to 0, but for
+	 * readability we make it explicit.
+	 */
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	bt = db->db_data;
+	bt->bt_begin = 0;
+	bt->bt_end = 0;
+	bt->bt_bytes = 0;
+	bt->bt_comp = 0;
+	bt->bt_uncomp = 0;
+	dmu_buf_rele(db, FTAG);
+
+	return (obj);
+}
+
+int
+bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	ASSERT3U(bt->bt_begin, ==, bt->bt_end);
+	ASSERT0(bt->bt_bytes);
+	ASSERT0(bt->bt_comp);
+	ASSERT0(bt->bt_uncomp);
+	dmu_buf_rele(db, FTAG);
+
+	return (dmu_object_free(os, obj, tx));
+}
+
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	boolean_t rv;
+
+	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	rv = (bt->bt_begin == bt->bt_end);
+	dmu_buf_rele(db, FTAG);
+	return (rv);
+}
+
+void
+bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	bptree_entry_phys_t bte = { 0 };
+
+	/*
+	 * bptree objects are in the pool mos, therefore they can only be
+	 * modified in syncing context. Furthermore, this is only modified
+	 * by the sync thread, so no locking is necessary.
+	 */
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+
+	bte.be_birth_txg = birth_txg;
+	bte.be_bp = *bp;
+	dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
+
+	dmu_buf_will_dirty(db, tx);
+	bt->bt_end++;
+	bt->bt_bytes += bytes;
+	bt->bt_comp += comp;
+	bt->bt_uncomp += uncomp;
+	dmu_buf_rele(db, FTAG);
+}
+
+/* ARGSUSED */
+static int
+bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	int err;
+	struct bptree_args *ba = arg;
+
+	if (bp == NULL || BP_IS_HOLE(bp))
+		return (0);
+
+	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
+	if (err == 0 && ba->ba_free) {
+		ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
+		ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
+		ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
+	}
+	return (err);
+}
+
+/*
+ * If "free" is set:
+ *  - It is assumed that "func" will be freeing the block pointers.
+ *  - If "func" returns nonzero, the bookmark will be remembered and
+ *    iteration will be restarted from this point on next invocation.
+ *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ *    bptree_iterate will remember the bookmark, continue traversing
+ *    any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
+int
+bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
+    void *arg, dmu_tx_t *tx)
+{
+	boolean_t ioerr = B_FALSE;
+	int err;
+	uint64_t i;
+	dmu_buf_t *db;
+	struct bptree_args ba;
+
+	ASSERT(!free || dmu_tx_is_syncing(tx));
+
+	err = dmu_bonus_hold(os, obj, FTAG, &db);
+	if (err != 0)
+		return (err);
+
+	if (free)
+		dmu_buf_will_dirty(db, tx);
+
+	ba.ba_phys = db->db_data;
+	ba.ba_free = free;
+	ba.ba_func = func;
+	ba.ba_arg = arg;
+	ba.ba_tx = tx;
+
+	err = 0;
+	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
+		bptree_entry_phys_t bte;
+		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
+
+		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
+		    &bte, DMU_READ_NO_PREFETCH);
+		if (err != 0)
+			break;
+
+		if (zfs_free_leak_on_eio)
+			flags |= TRAVERSE_HARD;
+		zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
+		    "bookmark %lld/%lld/%lld/%lld",
+		    (longlong_t)i,
+		    (longlong_t)bte.be_birth_txg,
+		    (longlong_t)bte.be_zb.zb_objset,
+		    (longlong_t)bte.be_zb.zb_object,
+		    (longlong_t)bte.be_zb.zb_level,
+		    (longlong_t)bte.be_zb.zb_blkid);
+		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
+		    bte.be_birth_txg, &bte.be_zb, flags,
+		    bptree_visit_cb, &ba);
+		if (free) {
+			/*
+			 * The callback has freed the visited block pointers.
+			 * Record our traversal progress on disk, either by
+			 * updating this record's bookmark, or by logically
+			 * removing this record by advancing bt_begin.
+			 */
+			if (err != 0) {
+				/* save bookmark for future resume */
+				ASSERT3U(bte.be_zb.zb_objset, ==,
+				    ZB_DESTROYED_OBJSET);
+				ASSERT0(bte.be_zb.zb_level);
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+				if (err == EIO || err == ECKSUM ||
+				    err == ENXIO) {
+					/*
+					 * Skip the rest of this tree and
+					 * continue on to the next entry.
+					 */
+					err = 0;
+					ioerr = B_TRUE;
+				} else {
+					break;
+				}
+			} else if (ioerr) {
+				/*
+				 * This entry is finished, but there were
+				 * i/o errors on previous entries, so we
+				 * can't adjust bt_begin.  Set this entry's
+				 * be_birth_txg such that it will be
+				 * treated as a no-op in future traversals.
+				 */
+				bte.be_birth_txg = UINT64_MAX;
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+			}
+
+			if (!ioerr) {
+				ba.ba_phys->bt_begin++;
+				(void) dmu_free_range(os, obj,
+				    i * sizeof (bte), sizeof (bte), tx);
+			}
+		} else if (err != 0) {
+			break;
+		}
+	}
+
+	ASSERT(!free || err != 0 || ioerr ||
+	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+
+	/* if all blocks are free there should be no used space */
+	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+		if (zfs_free_leak_on_eio) {
+			ba.ba_phys->bt_bytes = 0;
+			ba.ba_phys->bt_comp = 0;
+			ba.ba_phys->bt_uncomp = 0;
+		}
+
+		ASSERT0(ba.ba_phys->bt_bytes);
+		ASSERT0(ba.ba_phys->bt_comp);
+		ASSERT0(ba.ba_phys->bt_uncomp);
+	}
+
+	dmu_buf_rele(db, FTAG);
+
+	return (err);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c	30 Aug 2015 02:18:11 -0000
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#include	<sys/bqueue.h>
+#include	<sys/zfs_context.h>
+
+static inline bqueue_node_t *
+obj2node(bqueue_t *q, void *data)
+{
+	return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
+}
+
+/*
+ * Initialize a blocking queue  The maximum capacity of the queue is set to
+ * size.  Types that want to be stored in a bqueue must contain a bqueue_node_t,
+ * and offset should give its offset from the start of the struct.  Return 0 on
+ * success, or -1 on failure.
+ */
+int
+bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset)
+{
+	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
+	    node_offset + offsetof(bqueue_node_t, bqn_node));
+	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
+	q->bq_node_offset = node_offset;
+	q->bq_size = 0;
+	q->bq_maxsize = size;
+	return (0);
+}
+
+/*
+ * Destroy a blocking queue.  This function asserts that there are no
+ * elements in the queue, and no one is blocked on the condition
+ * variables.
+ */
+void
+bqueue_destroy(bqueue_t *q)
+{
+	ASSERT0(q->bq_size);
+	cv_destroy(&q->bq_add_cv);
+	cv_destroy(&q->bq_pop_cv);
+	mutex_destroy(&q->bq_lock);
+	list_destroy(&q->bq_list);
+}
+
+/*
+ * Add data to q, consuming size units of capacity.  If there is insufficient
+ * capacity to consume size units, block until capacity exists.  Asserts size is
+ * > 0.
+ */
+void
+bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+{
+	ASSERT3U(item_size, >, 0);
+	ASSERT3U(item_size, <, q->bq_maxsize);
+	mutex_enter(&q->bq_lock);
+	obj2node(q, data)->bqn_size = item_size;
+	while (q->bq_size + item_size > q->bq_maxsize) {
+		cv_wait(&q->bq_add_cv, &q->bq_lock);
+	}
+	q->bq_size += item_size;
+	list_insert_tail(&q->bq_list, data);
+	cv_signal(&q->bq_pop_cv);
+	mutex_exit(&q->bq_lock);
+}
+/*
+ * Take the first element off of q.  If there are no elements on the queue, wait
+ * until one is put there.  Return the removed element.
+ */
+void *
+bqueue_dequeue(bqueue_t *q)
+{
+	void *ret;
+	uint64_t item_size;
+	mutex_enter(&q->bq_lock);
+	while (q->bq_size == 0) {
+		cv_wait(&q->bq_pop_cv, &q->bq_lock);
+	}
+	ret = list_remove_head(&q->bq_list);
+	item_size = obj2node(q, ret)->bqn_size;
+	q->bq_size -= item_size;
+	mutex_exit(&q->bq_lock);
+	cv_signal(&q->bq_add_cv);
+	return (ret);
+}
+
+/*
+ * Returns true if the space used is 0.
+ */
+boolean_t
+bqueue_empty(bqueue_t *q)
+{
+	return (q->bq_size == 0);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c,v
retrieving revision 1.5
diff -u -p -r1.5 dbuf.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c	19 Feb 2016 19:25:59 -0000	1.5
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c	26 Apr 2017 00:40:35 -0000
@@ -19,12 +19,18 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
+#include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
@@ -34,26 +40,113 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/callb.h>
 
-static void dbuf_destroy(dmu_buf_impl_t *db);
-static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+uint_t zfs_dbuf_evict_key;
+
+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
+#ifndef __lint
+extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
+    dmu_buf_evict_func_t *evict_func_sync,
+    dmu_buf_evict_func_t *evict_func_async,
+    dmu_buf_t **clear_on_evict_dbufp);
+#endif /* ! __lint */
+
 /*
  * Global data structures and functions for the dbuf cache.
  */
-static kmem_cache_t *dbuf_cache;
+static kmem_cache_t *dbuf_kmem_cache;
+static taskq_t *dbu_evict_taskq;
+
+static kthread_t *dbuf_cache_evict_thread;
+static kmutex_t dbuf_evict_lock;
+static kcondvar_t dbuf_evict_cv;
+static boolean_t dbuf_evict_thread_exit;
+
+/*
+ * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
+ * are not currently held but have been recently released. These dbufs
+ * are not eligible for arc eviction until they are aged out of the cache.
+ * Dbufs are added to the dbuf cache once the last hold is released. If a
+ * dbuf is later accessed and still exists in the dbuf cache, then it will
+ * be removed from the cache and later re-added to the head of the cache.
+ * Dbufs that are aged out of the cache will be immediately destroyed and
+ * become eligible for arc eviction.
+ */
+static multilist_t dbuf_cache;
+static refcount_t dbuf_cache_size;
+uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
+
+/* Cap the size of the dbuf cache to log2 fraction of arc size. */
+int dbuf_cache_max_shift = 5;
+
+/*
+ * The dbuf cache uses a three-stage eviction policy:
+ *	- A low water marker designates when the dbuf eviction thread
+ *	should stop evicting from the dbuf cache.
+ *	- When we reach the maximum size (aka mid water mark), we
+ *	signal the eviction thread to run.
+ *	- The high water mark indicates when the eviction thread
+ *	is unable to keep up with the incoming load and eviction must
+ *	happen in the context of the calling thread.
+ *
+ * The dbuf cache:
+ *                                                 (max size)
+ *                                      low water   mid water   hi water
+ * +----------------------------------------+----------+----------+
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * +----------------------------------------+----------+----------+
+ *                                        stop        signal     evict
+ *                                      evicting     eviction   directly
+ *                                                    thread
+ *
+ * The high and low water marks indicate the operating range for the eviction
+ * thread. The low water mark is, by default, 90% of the total size of the
+ * cache and the high water mark is at 110% (both of these percentages can be
+ * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
+ * respectively). The eviction thread will try to ensure that the cache remains
+ * within this range by waking up every second and checking if the cache is
+ * above the low water mark. The thread can also be woken up by callers adding
+ * elements into the cache if the cache is larger than the mid water (i.e max
+ * cache size). Once the eviction thread is woken up and eviction is required,
+ * it will continue evicting buffers until it's able to reduce the cache size
+ * to the low water mark. If the cache size continues to grow and hits the high
+ * water mark, then callers adding elments to the cache will begin to evict
+ * directly from the cache until the cache is no longer above the high water
+ * mark.
+ */
+
+/*
+ * The percentage above and below the maximum cache size.
+ */
+uint_t dbuf_cache_hiwater_pct = 10;
+uint_t dbuf_cache_lowater_pct = 10;
 
 /* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
-	dmu_buf_impl_t *db = unused;
-	bzero(db, sizeof (dmu_buf_impl_t));
+	dmu_buf_impl_t *db = vdb;
 
+#ifdef __NetBSD__
+	db = unused;
+#endif
+	bzero(db, sizeof (dmu_buf_impl_t));
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+	multilist_link_init(&db->db_cache_link);
 	refcount_create(&db->db_holds);
+
 	return (0);
 }
 
@@ -61,9 +154,14 @@ dbuf_cons(void *vdb, void *unused, int k
 static void
 dbuf_dest(void *vdb, void *unused)
 {
-	dmu_buf_impl_t *db = unused;
+	dmu_buf_impl_t *db = vdb;
+
+#ifdef __NetBSD__
+	db = unused;
+#endif
 	mutex_destroy(&db->db_mtx);
 	cv_destroy(&db->db_changed);
+	ASSERT(!multilist_link_active(&db->db_cache_link));
 	refcount_destroy(&db->db_holds);
 }
 
@@ -93,8 +191,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_
 	return (crc);
 }
 
-#define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
-
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
@@ -102,12 +198,10 @@ dbuf_hash(void *os, uint64_t obj, uint8_
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
-dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	objset_t *os = dn->dn_objset;
-	uint64_t obj = dn->dn_object;
-	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t hv = dbuf_hash(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *db;
 
@@ -126,6 +220,24 @@ dbuf_find(dnode_t *dn, uint8_t level, ui
 	return (NULL);
 }
 
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_buf_impl_t *db = NULL;
+
+	if (dnode_hold(os, object, FTAG, &dn) == 0) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		if (dn->dn_bonus != NULL) {
+			db = dn->dn_bonus;
+			mutex_enter(&db->db_mtx);
+		}
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+	}
+	return (db);
+}
+
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
@@ -140,7 +252,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid = db->db_blkid;
-	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+	uint64_t hv = dbuf_hash(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf;
 
@@ -160,26 +272,25 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, 1);
+	atomic_inc_64(&dbuf_hash_count);
 
 	return (NULL);
 }
 
 /*
- * Remove an entry from the hash table.  This operation will
- * fail if there are any existing holds on the db.
+ * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+	uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	/*
-	 * We musn't hold db_mtx to maintin lock ordering:
+	 * We musn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
@@ -195,36 +306,281 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, -1);
+	atomic_dec_64(&dbuf_hash_count);
 }
 
-static arc_evict_func_t dbuf_do_evict;
+typedef enum {
+	DBVU_EVICTING,
+	DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
+
+static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+	int64_t holds;
+
+	if (db->db_user == NULL)
+		return;
+
+	/* Only data blocks support the attachment of user data. */
+	ASSERT(db->db_level == 0);
+
+	/* Clients must resolve a dbuf before attaching user data. */
+	ASSERT(db->db.db_data != NULL);
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+
+	holds = refcount_count(&db->db_holds);
+	if (verify_type == DBVU_EVICTING) {
+		/*
+		 * Immediate eviction occurs when holds == dirtycnt.
+		 * For normal eviction buffers, holds is zero on
+		 * eviction, except when dbuf_fix_old_data() calls
+		 * dbuf_clear_data().  However, the hold count can grow
+		 * during eviction even though db_mtx is held (see
+		 * dmu_bonus_hold() for an example), so we can only
+		 * test the generic invariant that holds >= dirtycnt.
+		 */
+		ASSERT3U(holds, >=, db->db_dirtycnt);
+	} else {
+		if (db->db_user_immediate_evict == TRUE)
+			ASSERT3U(holds, >=, db->db_dirtycnt);
+		else
+			ASSERT3U(holds, >, 0);
+	}
+#endif
+}
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
+	dmu_buf_user_t *dbu = db->db_user;
+
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
-	if (db->db_level != 0 || db->db_evict_func == NULL)
+	if (dbu == NULL)
 		return;
 
-	if (db->db_user_data_ptr_ptr)
-		*db->db_user_data_ptr_ptr = db->db.db_data;
-	db->db_evict_func(&db->db, db->db_user_ptr);
-	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
-	db->db_evict_func = NULL;
+	dbuf_verify_user(db, DBVU_EVICTING);
+	db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+	if (dbu->dbu_clear_on_evict_dbufp != NULL)
+		*dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+	/*
+	 * There are two eviction callbacks - one that we call synchronously
+	 * and one that we invoke via a taskq.  The async one is useful for
+	 * avoiding lock order reversals and limiting stack depth.
+	 *
+	 * Note that if we have a sync callback but no async callback,
+	 * it's likely that the sync callback will free the structure
+	 * containing the dbu.  In that case we need to take care to not
+	 * dereference dbu after calling the sync evict func.
+	 */
+	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
+
+	if (dbu->dbu_evict_func_sync != NULL)
+		dbu->dbu_evict_func_sync(dbu);
+
+	if (has_async) {
+		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
+		    dbu, 0, &dbu->dbu_tqent);
+	}
 }
 
-void
-dbuf_evict(dmu_buf_impl_t *db)
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
 {
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_buf == NULL);
-	ASSERT(db->db_data_pending == NULL);
+	if (db->db_level > 0) {
+		return (B_TRUE);
+	} else {
+		boolean_t is_metadata;
+
+		DB_DNODE_ENTER(db);
+		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
+		DB_DNODE_EXIT(db);
+
+		return (is_metadata);
+	}
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the dbuf eviction
+ * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
+{
+	dmu_buf_impl_t *db = obj;
+
+	/*
+	 * The assumption here, is the hash value for a given
+	 * dmu_buf_impl_t will remain constant throughout it's lifetime
+	 * (i.e. it's objset, object, level and blkid fields don't change).
+	 * Thus, we don't need to store the dbuf's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (dbuf_hash(db->db_objset, db->db.db_object,
+	    db->db_level, db->db_blkid) %
+	    multilist_get_num_sublists(ml));
+}
+
+static inline boolean_t
+dbuf_cache_above_hiwater(void)
+{
+	uint64_t dbuf_cache_hiwater_bytes =
+	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
+
+	return (refcount_count(&dbuf_cache_size) >
+	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
+}
+
+static inline boolean_t
+dbuf_cache_above_lowater(void)
+{
+	uint64_t dbuf_cache_lowater_bytes =
+	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
+
+	return (refcount_count(&dbuf_cache_size) >
+	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
+}
+
+/*
+ * Evict the oldest eligible dbuf from the dbuf cache.
+ */
+static void
+dbuf_evict_one(void)
+{
+	int idx = multilist_get_random_index(&dbuf_cache);
+	multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
+
+	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
+
+	/*
+	 * Set the thread's tsd to indicate that it's processing evictions.
+	 * Once a thread stops evicting from the dbuf cache it will
+	 * reset its tsd to NULL.
+	 */
+	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
+	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
+
+	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
+	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
+		db = multilist_sublist_prev(mls, db);
+	}
+
+	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
+	    multilist_sublist_t *, mls);
+
+	if (db != NULL) {
+		multilist_sublist_remove(mls, db);
+		multilist_sublist_unlock(mls);
+		(void) refcount_remove_many(&dbuf_cache_size,
+		    db->db.db_size, db);
+		dbuf_destroy(db);
+	} else {
+		multilist_sublist_unlock(mls);
+	}
+	(void) tsd_set(zfs_dbuf_evict_key, NULL);
+}
+
+/*
+ * The dbuf evict thread is responsible for aging out dbufs from the
+ * cache. Once the cache has reached it's maximum size, dbufs are removed
+ * and destroyed. The eviction thread will continue running until the size
+ * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
+ * out of the cache it is destroyed and becomes eligible for arc eviction.
+ */
+static void
+dbuf_evict_thread(void *dummy __unused)
+{
+	callb_cpr_t cpr;
+
+	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&dbuf_evict_lock);
+	while (!dbuf_evict_thread_exit) {
+		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+			CALLB_CPR_SAFE_BEGIN(&cpr);
+			(void) cv_timedwait_hires(&dbuf_evict_cv,
+			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
+		}
+		mutex_exit(&dbuf_evict_lock);
+
+		/*
+		 * Keep evicting as long as we're above the low water mark
+		 * for the cache. We do this without holding the locks to
+		 * minimize lock contention.
+		 */
+		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+			dbuf_evict_one();
+		}
+
+		mutex_enter(&dbuf_evict_lock);
+	}
+
+	dbuf_evict_thread_exit = B_FALSE;
+	cv_broadcast(&dbuf_evict_cv);
+	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
+	thread_exit();
+}
+
+/*
+ * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
+ * If the dbuf cache is at its high water mark, then evict a dbuf from the
+ * dbuf cache using the callers context.
+ */
+static void
+dbuf_evict_notify(void)
+{
+
+	/*
+	 * We use thread specific data to track when a thread has
+	 * started processing evictions. This allows us to avoid deeply
+	 * nested stacks that would have a call flow similar to this:
+	 *
+	 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+	 *	^						|
+	 *	|						|
+	 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
+	 *
+	 * The dbuf_eviction_thread will always have its tsd set until
+	 * that thread exits. All other threads will only set their tsd
+	 * if they are participating in the eviction process. This only
+	 * happens if the eviction thread is unable to process evictions
+	 * fast enough. To keep the dbuf cache size in check, other threads
+	 * can evict from the dbuf cache directly. Those threads will set
+	 * their tsd values so that we ensure that they only evict one dbuf
+	 * from the dbuf cache.
+	 */
+	if (tsd_get(zfs_dbuf_evict_key) != NULL)
+		return;
+
+	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+		boolean_t evict_now = B_FALSE;
 
-	dbuf_clear(db);
-	dbuf_destroy(db);
+		mutex_enter(&dbuf_evict_lock);
+		if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+			evict_now = dbuf_cache_above_hiwater();
+			cv_signal(&dbuf_evict_cv);
+		}
+		mutex_exit(&dbuf_evict_lock);
+
+		if (evict_now) {
+			dbuf_evict_one();
+		}
+	}
 }
 
 void
@@ -233,7 +589,7 @@ dbuf_init(void)
 	uint64_t hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
-	
+
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average 4K block size.  The table will take up
@@ -252,12 +608,38 @@ retry:
 		goto retry;
 	}
 
-	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * Setup the parameters for the dbuf cache. We cap the size of the
+	 * dbuf cache to 1/32nd (default) of the size of the ARC.
+	 */
+	dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
+	    arc_max_bytes() >> dbuf_cache_max_shift);
+
+	/*
+	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+	 * configuration is not required.
+	 */
+	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
+
+	multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_cache_link),
+	    zfs_arc_num_sublists_per_state,
+	    dbuf_cache_multilist_index_func);
+	refcount_create(&dbuf_cache_size);
+
+	tsd_create(&zfs_dbuf_evict_key, NULL);
+	dbuf_evict_thread_exit = B_FALSE;
+	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
+	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
+	    NULL, 0, &p0, TS_RUN, minclsyspri);
 }
 
 void
@@ -269,7 +651,23 @@ dbuf_fini(void)
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-	kmem_cache_destroy(dbuf_cache);
+	kmem_cache_destroy(dbuf_kmem_cache);
+	taskq_destroy(dbu_evict_taskq);
+
+	mutex_enter(&dbuf_evict_lock);
+	dbuf_evict_thread_exit = B_TRUE;
+	while (dbuf_evict_thread_exit) {
+		cv_signal(&dbuf_evict_cv);
+		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
+	}
+	mutex_exit(&dbuf_evict_lock);
+	tsd_destroy(&zfs_dbuf_evict_key);
+
+	mutex_destroy(&dbuf_evict_lock);
+	cv_destroy(&dbuf_evict_cv);
+
+	refcount_destroy(&dbuf_cache_size);
+	multilist_destroy(&dbuf_cache);
 }
 
 /*
@@ -280,7 +678,7 @@ dbuf_fini(void)
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -289,6 +687,8 @@ dbuf_verify(dmu_buf_impl_t *db)
 		return;
 
 	ASSERT(db->db_objset != NULL);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
@@ -296,13 +696,18 @@ dbuf_verify(dmu_buf_impl_t *db)
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
-		ASSERT(db->db_blkid == DB_BONUS_BLKID ||
-		    list_head(&dn->dn_dbufs));
+		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+		    db->db_blkid == DMU_SPILL_BLKID ||
+		    !avl_is_empty(&dn->dn_dbufs));
 	}
-	if (db->db_blkid == DB_BONUS_BLKID) {
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		ASSERT(dn != NULL);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
-		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
+		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
@@ -336,8 +741,9 @@ dbuf_verify(dmu_buf_impl_t *db)
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
-			ASSERT3P(db->db_blkptr, ==,
-			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+			if (db->db_blkid != DMU_SPILL_BLKID)
+				ASSERT3P(db->db_blkptr, ==,
+				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
@@ -349,7 +755,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 			 * have the struct_rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
-			if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) {
+			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
@@ -357,53 +763,83 @@ dbuf_verify(dmu_buf_impl_t *db)
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
-	    db->db.db_data && db->db_blkid != DB_BONUS_BLKID &&
+	    (db->db_buf == NULL || db->db_buf->b_data) &&
+	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
+		 *
+		 * There is an exception to this rule for indirect blocks; in
+		 * this case, if the indirect block is a hole, we fill in a few
+		 * fields on each of the child blocks (importantly, birth time)
+		 * to prevent hole birth times from being lost when you
+		 * partially fill in a hole.
 		 */
 		if (db->db_dirtycnt == 0) {
-			uint64_t *buf = db->db.db_data;
-			int i;
-
-			for (i = 0; i < db->db.db_size >> 3; i++) {
-				ASSERT(buf[i] == 0);
+			if (db->db_level == 0) {
+				uint64_t *buf = db->db.db_data;
+				int i;
+
+				for (i = 0; i < db->db.db_size >> 3; i++) {
+					ASSERT(buf[i] == 0);
+				}
+			} else {
+				blkptr_t *bps = db->db.db_data;
+				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
+				    db->db.db_size);
+				/*
+				 * We want to verify that all the blkptrs in the
+				 * indirect block are holes, but we may have
+				 * automatically set up a few fields for them.
+				 * We iterate through each blkptr and verify
+				 * they only have those fields set.
+				 */
+				for (int i = 0;
+				    i < db->db.db_size / sizeof (blkptr_t);
+				    i++) {
+					blkptr_t *bp = &bps[i];
+					ASSERT(ZIO_CHECKSUM_IS_ZERO(
+					    &bp->blk_cksum));
+					ASSERT(
+					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
+					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
+					    DVA_IS_EMPTY(&bp->blk_dva[2]));
+					ASSERT0(bp->blk_fill);
+					ASSERT0(bp->blk_pad[0]);
+					ASSERT0(bp->blk_pad[1]);
+					ASSERT(!BP_IS_EMBEDDED(bp));
+					ASSERT(BP_IS_HOLE(bp));
+					ASSERT0(bp->blk_phys_birth);
+				}
 			}
 		}
 	}
+	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
-dbuf_update_data(dmu_buf_impl_t *db)
+dbuf_clear_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
-		ASSERT(!refcount_is_zero(&db->db_holds));
-		*db->db_user_data_ptr_ptr = db->db.db_data;
-	}
+	dbuf_evict_user(db);
+	ASSERT3P(db->db_buf, ==, NULL);
+	db->db.db_data = NULL;
+	if (db->db_state != DB_NOFILL)
+		db->db_state = DB_UNCACHED;
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
+	ASSERT(buf != NULL);
+
 	db->db_buf = buf;
-	if (buf != NULL) {
-		ASSERT(buf->b_data != NULL);
-		db->db.db_data = buf->b_data;
-		if (!arc_released(buf))
-			arc_set_callback(buf, dbuf_do_evict, db);
-		dbuf_update_data(db);
-	} else {
-		dbuf_evict_user(db);
-		db->db.db_data = NULL;
-		if (db->db_state != DB_NOFILL)
-			db->db_state = DB_UNCACHED;
-	}
+	ASSERT(buf->b_data != NULL);
+	db->db.db_data = buf->b_data;
 }
 
 /*
@@ -414,26 +850,54 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
+		spa_t *spa = db->db_objset->os_spa;
+
 		mutex_exit(&db->db_mtx);
-		abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+		abuf = arc_loan_buf(spa, blksz);
 		bcopy(db->db.db_data, abuf->b_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
-		dbuf_set_data(db, NULL);
+		db->db_buf = NULL;
+		dbuf_clear_data(db);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
 uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
+dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
 {
-	if (dn->dn_datablkshift) {
-		return (offset >> dn->dn_datablkshift);
+	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+		/*
+		 * The level n blkid is equal to the level 0 blkid divided by
+		 * the number of level 0s in a level n block.
+		 *
+		 * The level 0 blkid is offset >> datablkshift =
+		 * offset / 2^datablkshift.
+		 *
+		 * The number of level 0s in a level n is the number of block
+		 * pointers in an indirect block, raised to the power of level.
+		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+		 *
+		 * Thus, the level n blkid is: offset /
+		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
+		 * = offset / 2^(datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 * = offset >> (datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 */
+		return (offset >> (dn->dn_datablkshift + level *
+		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
@@ -465,24 +929,24 @@ dbuf_read_done(zio_t *zio, arc_buf_t *bu
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		arc_buf_destroy(buf, db);
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
-	mutex_exit(&db->db_mtx);
-	dbuf_rele(db, NULL);
+	dbuf_rele_and_unlock(db, NULL);
 }
 
 static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
-	dnode_t *dn = db->db_dnode;
-	zbookmark_t zb;
-	uint32_t aflags = ARC_NOWAIT;
-	arc_buf_t *pbuf;
+	dnode_t *dn;
+	zbookmark_phys_t zb;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -490,7 +954,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
-	if (db->db_blkid == DB_BONUS_BLKID) {
+	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
@@ -500,7 +964,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		if (bonuslen)
 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
-		dbuf_update_data(db);
+		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
@@ -516,47 +980,63 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t
 	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
+		dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
 		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
+
+		if (db->db_blkptr != NULL && db->db_level > 0 &&
+		    BP_IS_HOLE(db->db_blkptr) &&
+		    db->db_blkptr->blk_birth != 0) {
+			blkptr_t *bps = db->db.db_data;
+			for (int i = 0; i < ((1 <<
+			    DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
+			    i++) {
+				blkptr_t *bp = &bps[i];
+				ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+				    1 << dn->dn_indblkshift);
+				BP_SET_LSIZE(bp,
+				    BP_GET_LEVEL(db->db_blkptr) == 1 ?
+				    dn->dn_datablksz :
+				    BP_GET_LSIZE(db->db_blkptr));
+				BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
+				BP_SET_LEVEL(bp,
+				    BP_GET_LEVEL(db->db_blkptr) - 1);
+				BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+			}
+		}
+		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
-		*flags |= DB_RF_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
+	DB_DNODE_EXIT(db);
+
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
 	if (DBUF_IS_L2CACHEABLE(db))
-		aflags |= ARC_L2CACHE;
+		aflags |= ARC_FLAG_L2CACHE;
 
 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	dbuf_add_ref(db, NULL);
-	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
-
-	if (db->db_parent)
-		pbuf = db->db_parent->db_buf;
-	else
-		pbuf = db->db_objset->os_phys_buf;
 
-	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
+	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
-	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
-	if (aflags & ARC_CACHED)
-		*flags |= DB_RF_CACHED;
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
-	int havepzio = (zio != NULL);
-	int prefetch;
+	boolean_t havepzio = (zio != NULL);
+	boolean_t prefetch;
+	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
@@ -565,59 +1045,72 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	if (db->db_state == DB_NOFILL)
-		return (EIO);
+		return (SET_ERROR(EIO));
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
-		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
-	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, TRUE);
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
 	} else if (db->db_state == DB_UNCACHED) {
-		if (zio == NULL) {
-			zio = zio_root(db->db_dnode->dn_objset->os_spa,
-			    NULL, NULL, ZIO_FLAG_CANFAIL);
-		}
-		dbuf_read_impl(db, zio, &flags);
+		spa_t *spa = dn->dn_objset->os_spa;
+
+		if (zio == NULL)
+			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+		dbuf_read_impl(db, zio, flags);
 
 		/* dbuf_read_impl has dropped db_mtx for us */
 
 		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, flags & DB_RF_CACHED);
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
 
 		if (!havepzio)
 			err = zio_wait(zio);
 	} else {
+		/*
+		 * Another reader came in while the dbuf was in flight
+		 * between UNCACHED and CACHED.  Either a writer will finish
+		 * writing the buffer (sending the dbuf to CACHED) or the
+		 * first reader's request will reach the read_done callback
+		 * and send the dbuf to CACHED.  Otherwise, a failure
+		 * occurred and the dbuf went to UNCACHED.
+		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
-			dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
-			    db->db.db_size, TRUE);
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
-			rw_exit(&db->db_dnode->dn_struct_rwlock);
+			rw_exit(&dn->dn_struct_rwlock);
+		DB_DNODE_EXIT(db);
 
+		/* Skip the wait per the caller's request. */
 		mutex_enter(&db->db_mtx);
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
+				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+				    db, zio_t *, zio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
-				err = EIO;
+				err = SET_ERROR(EIO);
 		}
 		mutex_exit(&db->db_mtx);
 	}
@@ -630,20 +1123,20 @@ static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		spa_t *spa = db->db_objset->os_spa;
 
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    db->db.db_size, db, type));
+		dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
 		db->db_state = DB_FILL;
 	} else if (db->db_state == DB_NOFILL) {
-		dbuf_set_data(db, NULL);
+		dbuf_clear_data(db);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
@@ -675,18 +1168,18 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, ui
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
-	    ((db->db_blkid  == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
-	 * 	reset the reference to point to a new copy,
+	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT(dr->dr_txg >= txg - 2);
-	if (db->db_blkid == DB_BONUS_BLKID) {
+	if (db->db_blkid == DMU_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -694,11 +1187,13 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, ui
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		dr->dt.dl.dr_data = arc_buf_alloc(
-		    db->db_dnode->dn_objset->os_spa, size, db, type);
+		spa_t *spa = db->db_objset->os_spa;
+
+		dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
-		dbuf_set_data(db, NULL);
+		db->db_buf = NULL;
+		dbuf_clear_data(db);
 	}
 }
 
@@ -713,17 +1208,19 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
-	if (db->db_blkid == DB_BONUS_BLKID ||
+	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
-	if (!BP_IS_HOLE(bp))
-		dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
+	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+		zio_free(db->db_objset->os_spa, txg, bp);
 
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	dr->dt.dl.dr_nopwrite = B_FALSE;
+
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
@@ -738,54 +1235,48 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
- * empty blocks.  Also, if we happen accross any level-1 dbufs in the
- * range that have not already been marked dirty, mark them dirty so
- * they stay in memory.
+ * empty blocks.
  */
 void
-dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
 {
+	dmu_buf_impl_t db_search;
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
-	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	uint64_t first_l1 = start >> epbs;
-	uint64_t last_l1 = end >> epbs;
-
-	if (end > dn->dn_maxblkid) {
-		end = dn->dn_maxblkid;
-		last_l1 = end >> epbs;
-	}
-	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
+	avl_index_t where;
+
+	if (end_blkid > dn->dn_maxblkid &&
+	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
+		end_blkid = dn->dn_maxblkid;
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+	db_search.db_level = 0;
+	db_search.db_blkid = start_blkid;
+	db_search.db_state = DB_SEARCH;
+
 	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	db = avl_find(&dn->dn_dbufs, &db_search, &where);
+	ASSERT3P(db, ==, NULL);
 
-		if (db->db_level == 1 &&
-		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
-			mutex_enter(&db->db_mtx);
-			if (db->db_last_dirty &&
-			    db->db_last_dirty->dr_txg < txg) {
-				dbuf_add_ref(db, FTAG);
-				mutex_exit(&db->db_mtx);
-				dbuf_will_dirty(db, tx);
-				dbuf_rele(db, FTAG);
-			} else {
-				mutex_exit(&db->db_mtx);
-			}
-		}
+	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
-		if (db->db_level != 0)
-			continue;
-		dprintf_dbuf(db, "found buf %s\n", "");
-		if (db->db_blkid < start || db->db_blkid > end)
-			continue;
+	for (; db != NULL; db = db_next) {
+		db_next = AVL_NEXT(&dn->dn_dbufs, db);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+		if (db->db_level != 0 || db->db_blkid > end_blkid) {
+			break;
+		}
+		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
-		if (dbuf_undirty(db, tx))
+		mutex_enter(&db->db_mtx);
+		if (dbuf_undirty(db, tx)) {
+			/* mutex has been dropped and dbuf destroyed */
 			continue;
+		}
 
-		mutex_enter(&db->db_mtx);
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
@@ -801,7 +1292,7 @@ dbuf_free_range(dnode_t *dn, uint64_t st
 		}
 		if (refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
-			dbuf_clear(db);
+			dbuf_destroy(db);
 			continue;
 		}
 		/* The dbuf is referenced */
@@ -815,7 +1306,8 @@ dbuf_free_range(dnode_t *dn, uint64_t st
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
-				if (db->db_blkid > dn->dn_maxblkid)
+				if (db->db_blkid != DMU_SPILL_BLKID &&
+				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
@@ -851,19 +1343,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
+	 *
+	 * This logic ensures that only block births for
+	 * filled blocks are considered.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_last_dirty)
+	if (db->db_last_dirty && (db->db_blkptr == NULL ||
+	    !BP_IS_HOLE(db->db_blkptr))) {
 		birth_txg = db->db_last_dirty->dr_txg;
-	else if (db->db_blkptr)
+	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 		birth_txg = db->db_blkptr->blk_birth;
+	}
 
-	/* If we don't exist or are in a snapshot, we can't be freed */
-	if (birth_txg)
+	/*
+	 * If this block don't exist or is in a snapshot, it can't be freed.
+	 * Don't pass the bp to dsl_dataset_block_freeable() since we
+	 * are holding the db_mtx lock and might deadlock if we are
+	 * prefetching a dedup-ed block.
+	 */
+	if (birth_txg != 0)
 		return (ds == NULL ||
-		    dsl_dataset_block_freeable(ds, birth_txg));
+		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
 	else
-		return (FALSE);
+		return (B_FALSE);
 }
 
 void
@@ -872,14 +1374,18 @@ dbuf_new_size(dmu_buf_impl_t *db, int si
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+	dnode_t *dn;
+
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 
 	/* XXX does *this* func really need the lock? */
-	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
-	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 	 * is OK, because there can be no other references to the db
 	 * when we are changing its size, so no concurrent DB_FILL can
 	 * be happening.
@@ -888,10 +1394,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int si
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
-	dbuf_will_dirty(db, tx);
+	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
-	buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type);
+	buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
@@ -902,7 +1408,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int si
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
-	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
+	arc_buf_destroy(obuf, db);
 	db->db.db_size = size;
 
 	if (db->db_level == 0) {
@@ -911,14 +1417,54 @@ dbuf_new_size(dmu_buf_impl_t *db, int si
 	}
 	mutex_exit(&db->db_mtx);
 
-	dnode_willuse_space(db->db_dnode, size-osize, tx);
+	dnode_willuse_space(dn, size-osize, tx);
+	DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+	objset_t *os = db->db_objset;
+
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(arc_released(os->os_phys_buf) ||
+	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+	(void) arc_release(db->db_buf, db);
+}
+
+/*
+ * We already have a dirty record for this TXG, and we are being
+ * dirtied again.
+ */
+static void
+dbuf_redirty(dbuf_dirty_record_t *dr)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+		/*
+		 * If this buffer has already been written out,
+		 * we now need to reset its state.
+		 */
+		dbuf_unoverride(dr);
+		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+		    db->db_state != DB_NOFILL) {
+			/* Already released on initial dirty, so just thaw. */
+			ASSERT(arc_released(db->db_buf));
+			arc_buf_thaw(db->db_buf);
+		}
+	}
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
+	dnode_t *dn;
+	objset_t *os;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	boolean_t do_free_accounting = B_FALSE;
@@ -928,15 +1474,25 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
+#ifdef DEBUG
+	if (dn->dn_objset->os_dsl_dataset != NULL) {
+		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+		    RW_READER, FTAG);
+	}
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
@@ -961,15 +1517,27 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
-	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
-	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
-		dn->dn_dirtyctx =
-		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
-		ASSERT(dn->dn_dirtyctx_firstset == NULL);
-		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
+		if (dn->dn_objset->os_dsl_dataset != NULL) {
+			rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+			    RW_READER, FTAG);
+		}
+		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+			dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
+			    DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+			ASSERT(dn->dn_dirtyctx_firstset == NULL);
+			dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+		}
+		if (dn->dn_objset->os_dsl_dataset != NULL) {
+			rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+			    FTAG);
+		}
 	}
 	mutex_exit(&dn->dn_mtx);
 
+	if (db->db_blkid == DMU_SPILL_BLKID)
+		dn->dn_have_spill = B_TRUE;
+
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
@@ -979,16 +1547,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
 		drp = &dr->dr_next;
 	if (dr && dr->dr_txg == tx->tx_txg) {
-		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
-			/*
-			 * If this buffer has already been written out,
-			 * we now need to reset its state.
-			 */
-			dbuf_unoverride(dr);
-			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
-			    db->db_state != DB_NOFILL)
-				arc_buf_thaw(db->db_buf);
-		}
+		DB_DNODE_EXIT(db);
+
+		dbuf_redirty(dr);
 		mutex_exit(&db->db_mtx);
 		return (dr);
 	}
@@ -1014,13 +1575,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
+	os = dn->dn_objset;
+#ifdef DEBUG
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-	if (db->db_blkid != DB_BONUS_BLKID) {
+	if (db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * Update the accounting.
 		 * Note: we delay "free accounting" until after we drop
@@ -1042,7 +1610,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
-			if (db->db_blkid == DB_BONUS_BLKID) {
+			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
@@ -1068,6 +1636,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
+	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+		dr->dr_accounted = db->db.db_size;
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	dr->dr_next = *drp;
@@ -1078,9 +1648,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
-	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
-		dnode_clear_range(dn, db->db_blkid, 1, tx);
+		if (dn->dn_free_ranges[txgoff] != NULL) {
+			range_tree_clear(dn->dn_free_ranges[txgoff],
+			    db->db_blkid, 1);
+		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
@@ -1094,14 +1668,29 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 
 	mutex_exit(&db->db_mtx);
 
-	if (db->db_blkid == DB_BONUS_BLKID) {
+	if (db->db_blkid == DMU_BONUS_BLKID ||
+	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
+		DB_DNODE_EXIT(db);
 		return (dr);
-	} else if (do_free_accounting) {
+	}
+
+	/*
+	 * The dn_struct_rwlock prevents db_blkptr from changing
+	 * due to a write from syncing context completing
+	 * while we are running, so we want to acquire it before
+	 * looking at db_blkptr.
+	 */
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
@@ -1113,14 +1702,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 		 * db_blkptr, but since this is just a guess,
 		 * it's OK if we get an odd answer.
 		 */
+		ddt_prefetch(os->os_spa, bp);
 		dnode_willuse_space(dn, -willfree, tx);
 	}
 
-	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		drop_struct_lock = TRUE;
-	}
-
 	if (db->db_level == 0) {
 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
@@ -1136,6 +1721,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 
 			parent = dbuf_hold_level(dn, db->db_level+1,
 			    db->db_blkid >> epbs, FTAG);
+			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_lock)
@@ -1146,7 +1732,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
-		/*  possible race with dbuf_undirty() */
+		/*
+		 * Since we've dropped the mutex, it's possible that
+		 * dbuf_undirty() might have changed this out from under us.
+		 */
 		if (db->db_last_dirty == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
@@ -1160,8 +1749,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	} else {
 		ASSERT(db->db_level+1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
-		ASSERT(db->db_parent == NULL ||
-		    db->db_parent == db->db_dnode->dn_dbuf);
+		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
@@ -1171,119 +1759,138 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	}
 
 	dnode_setdirty(dn, tx);
+	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
-static int
+/*
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction.  Return whether this evicted the dbuf.
+ */
+static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	uint64_t txg = tx->tx_txg;
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
-	mutex_enter(&db->db_mtx);
+	/*
+	 * Due to our use of dn_nlevels below, this can only be called
+	 * in open context, unless we are operating on the MOS.
+	 * From syncing context, dn_nlevels may be different from the
+	 * dn_nlevels used when dbuf was dirtied.
+	 */
+	ASSERT(db->db_objset ==
+	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT0(db->db_level);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
-	if (dr == NULL || dr->dr_txg < txg) {
-		mutex_exit(&db->db_mtx);
-		return (0);
-	}
+	if (dr == NULL || dr->dr_txg < txg)
+		return (B_FALSE);
 	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 
-	/*
-	 * If this buffer is currently held, we cannot undirty
-	 * it, since one of the current holders may be in the
-	 * middle of an update.  Note that users of dbuf_undirty()
-	 * should not place a hold on the dbuf before the call.
-	 */
-	if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
-		mutex_exit(&db->db_mtx);
-		/* Make sure we don't toss this buffer at sync phase */
-		mutex_enter(&dn->dn_mtx);
-		dnode_clear_range(dn, db->db_blkid, 1, tx);
-		mutex_exit(&dn->dn_mtx);
-		return (0);
-	}
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
-	/* XXX would be nice to fix up dn_towrite_space[] */
+	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+	    dr->dr_accounted, txg);
 
 	*drp = dr->dr_next;
 
+	/*
+	 * Note that there are three places in dbuf_dirty()
+	 * where this dirty record may be put on a list.
+	 * Make sure to do a list_remove corresponding to
+	 * every one of those list_insert calls.
+	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
-	} else if (db->db_level+1 == dn->dn_nlevels) {
+	} else if (db->db_blkid == DMU_SPILL_BLKID ||
+	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
+	DB_DNODE_EXIT(db);
 
-	if (db->db_level == 0) {
-		if (db->db_state != DB_NOFILL) {
-			dbuf_unoverride(dr);
+	if (db->db_state != DB_NOFILL) {
+		dbuf_unoverride(dr);
 
-			ASSERT(db->db_buf != NULL);
-			ASSERT(dr->dt.dl.dr_data != NULL);
-			if (dr->dt.dl.dr_data != db->db_buf)
-				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
-				    db) == 1);
-		}
-	} else {
 		ASSERT(db->db_buf != NULL);
-		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		mutex_destroy(&dr->dt.di.dr_mtx);
-		list_destroy(&dr->dt.di.dr_children);
+		ASSERT(dr->dt.dl.dr_data != NULL);
+		if (dr->dt.dl.dr_data != db->db_buf)
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
 	}
+
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-		arc_buf_t *buf = db->db_buf;
-
-		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
-		dbuf_set_data(db, NULL);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
-		dbuf_evict(db);
-		return (1);
+		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+		dbuf_destroy(db);
+		return (B_TRUE);
 	}
 
-	mutex_exit(&db->db_mtx);
-	return (0);
+	return (B_FALSE);
 }
 
-__attribute__((__weak__)) void
+void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dbuf_will_dirty(db, tx);
-}
-
-void
-dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
-{
 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
-	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
+	/*
+	 * Quick check for dirtyness.  For already dirty blocks, this
+	 * reduces runtime of this function by >90%, and overall performance
+	 * by 50% for some workloads (e.g. file deletion with indirect blocks
+	 * cached).
+	 */
+	mutex_enter(&db->db_mtx);
+	dbuf_dirty_record_t *dr;
+	for (dr = db->db_last_dirty;
+	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
+		/*
+		 * It's possible that it is already dirty but not cached,
+		 * because there are some calls to dbuf_dirty() that don't
+		 * go through dmu_buf_will_dirty().
+		 */
+		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
+			/* This dbuf is already dirty and cached. */
+			dbuf_redirty(dr);
+			mutex_exit(&db->db_mtx);
+			return;
+		}
+	}
+	mutex_exit(&db->db_mtx);
+
+	DB_DNODE_ENTER(db);
+	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
+	DB_DNODE_EXIT(db);
 	(void) dbuf_read(db, NULL, rf);
 	(void) dbuf_dirty(db, tx);
 }
@@ -1303,7 +1910,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dm
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1315,13 +1922,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dm
 	(void) dbuf_dirty(db, tx);
 }
 
-__attribute__((__weak__)) void
-dmu_buf_fill_done(dmu_buf_t *db_fake, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dbuf_fill_done(db, tx);
-}
-
+#pragma weak dmu_buf_fill_done = dbuf_fill_done
 /* ARGSUSED */
 void
 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
@@ -1331,7 +1932,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_t
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
-			ASSERT(db->db_blkid != DB_BONUS_BLKID);
+			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
@@ -1343,6 +1944,43 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_t
 	mutex_exit(&db->db_mtx);
 }
 
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	struct dirty_leaf *dl;
+	dmu_object_type_t type;
+
+	if (etype == BP_EMBEDDED_TYPE_DATA) {
+		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+		    SPA_FEATURE_EMBEDDED_DATA));
+	}
+
+	DB_DNODE_ENTER(db);
+	type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	ASSERT0(db->db_level);
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+	dmu_buf_will_not_fill(dbuf, tx);
+
+	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+	dl = &db->db_last_dirty->dt.dl;
+	encode_embedded_bp_compressed(&dl->dr_overridden_by,
+	    data, comp, uncompressed_size, compressed_size);
+	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+	BP_SET_TYPE(&dl->dr_overridden_by, type);
+	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+	dl->dr_override_state = DR_OVERRIDDEN;
+	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1351,8 +1989,7 @@ void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
-	ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
 	ASSERT(buf != NULL);
@@ -1374,7 +2011,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
-		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		arc_buf_destroy(buf, db);
 		xuio_stat_wbuf_copied();
 		return;
 	}
@@ -1392,10 +2029,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
-			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+			arc_buf_destroy(db->db_buf, db);
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
-			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+			arc_buf_destroy(db->db_buf, db);
 		}
 		db->db_buf = NULL;
 	}
@@ -1404,71 +2041,109 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a
 	db->db_state = DB_FILL;
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
-	dbuf_fill_done(db, tx);
+	dmu_buf_fill_done(&db->db, tx);
 }
 
-/*
- * "Clear" the contents of this dbuf.  This will mark the dbuf
- * EVICTING and clear *most* of its references.  Unfortunetely,
- * when we are not holding the dn_dbufs_mtx, we can't clear the
- * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
- * in this case.  For callers from the DMU we will usually see:
- *	dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
- * For the arc callback, we will usually see:
- * 	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
- * Sometimes, though, we will get a mix of these two:
- *	DMU: dbuf_clear()->arc_buf_evict()
- *	ARC: dbuf_do_evict()->dbuf_destroy()
- */
 void
-dbuf_clear(dmu_buf_impl_t *db)
+dbuf_destroy(dmu_buf_impl_t *db)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
-	dmu_buf_impl_t *dndb = dn->dn_dbuf;
-	int dbuf_gone = FALSE;
+	dmu_buf_impl_t *dndb;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
-	dbuf_evict_user(db);
+	if (db->db_buf != NULL) {
+		arc_buf_destroy(db->db_buf, db);
+		db->db_buf = NULL;
+	}
 
-	if (db->db_state == DB_CACHED) {
+	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(db->db.db_data != NULL);
-		if (db->db_blkid == DB_BONUS_BLKID) {
-			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
-			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
-		}
-		db->db.db_data = NULL;
+		zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+		arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		db->db_state = DB_UNCACHED;
 	}
 
+	dbuf_clear_data(db);
+
+	if (multilist_link_active(&db->db_cache_link)) {
+		multilist_remove(&dbuf_cache, db);
+		(void) refcount_remove_many(&dbuf_cache_size,
+		    db->db.db_size, db);
+	}
+
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 
 	db->db_state = DB_EVICTING;
 	db->db_blkptr = NULL;
 
-	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
-		list_remove(&dn->dn_dbufs, db);
+	/*
+	 * Now that db_state is DB_EVICTING, nobody else can find this via
+	 * the hash table.  We can now drop db_mtx, which allows us to
+	 * acquire the dn_dbufs_mtx.
+	 */
+	mutex_exit(&db->db_mtx);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dndb = dn->dn_dbuf;
+	if (db->db_blkid != DMU_BONUS_BLKID) {
+		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+		if (needlock)
+			mutex_enter(&dn->dn_dbufs_mtx);
+		avl_remove(&dn->dn_dbufs, db);
+		atomic_dec_32(&dn->dn_dbufs_count);
+		membar_producer();
+		DB_DNODE_EXIT(db);
+		if (needlock)
+			mutex_exit(&dn->dn_dbufs_mtx);
+		/*
+		 * Decrementing the dbuf count means that the hold corresponding
+		 * to the removed dbuf is no longer discounted in dnode_move(),
+		 * so the dnode cannot be moved until after we release the hold.
+		 * The membar_producer() ensures visibility of the decremented
+		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+		 * release any lock.
+		 */
 		dnode_rele(dn, db);
-		db->db_dnode = NULL;
+		db->db_dnode_handle = NULL;
+
+		dbuf_hash_remove(db);
+	} else {
+		DB_DNODE_EXIT(db);
 	}
 
-	if (db->db_buf)
-		dbuf_gone = arc_buf_evict(db->db_buf);
+	ASSERT(refcount_is_zero(&db->db_holds));
 
-	if (!dbuf_gone)
-		mutex_exit(&db->db_mtx);
+	db->db_parent = NULL;
+
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db.db_data == NULL);
+	ASSERT(db->db_hash_next == NULL);
+	ASSERT(db->db_blkptr == NULL);
+	ASSERT(db->db_data_pending == NULL);
+	ASSERT(!multilist_link_active(&db->db_cache_link));
+
+	kmem_cache_free(dbuf_kmem_cache, db);
+	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
 	/*
-	 * If this dbuf is referened from an indirect dbuf,
+	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb)
 		dbuf_rele(parent, db);
 }
 
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or a userused or groupused
+ * object.
+ */
 static int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
@@ -1478,7 +2153,20 @@ dbuf_findbp(dnode_t *dn, int level, uint
 	*parentp = NULL;
 	*bpp = NULL;
 
-	ASSERT(blkid != DB_BONUS_BLKID);
+	ASSERT(blkid != DMU_BONUS_BLKID);
+
+	if (blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		if (dn->dn_have_spill &&
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+			*bpp = &dn->dn_phys->dn_spill;
+		else
+			*bpp = NULL;
+		dbuf_add_ref(dn->dn_dbuf, NULL);
+		*parentp = dn->dn_dbuf;
+		mutex_exit(&dn->dn_mtx);
+		return (0);
+	}
 
 	if (dn->dn_phys->dn_nlevels == 0)
 		nlevels = 1;
@@ -1492,11 +2180,11 @@ dbuf_findbp(dnode_t *dn, int level, uint
 	if (level >= nlevels ||
 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err = dbuf_hold_impl(dn, level+1,
-		    blkid >> epbs, fail_sparse, NULL, parentp);
+		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
@@ -1533,7 +2221,7 @@ dbuf_create(dnode_t *dn, uint8_t level, 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
-	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
@@ -1541,29 +2229,32 @@ dbuf_create(dnode_t *dn, uint8_t level, 
 	db->db_blkid = blkid;
 	db->db_last_dirty = NULL;
 	db->db_dirtycnt = 0;
-	db->db_dnode = dn;
+	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
-	db->db_user_ptr = NULL;
-	db->db_user_data_ptr_ptr = NULL;
-	db->db_evict_func = NULL;
-	db->db_immediate_evict = 0;
-	db->db_freed_in_flight = 0;
+	db->db_user = NULL;
+	db->db_user_immediate_evict = FALSE;
+	db->db_freed_in_flight = FALSE;
+	db->db_pending_evict = FALSE;
 
-	if (blkid == DB_BONUS_BLKID) {
+	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_MAX_BONUSLEN -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
-		db->db.db_offset = DB_BONUS_BLKID;
+		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
+	} else if (blkid == DMU_SPILL_BLKID) {
+		db->db.db_size = (blkptr != NULL) ?
+		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+		db->db.db_offset = 0;
 	} else {
 		int blocksize =
-		    db->db_level ? 1<<dn->dn_indblkshift :  dn->dn_datablksz;
+		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
@@ -1579,11 +2270,12 @@ dbuf_create(dnode_t *dn, uint8_t level, 
 	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
-		kmem_cache_free(dbuf_cache, db);
+		kmem_cache_free(dbuf_kmem_cache, db);
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return (odb);
 	}
-	list_insert_head(&dn->dn_dbufs, db);
+	avl_add(&dn->dn_dbufs, db);
+
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
@@ -1594,120 +2286,241 @@ dbuf_create(dnode_t *dn, uint8_t level, 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
+	atomic_inc_32(&dn->dn_dbufs_count);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
-static int
-dbuf_do_evict(void *private)
-{
-	arc_buf_t *buf = private;
-	dmu_buf_impl_t *db = buf->b_private;
+typedef struct dbuf_prefetch_arg {
+	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
+	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+	int dpa_curlevel; /* The current level that we're reading */
+	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
+	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+} dbuf_prefetch_arg_t;
 
-	if (!MUTEX_HELD(&db->db_mtx))
-		mutex_enter(&db->db_mtx);
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
+{
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return;
 
-	ASSERT(refcount_is_zero(&db->db_holds));
+	arc_flags_t aflags =
+	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
-	if (db->db_state != DB_EVICTING) {
-		ASSERT(db->db_state == DB_CACHED);
-		DBUF_VERIFY(db);
-		db->db_buf = NULL;
-		dbuf_evict(db);
-	} else {
-		mutex_exit(&db->db_mtx);
-		dbuf_destroy(db);
-	}
-	return (0);
+	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+	ASSERT(dpa->dpa_zio != NULL);
+	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+	    &aflags, &dpa->dpa_zb);
 }
 
+/*
+ * Called when an indirect block above our prefetch target is read in.  This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
 static void
-dbuf_destroy(dmu_buf_impl_t *db)
+dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
 {
-	ASSERT(refcount_is_zero(&db->db_holds));
+	dbuf_prefetch_arg_t *dpa = private;
 
-	if (db->db_blkid != DB_BONUS_BLKID) {
-		/*
-		 * If this dbuf is still on the dn_dbufs list,
-		 * remove it from that list.
-		 */
-		if (db->db_dnode) {
-			dnode_t *dn = db->db_dnode;
+	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+	ASSERT3S(dpa->dpa_curlevel, >, 0);
 
-			mutex_enter(&dn->dn_dbufs_mtx);
-			list_remove(&dn->dn_dbufs, db);
-			mutex_exit(&dn->dn_dbufs_mtx);
-
-			dnode_rele(dn, db);
-			db->db_dnode = NULL;
+	/*
+	 * The dpa_dnode is only valid if we are called with a NULL
+	 * zio. This indicates that the arc_read() returned without
+	 * first calling zio_read() to issue a physical read. Once
+	 * a physical read is made the dpa_dnode must be invalidated
+	 * as the locks guarding it may have been dropped. If the
+	 * dpa_dnode is still valid, then we want to add it to the dbuf
+	 * cache. To do so, we must hold the dbuf associated with the block
+	 * we just prefetched, read its contents so that we associate it
+	 * with an arc_buf_t, and then release it.
+	 */
+	if (zio != NULL) {
+		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+		if (zio->io_flags & ZIO_FLAG_RAW) {
+			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
+		} else {
+			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
 		}
-		dbuf_hash_remove(db);
-	}
-	db->db_parent = NULL;
-	db->db_buf = NULL;
+		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
 
-	ASSERT(!list_link_active(&db->db_link));
-	ASSERT(db->db.db_data == NULL);
-	ASSERT(db->db_hash_next == NULL);
-	ASSERT(db->db_blkptr == NULL);
-	ASSERT(db->db_data_pending == NULL);
+		dpa->dpa_dnode = NULL;
+	} else if (dpa->dpa_dnode != NULL) {
+		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
+		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
+		    dpa->dpa_zb.zb_level));
+		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
+		    dpa->dpa_curlevel, curblkid, FTAG);
+		(void) dbuf_read(db, NULL,
+		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
+		dbuf_rele(db, FTAG);
+	}
+
+	dpa->dpa_curlevel--;
+
+	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+		kmem_free(dpa, sizeof (*dpa));
+	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+		dbuf_issue_final_prefetch(dpa, bp);
+		kmem_free(dpa, sizeof (*dpa));
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
+	}
 
-	kmem_cache_free(dbuf_cache, db);
-	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+	arc_buf_destroy(abuf, private);
 }
 
+/*
+ * Issue prefetch reads for the given block on the given level.  If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously.  As a result, this call never blocks waiting for a read to
+ * complete.
+ */
 void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+    arc_flags_t aflags)
 {
-	dmu_buf_impl_t *db = NULL;
-	blkptr_t *bp = NULL;
+	blkptr_t bp;
+	int epbs, nlevels, curlevel;
+	uint64_t curblkid;
 
-	ASSERT(blkid != DB_BONUS_BLKID);
+	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
+	if (blkid > dn->dn_maxblkid)
+		return;
+
 	if (dnode_block_freed(dn, blkid))
 		return;
 
-	/* dbuf_find() returns with db_mtx held */
-	if (db = dbuf_find(dn, 0, blkid)) {
-		if (refcount_count(&db->db_holds) > 0) {
-			/*
-			 * This dbuf is active.  We assume that it is
-			 * already CACHED, or else about to be either
-			 * read or filled.
-			 */
-			mutex_exit(&db->db_mtx);
-			return;
-		}
+	/*
+	 * This dnode hasn't been written to disk yet, so there's nothing to
+	 * prefetch.
+	 */
+	nlevels = dn->dn_phys->dn_nlevels;
+	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+		return;
+
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+		return;
+
+	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+	    level, blkid);
+	if (db != NULL) {
 		mutex_exit(&db->db_mtx);
-		db = NULL;
+		/*
+		 * This dbuf already exists.  It is either CACHED, or
+		 * (we assume) about to be read or filled.
+		 */
+		return;
+	}
+
+	/*
+	 * Find the closest ancestor (indirect block) of the target block
+	 * that is present in the cache.  In this indirect block, we will
+	 * find the bp that is at curlevel, curblkid.
+	 */
+	curlevel = level;
+	curblkid = blkid;
+	while (curlevel < nlevels - 1) {
+		int parent_level = curlevel + 1;
+		uint64_t parent_blkid = curblkid >> epbs;
+		dmu_buf_impl_t *db;
+
+		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+		    FALSE, TRUE, FTAG, &db) == 0) {
+			blkptr_t *bpp = db->db_buf->b_data;
+			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+			dbuf_rele(db, FTAG);
+			break;
+		}
+
+		curlevel = parent_level;
+		curblkid = parent_blkid;
 	}
 
-	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
-		if (bp && !BP_IS_HOLE(bp)) {
-			arc_buf_t *pbuf;
-			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-			zbookmark_t zb;
+	if (curlevel == nlevels - 1) {
+		/* No cached indirect blocks found. */
+		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+		bp = dn->dn_phys->dn_blkptr[curblkid];
+	}
+	if (BP_IS_HOLE(&bp))
+		return;
 
-			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
-			    dn->dn_object, 0, blkid);
+	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
 
-			if (db)
-				pbuf = db->db_buf;
-			else
-				pbuf = dn->dn_objset->os_phys_buf;
+	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
 
-			(void) arc_read(NULL, dn->dn_objset->os_spa,
-			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    &aflags, &zb);
-		}
-		if (db)
-			dbuf_rele(db, NULL);
+	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+	    dn->dn_object, level, blkid);
+	dpa->dpa_curlevel = curlevel;
+	dpa->dpa_prio = prio;
+	dpa->dpa_aflags = aflags;
+	dpa->dpa_spa = dn->dn_objset->os_spa;
+	dpa->dpa_dnode = dn;
+	dpa->dpa_epbs = epbs;
+	dpa->dpa_zio = pio;
+
+	/*
+	 * If we have the indirect just above us, no need to do the asynchronous
+	 * prefetch chain; we'll just run the last step ourselves.  If we're at
+	 * a higher level, though, we want to issue the prefetches for all the
+	 * indirect blocks asynchronously, so we can go on with whatever we were
+	 * doing.
+	 */
+	if (curlevel == level) {
+		ASSERT3U(curblkid, ==, blkid);
+		dbuf_issue_final_prefetch(dpa, &bp);
+		kmem_free(dpa, sizeof (*dpa));
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+		    dn->dn_object, curlevel, curblkid);
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
 	}
+	/*
+	 * We use pio here instead of dpa_zio since it's possible that
+	 * dpa may have already been freed.
+	 */
+	zio_nowait(pio);
 }
 
 /*
@@ -1715,29 +2528,33 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki
  * Note: dn_struct_rwlock must be held.
  */
 int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 
-	ASSERT(blkid != DB_BONUS_BLKID);
+	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 top:
 	/* dbuf_find() returns with db_mtx held */
-	db = dbuf_find(dn, level, blkid);
+	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
+		if (fail_uncached)
+			return (SET_ERROR(ENOENT));
+
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
-				err = ENOENT;
+				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
@@ -1749,19 +2566,14 @@ top:
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
-	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
-		arc_buf_add_ref(db->db_buf, db);
-		if (db->db_buf->b_data == NULL) {
-			dbuf_clear(db);
-			if (parent) {
-				dbuf_rele(parent, NULL);
-				parent = NULL;
-			}
-			goto top;
-		}
-		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+	if (fail_uncached && db->db_state != DB_CACHED) {
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(ENOENT));
 	}
 
+	if (db->db_buf != NULL)
+		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
@@ -1769,7 +2581,7 @@ top:
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
-	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
@@ -1778,15 +2590,20 @@ top:
 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 			dbuf_set_data(db,
-			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+			    arc_alloc_buf(dn->dn_objset->os_spa,
 			    db->db.db_size, db, type));
 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
 			    db->db.db_size);
 		}
 	}
 
+	if (multilist_link_active(&db->db_cache_link)) {
+		ASSERT(refcount_is_zero(&db->db_holds));
+		multilist_remove(&dbuf_cache, db);
+		(void) refcount_remove_many(&dbuf_cache_size,
+		    db->db.db_size, db);
+	}
 	(void) refcount_add(&db->db_holds, tag);
-	dbuf_update_data(db);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
@@ -1794,7 +2611,7 @@ top:
 	if (parent)
 		dbuf_rele(parent, NULL);
 
-	ASSERT3P(db->db_dnode, ==, dn);
+	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
@@ -1805,16 +2622,14 @@ top:
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
-	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
-	return (err ? NULL : db);
+	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
-	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
@@ -1824,30 +2639,77 @@ dbuf_create_bonus(dnode_t *dn)
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
-	dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
+	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
-__attribute__((__weak__)) void
-dmu_buf_add_ref(dmu_buf_t *db_fake, void *tag)
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dbuf_add_ref(db, tag);
+	dnode_t *dn;
+
+	if (db->db_blkid != DMU_SPILL_BLKID)
+		return (SET_ERROR(ENOTSUP));
+	if (blksz == 0)
+		blksz = SPA_MINBLOCKSIZE;
+	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dbuf_new_size(db, blksz, tx);
+	rw_exit(&dn->dn_struct_rwlock);
+	DB_DNODE_EXIT(db);
+
+	return (0);
 }
 
 void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds = refcount_add(&db->db_holds, tag);
-	ASSERT(holds > 1);
+	ASSERT3S(holds, >, 1);
 }
 
-__attribute__((__weak__)) void
-dmu_buf_rele(dmu_buf_t *db_fake, void *tag)
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+    void *tag)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	dbuf_rele(db, tag);
+	dmu_buf_impl_t *found_db;
+	boolean_t result = B_FALSE;
+
+	if (db->db_blkid == DMU_BONUS_BLKID)
+		found_db = dbuf_find_bonus(os, obj);
+	else
+		found_db = dbuf_find(os, obj, 0, blkid);
+
+	if (found_db != NULL) {
+		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+			(void) refcount_add(&db->db_holds, tag);
+			result = B_TRUE;
+		}
+		mutex_exit(&db->db_mtx);
+	}
+	return (result);
 }
 
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
@@ -1855,6 +2717,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
 	dbuf_rele_and_unlock(db, tag);
 }
 
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+	dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.
@@ -1867,6 +2735,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db,
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
+	/*
+	 * Remove the reference to the dbuf before removing its hold on the
+	 * dnode so we can guarantee in dnode_move() that a referenced bonus
+	 * buffer has a corresponding dnode hold.
+	 */
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
@@ -1874,17 +2747,47 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db,
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
-	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+	if (db->db_buf != NULL &&
+	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
 		arc_buf_freeze(db->db_buf);
+	}
 
 	if (holds == db->db_dirtycnt &&
-	    db->db_level == 0 && db->db_immediate_evict)
+	    db->db_level == 0 && db->db_user_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
-		if (db->db_blkid == DB_BONUS_BLKID) {
+		if (db->db_blkid == DMU_BONUS_BLKID) {
+			dnode_t *dn;
+			boolean_t evict_dbuf = db->db_pending_evict;
+
+			/*
+			 * If the dnode moves here, we cannot cross this
+			 * barrier until the move completes.
+			 */
+			DB_DNODE_ENTER(db);
+
+			dn = DB_DNODE(db);
+			atomic_dec_32(&dn->dn_dbufs_count);
+
+			/*
+			 * Decrementing the dbuf count means that the bonus
+			 * buffer's dnode hold is no longer discounted in
+			 * dnode_move(). The dnode cannot move until after
+			 * the dnode_rele() below.
+			 */
+			DB_DNODE_EXIT(db);
+
+			/*
+			 * Do not reference db after its lock is dropped.
+			 * Another thread may evict it.
+			 */
 			mutex_exit(&db->db_mtx);
-			dnode_rele(db->db_dnode, db);
+
+			if (evict_dbuf)
+				dnode_evict_bonus(dn);
+
+			dnode_rele(dn, db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
@@ -1892,34 +2795,47 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db,
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
-			dbuf_evict(db);
+			dbuf_destroy(db);
 		} else if (arc_released(db->db_buf)) {
-			arc_buf_t *buf = db->db_buf;
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
-			dbuf_set_data(db, NULL);
-			VERIFY(arc_buf_remove_ref(buf, db) == 1);
-			dbuf_evict(db);
+			dbuf_destroy(db);
 		} else {
-			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-			if (!DBUF_IS_CACHEABLE(db))
-				dbuf_clear(db);
-			else
+			boolean_t do_arc_evict = B_FALSE;
+			blkptr_t bp;
+			spa_t *spa = dmu_objset_spa(db->db_objset);
+
+			if (!DBUF_IS_CACHEABLE(db) &&
+			    db->db_blkptr != NULL &&
+			    !BP_IS_HOLE(db->db_blkptr) &&
+			    !BP_IS_EMBEDDED(db->db_blkptr)) {
+				do_arc_evict = B_TRUE;
+				bp = *db->db_blkptr;
+			}
+
+			if (!DBUF_IS_CACHEABLE(db) ||
+			    db->db_pending_evict) {
+				dbuf_destroy(db);
+			} else if (!multilist_link_active(&db->db_cache_link)) {
+				multilist_insert(&dbuf_cache, db);
+				(void) refcount_add_many(&dbuf_cache_size,
+				    db->db.db_size, db);
 				mutex_exit(&db->db_mtx);
+
+				dbuf_evict_notify();
+			}
+
+			if (do_arc_evict)
+				arc_freed(spa, &bp);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
-}
 
-__attribute__((__weak__)) uint64_t
-dmu_buf_refcount(dmu_buf_t *db_fake)
-{
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	return dbuf_refcount(db);
 }
 
+#pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
@@ -1927,56 +2843,57 @@ dbuf_refcount(dmu_buf_impl_t *db)
 }
 
 void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+    dmu_buf_user_t *new_user)
 {
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	mutex_enter(&db->db_mtx);
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	if (db->db_user == old_user)
+		db->db_user = new_user;
+	else
+		old_user = db->db_user;
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	mutex_exit(&db->db_mtx);
+
+	return (old_user);
 }
 
 void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
-	db->db_immediate_evict = TRUE;
-	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
-	    user_data_ptr_ptr, evict_func));
+	return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
-dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	ASSERT(db->db_level == 0);
-
-	ASSERT((user_ptr == NULL) == (evict_func == NULL));
 
-	mutex_enter(&db->db_mtx);
-
-	if (db->db_user_ptr == old_user_ptr) {
-		db->db_user_ptr = user_ptr;
-		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
-		db->db_evict_func = evict_func;
-
-		dbuf_update_data(db);
-	} else {
-		old_user_ptr = db->db_user_ptr;
-	}
+	db->db_user_immediate_evict = TRUE;
+	return (dmu_buf_set_user(db_fake, user));
+}
 
-	mutex_exit(&db->db_mtx);
-	return (old_user_ptr);
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	ASSERT(!refcount_is_zero(&db->db_holds));
 
-	return (db->db_user_ptr);
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	return (db->db_user);
+}
+
+void
+dmu_buf_user_evict_wait()
+{
+	taskq_wait(dbu_evict_taskq);
 }
 
 boolean_t
@@ -1987,11 +2904,40 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
 
 	if (db->db_blkptr)
 		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
-		    db->db_blkptr->blk_birth);
+		    db->db_blkptr, db->db_blkptr->blk_birth);
 
 	return (res);
 }
 
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_blkptr);
+}
+
+objset_t *
+dmu_buf_get_objset(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_objset);
+}
+
+dnode_t *
+dmu_buf_dnode_enter(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_ENTER(dbi);
+	return (DB_DNODE(dbi));
+}
+
+void
+dmu_buf_dnode_exit(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_EXIT(dbi);
+}
+
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -2001,6 +2947,11 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_i
 	if (db->db_blkptr != NULL)
 		return;
 
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		db->db_blkptr = &dn->dn_phys->dn_spill;
+		BP_ZERO(db->db_blkptr);
+		return;
+	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
@@ -2020,8 +2971,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_i
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, db, &parent);
+			parent = dbuf_hold_level(dn, db->db_level + 1,
+			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
@@ -2036,7 +2987,7 @@ static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	zio_t *zio;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -2048,17 +2999,23 @@ dbuf_sync_indirect(dbuf_dirty_record_t *
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
+	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
-	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	ASSERT(db->db_buf != NULL);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	/* Indirect block size must match what the dnode thinks it is. */
+	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
+	DB_DNODE_EXIT(db);
 
+	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
@@ -2066,7 +3023,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
-	dbuf_sync_list(&dr->dt.di.dr_children, tx);
+	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
@@ -2077,8 +3034,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
+	dnode_t *dn;
+	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -2101,19 +3058,30 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 	}
 	DBUF_VERIFY(db);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+		mutex_exit(&dn->dn_mtx);
+	}
+
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
-	if (db->db_blkid == DB_BONUS_BLKID) {
+	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
 
 		ASSERT(*datap != NULL);
-		ASSERT3U(db->db_level, ==, 0);
+		ASSERT0(db->db_level);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+		DB_DNODE_EXIT(db);
+
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
@@ -2125,6 +3093,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 		ASSERT(dr->dr_next == NULL);
 		ASSERT(dr->dr_dbuf == db);
 		*drp = dr->dr_next;
+		if (dr->dr_dbuf->db_level != 0) {
+			list_destroy(&dr->dt.di.dr_children);
+			mutex_destroy(&dr->dt.di.dr_mtx);
+		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
@@ -2132,6 +3104,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 		return;
 	}
 
+	os = dn->dn_objset;
+
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
@@ -2141,7 +3115,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 	dbuf_check_blkptr(dn, db);
 
 	/*
-	 * If this buffer is in the middle of an immdiate write,
+	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
@@ -2168,7 +3142,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 		 */
 		int blksz = arc_buf_size(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+		*datap = arc_alloc_buf(os->os_spa, blksz, db, type);
 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
 	}
 	db->db_data_pending = dr;
@@ -2178,14 +3152,24 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
-	if (dn->dn_object == DMU_META_DNODE_OBJECT)
+	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
-	else
+		DB_DNODE_EXIT(db);
+	} else {
+		/*
+		 * Although zio_nowait() does not "wait for an IO", it does
+		 * initiate the IO. If this is an empty write it seems plausible
+		 * that the IO could actually be completed before the nowait
+		 * returns. We need to DB_DNODE_EXIT() first in case
+		 * zio_nowait() invalidates the dbuf.
+		 */
+		DB_DNODE_EXIT(db);
 		zio_nowait(dr->dr_zio);
+	}
 }
 
 void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
@@ -2202,6 +3186,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *t
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
+		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);
@@ -2215,33 +3203,46 @@ static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
-	dnode_t *dn = db->db_dnode;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
-	ASSERT(db->db_blkptr == bp);
+	ASSERT3P(db->db_blkptr, !=, NULL);
+	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
-	if (BP_IS_HOLE(bp)) {
-		ASSERT(bp->blk_fill == 0);
-		return;
+	if (bp->blk_birth != 0) {
+		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_type) ||
+		    (db->db_blkid == DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+		    BP_IS_EMBEDDED(bp));
+		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
-	ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
-	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
-
 	mutex_enter(&db->db_mtx);
 
+#ifdef ZFS_DEBUG
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+		ASSERT(!(BP_IS_HOLE(bp)) &&
+		    db->db_blkptr == &dn->dn_phys->dn_spill);
+	}
+#endif
+
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
-		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+		    db->db_blkid != DMU_SPILL_BLKID)
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		mutex_exit(&dn->dn_mtx);
 
@@ -2253,7 +3254,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *
 					fill++;
 			}
 		} else {
-			fill = 1;
+			if (BP_IS_HOLE(bp)) {
+				fill = 0;
+			} else {
+				fill = 1;
+			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
@@ -2261,13 +3266,90 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
-			fill += ibp->blk_fill;
+			fill += BP_GET_FILL(ibp);
 		}
 	}
+	DB_DNODE_EXIT(db);
 
-	bp->blk_fill = fill;
+	if (!BP_IS_EMBEDDED(bp))
+		bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	*db->db_blkptr = *bp;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+/* ARGSUSED */
+/*
+ * This function gets called just prior to running through the compression
+ * stage of the zio pipeline. If we're an indirect block comprised of only
+ * holes, then we want this indirect to be compressed away to a hole. In
+ * order to do that we must zero out any information about the holes that
+ * this indirect points to prior to before we try to compress it.
+ */
+static void
+dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn;
+	blkptr_t *bp;
+	uint64_t i;
+	int epbs;
+
+	ASSERT3U(db->db_level, >, 0);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	/* Determine if all our children are holes */
+	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+		if (!BP_IS_HOLE(bp))
+			break;
+	}
+
+	/*
+	 * If all the children are holes, then zero them all out so that
+	 * we may get compressed away.
+	 */
+	if (i == 1 << epbs) {
+		/* didn't find any non-holes */
+		bzero(db->db.db_data, db->db.db_size);
+	}
+	DB_DNODE_EXIT(db);
+}
+
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times).  This
+ * allows the DMU to monitor the progress of each logical i/o.  For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block.  There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	dmu_buf_impl_t *db = arg;
+	objset_t *os = db->db_objset;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+	dbuf_dirty_record_t *dr;
+	int delta = 0;
+
+	dr = db->db_data_pending;
+	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+	/*
+	 * The callback will be called io_phys_children times.  Retire one
+	 * portion of our dirty space each time we are called.  Any rounding
+	 * error will be cleaned up by dsl_pool_sync()'s call to
+	 * dsl_pool_undirty_space().
+	 */
+	delta = dr->dr_accounted / zio->io_phys_children;
+	dsl_pool_undirty_space(dp, delta, zio->io_txg);
 }
 
 /* ARGSUSED */
@@ -2275,22 +3357,23 @@ static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
-	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
-	uint64_t txg = zio->io_txg;
+	blkptr_t *bp = db->db_blkptr;
+	objset_t *os = db->db_objset;
+	dmu_tx_t *tx = os->os_synctx;
 	dbuf_dirty_record_t **drp, *dr;
 
-	ASSERT3U(zio->io_error, ==, 0);
+	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
-	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+	/*
+	 * For nopwrites and rewrites we ensure that the bp matches our
+	 * original and bypass all the accounting.
+	 */
+	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
@@ -2303,33 +3386,46 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
-	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
 
+#ifdef ZFS_DEBUG
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
+		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+		    db->db_blkptr == &dn->dn_phys->dn_spill);
+		DB_DNODE_EXIT(db);
+	}
+#endif
+
 	if (db->db_level == 0) {
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
-				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
-				    db) == 1);
-			else if (!arc_released(db->db_buf))
-				arc_set_callback(db->db_buf, dbuf_do_evict, db);
+				arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs =
 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+			ASSERT3U(db->db_blkid, <=,
+			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
-			ASSERT3U(dn->dn_phys->dn_maxblkid
-			    >> (db->db_level * epbs), >=, db->db_blkid);
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
+		DB_DNODE_EXIT(db);
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
@@ -2339,7 +3435,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
-	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
@@ -2381,17 +3477,25 @@ dbuf_write_override_done(zio_t *zio)
 	dbuf_write_done(zio, NULL, db);
 }
 
+/* Issue I/O to commit a dirty buffer to disk. */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_t *os = dn->dn_objset;
+	dnode_t *dn;
+	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
+	int wp_flag = 0;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	os = dn->dn_objset;
 
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
@@ -2404,20 +3508,31 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
-				arc_release(data, db);
+				dbuf_release_bp(db);
 			}
 		}
 	}
 
 	if (parent != dn->dn_dbuf) {
+		/* Our parent is an indirect block. */
+		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
+		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
+		/*
+		 * We're about to modify our parent's db_data by modifying
+		 * our block pointer, so the parent must be released.
+		 */
 		ASSERT(arc_released(parent->db_buf));
 		zio = parent->db_data_pending->dr_zio;
 	} else {
-		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
-		ASSERT3P(db->db_blkptr, ==,
-		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		/* Our parent is the dnode itself. */
+		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+		    db->db_blkid != DMU_SPILL_BLKID) ||
+		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+		if (db->db_blkid != DMU_SPILL_BLKID)
+			ASSERT3P(db->db_blkptr, ==,
+			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		zio = dn->dn_zio;
 	}
 
@@ -2429,32 +3544,64 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
-	dmu_write_policy(os, dn, db->db_level,
-	    db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
+	if (db->db_blkid == DMU_SPILL_BLKID)
+		wp_flag = WP_SPILL;
+	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+	DB_DNODE_EXIT(db);
+
+	/*
+	 * We copy the blkptr now (rather than when we instantiate the dirty
+	 * record), because its value can change between open context and
+	 * syncing context. We do not need to hold dn_struct_rwlock to read
+	 * db_blkptr because we are in syncing context.
+	 */
+	dr->dr_bp_copy = *db->db_blkptr;
+
+	if (db->db_level == 0 &&
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		/*
+		 * The BP for this block has been provided by open context
+		 * (by dmu_sync() or dmu_buf_write_embedded()).
+		 */
+		void *contents = (data != NULL) ? data->b_data : NULL;
 
-	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		ASSERT(db->db_state != DB_NOFILL);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
-		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
-		    dbuf_write_override_ready, dbuf_write_override_done, dr,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		    &dr->dr_bp_copy, contents, db->db.db_size, &zp,
+		    dbuf_write_override_ready, NULL, NULL,
+		    dbuf_write_override_done,
+		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies);
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
-		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
-		    db->db_blkptr, NULL, db->db.db_size, &zp,
-		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+		    &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
+		    dbuf_write_nofill_ready, NULL, NULL,
+		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
+
+		/*
+		 * For indirect blocks, we want to setup the children
+		 * ready callback so that we can properly handle an indirect
+		 * block that only contains holes.
+		 */
+		arc_done_func_t *children_ready_cb = NULL;
+		if (db->db_level != 0)
+			children_ready_cb = dbuf_write_children_ready;
+
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
-		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
-		    dbuf_write_ready, dbuf_write_done, db,
+		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
+		    &zp, dbuf_write_ready, children_ready_cb,
+		    dbuf_write_physdone, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c,v
retrieving revision 1.2
diff -u -p -r1.2 ddt.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c	27 Mar 2014 15:50:48 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c	22 Nov 2015 17:22:33 -0000
@@ -20,8 +20,8 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -35,6 +35,17 @@
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 1;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP");
+SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch,
+    0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
 
 static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
@@ -53,10 +64,11 @@ ddt_object_create(ddt_t *ddt, enum ddt_t
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
-	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP;
 	char name[DDT_NAMELEN];
 
-	ddt_object_name(ddt, type, class, name, sizeof(name));
+	ddt_object_name(ddt, type, class, name);
 
 	ASSERT(*objectp == 0);
 	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
@@ -77,16 +89,18 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
+	uint64_t count;
 	char name[DDT_NAMELEN];
 
-	ddt_object_name(ddt, type, class, name, sizeof(name));
+	ddt_object_name(ddt, type, class, name);
 
 	ASSERT(*objectp != 0);
-	ASSERT(ddt_object_count(ddt, type, class) == 0);
+	VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
 	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
 	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+	bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
 
 	*objectp = 0;
 }
@@ -94,36 +108,64 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_
 static int
 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 {
+	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+	dmu_object_info_t doi;
+	uint64_t count;
 	char name[DDT_NAMELEN];
 	int error;
 
-	ddt_object_name(ddt, type, class, name, sizeof(name));
+	ddt_object_name(ddt, type, class, name);
 
 	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 
-	if (error)
+	if (error != 0)
 		return (error);
 
-	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
-	    &ddt->ddt_histogram[type][class]);
+	    &ddt->ddt_histogram[type][class]));
 
-	ASSERT(error == 0);
-	return (error);
+	/*
+	 * Seed the cached statistics.
+	 */
+	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+
+	error = ddt_object_count(ddt, type, class, &count);
+	if (error)
+		return error;
+
+	ddo->ddo_count = count;
+	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+	return (0);
 }
 
 static void
 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
     dmu_tx_t *tx)
 {
+	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+	dmu_object_info_t doi;
+	uint64_t count;
 	char name[DDT_NAMELEN];
 
-	ddt_object_name(ddt, type, class, name, sizeof(name));
+	ddt_object_name(ddt, type, class, name);
 
 	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 	    &ddt->ddt_histogram[type][class], tx) == 0);
+
+	/*
+	 * Cache DDT statistics; this is the only time they'll change.
+	 */
+	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+	VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
+
+	ddo->ddo_count = count;
+	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 }
 
 static int
@@ -131,13 +173,24 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_t
     ddt_entry_t *dde)
 {
 	if (!ddt_object_exists(ddt, type, class))
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 	    ddt->ddt_object[type][class], dde));
 }
 
-static int
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return;
+
+	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde);
+}
+
+int
 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
     ddt_entry_t *dde, dmu_tx_t *tx)
 {
@@ -167,13 +220,13 @@ ddt_object_walk(ddt_t *ddt, enum ddt_typ
 	    ddt->ddt_object[type][class], dde, walk));
 }
 
-uint64_t
-ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+int
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
-	    ddt->ddt_object[type][class]));
+	    ddt->ddt_object[type][class], count));
 }
 
 int
@@ -181,7 +234,7 @@ ddt_object_info(ddt_t *ddt, enum ddt_typ
     dmu_object_info_t *doi)
 {
 	if (!ddt_object_exists(ddt, type, class))
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 
 	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 	    doi));
@@ -195,9 +248,9 @@ ddt_object_exists(ddt_t *ddt, enum ddt_t
 
 void
 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    char *name, size_t namelen)
+    char *name)
 {
-	(void) snprintf(name, namelen, DMU_POOL_DDT,
+	(void) sprintf(name, DMU_POOL_DDT,
 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
 	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
 }
@@ -222,12 +275,13 @@ ddt_bp_create(enum zio_checksum checksum
 		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
 
 	bp->blk_cksum = ddk->ddk_cksum;
+	bp->blk_fill = 1;
 
 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 	BP_SET_CHECKSUM(bp, checksum);
-	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_TYPE(bp, DMU_OT_DEDUP);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
@@ -362,7 +416,7 @@ ddt_stat_update(ddt_t *ddt, ddt_entry_t 
 
 	ddt_stat_generate(ddt, dde, &dds);
 
-	bucket = highbit(dds.dds_ref_blocks) - 1;
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
 	ASSERT(bucket >= 0);
 
 	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -400,31 +454,28 @@ ddt_histogram_empty(const ddt_histogram_
 }
 
 void
-ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo)
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 {
-	dmu_object_info_t doi;
-	uint64_t count;
-	int error;
-
+	/* Sum the statistics we cached in ddt_object_sync(). */
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
-				error = ddt_object_info(ddt, type, class, &doi);
-				if (error == ENOENT)
-					continue;
-				ASSERT3U(error, ==, 0);
-
-				count = ddt_object_count(ddt, type, class);
-				ddo->ddo_count += count;
-				ddo->ddo_dspace +=
-				    (doi.doi_physical_blocks_512 << 9) / count;
-				ddo->ddo_mspace += doi.doi_fill_count *
-				    doi.doi_data_block_size / count;
+				ddt_object_t *ddo =
+				    &ddt->ddt_object_stats[type][class];
+				ddo_total->ddo_count += ddo->ddo_count;
+				ddo_total->ddo_dspace += ddo->ddo_dspace;
+				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
 	}
+
+	/* ... and compute the averages. */
+	if (ddo_total->ddo_count != 0) {
+		ddo_total->ddo_dspace /= ddo_total->ddo_count;
+		ddo_total->ddo_mspace /= ddo_total->ddo_count;
+	}
 }
 
 void
@@ -436,7 +487,7 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
 				ddt_histogram_add(ddh,
-				    &ddt->ddt_histogram[type][class]);
+				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
 	}
@@ -543,7 +594,10 @@ ddt_compress(void *src, uchar_t *dst, si
 		bcopy(src, dst, s_len);
 	}
 
-	*version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+	*version = cpfunc;
+	/* CONSTCOND */
+	if (ZFS_HOST_BYTEORDER)
+		*version |= DDT_COMPRESS_BYTEORDER_MASK;
 
 	return (c_len + 1);
 }
@@ -560,7 +614,8 @@ ddt_decompress(uchar_t *src, void *dst, 
 	else
 		bcopy(src, dst, d_len);
 
-	if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+	if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
+	    (ZFS_HOST_BYTEORDER != 0))
 		byteswap_uint64_array(dst, d_len);
 }
 
@@ -689,6 +744,30 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *b
 	return (dde);
 }
 
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+	ddt_t *ddt;
+	ddt_entry_t dde;
+
+	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
+		return;
+
+	/*
+	 * We only remove the DDT once all tables are empty and only
+	 * prefetch dedup blocks when there are entries in the DDT.
+	 * Thus no locking is required as the DDT can't disappear on us.
+	 */
+	ddt = ddt_select(spa, bp);
+	ddt_key_fill(&dde.dde_key, bp);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			ddt_object_prefetch(ddt, type, class, &dde);
+		}
+	}
+}
+
 int
 ddt_entry_compare(const void *x1, const void *x2)
 {
@@ -761,15 +840,21 @@ ddt_load(spa_t *spa)
 		return (error == ENOENT ? 0 : error);
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
 		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
 			for (enum ddt_class class = 0; class < DDT_CLASSES;
 			    class++) {
-				ddt_t *ddt = spa->spa_ddt[c];
 				error = ddt_object_load(ddt, type, class);
 				if (error != 0 && error != ENOENT)
 					return (error);
 			}
 		}
+
+		/*
+		 * Seed the cached histograms.
+		 */
+		bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+		    sizeof (ddt->ddt_histogram));
 	}
 
 	return (0);
@@ -967,10 +1052,17 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *
 			ddt_object_create(ddt, ntype, nclass, tx);
 		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
 
-		if (dp->dp_scrub_func != SCRUB_FUNC_NONE &&
-		    oclass > nclass &&
-		    nclass <= dp->dp_scrub_ddt_class_max)
-			dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde);
+		/*
+		 * If the class changes, the order that we scan this bp
+		 * changes.  If it decreases, we could miss it, so
+		 * scan it right now.  (This covers both class changing
+		 * while we are doing ddt_walk(), and when we are
+		 * traversing.)
+		 */
+		if (nclass < oclass) {
+			dsl_scan_ddt_entry(dp->dp_scan,
+			    ddt->ddt_checksum, dde, tx);
+		}
 	}
 }
 
@@ -984,15 +1076,12 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx,
 	if (avl_numnodes(&ddt->ddt_tree) == 0)
 		return;
 
-	ASSERT(spa_sync_pass(spa) == 1);
 	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
 
 	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
-		VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
-		    &spa->spa_ddt_stat_object, tx) == 0);
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
 	}
 
 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
@@ -1001,14 +1090,23 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx,
 	}
 
 	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		uint64_t add, count = 0;
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			if (ddt_object_exists(ddt, type, class)) {
+				ddt_object_sync(ddt, type, class, tx);
+				VERIFY(ddt_object_count(ddt, type, class,
+				    &add) == 0);
+				count += add;
+			}
+		}
 		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
-			if (!ddt_object_exists(ddt, type, class))
-				continue;
-			ddt_object_sync(ddt, type, class, tx);
-			if (ddt_object_count(ddt, type, class) == 0)
+			if (count == 0 && ddt_object_exists(ddt, type, class))
 				ddt_object_destroy(ddt, type, class, tx);
 		}
 	}
+
+	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+	    sizeof (ddt->ddt_histogram));
 }
 
 void
@@ -1049,6 +1147,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb
 					    ddb->ddb_type, ddb->ddb_class,
 					    &ddb->ddb_cursor, dde);
 				}
+				dde->dde_type = ddb->ddb_type;
+				dde->dde_class = ddb->ddb_class;
 				if (error == 0)
 					return (0);
 				if (error != ENOENT)
@@ -1060,5 +1160,5 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb
 		ddb->ddb_type = 0;
 	} while (++ddb->ddb_class < DDT_CLASSES);
 
-	return (ENOENT);
+	return (SET_ERROR(ENOENT));
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 ddt_zap.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c	27 Feb 2010 22:30:59 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c	16 Feb 2013 16:40:56 -0000
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -30,7 +29,6 @@
 #include <sys/ddt.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
-#include <util/sscanf.h>
 
 int ddt_zap_leaf_blockshift = 12;
 int ddt_zap_indirect_blockshift = 12;
@@ -81,6 +79,13 @@ ddt_zap_lookup(objset_t *os, uint64_t ob
 	return (0);
 }
 
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+	(void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS);
+}
+
 static int
 ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
 {
@@ -128,14 +133,11 @@ ddt_zap_walk(objset_t *os, uint64_t obje
 	return (error);
 }
 
-static uint64_t
-ddt_zap_count(objset_t *os, uint64_t object)
+static int
+ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count)
 {
-	uint64_t count = 0;
-
-	VERIFY(zap_count(os, object, &count) == 0);
 
-	return (count);
+	return (zap_count(os, object, count));
 }
 
 const ddt_ops_t ddt_zap_ops = {
@@ -143,6 +145,7 @@ const ddt_ops_t ddt_zap_ops = {
 	ddt_zap_create,
 	ddt_zap_destroy,
 	ddt_zap_lookup,
+	ddt_zap_prefetch,
 	ddt_zap_update,
 	ddt_zap_remove,
 	ddt_zap_walk,
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c,v
retrieving revision 1.5
diff -u -p -r1.5 dmu.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c	11 Apr 2015 00:23:31 -0000	1.5
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c	2 May 2017 16:59:07 -0000
@@ -19,9 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
+/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
+/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
+/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
@@ -40,60 +43,125 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
-#include <sys/vmsystm.h>
+#include <sys/racct.h>
+#include <sys/vm.h>
 #include <sys/zfs_znode.h>
 #endif
 
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
+    &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
+
+/*
+ * Tunable to control percentage of dirtied blocks from frees in one TXG.
+ * After this threshold is crossed, additional dirty blocks from frees
+ * wait until the next TXG.
+ * A value of zero will disable this throttle.
+ */
+uint32_t zfs_per_txg_dirty_frees_percent = 30;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
+	&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
-	{	zap_byteswap,		TRUE,	"object directory"	},
-	{	byteswap_uint64_array,	TRUE,	"object array"		},
-	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
-	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
-	{	byteswap_uint64_array,	TRUE,	"bplist"		},
-	{	byteswap_uint64_array,	TRUE,	"bplist header"		},
-	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
-	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
-	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
-	{	dnode_buf_byteswap,	TRUE,	"DMU dnode"		},
-	{	dmu_objset_byteswap,	TRUE,	"DMU objset"		},
-	{	byteswap_uint64_array,	TRUE,	"DSL directory"		},
-	{	zap_byteswap,		TRUE,	"DSL directory child map"},
-	{	zap_byteswap,		TRUE,	"DSL dataset snap map"	},
-	{	zap_byteswap,		TRUE,	"DSL props"		},
-	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
-	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
-	{	zfs_oldacl_byteswap,	TRUE,	"ZFS V0 ACL"		},
-	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
-	{	zap_byteswap,		TRUE,	"ZFS directory"		},
-	{	zap_byteswap,		TRUE,	"ZFS master node"	},
-	{	zap_byteswap,		TRUE,	"ZFS delete queue"	},
-	{	byteswap_uint8_array,	FALSE,	"zvol object"		},
-	{	zap_byteswap,		TRUE,	"zvol prop"		},
-	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
-	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
-	{	zap_byteswap,		TRUE,	"other ZAP"		},
-	{	zap_byteswap,		TRUE,	"persistent error log"	},
-	{	byteswap_uint8_array,	TRUE,	"SPA history"		},
-	{	byteswap_uint64_array,	TRUE,	"SPA history offsets"	},
-	{	zap_byteswap,		TRUE,	"Pool properties"	},
-	{	zap_byteswap,		TRUE,	"DSL permissions"	},
-	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
-	{	byteswap_uint8_array,	TRUE,	"ZFS SYSACL"		},
-	{	byteswap_uint8_array,	TRUE,	"FUID table"		},
-	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
-	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
-	{	zap_byteswap,		TRUE,	"scrub work queue"	},
-	{	zap_byteswap,		TRUE,	"ZFS user/group used"	},
-	{	zap_byteswap,		TRUE,	"ZFS user/group quota"	},
-	{	zap_byteswap,		TRUE,	"snapshot refcount tags"},
-	{	zap_byteswap,		TRUE,	"DDT ZAP algorithm"	},
-	{	zap_byteswap,		TRUE,	"DDT statistics"	},
+	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
+	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
+	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
+	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
+	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
+	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
+	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
+	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
+	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
+	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
+	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
+	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
+	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
+	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
+	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
+	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
+	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
+	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
+};
+
+const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+	{	byteswap_uint8_array,	"uint8"		},
+	{	byteswap_uint16_array,	"uint16"	},
+	{	byteswap_uint32_array,	"uint32"	},
+	{	byteswap_uint64_array,	"uint64"	},
+	{	zap_byteswap,		"zap"		},
+	{	dnode_buf_byteswap,	"dnode"		},
+	{	dmu_objset_byteswap,	"objset"	},
+	{	zfs_znode_byteswap,	"znode"		},
+	{	zfs_oldacl_byteswap,	"oldacl"	},
+	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
+{
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+
+	blkid = dbuf_whichblock(dn, 0, offset);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold(dn, blkid, tag);
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (0);
+}
+int
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
@@ -104,22 +172,64 @@ dmu_buf_hold(objset_t *os, uint64_t obje
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+
 	if (db == NULL) {
-		err = EIO;
-	} else {
-		err = dbuf_read(db, NULL, DB_RF_CANFAIL);
-		if (err) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (err);
+}
+
+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+
+	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
 			dbuf_rele(db, tag);
-			db = NULL;
+			*dbp = NULL;
+		}
+	}
+
+	return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+
+	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
+			dbuf_rele(db, tag);
+			*dbp = NULL;
 		}
 	}
 
-	dnode_rele(dn, FTAG);
-	*dbp = &db->db;
 	return (err);
 }
 
@@ -130,16 +240,79 @@ dmu_bonus_max(void)
 }
 
 int
-dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
 
-	if (dn->dn_bonus != (dmu_buf_impl_t *)db)
-		return (EINVAL);
-	if (newsize < 0 || newsize > db->db_size)
-		return (EINVAL);
-	dnode_setbonuslen(dn, newsize, tx);
-	return (0);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (dn->dn_bonus != db) {
+		error = SET_ERROR(EINVAL);
+	} else if (newsize < 0 || newsize > db_fake->db_size) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		dnode_setbonuslen(dn, newsize, tx);
+		error = 0;
+	}
+
+	DB_DNODE_EXIT(db);
+	return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (!DMU_OT_IS_VALID(type)) {
+		error = SET_ERROR(EINVAL);
+	} else if (dn->dn_bonus != db) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		dnode_setbonus_type(dn, type, tx);
+		error = 0;
+	}
+
+	DB_DNODE_EXIT(db);
+	return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	dmu_object_type_t type;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	type = dn->dn_bonustype;
+	DB_DNODE_EXIT(db);
+
+	return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int error;
+
+	error = dnode_hold(os, object, FTAG, &dn);
+	dbuf_rm_spill(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_rm_spill(dn, tx);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	return (error);
 }
 
 /*
@@ -164,21 +337,105 @@ dmu_bonus_hold(objset_t *os, uint64_t ob
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
-	rw_exit(&dn->dn_struct_rwlock);
 
 	/* as long as the bonus buf is held, the dnode will be held */
-	if (refcount_add(&db->db_holds, tag) == 1)
+	if (refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
+		atomic_inc_32(&dn->dn_dbufs_count);
+	}
+
+	/*
+	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
+	 * a dnode hold for every dbuf.
+	 */
+	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 
-	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
 
 	*dbp = &db->db;
 	return (0);
 }
 
 /*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = NULL;
+	int err;
+
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_exit(&dn->dn_struct_rwlock);
+
+	ASSERT(db != NULL);
+	err = dbuf_read(db, NULL, flags);
+	if (err == 0)
+		*dbp = &db->db;
+	else
+		dbuf_rele(db, tag);
+	return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+		err = SET_ERROR(EINVAL);
+	} else {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+		if (!dn->dn_have_spill) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			err = dmu_spill_hold_by_dnode(dn,
+			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+		}
+
+		rw_exit(&dn->dn_struct_rwlock);
+	}
+
+	DB_DNODE_EXIT(db);
+	return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
@@ -186,27 +443,29 @@ dmu_bonus_hold(objset_t *os, uint64_t ob
  */
 static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
-    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
-	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
-	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
-	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
-	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
-		dbuf_flags |= DB_RF_NOPREFETCH;
+	/*
+	 * Note: We directly notify the prefetch code of this read, so that
+	 * we can tell it about the multi-block read.  dbuf_read() only knows
+	 * about the one block it is accessing.
+	 */
+	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
+	    DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
-		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
+		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
@@ -216,39 +475,52 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
-			return (EIO);
+			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
-	if (dn->dn_objset->os_dsl_dataset)
-		dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
-	if (dp && dsl_pool_sync_context(dp))
-		start = gethrtime();
+#if defined(_KERNEL) && defined(RACCT)
+	if (racct_enable && !read) {
+		PROC_LOCK(curproc);
+		racct_add_force(curproc, RACCT_WRITEBPS, length);
+		racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
+		PROC_UNLOCK(curproc);
+	}
+#endif
+
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
-		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			zio_nowait(zio);
-			return (EIO);
+			return (SET_ERROR(EIO));
 		}
+
 		/* initiate async i/o */
-		if (read) {
+		if (read)
 			(void) dbuf_read(db, zio, dbuf_flags);
-		}
+
+#ifdef _KERNEL
+		else
+			curthread->td_ru.ru_oublock++;
+#endif
 		dbp[i] = &db->db;
 	}
+
+	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
+	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
+		    read && DNODE_IS_CACHEABLE(dn));
+	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
-	/* track read overhead when we are in sync context */
-	if (dp && dsl_pool_sync_context(dp))
-		dp->dp_read_overhead += gethrtime() - start;
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
@@ -263,7 +535,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
-				err = EIO;
+				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
@@ -297,14 +569,19 @@ dmu_buf_hold_array(objset_t *os, uint64_
 }
 
 int
-dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
+    uint64_t length, boolean_t read, void *tag, int *numbufsp,
+    dmu_buf_t ***dbpp)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
 	int err;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
+	DB_DNODE_EXIT(db);
 
 	return (err);
 }
@@ -326,25 +603,32 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake,
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
+/*
+ * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * indirect blocks prefeteched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
+ *
+ * Note that if the indirect blocks above the blocks being prefetched are not in
+ * cache, they will be asychronously read in.
+ */
 void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	uint64_t blkid;
-	int nblks, i, err;
-
-	if (zfs_prefetch_disable)
-		return;
+	int nblks, err;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = os->os_meta_dnode;
+		dn = DMU_META_DNODE(os);
 
 		if (object == 0 || object >= DN_MAX_OBJECT)
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
-		dbuf_prefetch(dn, blkid);
+		blkid = dbuf_whichblock(dn, level,
+		    object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, level, blkid, pri, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
@@ -359,18 +643,24 @@ dmu_prefetch(objset_t *os, uint64_t obje
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	if (dn->dn_datablkshift) {
-		int blkshift = dn->dn_datablkshift;
-		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
-		    P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+	/*
+	 * offset + len - 1 is the last byte we want to prefetch for, and offset
+	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
+	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
+	 * offset)  is the first.  Then the number we need to prefetch is the
+	 * last - first + 1.
+	 */
+	if (level > 0 || dn->dn_datablkshift != 0) {
+		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+		    dbuf_whichblock(dn, level, offset) + 1;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
-		blkid = dbuf_whichblock(dn, offset);
-		for (i = 0; i < nblks; i++)
-			dbuf_prefetch(dn, blkid+i);
+		blkid = dbuf_whichblock(dn, level, offset);
+		for (int i = 0; i < nblks; i++)
+			dbuf_prefetch(dn, level, blkid + i, pri, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
@@ -383,98 +673,136 @@ dmu_prefetch(objset_t *os, uint64_t obje
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length").  On return, *start will be the first
+ * offset that should be freed.
  */
 static int
-get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
 {
-	uint64_t len = *start - limit;
-	uint64_t blkcnt = 0;
-	uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange =
 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
-	ASSERT(limit <= *start);
+	ASSERT3U(minimum, <=, *start);
 
-	if (len <= iblkrange * maxblks) {
-		*start = limit;
+	if (*start - minimum <= iblkrange * maxblks) {
+		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
-	while (*start > limit && blkcnt < maxblks) {
+	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
-		/* find next allocated L1 indirect */
+		/*
+		 * dnode_next_offset(BACKWARDS) will find an allocated L1
+		 * indirect block at or before the input offset.  We must
+		 * decrement *start so that it is at the end of the region
+		 * to search.
+		 */
+		(*start)--;
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
-		/* if there are no more, then we are done */
+		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
-			*start = limit;
-			return (0);
-		} else if (err) {
+			*start = minimum;
+			break;
+		} else if (err != 0) {
 			return (err);
 		}
-		blkcnt += 1;
 
-		/* reset offset to end of "next" block back */
+		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN(*start, iblkrange);
-		if (*start <= limit)
-			*start = limit;
-		else
-			*start -= 1;
 	}
+	if (*start < minimum)
+		*start = minimum;
 	return (0);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
-    uint64_t length, boolean_t free_dnode)
+    uint64_t length)
 {
-	dmu_tx_t *tx;
-	uint64_t object_size, start, end, len;
-	boolean_t trunc = (length == DMU_OBJECT_END);
-	int align, err;
-
-	align = 1 << dn->dn_datablkshift;
-	ASSERT(align > 0);
-	object_size = align == 1 ? dn->dn_datablksz :
-	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
-
-	end = offset + length;
-	if (trunc || end > object_size)
-		end = object_size;
-	if (end <= offset)
+	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+	int err;
+	uint64_t dirty_frees_threshold;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	if (offset >= object_size)
 		return (0);
-	length = end - offset;
 
-	while (length) {
-		start = end;
-		/* assert(offset <= start) */
-		err = get_next_chunk(dn, &start, offset);
+	if (zfs_per_txg_dirty_frees_percent <= 100)
+		dirty_frees_threshold =
+		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
+	else
+		dirty_frees_threshold = zfs_dirty_data_max / 4;
+
+	if (length == DMU_OBJECT_END || offset + length > object_size)
+		length = object_size - offset;
+
+	while (length != 0) {
+		uint64_t chunk_end, chunk_begin, chunk_len;
+		uint64_t long_free_dirty_all_txgs = 0;
+		dmu_tx_t *tx;
+
+		chunk_end = chunk_begin = offset + length;
+
+		/* move chunk_begin backwards to the beginning of this chunk */
+		err = get_next_chunk(dn, &chunk_begin, offset);
 		if (err)
 			return (err);
-		len = trunc ? DMU_OBJECT_END : end - start;
+		ASSERT3U(chunk_begin, >=, offset);
+		ASSERT3U(chunk_begin, <=, chunk_end);
+
+		chunk_len = chunk_end - chunk_begin;
+
+		mutex_enter(&dp->dp_lock);
+		for (int t = 0; t < TXG_SIZE; t++) {
+			long_free_dirty_all_txgs +=
+			    dp->dp_long_free_dirty_pertxg[t];
+		}
+		mutex_exit(&dp->dp_lock);
+
+		/*
+		 * To avoid filling up a TXG with just frees wait for
+		 * the next TXG to open before freeing more chunks if
+		 * we have reached the threshold of frees
+		 */
+		if (dirty_frees_threshold != 0 &&
+		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
+			txg_wait_open(dp, 0);
+			continue;
+		}
 
 		tx = dmu_tx_create(os);
-		dmu_tx_hold_free(tx, dn->dn_object, start, len);
+		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
+
+		/*
+		 * Mark this transaction as typically resulting in a net
+		 * reduction in space used.
+		 */
+		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
-		dnode_free_range(dn, start, trunc ? -1 : len, tx);
-
-		if (start == 0 && free_dnode) {
-			ASSERT(trunc);
-			dnode_free(dn, tx);
-		}
-
-		length -= end - start;
-
+		mutex_enter(&dp->dp_lock);
+		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
+		    chunk_len;
+		mutex_exit(&dp->dp_lock);
+		DTRACE_PROBE3(free__long__range,
+		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
+		    uint64_t, dmu_tx_get_txg(tx));
+		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 		dmu_tx_commit(tx);
-		end = start;
+
+		length -= chunk_len;
 	}
 	return (0);
 }
@@ -489,38 +817,43 @@ dmu_free_long_range(objset_t *os, uint64
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
-	err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+	err = dmu_free_long_range_impl(os, dn, offset, length);
+
+	/*
+	 * It is important to zero out the maxblkid when freeing the entire
+	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
+	 * will take the fast path, and (b) dnode_reallocate() can verify
+	 * that the entire file has been freed.
+	 */
+	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
+		dn->dn_maxblkid = 0;
+
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
-dmu_free_object(objset_t *os, uint64_t object)
+dmu_free_long_object(objset_t *os, uint64_t object)
 {
-	dnode_t *dn;
 	dmu_tx_t *tx;
 	int err;
 
-	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
-	    FTAG, &dn);
+	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
-	if (dn->dn_nlevels == 1) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, object);
-		dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err == 0) {
-			dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
-			dnode_free(dn, tx);
-			dmu_tx_commit(tx);
-		} else {
-			dmu_tx_abort(tx);
-		}
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, object);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+	dmu_tx_mark_netfree(tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err == 0) {
+		err = dmu_object_free(os, object, tx);
+		dmu_tx_commit(tx);
 	} else {
-		err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+		dmu_tx_abort(tx);
 	}
-	dnode_rele(dn, FTAG);
+
 	return (err);
 }
 
@@ -661,6 +994,25 @@ dmu_prealloc(objset_t *os, uint64_t obje
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+
+	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+	VERIFY0(dmu_buf_hold_noread(os, object, offset,
+	    FTAG, &db));
+
+	dmu_buf_write_embedded(db,
+	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+	    uncompressed_size, compressed_size, byteorder, tx);
+
+	dmu_buf_rele(db, FTAG);
+}
+
 /*
  * DMU support for xuio
  */
@@ -681,12 +1033,10 @@ dmu_xuio_init(xuio_t *xuio, int nblk)
 	priv->iovp = uio->uio_iov;
 	XUIO_XUZC_PRIV(xuio) = priv;
 
-#ifdef PORT_SOLARIS
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
-#endif
 
 	return (0);
 }
@@ -701,12 +1051,10 @@ dmu_xuio_fini(xuio_t *xuio)
 	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
 	kmem_free(priv, sizeof (dmu_xuio_t));
 
-#ifdef PORT_SOLARIS
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
-#endif
 }
 
 /*
@@ -755,7 +1103,6 @@ dmu_xuio_clear(xuio_t *xuio, int i)
 	priv->bufs[i] = NULL;
 }
 
-#ifdef PORT_SOLARIS
 static void
 xuio_stat_init(void)
 {
@@ -776,7 +1123,6 @@ xuio_stat_fini(void)
 		xuio_ksp = NULL;
 	}
 }
-#endif
 
 void
 xuio_stat_wbuf_copied()
@@ -791,8 +1137,8 @@ xuio_stat_wbuf_nocopy()
 }
 
 #ifdef _KERNEL
-int
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+static int
+dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
@@ -802,12 +1148,12 @@ dmu_read_uio(objset_t *os, uint64_t obje
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
-	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
-	    &numbufs, &dbp);
+	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
-#ifndef __NetBSD__		/* XXX xuio */
+#ifdef UIO_XUIO
 	if (uio->uio_extflg == UIO_XUIO)
 		xuio = (xuio_t *)uio;
 #endif
@@ -837,8 +1183,18 @@ dmu_read_uio(objset_t *os, uint64_t obje
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 		} else {
+#ifdef illumos
+			err = uiomove((char *)db->db_data + bufoff, tocpy,
+			    UIO_READ, uio);
+#endif
+#ifdef __FreeBSD__
+			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
+			    tocpy, uio);
+#endif
+#ifdef __NetBSD__
 			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
+#endif
 		}
 		if (err)
 			break;
@@ -850,28 +1206,77 @@ dmu_read_uio(objset_t *os, uint64_t obje
 	return (err);
 }
 
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_read_uio_dnode(dn, uio, size);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset uio->uio_loffset.
+ */
 int
-dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
-    dmu_tx_t *tx)
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
-	dmu_buf_t **dbp;
-	int numbufs, i;
-	int err = 0;
+	dnode_t *dn;
+	int err;
 
 	if (size == 0)
 		return (0);
 
-	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size,
-	    FALSE, FTAG, &numbufs, &dbp);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
-	for (i = 0; i < numbufs; i++) {
-		int tocpy;
-		int bufoff;
-		dmu_buf_t *db = dbp[i];
+	err = dmu_read_uio_dnode(dn, uio, size);
 
-		ASSERT(size > 0);
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+static int
+dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs;
+	int err = 0;
+	int i;
+
+	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
 
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
@@ -883,6 +1288,7 @@ dmu_write_uio(objset_t *os, uint64_t obj
 		else
 			dmu_buf_will_dirty(db, tx);
 
+#ifdef illumos
 		/*
 		 * XXX uiomove could block forever (eg. nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
@@ -891,6 +1297,15 @@ dmu_write_uio(objset_t *os, uint64_t obj
 		 */
 		err = uiomove((char *)db->db_data + bufoff, tocpy,
 		    UIO_WRITE, uio);
+#endif
+#ifdef __FreeBSD__
+		err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
+		    uio);
+#endif
+#ifdef __NetBSD__
+		err = uiomove((char *)db->db_data + bufoff, tocpy,
+		    UIO_WRITE, uio);
+#endif
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
@@ -900,11 +1315,66 @@ dmu_write_uio(objset_t *os, uint64_t obj
 
 		size -= tocpy;
 	}
+
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
-#ifndef __NetBSD__
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_write_uio_dnode(dn, uio, size, tx);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+#ifdef illumos
 int
 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     page_t *pp, dmu_tx_t *tx)
@@ -959,8 +1429,123 @@ dmu_write_pages(objset_t *os, uint64_t o
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
-#endif  /* __NetBSD__ */
+#endif /* illumos */
+
+#ifdef __FreeBSD__
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    vm_page_t *ma, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	struct sf_buf *sf;
+	int numbufs, i;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy, copied, thiscpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+		caddr_t va;
+
+		ASSERT(size > 0);
+		ASSERT3U(db->db_size, >=, PAGESIZE);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
+			thiscpy = MIN(PAGESIZE, tocpy - copied);
+			va = zfs_map_page(*ma, &sf);
+			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+			zfs_unmap_page(sf);
+			ma += 1;
+			bufoff += PAGESIZE;
+		}
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
+#endif	/* __FreeBSD__ */
+
+#ifdef __NetBSD__
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    struct vm_page **pgs, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy, copied, thiscpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+		caddr_t va;
+
+		ASSERT(size > 0);
+		ASSERT3U(db->db_size, >=, PAGESIZE);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+			ASSERT3U((*pgs)->offset, ==, db->db_offset + bufoff);
+			thiscpy = MIN(PAGESIZE, tocpy - copied);
+			va = zfs_map_page(*pgs, S_READ);
+			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+			zfs_unmap_page(*pgs, va);
+			pgs++;
+			bufoff += PAGESIZE;
+		}
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
 #endif
+#endif	/* _KERNEL */
 
 /*
  * Allocate a loaned anonymous arc buffer.
@@ -968,9 +1553,9 @@ dmu_write_pages(objset_t *os, uint64_t o
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
-	return (arc_loan_buf(dn->dn_objset->os_spa, size));
+	return (arc_loan_buf(db->db_objset->os_spa, size));
 }
 
 /*
@@ -980,7 +1565,7 @@ void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
-	VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
+	arc_buf_destroy(buf, FTAG);
 }
 
 /*
@@ -992,23 +1577,53 @@ void
 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	uint32_t blksz = (uint32_t)arc_buf_size(buf);
 	uint64_t blkid;
 
+	DB_DNODE_ENTER(dbuf);
+	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	blkid = dbuf_whichblock(dn, offset);
+	blkid = dbuf_whichblock(dn, 0, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
+	DB_DNODE_EXIT(dbuf);
 
-	if (offset == db->db.db_offset && blksz == db->db.db_size) {
+	/*
+	 * We can only assign if the offset is aligned, the arc buf is the
+	 * same size as the dbuf, and the dbuf is not metadata.  It
+	 * can't be metadata because the loaned arc buf comes from the
+	 * user-data kmem arena.
+	 */
+	if (offset == db->db.db_offset && blksz == db->db.db_size &&
+	    DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
+#ifdef _KERNEL
+		curthread->td_ru.ru_oublock++;
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_force(curproc, RACCT_WRITEBPS, blksz);
+			racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
+#endif /* _KERNEL */
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
+		objset_t *os;
+		uint64_t object;
+
+		DB_DNODE_ENTER(dbuf);
+		dn = DB_DNODE(dbuf);
+		os = dn->dn_objset;
+		object = dn->dn_object;
+		DB_DNODE_EXIT(dbuf);
+
 		dbuf_rele(db, FTAG);
-		dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
-		    buf->b_data, tx);
+		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 	}
@@ -1027,7 +1642,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *bu
 {
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
@@ -1037,8 +1651,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *bu
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
-		} else {
-			ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			bp->blk_fill = 1;
 		}
@@ -1062,10 +1675,33 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
+		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+		if (dr->dt.dl.dr_nopwrite) {
+			blkptr_t *bp = zio->io_bp;
+			blkptr_t *bp_orig = &zio->io_bp_orig;
+			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
+
+			ASSERT(BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+			ASSERT(zio_checksum_table[chksum].ci_flags &
+			    ZCHECKSUM_FLAG_NOPWRITE);
+		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
-		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+
+		/*
+		 * Old style holes are filled with all zeros, whereas
+		 * new-style holes maintain their lsize, type, level,
+		 * and birth time (see zio_write_compress). While we
+		 * need to reset the BP_SET_LSIZE() call that happened
+		 * in dmu_sync_ready for old style holes, we do *not*
+		 * want to wipe out the information contained in new
+		 * style holes. Thus, only zero out the block pointer if
+		 * it's an old style hole.
+		 */
+		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
+		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1083,11 +1719,22 @@ dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
 
 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
-		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		/*
+		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
+		 * then there is nothing to do here. Otherwise, free the
+		 * newly allocated block in this txg.
+		 */
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+			ASSERT(BP_EQUAL(bp, bp_orig));
+		} else {
+			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
@@ -1099,7 +1746,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
-    zio_prop_t *zp, zbookmark_t *zb)
+    zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
@@ -1108,7 +1755,8 @@ dmu_sync_late_arrival(zio_t *pio, objset
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
 		dmu_tx_abort(tx);
-		return (EIO);	/* Make zl_get_data do txg_waited_synced() */
+		/* Make zl_get_data do txg_waited_synced() */
+		return (SET_ERROR(EIO));
 	}
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
@@ -1117,10 +1765,11 @@ dmu_sync_late_arrival(zio_t *pio, objset
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
-	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
-	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
-	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx),
+	    zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size,
+	    zp, dmu_sync_late_arrival_ready, NULL,
+	    NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
@@ -1132,7 +1781,7 @@ dmu_sync_late_arrival(zio_t *pio, objset
  *
  * Return values:
  *
- *	EEXIST: this txg has already been synced, so there's nothing to to.
+ *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
@@ -1159,17 +1808,20 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr;
 	dmu_sync_arg_t *dsa;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	zio_prop_t zp;
+	dnode_t *dn;
 
 	ASSERT(pio != NULL);
-	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
-	dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
@@ -1190,7 +1842,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
-		return (EEXIST);
+		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
@@ -1212,9 +1864,39 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	}
 
+	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
+
+	/*
+	 * Assume the on-disk data is X, the current syncing data (in
+	 * txg - 1) is Y, and the current in-memory data is Z (currently
+	 * in dmu_sync).
+	 *
+	 * We usually want to perform a nopwrite if X and Z are the
+	 * same.  However, if Y is different (i.e. the BP is going to
+	 * change before this write takes effect), then a nopwrite will
+	 * be incorrect - we would override with X, which could have
+	 * been freed when Y was written.
+	 *
+	 * (Note that this is not a concern when we are nop-writing from
+	 * syncing context, because X and Y must be identical, because
+	 * all previous txgs have been synced.)
+	 *
+	 * Therefore, we disable nopwrite if the current BP could change
+	 * before this TXG.  There are two ways it could change: by
+	 * being dirty (dr_next is non-NULL), or by being freed
+	 * (dnode_block_freed()).  This behavior is verified by
+	 * zio_done(), which VERIFYs that the override BP is identical
+	 * to the on-disk BP.
+	 */
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
+		zp.zp_nopwrite = B_FALSE;
+	DB_DNODE_EXIT(db);
+
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
@@ -1224,7 +1906,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
-		return (EALREADY);
+		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
@@ -1238,8 +1920,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
-	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
-	    dmu_sync_ready, dmu_sync_done, dsa,
+	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
@@ -1247,7 +1929,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
-	dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
@@ -1262,13 +1944,19 @@ dmu_object_set_blocksize(objset_t *os, u
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
-	dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
-	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os, object, FTAG, &dn);
-	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+	/*
+	 * Send streams include each object's checksum function.  This
+	 * check ensures that the receiving system can understand the
+	 * checksum function transmitted.
+	 */
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
@@ -1276,36 +1964,66 @@ dmu_object_set_checksum(objset_t *os, ui
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
-	dmu_tx_t *tx)
+    dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
-	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os, object, FTAG, &dn);
-	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+	/*
+	 * Send streams include each object's compression function.  This
+	 * check ensures that the receiving system can understand the
+	 * compression function transmitted.
+	 */
+	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 int zfs_mdcomp_disable = 0;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
+    &zfs_mdcomp_disable, 0, "Disable metadata compression");
+
+/*
+ * When the "redundant_metadata" property is set to "most", only indirect
+ * blocks of this level and higher will have an additional ditto block.
+ */
+int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
-	boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata);
+	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
+	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
-	boolean_t dedup;
+	boolean_t dedup = B_FALSE;
+	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	int copies = os->os_copies;
 
 	/*
-	 * Determine checksum setting.
+	 * We maintain different write policies for each of the following
+	 * types of data:
+	 *	 1. metadata
+	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
+		if (zfs_mdcomp_disable) {
+			compress = ZIO_COMPRESS_EMPTY;
+		} else {
+			/*
+			 * XXX -- we should design a compression algorithm
+			 * that specializes in arrays of bps.
+			 */
+			compress = zio_compress_select(os->os_spa,
+			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
+		}
+
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
@@ -1313,82 +2031,93 @@ dmu_write_policy(objset_t *os, dnode_t *
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
-		if (zio_checksum_table[checksum].ci_correctable < 1 ||
-		    zio_checksum_table[checksum].ci_eck)
+		if (!(zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_METADATA) ||
+		    (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
-	} else {
-		checksum = zio_checksum_select(dn->dn_checksum, checksum);
-	}
 
-	/*
-	 * Determine compression setting.
-	 */
-	if (ismd) {
+		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+		    (os->os_redundant_metadata ==
+		    ZFS_REDUNDANT_METADATA_MOST &&
+		    (level >= zfs_redundant_metadata_most_ditto_level ||
+		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+			copies++;
+	} else if (wp & WP_NOFILL) {
+		ASSERT(level == 0);
+
 		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
+		 * If we're writing preallocated blocks, we aren't actually
+		 * writing them so don't set any policy properties.  These
+		 * blocks are currently only used by an external subsystem
+		 * outside of zfs (i.e. dump) and not written by the zio
+		 * pipeline.
 		 */
-		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-		    ZIO_COMPRESS_LZJB;
+		compress = ZIO_COMPRESS_OFF;
+		checksum = ZIO_CHECKSUM_NOPARITY;
 	} else {
-		compress = zio_compress_select(dn->dn_compress, compress);
-	}
+		compress = zio_compress_select(os->os_spa, dn->dn_compress,
+		    compress);
 
-	/*
-	 * Determine dedup setting.  If we are in dmu_sync(), we won't
-	 * actually dedup now because that's all done in syncing context;
-	 * but we do want to use the dedup checkum.  If the checksum is not
-	 * strong enough to ensure unique signatures, force dedup_verify.
-	 */
-	dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
-	if (dedup) {
-		checksum = dedup_checksum;
-		if (!zio_checksum_table[checksum].ci_dedup)
-			dedup_verify = 1;
-	}
+		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+		    zio_checksum_select(dn->dn_checksum, checksum) :
+		    dedup_checksum;
 
-	if (wp & WP_DMU_SYNC)
-		dedup = 0;
-
-	if (wp & WP_NOFILL) {
-		ASSERT(!ismd && level == 0);
-		checksum = ZIO_CHECKSUM_OFF;
-		compress = ZIO_COMPRESS_OFF;
-		dedup = B_FALSE;
+		/*
+		 * Determine dedup setting.  If we are in dmu_sync(),
+		 * we won't actually dedup now because that's all
+		 * done in syncing context; but we do want to use the
+		 * dedup checkum.  If the checksum is not strong
+		 * enough to ensure unique signatures, force
+		 * dedup_verify.
+		 */
+		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+			if (!(zio_checksum_table[checksum].ci_flags &
+			    ZCHECKSUM_FLAG_DEDUP))
+				dedup_verify = B_TRUE;
+		}
+
+		/*
+		 * Enable nopwrite if we have secure enough checksum
+		 * algorithm (see comment in zio_nop_write) and
+		 * compression is enabled.  We don't enable nopwrite if
+		 * dedup is enabled as the two features are mutually
+		 * exclusive.
+		 */
+		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE) &&
+		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	zp->zp_checksum = checksum;
 	zp->zp_compress = compress;
-	zp->zp_type = type;
+	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
-	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
+	zp->zp_nopwrite = nopwrite;
 }
 
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
-	int i, err;
+	int err;
 
-	err = dnode_hold(os, object, FTAG, &dn);
-	if (err)
-		return (err);
 	/*
 	 * Sync any current changes before
 	 * we go trundling through the block pointers.
 	 */
-	for (i = 0; i < TXG_SIZE; i++) {
-		if (list_link_active(&dn->dn_dirty_link[i]))
-			break;
+	err = dmu_object_wait_synced(os, object);
+	if (err) {
+		return (err);
 	}
-	if (i != TXG_SIZE) {
-		dnode_rele(dn, FTAG);
-		txg_wait_synced(dmu_objset_pool(os), 0);
-		err = dnode_hold(os, object, FTAG, &dn);
-		if (err)
-			return (err);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err) {
+		return (err);
 	}
 
 	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
@@ -1397,6 +2126,37 @@ dmu_offset_next(objset_t *os, uint64_t o
 	return (err);
 }
 
+/*
+ * Given the ZFS object, if it contains any dirty nodes
+ * this function flushes all dirty blocks to disk. This
+ * ensures the DMU object info is updated. A more efficient
+ * future version might just find the TXG with the maximum
+ * ID and wait for that to be synced.
+ */
+int
+dmu_object_wait_synced(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	int error, i;
+
+	error = dnode_hold(os, object, FTAG, &dn);
+	if (error) {
+		return (error);
+	}
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (list_link_active(&dn->dn_dirty_link[i])) {
+			break;
+		}
+	}
+	dnode_rele(dn, FTAG);
+	if (i != TXG_SIZE) {
+		txg_wait_synced(dmu_objset_pool(os), 0);
+	}
+
+	return (0);
+}
+
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
@@ -1416,11 +2176,12 @@ dmu_object_info_from_dnode(dnode_t *dn, 
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
+	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
-	doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
-		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
+		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
@@ -1450,9 +2211,13 @@ dmu_object_info(objset_t *os, uint64_t o
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
-dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
-	dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	DB_DNODE_ENTER(db);
+	dmu_object_info_from_dnode(DB_DNODE(db), doi);
+	DB_DNODE_EXIT(db);
 }
 
 /*
@@ -1460,14 +2225,20 @@ dmu_object_info_from_db(dmu_buf_t *db, d
  * This is specifically optimized for zfs_getattr().
  */
 void
-dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+    u_longlong_t *nblk512)
 {
-	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add 1 for dnode space */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + 1;
+	DB_DNODE_EXIT(db);
 }
 
 void
@@ -1518,25 +2289,29 @@ byteswap_uint8_array(void *vbuf, size_t 
 void
 dmu_init(void)
 {
-	dbuf_init();
+	zfs_dbgmsg_init();
+	sa_cache_init();
+	xuio_stat_init();
+	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
-	arc_init();
+	zio_compress_init();
 	l2arc_init();
-#ifdef PORT_SOLARIS	
-	xuio_stat_init();
-#endif	
+	arc_init();
+	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
-	arc_fini();
+	arc_fini(); /* arc depends on l2arc, so arc must go first */
+	l2arc_fini();
 	zfetch_fini();
-	dnode_fini();
+	zio_compress_fini();
 	dbuf_fini();
-	l2arc_fini();
-#ifdef PORT_SOLARIS
+	dnode_fini();
+	dmu_objset_fini();
 	xuio_stat_fini();
-#endif	
+	sa_cache_fini();
+	zfs_dbgmsg_fini();
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c	22 Apr 2017 07:54:20 -0000
@@ -0,0 +1,268 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+
+struct diffarg {
+#ifdef __FreeBSD__
+	kthread_t *da_td;
+	struct file *da_fp;		/* file to which we are reporting */
+#else
+	struct vnode *da_vp;		/* file to which we are reporting */
+#endif
+	offset_t *da_offp;
+	int da_err;			/* error that stopped diff search */
+	dmu_diff_record_t da_ddr;
+};
+
+#ifdef __FreeBSD__
+static int
+write_bytes(struct diffarg *da)
+{
+	struct uio auio;
+	struct iovec aiov;
+
+	aiov.iov_base = (caddr_t)&da->da_ddr;
+	aiov.iov_len = sizeof (da->da_ddr);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = aiov.iov_len;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_offset = (off_t)-1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = da->da_td;
+#ifdef _KERNEL
+	if (da->da_fp->f_type == DTYPE_VNODE)
+		bwillwrite();
+	return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td));
+#else
+	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+	return (EOPNOTSUPP);
+#endif
+}
+#endif /* __FreeBSD__ */
+
+static int
+write_record(struct diffarg *da)
+{
+	ssize_t resid; /* have to get resid to get detailed errno */
+
+	if (da->da_ddr.ddr_type == DDR_NONE) {
+		da->da_err = 0;
+		return (0);
+	}
+
+#ifdef __FreeBSD__
+	da->da_err = write_bytes(da);
+#else
+	da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr,
+	    sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND,
+	    RLIM64_INFINITY, CRED(), &resid);
+#endif
+	*da->da_offp += sizeof (da->da_ddr);
+	return (da->da_err);
+}
+
+static int
+report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last)
+{
+	ASSERT(first <= last);
+	if (da->da_ddr.ddr_type != DDR_FREE ||
+	    first != da->da_ddr.ddr_last + 1) {
+		if (write_record(da) != 0)
+			return (da->da_err);
+		da->da_ddr.ddr_type = DDR_FREE;
+		da->da_ddr.ddr_first = first;
+		da->da_ddr.ddr_last = last;
+		return (0);
+	}
+	da->da_ddr.ddr_last = last;
+	return (0);
+}
+
+static int
+report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp)
+{
+	ASSERT(dnp != NULL);
+	if (dnp->dn_type == DMU_OT_NONE)
+		return (report_free_dnode_range(da, object, object));
+
+	if (da->da_ddr.ddr_type != DDR_INUSE ||
+	    object != da->da_ddr.ddr_last + 1) {
+		if (write_record(da) != 0)
+			return (da->da_err);
+		da->da_ddr.ddr_type = DDR_INUSE;
+		da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+		return (0);
+	}
+	da->da_ddr.ddr_last = object;
+	return (0);
+}
+
+#define	DBP_SPAN(dnp, level)				  \
+	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	struct diffarg *da = arg;
+	int err = 0;
+
+	if (issig(JUSTLOOKING) && issig(FORREAL))
+		return (SET_ERROR(EINTR));
+
+	if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
+		return (0);
+
+	if (BP_IS_HOLE(bp)) {
+		uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+		err = report_free_dnode_range(da, dnobj,
+		    dnobj + (span >> DNODE_SHIFT) - 1);
+		if (err)
+			return (err);
+	} else if (zb->zb_level == 0) {
+		dnode_phys_t *blk;
+		arc_buf_t *abuf;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		int blksz = BP_GET_LSIZE(bp);
+		int i;
+
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    &aflags, zb) != 0)
+			return (SET_ERROR(EIO));
+
+		blk = abuf->b_data;
+		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			uint64_t dnobj = (zb->zb_blkid <<
+			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+			err = report_dnode(da, dnobj, blk+i);
+			if (err)
+				break;
+		}
+		arc_buf_destroy(abuf, &abuf);
+		if (err)
+			return (err);
+		/* Don't care about the data blocks */
+		return (TRAVERSE_VISIT_NO_CHILDREN);
+	}
+	return (0);
+}
+
+int
+dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+#ifdef __FreeBSD__
+    struct file *fp, offset_t *offp)
+#else
+    struct vnode *vp, offset_t *offp)
+#endif
+{
+	struct diffarg da;
+	dsl_dataset_t *fromsnap;
+	dsl_dataset_t *tosnap;
+	dsl_pool_t *dp;
+	int error;
+	uint64_t fromtxg;
+
+	if (strchr(tosnap_name, '@') == NULL ||
+	    strchr(fromsnap_name, '@') == NULL)
+		return (SET_ERROR(EINVAL));
+
+	error = dsl_pool_hold(tosnap_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
+	if (error != 0) {
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
+		dsl_dataset_rele(fromsnap, FTAG);
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
+	dsl_dataset_rele(fromsnap, FTAG);
+
+	dsl_dataset_long_hold(tosnap, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+#ifdef __FreeBSD__
+	da.da_td = curthread;
+	da.da_fp = fp;
+#else
+	da.da_vp = vp;
+#endif
+	da.da_offp = offp;
+	da.da_ddr.ddr_type = DDR_NONE;
+	da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+	da.da_err = 0;
+
+	error = traverse_dataset(tosnap, fromtxg,
+	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da);
+
+	if (error != 0) {
+		da.da_err = error;
+	} else {
+		/* we set the da.da_err we return as side-effect */
+		(void) write_record(&da);
+	}
+
+	dsl_dataset_long_rele(tosnap, FTAG);
+	dsl_dataset_rele(tosnap, FTAG);
+
+	return (da.da_err);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_object.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c	27 Feb 2010 22:30:45 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c	27 Mar 2016 02:52:21 -0000
@@ -19,14 +19,17 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
 
 uint64_t
 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
@@ -34,7 +37,7 @@ dmu_object_alloc(objset_t *os, dmu_objec
 {
 	uint64_t object;
 	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
-	    (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
 	dnode_t *dn = NULL;
 	int restarted = B_FALSE;
 
@@ -47,10 +50,16 @@ dmu_object_alloc(objset_t *os, dmu_objec
 		 * reasonably sparse (at most 1/4 full).  Look from the
 		 * beginning once, but after that keep looking from here.
 		 * If we can't find one, just keep going from here.
+		 *
+		 * Note that dmu_traverse depends on the behavior that we use
+		 * multiple blocks of the dnode object before going back to
+		 * reuse objects.  Any change to this algorithm should preserve
+		 * that property or find another solution to the issues
+		 * described in traverse_visitbp.
 		 */
 		if (P2PHASE(object, L2_dnode_count) == 0) {
 			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-			int error = dnode_next_offset(os->os_meta_dnode,
+			int error = dnode_next_offset(DMU_META_DNODE(os),
 			    DNODE_FIND_HOLE,
 			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
 			restarted = B_TRUE;
@@ -91,7 +100,7 @@ dmu_object_claim(objset_t *os, uint64_t 
 	int err;
 
 	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
-		return (EBADF);
+		return (SET_ERROR(EBADF));
 
 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
 	if (err)
@@ -105,55 +114,22 @@ dmu_object_claim(objset_t *os, uint64_t 
 
 int
 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen)
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	dnode_t *dn;
-	dmu_tx_t *tx;
-	int nblkptr;
 	int err;
 
 	if (object == DMU_META_DNODE_OBJECT)
-		return (EBADF);
+		return (SET_ERROR(EBADF));
 
 	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 	    FTAG, &dn);
 	if (err)
 		return (err);
 
-	if (dn->dn_type == ot && dn->dn_datablksz == blocksize &&
-	    dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) {
-		/* nothing is changing, this is a noop */
-		dnode_rele(dn, FTAG);
-		return (0);
-	}
-
-	nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
-
-	/*
-	 * If we are losing blkptrs or changing the block size this must
-	 * be a new file instance.   We must clear out the previous file
-	 * contents before we can change this type of metadata in the dnode.
-	 */
-	if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) {
-		err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
-		if (err)
-			goto out;
-	}
-
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, object);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
-		goto out;
-	}
-
 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
 
-	dmu_tx_commit(tx);
-out:
 	dnode_rele(dn, FTAG);
-
 	return (err);
 }
 
@@ -178,16 +154,72 @@ dmu_object_free(objset_t *os, uint64_t o
 	return (0);
 }
 
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
 int
 dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 {
 	uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
 	int error;
 
-	error = dnode_next_offset(os->os_meta_dnode,
+	error = dnode_next_offset(DMU_META_DNODE(os),
 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
 
 	*objectp = offset >> DNODE_SHIFT;
 
 	return (error);
 }
+
+/*
+ * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
+ * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
+ *
+ * Only for use from syncing context, on MOS objects.
+ */
+void
+dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
+		dnode_rele(dn, FTAG);
+		return;
+	}
+	ASSERT3U(dn->dn_type, ==, old_type);
+	ASSERT0(dn->dn_maxblkid);
+	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
+	    DMU_OTN_ZAP_METADATA;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+
+	mzap_create_impl(mos, object, 0, 0, tx);
+
+	spa_feature_incr(dmu_objset_spa(mos),
+	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+}
+
+void
+dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	dmu_object_type_t t;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+	t = dn->dn_type;
+	dnode_rele(dn, FTAG);
+
+	if (t == DMU_OTN_ZAP_METADATA) {
+		spa_feature_decr(dmu_objset_spa(mos),
+		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+	}
+	VERIFY0(dmu_object_free(mos, object, tx));
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_objset.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c	27 Feb 2010 22:30:49 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c	3 Dec 2016 17:03:49 -0000
@@ -19,10 +19,18 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
@@ -40,7 +48,38 @@
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/sunddi.h>
+#include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+/*
+ * Tunable to overwrite the maximum number of threads for the parallization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+static void dmu_objset_find_dp_cb(void *arg);
+
+void
+dmu_objset_init(void)
+{
+	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+	rw_destroy(&os_lock);
+}
 
 spa_t *
 dmu_objset_spa(objset_t *os)
@@ -91,7 +130,13 @@ dmu_objset_id(objset_t *os)
 	return (ds ? ds->ds_object : 0);
 }
 
-uint64_t
+zfs_sync_type_t
+dmu_objset_syncprop(objset_t *os)
+{
+	return (os->os_sync);
+}
+
+zfs_logbias_op_t
 dmu_objset_logbias(objset_t *os)
 {
 	return (os->os_logbias);
@@ -120,7 +165,8 @@ compression_changed_cb(void *arg, uint64
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
-	os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+	os->os_compress = zio_compress_select(os->os_spa, newval,
+	    ZIO_COMPRESS_ON);
 }
 
 static void
@@ -184,6 +230,36 @@ secondary_cache_changed_cb(void *arg, ui
 }
 
 static void
+sync_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
+	    newval == ZFS_SYNC_DISABLED);
+
+	os->os_sync = newval;
+	if (os->os_zil)
+		zil_set_sync(os->os_zil, newval);
+}
+
+static void
+redundant_metadata_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
+	    newval == ZFS_REDUNDANT_METADATA_MOST);
+
+	os->os_redundant_metadata = newval;
+}
+
+static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
@@ -195,6 +271,14 @@ logbias_changed_cb(void *arg, uint64_t n
 		zil_set_logbias(os->os_zil, newval);
 }
 
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	os->os_recordsize = newval;
+}
+
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
@@ -225,42 +309,36 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat
 	os->os_spa = spa;
 	os->os_rootbp = bp;
 	if (!BP_IS_HOLE(os->os_rootbp)) {
-		uint32_t aflags = ARC_WAIT;
-		zbookmark_t zb;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		zbookmark_phys_t zb;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 		if (DMU_OS_IS_L2CACHEABLE(os))
-			aflags |= ARC_L2CACHE;
+			aflags |= ARC_FLAG_L2CACHE;
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
-		/*
-		 * NB: when bprewrite scrub can change the bp,
-		 * and this is called from dmu_objset_open_ds_os, the bp
-		 * could change, and we'll need a lock.
-		 */
-		err = arc_read_nolock(NULL, spa, os->os_rootbp,
+		err = arc_read(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
-		if (err) {
+		if (err != 0) {
 			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
-				err = EIO;
+				err = SET_ERROR(EIO);
 			return (err);
 		}
 
 		/* Increase the blocksize if we are permitted. */
 		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
-			arc_buf_t *buf = arc_buf_alloc(spa,
+			arc_buf_t *buf = arc_alloc_buf(spa,
 			    sizeof (objset_phys_t), &os->os_phys_buf,
 			    ARC_BUFC_METADATA);
 			bzero(buf->b_data, sizeof (objset_phys_t));
 			bcopy(os->os_phys_buf->b_data, buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
-			(void) arc_buf_remove_ref(os->os_phys_buf,
-			    &os->os_phys_buf);
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}
 
@@ -269,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
-		os->os_phys_buf = arc_buf_alloc(spa, size,
+		os->os_phys_buf = arc_alloc_buf(spa, size,
 		    &os->os_phys_buf, ARC_BUFC_METADATA);
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
@@ -281,48 +359,91 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat
 	 * default (fletcher2/off).  Snapshots don't need to know about
 	 * checksum/compression/copies.
 	 */
-	if (ds) {
-		err = dsl_prop_register(ds, "primarycache",
+	if (ds != NULL) {
+		boolean_t needlock = B_FALSE;
+
+		/*
+		 * Note: it's valid to open the objset if the dataset is
+		 * long-held, in which case the pool_config lock will not
+		 * be held.
+		 */
+		if (!dsl_pool_config_held(dmu_objset_pool(os))) {
+			needlock = B_TRUE;
+			dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+		}
+		err = dsl_prop_register(ds,
+		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
-		if (err == 0)
-			err = dsl_prop_register(ds, "secondarycache",
+		if (err == 0) {
+			err = dsl_prop_register(ds,
+			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
-		if (!dsl_dataset_is_snapshot(ds)) {
-			if (err == 0)
-				err = dsl_prop_register(ds, "checksum",
+		}
+		if (!ds->ds_is_snapshot) {
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 				    checksum_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "compression",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    compression_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "copies",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_COPIES),
 				    copies_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "dedup",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_DEDUP),
 				    dedup_changed_cb, os);
-			if (err == 0)
-				err = dsl_prop_register(ds, "logbias",
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 				    logbias_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_SYNC),
+				    sync_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(
+				    ZFS_PROP_REDUNDANT_METADATA),
+				    redundant_metadata_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+				    recordsize_changed_cb, os);
+			}
 		}
-		if (err) {
-			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
-			    &os->os_phys_buf) == 1);
+		if (needlock)
+			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		if (err != 0) {
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
-	} else if (ds == NULL) {
+	} else {
 		/* It's the meta-objset. */
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-		os->os_compress = ZIO_COMPRESS_LZJB;
+		os->os_compress = ZIO_COMPRESS_ON;
 		os->os_copies = spa_max_replication(spa);
 		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
-		os->os_dedup_verify = 0;
-		os->os_logbias = 0;
+		os->os_dedup_verify = B_FALSE;
+		os->os_logbias = ZFS_LOGBIAS_LATENCY;
+		os->os_sync = ZFS_SYNC_STANDARD;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
 
-	os->os_zil_header = os->os_phys->os_zil_header;
+	if (ds == NULL || !ds->ds_is_snapshot)
+		os->os_zil_header = os->os_phys->os_zil_header;
 	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
@@ -340,24 +461,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
-	os->os_meta_dnode = dnode_special_open(os,
-	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+	dnode_special_open(os, &os->os_phys->os_meta_dnode,
+	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
-		os->os_userused_dnode = dnode_special_open(os,
-		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
-		os->os_groupused_dnode = dnode_special_open(os,
-		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
-	}
-
-	/*
-	 * We should be the only thread trying to do this because we
-	 * have ds_opening_lock
-	 */
-	if (ds) {
-		mutex_enter(&ds->ds_lock);
-		ASSERT(ds->ds_objset == NULL);
-		ds->ds_objset = os;
-		mutex_exit(&ds->ds_lock);
+		dnode_special_open(os, &os->os_phys->os_userused_dnode,
+		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 	}
 
 	*osp = os;
@@ -369,63 +479,160 @@ dmu_objset_from_ds(dsl_dataset_t *ds, ob
 {
 	int err = 0;
 
+	/*
+	 * We shouldn't be doing anything with dsl_dataset_t's unless the
+	 * pool_config lock is held, or the dataset is long-held.
+	 */
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
+	    dsl_dataset_long_held(ds));
+
 	mutex_enter(&ds->ds_opening_lock);
-	*osp = ds->ds_objset;
-	if (*osp == NULL) {
+	if (ds->ds_objset == NULL) {
+		objset_t *os;
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-		    ds, &ds->ds_phys->ds_bp, osp);
+		    ds, dsl_dataset_get_blkptr(ds), &os);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+		if (err == 0) {
+			mutex_enter(&ds->ds_lock);
+			ASSERT(ds->ds_objset == NULL);
+			ds->ds_objset = os;
+			mutex_exit(&ds->ds_lock);
+		}
 	}
+	*osp = ds->ds_objset;
 	mutex_exit(&ds->ds_opening_lock);
 	return (err);
 }
 
-/* called from zpl */
+/*
+ * Holds the pool while the objset is held.  Therefore only one objset
+ * can be held at a time.
+ */
 int
 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
+	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
-	err = dsl_dataset_hold(name, tag, &ds);
-	if (err)
+	err = dsl_pool_hold(name, tag, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold(dp, name, tag, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, tag);
 		return (err);
+	}
 
 	err = dmu_objset_from_ds(ds, osp);
-	if (err)
+	if (err != 0) {
 		dsl_dataset_rele(ds, tag);
+		dsl_pool_rele(dp, tag);
+	}
 
 	return (err);
 }
 
-/* called from zpl */
-int
-dmu_objset_own(const char *name, dmu_objset_type_t type,
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
-	dsl_dataset_t *ds;
 	int err;
 
-	err = dsl_dataset_own(name, B_FALSE, tag, &ds);
-	if (err)
-		return (err);
-
 	err = dmu_objset_from_ds(ds, osp);
-	if (err) {
+	if (err != 0) {
 		dsl_dataset_disown(ds, tag);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-		dmu_objset_disown(*osp, tag);
-		return (EINVAL);
+		dsl_dataset_disown(ds, tag);
+		return (SET_ERROR(EINVAL));
 	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
-		dmu_objset_disown(*osp, tag);
-		return (EROFS);
+		dsl_dataset_disown(ds, tag);
+		return (SET_ERROR(EROFS));
 	}
 	return (err);
 }
 
+/*
+ * dsl_pool must not be held when this is called.
+ * Upon successful return, there will be a longhold on the dataset,
+ * and the dsl_pool will not be held.
+ */
+int
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_pool_hold(name, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_own(dp, name, tag, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
+	dsl_pool_rele(dp, FTAG);
+
+	return (err);
+}
+
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+	if (err != 0)
+		return (err);
+
+	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+}
+
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
+	dsl_pool_t *dp = dmu_objset_pool(os);
 	dsl_dataset_rele(os->os_dsl_dataset, tag);
+	dsl_pool_rele(dp, tag);
+}
+
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag.  We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
+void
+dmu_objset_refresh_ownership(objset_t *os, void *tag)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds, *newds;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+
+	ds = os->os_dsl_dataset;
+	VERIFY3P(ds, !=, NULL);
+	VERIFY3P(ds->ds_owner, ==, tag);
+	VERIFY(dsl_dataset_long_held(ds));
+
+	dsl_dataset_name(ds, name);
+	dp = dmu_objset_pool(os);
+	dsl_pool_config_enter(dp, FTAG);
+	dmu_objset_disown(os, tag);
+	VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
+	VERIFY3P(newds, ==, os->os_dsl_dataset);
+	dsl_pool_config_exit(dp, FTAG);
 }
 
 void
@@ -434,45 +641,56 @@ dmu_objset_disown(objset_t *os, void *ta
 	dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
-int
+void
 dmu_objset_evict_dbufs(objset_t *os)
 {
+	dnode_t dn_marker;
 	dnode_t *dn;
 
 	mutex_enter(&os->os_lock);
-
-	/* process the mdn last, since the other dnodes have holds on it */
-	list_remove(&os->os_dnodes, os->os_meta_dnode);
-	list_insert_tail(&os->os_dnodes, os->os_meta_dnode);
-
-	/*
-	 * Find the first dnode with holds.  We have to do this dance
-	 * because dnode_add_ref() only works if you already have a
-	 * hold.  If there are no holds then it has no dbufs so OK to
-	 * skip.
-	 */
-	for (dn = list_head(&os->os_dnodes);
-	    dn && !dnode_add_ref(dn, FTAG);
-	    dn = list_next(&os->os_dnodes, dn))
-		continue;
-
-	while (dn) {
-		dnode_t *next_dn = dn;
-
-		do {
-			next_dn = list_next(&os->os_dnodes, next_dn);
-		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
-
-		mutex_exit(&os->os_lock);
-		dnode_evict_dbufs(dn);
-		dnode_rele(dn, FTAG);
-		mutex_enter(&os->os_lock);
-		dn = next_dn;
+	dn = list_head(&os->os_dnodes);
+	while (dn != NULL) {
+		/*
+		 * Skip dnodes without holds.  We have to do this dance
+		 * because dnode_add_ref() only works if there is already a
+		 * hold.  If the dnode has no holds, then it has no dbufs.
+		 */
+		if (dnode_add_ref(dn, FTAG)) {
+			list_insert_after(&os->os_dnodes, dn, &dn_marker);
+			mutex_exit(&os->os_lock);
+
+			dnode_evict_dbufs(dn);
+			dnode_rele(dn, FTAG);
+
+			mutex_enter(&os->os_lock);
+			dn = list_next(&os->os_dnodes, &dn_marker);
+			list_remove(&os->os_dnodes, &dn_marker);
+		} else {
+			dn = list_next(&os->os_dnodes, dn);
+		}
 	}
 	mutex_exit(&os->os_lock);
-	return (list_head(&os->os_dnodes) != os->os_meta_dnode);
+
+	if (DMU_USERUSED_DNODE(os) != NULL) {
+		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
+	}
+	dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction.  Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ *       dnode_buf_pageout()), it is possible for the meta dnode for the
+ *       objset to have no holds even though os->os_dnodes is not empty.
+ */
 void
 dmu_objset_evict(objset_t *os)
 {
@@ -481,44 +699,51 @@ dmu_objset_evict(objset_t *os)
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!dmu_objset_is_dirty(os, t));
 
-	if (ds) {
-		if (!dsl_dataset_is_snapshot(ds)) {
-			VERIFY(0 == dsl_prop_unregister(ds, "checksum",
-			    checksum_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "compression",
-			    compression_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "copies",
-			    copies_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "dedup",
-			    dedup_changed_cb, os));
-			VERIFY(0 == dsl_prop_unregister(ds, "logbias",
-			    logbias_changed_cb, os));
-		}
-		VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
-		    primary_cache_changed_cb, os));
-		VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
-		    secondary_cache_changed_cb, os));
+	if (ds)
+		dsl_prop_unregister_all(ds, os);
+
+	if (os->os_sa)
+		sa_tear_down(os);
+
+	dmu_objset_evict_dbufs(os);
+
+	mutex_enter(&os->os_lock);
+	spa_evicting_os_register(os->os_spa, os);
+	if (list_is_empty(&os->os_dnodes)) {
+		mutex_exit(&os->os_lock);
+		dmu_objset_evict_done(os);
+	} else {
+		mutex_exit(&os->os_lock);
 	}
+}
 
-	/*
-	 * We should need only a single pass over the dnode list, since
-	 * nothing can be added to the list at this point.
-	 */
-	(void) dmu_objset_evict_dbufs(os);
-
-	dnode_special_close(os->os_meta_dnode);
-	if (os->os_userused_dnode) {
-		dnode_special_close(os->os_userused_dnode);
-		dnode_special_close(os->os_groupused_dnode);
+void
+dmu_objset_evict_done(objset_t *os)
+{
+	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
+	dnode_special_close(&os->os_meta_dnode);
+	if (DMU_USERUSED_DNODE(os)) {
+		dnode_special_close(&os->os_userused_dnode);
+		dnode_special_close(&os->os_groupused_dnode);
 	}
 	zil_free(os->os_zil);
 
-	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+
+	/*
+	 * This is a barrier to prevent the objset from going away in
+	 * dnode_move() until we can safely ensure that the objset is still in
+	 * use. We consider the objset valid before the barrier and invalid
+	 * after the barrier.
+	 */
+	rw_enter(&os_lock, RW_READER);
+	rw_exit(&os_lock);
 
-	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
+	spa_evicting_os_deregister(os->os_spa, os);
 	kmem_free(os, sizeof (objset_t));
 }
 
@@ -537,12 +762,13 @@ dmu_objset_create_impl(spa_t *spa, dsl_d
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	if (ds)
-		mutex_enter(&ds->ds_opening_lock);
-	VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os));
-	if (ds)
-		mutex_exit(&ds->ds_opening_lock);
-	mdn = os->os_meta_dnode;
+
+	if (ds != NULL)
+		VERIFY0(dmu_objset_from_ds(ds, &os));
+	else
+		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
+
+	mdn = DMU_META_DNODE(os);
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -563,11 +789,17 @@ dmu_objset_create_impl(spa_t *spa, dsl_d
 
 		/*
 		 * Determine the number of levels necessary for the meta-dnode
-		 * to contain DN_MAX_OBJECT dnodes.
+		 * to contain DN_MAX_OBJECT dnodes.  Note that in order to
+		 * ensure that we do not overflow 64 bits, there has to be
+		 * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
+		 * but < 2^64.  Therefore,
+		 * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
+		 * less than (64 - log2(DN_MAX_OBJECT)) (16).
 		 */
-		while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+		while ((uint64_t)mdn->dn_nblkptr <<
+		    (mdn->dn_datablkshift - DNODE_SHIFT +
 		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
-		    DN_MAX_OBJECT * sizeof (dnode_phys_t))
+		    DN_MAX_OBJECT)
 			levels++;
 
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
@@ -588,299 +820,202 @@ dmu_objset_create_impl(spa_t *spa, dsl_d
 	return (os);
 }
 
-struct oscarg {
-	void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-	void *userarg;
-	dsl_dataset_t *clone_origin;
-	const char *lastname;
-	dmu_objset_type_t type;
-	uint64_t flags;
-};
+typedef struct dmu_objset_create_arg {
+	const char *doca_name;
+	cred_t *doca_cred;
+	void (*doca_userfunc)(objset_t *os, void *arg,
+	    cred_t *cr, dmu_tx_t *tx);
+	void *doca_userarg;
+	dmu_objset_type_t doca_type;
+	uint64_t doca_flags;
+} dmu_objset_create_arg_t;
 
 /*ARGSUSED*/
 static int
-dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	struct oscarg *oa = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	int err;
-	uint64_t ddobj;
+	dmu_objset_create_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	const char *tail;
+	int error;
+
+	if (strchr(doca->doca_name, '@') != NULL)
+		return (SET_ERROR(EINVAL));
 
-	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-	    oa->lastname, sizeof (uint64_t), 1, &ddobj);
-	if (err != ENOENT)
-		return (err ? err : EEXIST);
-
-	if (oa->clone_origin != NULL) {
-		/* You can't clone across pools. */
-		if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
-			return (EXDEV);
-
-		/* You can only clone snapshots, not the head datasets. */
-		if (!dsl_dataset_is_snapshot(oa->clone_origin))
-			return (EINVAL);
+	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
+	if (error != 0)
+		return (error);
+	if (tail == NULL) {
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(EEXIST));
 	}
+	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+	    doca->doca_cred);
+	dsl_dir_rele(pdd, FTAG);
 
-	return (0);
+	return (error);
 }
 
 static void
-dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	struct oscarg *oa = arg2;
-	uint64_t dsobj;
-
-	ASSERT(dmu_tx_is_syncing(tx));
+	dmu_objset_create_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	const char *tail;
+	dsl_dataset_t *ds;
+	uint64_t obj;
+	blkptr_t *bp;
+	objset_t *os;
 
-	dsobj = dsl_dataset_create_sync(dd, oa->lastname,
-	    oa->clone_origin, oa->flags, cr, tx);
+	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 
-	if (oa->clone_origin == NULL) {
-		dsl_dataset_t *ds;
-		blkptr_t *bp;
-		objset_t *os;
+	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
+	    doca->doca_cred, tx);
 
-		VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj,
-		    FTAG, &ds));
-		bp = dsl_dataset_get_blkptr(ds);
-		ASSERT(BP_IS_HOLE(bp));
+	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	bp = dsl_dataset_get_blkptr(ds);
+	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
+	    ds, bp, doca->doca_type, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
-		os = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
-		    ds, bp, oa->type, tx);
-
-		if (oa->userfunc)
-			oa->userfunc(os, oa->userarg, cr, tx);
-		dsl_dataset_rele(ds, FTAG);
+	if (doca->doca_userfunc != NULL) {
+		doca->doca_userfunc(os, doca->doca_userarg,
+		    doca->doca_cred, tx);
 	}
 
-	spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
-	    tx, cr, "dataset = %llu", dsobj);
+	spa_history_log_internal_ds(ds, "create", tx, "");
+	dsl_dataset_rele(ds, FTAG);
+	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
-	dsl_dir_t *pdd;
-	const char *tail;
-	int err = 0;
-	struct oscarg oa = { 0 };
+	dmu_objset_create_arg_t doca;
 
-	ASSERT(strchr(name, '@') == NULL);
-	err = dsl_dir_open(name, FTAG, &pdd, &tail);
-	if (err)
-		return (err);
-	if (tail == NULL) {
-		dsl_dir_close(pdd, FTAG);
-		return (EEXIST);
-	}
-
-	oa.userfunc = func;
-	oa.userarg = arg;
-	oa.lastname = tail;
-	oa.type = type;
-	oa.flags = flags;
-
-	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
-	    dmu_objset_create_sync, pdd, &oa, 5);
-	dsl_dir_close(pdd, FTAG);
-	return (err);
-}
+	doca.doca_name = name;
+	doca.doca_cred = CRED();
+	doca.doca_flags = flags;
+	doca.doca_userfunc = func;
+	doca.doca_userarg = arg;
+	doca.doca_type = type;
+
+	return (dsl_sync_task(name,
+	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
+	    5, ZFS_SPACE_CHECK_NORMAL));
+}
+
+typedef struct dmu_objset_clone_arg {
+	const char *doca_clone;
+	const char *doca_origin;
+	cred_t *doca_cred;
+} dmu_objset_clone_arg_t;
 
-int
-dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
+/*ARGSUSED*/
+static int
+dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 {
+	dmu_objset_clone_arg_t *doca = arg;
 	dsl_dir_t *pdd;
 	const char *tail;
-	int err = 0;
-	struct oscarg oa = { 0 };
-
-	ASSERT(strchr(name, '@') == NULL);
-	err = dsl_dir_open(name, FTAG, &pdd, &tail);
-	if (err)
-		return (err);
-	if (tail == NULL) {
-		dsl_dir_close(pdd, FTAG);
-		return (EEXIST);
-	}
-
-	oa.lastname = tail;
-	oa.clone_origin = clone_origin;
-	oa.flags = flags;
+	int error;
+	dsl_dataset_t *origin;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 
-	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
-	    dmu_objset_create_sync, pdd, &oa, 5);
-	dsl_dir_close(pdd, FTAG);
-	return (err);
-}
+	if (strchr(doca->doca_clone, '@') != NULL)
+		return (SET_ERROR(EINVAL));
 
-int
-dmu_objset_destroy(const char *name, boolean_t defer)
-{
-	dsl_dataset_t *ds;
-	int error;
+	if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
 
-	/*
-	 * dsl_dataset_destroy() can free any claimed-but-unplayed
-	 * intent log, but if there is an active log, it has blocks that
-	 * are allocated, but may not yet be reflected in the on-disk
-	 * structure.  Only the ZIL knows how to free them, so we have
-	 * to call into it here.
-	 */
-	error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
-	if (error == 0) {
-		objset_t *os;
-		if (dmu_objset_from_ds(ds, &os) == 0)
-			zil_destroy(dmu_objset_zil(os), B_FALSE);
-		error = dsl_dataset_destroy(ds, FTAG, defer);
-		/* dsl_dataset_destroy() closes the ds. */
+	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
+	if (error != 0)
+		return (error);
+	if (tail == NULL) {
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(EEXIST));
 	}
 
-	return (error);
-}
+	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+	    doca->doca_cred);
+	if (error != 0) {
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(EDQUOT));
+	}
+	dsl_dir_rele(pdd, FTAG);
 
-struct snaparg {
-	dsl_sync_task_group_t *dstg;
-	char *snapname;
-	char failed[MAXPATHLEN];
-	boolean_t recursive;
-	nvlist_t *props;
-};
+	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
+	if (error != 0)
+		return (error);
 
-static int
-snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	objset_t *os = arg1;
-	struct snaparg *sn = arg2;
-
-	/* The props have already been checked by zfs_check_userprops(). */
+	/* You can only clone snapshots, not the head datasets. */
+	if (!origin->ds_is_snapshot) {
+		dsl_dataset_rele(origin, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	dsl_dataset_rele(origin, FTAG);
 
-	return (dsl_dataset_snapshot_check(os->os_dsl_dataset,
-	    sn->snapname, tx));
+	return (0);
 }
 
 static void
-snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 {
-	objset_t *os = arg1;
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	struct snaparg *sn = arg2;
+	dmu_objset_clone_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	const char *tail;
+	dsl_dataset_t *origin, *ds;
+	uint64_t obj;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
-	dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
+	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 
-	if (sn->props) {
-		dsl_props_arg_t pa;
-		pa.pa_props = sn->props;
-		pa.pa_source = ZPROP_SRC_LOCAL;
-		dsl_props_set_sync(ds->ds_prev, &pa, cr, tx);
-	}
+	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
+	    doca->doca_cred, tx);
+
+	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+	dsl_dataset_name(origin, namebuf);
+	spa_history_log_internal_ds(ds, "clone", tx,
+	    "origin=%s (%llu)", namebuf, origin->ds_object);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_dataset_rele(origin, FTAG);
+	dsl_dir_rele(pdd, FTAG);
 }
 
-static int
-dmu_objset_snapshot_one(const char *name, void *arg)
+int
+dmu_objset_clone(const char *clone, const char *origin)
 {
-	struct snaparg *sn = arg;
-	objset_t *os;
-	int err;
-	char *cp;
+	dmu_objset_clone_arg_t doca;
 
-	/*
-	 * If the objset starts with a '%', then ignore it unless it was
-	 * explicitly named (ie, not recursive).  These hidden datasets
-	 * are always inconsistent, and by not opening them here, we can
-	 * avoid a race with dsl_dir_destroy_check().
-	 */
-	cp = strrchr(name, '/');
-	if (cp && cp[1] == '%' && sn->recursive)
-		return (0);
-
-	(void) strcpy(sn->failed, name);
-
-	/*
-	 * Check permissions if we are doing a recursive snapshot.  The
-	 * permission checks for the starting dataset have already been
-	 * performed in zfs_secpolicy_snapshot()
-	 */
-	if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED())))
-		return (err);
-
-	err = dmu_objset_hold(name, sn, &os);
-	if (err != 0)
-		return (err);
-
-	/*
-	 * If the objset is in an inconsistent state (eg, in the process
-	 * of being destroyed), don't snapshot it.  As with %hidden
-	 * datasets, we return EBUSY if this name was explicitly
-	 * requested (ie, not recursive), and otherwise ignore it.
-	 */
-	if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
-		dmu_objset_rele(os, sn);
-		return (sn->recursive ? 0 : EBUSY);
-	}
+	doca.doca_clone = clone;
+	doca.doca_origin = origin;
+	doca.doca_cred = CRED();
 
-	/*
-	 * NB: we need to wait for all in-flight changes to get to disk,
-	 * so that we snapshot those changes.  zil_suspend does this as
-	 * a side effect.
-	 */
-	err = zil_suspend(dmu_objset_zil(os));
-	if (err == 0) {
-		dsl_sync_task_create(sn->dstg, snapshot_check,
-		    snapshot_sync, os, sn, 3);
-	} else {
-		dmu_objset_rele(os, sn);
-	}
-
-	return (err);
+	return (dsl_sync_task(clone,
+	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
+	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
-dmu_objset_snapshot(char *fsname, char *snapname,
-    nvlist_t *props, boolean_t recursive)
+dmu_objset_snapshot_one(const char *fsname, const char *snapname)
 {
-	dsl_sync_task_t *dst;
-	struct snaparg sn;
-	spa_t *spa;
 	int err;
+	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
+	nvlist_t *snaps = fnvlist_alloc();
 
-	(void) strcpy(sn.failed, fsname);
-
-	err = spa_open(fsname, &spa, FTAG);
-	if (err)
-		return (err);
-
-	sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	sn.snapname = snapname;
-	sn.props = props;
-	sn.recursive = recursive;
-
-	if (recursive) {
-		err = dmu_objset_find(fsname,
-		    dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
-	} else {
-		err = dmu_objset_snapshot_one(fsname, &sn);
-	}
-
-	if (err == 0)
-		err = dsl_sync_task_group_wait(sn.dstg);
-
-	for (dst = list_head(&sn.dstg->dstg_tasks); dst;
-	    dst = list_next(&sn.dstg->dstg_tasks, dst)) {
-		objset_t *os = dst->dst_arg1;
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		if (dst->dst_err)
-			dsl_dataset_name(ds, sn.failed);
-		zil_resume(dmu_objset_zil(os));
-		dmu_objset_rele(os, &sn);
-	}
-
-	if (err)
-		(void) strcpy(fsname, sn.failed);
-	dsl_sync_task_group_destroy(sn.dstg);
-	spa_close(spa, FTAG);
+	fnvlist_add_boolean(snaps, longsnap);
+	strfree(longsnap);
+	err = dsl_dataset_snapshot(snaps, NULL, NULL);
+	fnvlist_free(snaps);
 	return (err);
 }
 
@@ -919,9 +1054,9 @@ dmu_objset_write_ready(zio_t *zio, arc_b
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
-	ASSERT(bp == os->os_rootbp);
-	ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET);
-	ASSERT(BP_GET_LEVEL(bp) == 0);
+	ASSERT(!BP_IS_EMBEDDED(bp));
+	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+	ASSERT0(BP_GET_LEVEL(bp));
 
 	/*
 	 * Update rootbp fill count: it should be the number of objects
@@ -931,7 +1066,12 @@ dmu_objset_write_ready(zio_t *zio, arc_b
 	 */
 	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
-		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+	if (os->os_dsl_dataset != NULL)
+		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
+	*os->os_rootbp = *bp;
+	if (os->os_dsl_dataset != NULL)
+		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 }
 
 /* ARGSUSED */
@@ -951,6 +1091,7 @@ dmu_objset_write_done(zio_t *zio, arc_bu
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
+	kmem_free(bp, sizeof (*bp));
 }
 
 /* called from dsl */
@@ -958,12 +1099,14 @@ void
 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
 	list_t *newlist = NULL;
 	dbuf_dirty_record_t *dr;
+	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
+	*blkptr_copy = *os->os_rootbp;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
@@ -983,33 +1126,32 @@ dmu_objset_sync(objset_t *os, zio_t *pio
 	/*
 	 * Create the root block IO
 	 */
-	arc_release(os->os_phys_buf, &os->os_phys_buf);
-
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
-	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
-	    dmu_objset_write_ready, dmu_objset_write_done, os,
-	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
-	os->os_meta_dnode->dn_zio = zio;
-	dnode_sync(os->os_meta_dnode, tx);
+	DMU_META_DNODE(os)->dn_zio = zio;
+	dnode_sync(DMU_META_DNODE(os), tx);
 
 	os->os_phys->os_flags = os->os_flags;
 
-	if (os->os_userused_dnode &&
-	    os->os_userused_dnode->dn_type != DMU_OT_NONE) {
-		os->os_userused_dnode->dn_zio = zio;
-		dnode_sync(os->os_userused_dnode, tx);
-		os->os_groupused_dnode->dn_zio = zio;
-		dnode_sync(os->os_groupused_dnode, tx);
+	if (DMU_USERUSED_DNODE(os) &&
+	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+		DMU_USERUSED_DNODE(os)->dn_zio = zio;
+		dnode_sync(DMU_USERUSED_DNODE(os), tx);
+		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
 	}
 
 	txgoff = tx->tx_txg & TXG_MASK;
@@ -1027,9 +1169,9 @@ dmu_objset_sync(objset_t *os, zio_t *pio
 	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
 	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
-	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
-		ASSERT(dr->dr_dbuf->db_level == 0);
+		ASSERT0(dr->dr_dbuf->db_level);
 		list_remove(list, dr);
 		if (dr->dr_zio)
 			zio_nowait(dr->dr_zio);
@@ -1061,80 +1203,296 @@ boolean_t
 dmu_objset_userused_enabled(objset_t *os)
 {
 	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
-	    used_cbs[os->os_phys->os_type] &&
-	    os->os_userused_dnode);
+	    used_cbs[os->os_phys->os_type] != NULL &&
+	    DMU_USERUSED_DNODE(os) != NULL);
+}
+
+typedef struct userquota_node {
+	uint64_t uqn_id;
+	int64_t uqn_delta;
+	avl_node_t uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+	avl_tree_t uqc_user_deltas;
+	avl_tree_t uqc_group_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+	const userquota_node_t *luqn = l;
+	const userquota_node_t *ruqn = r;
+
+	if (luqn->uqn_id < ruqn->uqn_id)
+		return (-1);
+	if (luqn->uqn_id > ruqn->uqn_id)
+		return (1);
+	return (0);
 }
 
 static void
-do_userquota_callback(objset_t *os, dnode_phys_t *dnp,
-    boolean_t subtract, dmu_tx_t *tx)
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
 {
-	static const char zerobuf[DN_MAX_BONUSLEN] = {0};
-	uint64_t user, group;
+	void *cookie;
+	userquota_node_t *uqn;
 
-	ASSERT(dnp->dn_type != 0 ||
-	    (bcmp(DN_BONUS(dnp), zerobuf, DN_MAX_BONUSLEN) == 0 &&
-	    DN_USED_BYTES(dnp) == 0));
-
-	if ((dnp->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) &&
-	    0 == used_cbs[os->os_phys->os_type](dnp->dn_bonustype,
-	    DN_BONUS(dnp), &user, &group)) {
-		int64_t delta = DNODE_SIZE + DN_USED_BYTES(dnp);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+	    &cookie)) != NULL) {
+		VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_user_deltas);
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+	    &cookie)) != NULL) {
+		VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_group_deltas);
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
+{
+	userquota_node_t search = { .uqn_id = id };
+	avl_index_t idx;
+
+	userquota_node_t *uqn = avl_find(avl, &search, &idx);
+	if (uqn == NULL) {
+		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+		uqn->uqn_id = id;
+		avl_insert(avl, uqn, idx);
+	}
+	uqn->uqn_delta += delta;
+}
+
+static void
+do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
+    uint64_t user, uint64_t group, boolean_t subtract)
+{
+	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
+		int64_t delta = DNODE_SIZE + used;
 		if (subtract)
 			delta = -delta;
-		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
-		    user, delta, tx));
-		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
-		    group, delta, tx));
+
+		userquota_update_cache(&cache->uqc_user_deltas, user, delta);
+		userquota_update_cache(&cache->uqc_group_deltas, group, delta);
 	}
 }
 
 void
-dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx)
+dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	list_t *list = &os->os_synced_dnodes;
+	userquota_cache_t cache = { 0 };
 
 	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
 
+	avl_create(&cache.uqc_user_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+	avl_create(&cache.uqc_group_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+
 	while (dn = list_head(list)) {
+		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
-		ASSERT(dn->dn_oldphys);
 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
 		    dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED);
 
 		/* Allocate the user/groupused objects if necessary. */
-		if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
-			VERIFY(0 == zap_create_claim(os,
+		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+			VERIFY0(zap_create_claim(os,
 			    DMU_USERUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
-			VERIFY(0 == zap_create_claim(os,
+			VERIFY0(zap_create_claim(os,
 			    DMU_GROUPUSED_OBJECT,
 			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		}
 
-		/*
-		 * We intentionally modify the zap object even if the
-		 * net delta (due to phys-oldphys) is zero.  Otherwise
-		 * the block of the zap obj could be shared between
-		 * datasets but need to be different between them after
-		 * a bprewrite.
-		 */
-		do_userquota_callback(os, dn->dn_oldphys, B_TRUE, tx);
-		do_userquota_callback(os, dn->dn_phys, B_FALSE, tx);
+		flags = dn->dn_id_flags;
+		ASSERT(flags);
+		if (flags & DN_ID_OLD_EXIST)  {
+			do_userquota_update(&cache,
+			    dn->dn_oldused, dn->dn_oldflags,
+			    dn->dn_olduid, dn->dn_oldgid, B_TRUE);
+		}
+		if (flags & DN_ID_NEW_EXIST) {
+			do_userquota_update(&cache,
+			    DN_USED_BYTES(dn->dn_phys),
+			    dn->dn_phys->dn_flags,  dn->dn_newuid,
+			    dn->dn_newgid, B_FALSE);
+		}
 
-		/*
-		 * The mutex is needed here for interlock with dnode_allocate.
-		 */
 		mutex_enter(&dn->dn_mtx);
-		zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
-		dn->dn_oldphys = NULL;
+		dn->dn_oldused = 0;
+		dn->dn_oldflags = 0;
+		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+			dn->dn_olduid = dn->dn_newuid;
+			dn->dn_oldgid = dn->dn_newgid;
+			dn->dn_id_flags |= DN_ID_OLD_EXIST;
+			if (dn->dn_bonuslen == 0)
+				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+			else
+				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+		}
+		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
 		mutex_exit(&dn->dn_mtx);
 
 		list_remove(list, dn);
 		dnode_rele(dn, list);
 	}
+	do_userquota_cacheflush(os, &cache, tx);
+}
+
+/*
+ * Returns a pointer to data to find uid/gid from
+ *
+ * If a dirty record for transaction group that is syncing can't
+ * be found then NULL is returned.  In the NULL case it is assumed
+ * the uid/gid aren't changing.
+ */
+static void *
+dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dbuf_dirty_record_t *dr, **drp;
+	void *data;
+
+	if (db->db_dirtycnt == 0)
+		return (db->db.db_data);  /* Nothing is changing */
+
+	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
+		if (dr->dr_txg == tx->tx_txg)
+			break;
+
+	if (dr == NULL) {
+		data = NULL;
+	} else {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(dr->dr_dbuf);
+		dn = DB_DNODE(dr->dr_dbuf);
+
+		if (dn->dn_bonuslen == 0 &&
+		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+			data = dr->dt.dl.dr_data->b_data;
+		else
+			data = dr->dt.dl.dr_data;
+
+		DB_DNODE_EXIT(dr->dr_dbuf);
+	}
+
+	return (data);
+}
+
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
+{
+	objset_t *os = dn->dn_objset;
+	void *data = NULL;
+	dmu_buf_impl_t *db = NULL;
+	uint64_t *user = NULL;
+	uint64_t *group = NULL;
+	int flags = dn->dn_id_flags;
+	int error;
+	boolean_t have_spill = B_FALSE;
+
+	if (!dmu_objset_userused_enabled(dn->dn_objset))
+		return;
+
+	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+	    DN_ID_CHKED_SPILL)))
+		return;
+
+	if (before && dn->dn_bonuslen != 0)
+		data = DN_BONUS(dn->dn_phys);
+	else if (!before && dn->dn_bonuslen != 0) {
+		if (dn->dn_bonus) {
+			db = dn->dn_bonus;
+			mutex_enter(&db->db_mtx);
+			data = dmu_objset_userquota_find_data(db, tx);
+		} else {
+			data = DN_BONUS(dn->dn_phys);
+		}
+	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+			int rf = 0;
+
+			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+				rf |= DB_RF_HAVESTRUCT;
+			error = dmu_spill_hold_by_dnode(dn,
+			    rf | DB_RF_MUST_SUCCEED,
+			    FTAG, (dmu_buf_t **)&db);
+			ASSERT(error == 0);
+			mutex_enter(&db->db_mtx);
+			data = (before) ? db->db.db_data :
+			    dmu_objset_userquota_find_data(db, tx);
+			have_spill = B_TRUE;
+	} else {
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+		mutex_exit(&dn->dn_mtx);
+		return;
+	}
+
+	if (before) {
+		ASSERT(data);
+		user = &dn->dn_olduid;
+		group = &dn->dn_oldgid;
+	} else if (data) {
+		user = &dn->dn_newuid;
+		group = &dn->dn_newgid;
+	}
+
+	/*
+	 * Must always call the callback in case the object
+	 * type has changed and that type isn't an object type to track
+	 */
+	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
+	    user, group);
+
+	/*
+	 * Preserve existing uid/gid when the callback can't determine
+	 * what the new uid/gid are and the callback returned EEXIST.
+	 * The EEXIST error tells us to just use the existing uid/gid.
+	 * If we don't know what the old values are then just assign
+	 * them to 0, since that is a new file  being created.
+	 */
+	if (!before && data == NULL && error == EEXIST) {
+		if (flags & DN_ID_OLD_EXIST) {
+			dn->dn_newuid = dn->dn_olduid;
+			dn->dn_newgid = dn->dn_oldgid;
+		} else {
+			dn->dn_newuid = 0;
+			dn->dn_newgid = 0;
+		}
+		error = 0;
+	}
+
+	if (db)
+		mutex_exit(&db->db_mtx);
+
+	mutex_enter(&dn->dn_mtx);
+	if (error == 0 && before)
+		dn->dn_id_flags |= DN_ID_OLD_EXIST;
+	if (error == 0 && !before)
+		dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+	if (have_spill) {
+		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+	} else {
+		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+	}
+	mutex_exit(&dn->dn_mtx);
+	if (have_spill)
+		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 }
 
 boolean_t
@@ -1153,9 +1511,9 @@ dmu_objset_userspace_upgrade(objset_t *o
 	if (dmu_objset_userspace_present(os))
 		return (0);
 	if (!dmu_objset_userused_enabled(os))
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 	if (dmu_objset_is_snapshot(os))
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	/*
 	 * We simply need to mark every object dirty, so that it will be
@@ -1171,15 +1529,15 @@ dmu_objset_userspace_upgrade(objset_t *o
 		int objerr;
 
 		if (issig(JUSTLOOKING) && issig(FORREAL))
-			return (EINTR);
+			return (SET_ERROR(EINTR));
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
-		if (objerr)
+		if (objerr != 0)
 			continue;
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, obj);
 		objerr = dmu_tx_assign(tx, TXG_WAIT);
-		if (objerr) {
+		if (objerr != 0) {
 			dmu_tx_abort(tx);
 			continue;
 		}
@@ -1234,7 +1592,7 @@ int
 dmu_objset_is_snapshot(objset_t *os)
 {
 	if (os->os_dsl_dataset != NULL)
-		return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
+		return (os->os_dsl_dataset->ds_is_snapshot);
 	else
 		return (B_FALSE);
 }
@@ -1246,12 +1604,12 @@ dmu_snapshot_realname(objset_t *os, char
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
-	if (ds->ds_phys->ds_snapnames_zapobj == 0)
-		return (ENOENT);
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+		return (SET_ERROR(ENOENT));
 
 	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
-	    real, maxlen, conflict));
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+	    MT_FIRST, real, maxlen, conflict));
 }
 
 int
@@ -1262,21 +1620,23 @@ dmu_snapshot_list_next(objset_t *os, int
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
-	if (ds->ds_phys->ds_snapnames_zapobj == 0)
-		return (ENOENT);
+	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, *offp);
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
-		return (ENAMETOOLONG);
+		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
@@ -1301,21 +1661,21 @@ dmu_dir_list_next(objset_t *os, int name
 
 	/* there is no next dir on a snapshot! */
 	if (os->os_dsl_dataset->ds_object !=
-	    dd->dd_phys->dd_head_dataset_obj)
-		return (ENOENT);
+	    dsl_dir_phys(dd)->dd_head_dataset_obj)
+		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    dd->dd_pool->dp_meta_objset,
-	    dd->dd_phys->dd_child_dir_zapobj, *offp);
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
-		return (ENAMETOOLONG);
+		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
@@ -1328,42 +1688,242 @@ dmu_dir_list_next(objset_t *os, int name
 	return (0);
 }
 
-struct findarg {
-	int (*func)(const char *, void *);
-	void *arg;
-};
+typedef struct dmu_objset_find_ctx {
+	taskq_t		*dc_tq;
+	dsl_pool_t	*dc_dp;
+	uint64_t	dc_ddobj;
+	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+	void		*dc_arg;
+	int		dc_flags;
+	kmutex_t	*dc_error_lock;
+	int		*dc_error;
+} dmu_objset_find_ctx_t;
 
-/* ARGSUSED */
-static int
-findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 {
-	struct findarg *fa = arg;
-	return (fa->func(dsname, fa->arg));
+	dsl_pool_t *dp = dcp->dc_dp;
+	dmu_objset_find_ctx_t *child_dcp;
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	zap_cursor_t zc;
+	zap_attribute_t *attr;
+	uint64_t thisobj;
+	int err = 0;
+
+	/* don't process if there already was an error */
+	if (*dcp->dc_error != 0)
+		goto out;
+
+	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
+	if (err != 0)
+		goto out;
+
+	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+	if (dd->dd_myname[0] == '$') {
+		dsl_dir_rele(dd, FTAG);
+		goto out;
+	}
+
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/*
+	 * Iterate over all children.
+	 */
+	if (dcp->dc_flags & DS_FIND_CHILDREN) {
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT3U(attr->za_integer_length, ==,
+			    sizeof (uint64_t));
+			ASSERT3U(attr->za_num_integers, ==, 1);
+
+			child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+			*child_dcp = *dcp;
+			child_dcp->dc_ddobj = attr->za_first_integer;
+			if (dcp->dc_tq != NULL)
+				(void) taskq_dispatch(dcp->dc_tq,
+				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+			else
+				dmu_objset_find_dp_impl(child_dcp);
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	/*
+	 * Iterate over all snapshots.
+	 */
+	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
+		dsl_dataset_t *ds;
+		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+		if (err == 0) {
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+			dsl_dataset_rele(ds, FTAG);
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+			    zap_cursor_retrieve(&zc, attr) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				ASSERT3U(attr->za_integer_length, ==,
+				    sizeof (uint64_t));
+				ASSERT3U(attr->za_num_integers, ==, 1);
+
+				err = dsl_dataset_hold_obj(dp,
+				    attr->za_first_integer, FTAG, &ds);
+				if (err != 0)
+					break;
+				err = dcp->dc_func(dp, ds, dcp->dc_arg);
+				dsl_dataset_rele(ds, FTAG);
+				if (err != 0)
+					break;
+			}
+			zap_cursor_fini(&zc);
+		}
+	}
+
+	dsl_dir_rele(dd, FTAG);
+	kmem_free(attr, sizeof (zap_attribute_t));
+
+	if (err != 0)
+		goto out;
+
+	/*
+	 * Apply to self.
+	 */
+	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+	if (err != 0)
+		goto out;
+	err = dcp->dc_func(dp, ds, dcp->dc_arg);
+	dsl_dataset_rele(ds, FTAG);
+
+out:
+	if (err != 0) {
+		mutex_enter(dcp->dc_error_lock);
+		/* only keep first error */
+		if (*dcp->dc_error == 0)
+			*dcp->dc_error = err;
+		mutex_exit(dcp->dc_error_lock);
+	}
+
+	kmem_free(dcp, sizeof (*dcp));
+}
+
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+	dmu_objset_find_ctx_t *dcp = arg;
+	dsl_pool_t *dp = dcp->dc_dp;
+
+	/*
+	 * We need to get a pool_config_lock here, as there are several
+	 * asssert(pool_config_held) down the stack. Getting a lock via
+	 * dsl_pool_config_enter is risky, as it might be stalled by a
+	 * pending writer. This would deadlock, as the write lock can
+	 * only be granted when our parent thread gives up the lock.
+	 * The _prio interface gives us priority over a pending writer.
+	 */
+	dsl_pool_config_enter_prio(dp, FTAG);
+
+	dmu_objset_find_dp_impl(dcp);
+
+	dsl_pool_config_exit(dp, FTAG);
 }
 
 /*
- * Find all objsets under name, and for each, call 'func(child_name, arg)'.
- * Perhaps change all callers to use dmu_objset_find_spa()?
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
  */
 int
-dmu_objset_find(char *name, int func(const char *, void *), void *arg,
-    int flags)
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
 {
-	struct findarg fa;
-	fa.func = func;
-	fa.arg = arg;
-	return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
+	int error = 0;
+	taskq_t *tq = NULL;
+	int ntasks;
+	dmu_objset_find_ctx_t *dcp;
+	kmutex_t err_lock;
+
+	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+	dcp->dc_tq = NULL;
+	dcp->dc_dp = dp;
+	dcp->dc_ddobj = ddobj;
+	dcp->dc_func = func;
+	dcp->dc_arg = arg;
+	dcp->dc_flags = flags;
+	dcp->dc_error_lock = &err_lock;
+	dcp->dc_error = &error;
+
+	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+		/*
+		 * In case a write lock is held we can't make use of
+		 * parallelism, as down the stack of the worker threads
+		 * the lock is asserted via dsl_pool_config_held.
+		 * In case of a read lock this is solved by getting a read
+		 * lock in each worker thread, which isn't possible in case
+		 * of a writer lock. So we fall back to the synchronous path
+		 * here.
+		 * In the future it might be possible to get some magic into
+		 * dsl_pool_config_held in a way that it returns true for
+		 * the worker threads so that a single lock held from this
+		 * thread suffices. For now, stay single threaded.
+		 */
+		dmu_objset_find_dp_impl(dcp);
+		mutex_destroy(&err_lock);
+
+		return (error);
+	}
+
+	ntasks = dmu_find_threads;
+	if (ntasks == 0)
+		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
+	    INT_MAX, 0);
+	if (tq == NULL) {
+		kmem_free(dcp, sizeof (*dcp));
+		mutex_destroy(&err_lock);
+
+		return (SET_ERROR(ENOMEM));
+	}
+	dcp->dc_tq = tq;
+
+	/* dcp will be freed by task */
+	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+	/*
+	 * PORTING: this code relies on the property of taskq_wait to wait
+	 * until no more tasks are queued and no more tasks are active. As
+	 * we always queue new tasks from within other tasks, task_wait
+	 * reliably waits for the full recursion to finish, even though we
+	 * enqueue new tasks after taskq_wait has been called.
+	 * On platforms other than illumos, taskq_wait may not have this
+	 * property.
+	 */
+	taskq_wait(tq);
+	taskq_destroy(tq);
+	mutex_destroy(&err_lock);
+
+	return (error);
 }
 
 /*
- * Find all objsets under name, call func on each
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * The dp_config_rwlock must not be held when this is called, and it
+ * will not be held when the callback is called.
+ * Therefore this function should only be used when the pool is not changing
+ * (e.g. in syncing context), or the callback can deal with the possible races.
  */
-int
-dmu_objset_find_spa(spa_t *spa, const char *name,
-    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
+static int
+dmu_objset_find_impl(spa_t *spa, const char *name,
+    int func(const char *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
-	dsl_pool_t *dp;
+	dsl_pool_t *dp = spa_get_dsl(spa);
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
@@ -1371,43 +1931,50 @@ dmu_objset_find_spa(spa_t *spa, const ch
 	uint64_t thisobj;
 	int err;
 
-	if (name == NULL)
-		name = spa_name(spa);
-	err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
-	if (err)
+	dsl_pool_config_enter(dp, FTAG);
+
+	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
+	if (err != 0) {
+		dsl_pool_config_exit(dp, FTAG);
 		return (err);
+	}
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
-		dsl_dir_close(dd, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		dsl_pool_config_exit(dp, FTAG);
 		return (0);
 	}
 
-	thisobj = dd->dd_phys->dd_head_dataset_obj;
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-	dp = dd->dd_pool;
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj);
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
-			ASSERT(attr->za_integer_length == sizeof (uint64_t));
-			ASSERT(attr->za_num_integers == 1);
+			ASSERT3U(attr->za_integer_length, ==,
+			    sizeof (uint64_t));
+			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			child = kmem_asprintf("%s/%s", name, attr->za_name);
-			err = dmu_objset_find_spa(spa, child, func, arg, flags);
+			dsl_pool_config_exit(dp, FTAG);
+			err = dmu_objset_find_impl(spa, child,
+			    func, arg, flags);
+			dsl_pool_config_enter(dp, FTAG);
 			strfree(child);
-			if (err)
+			if (err != 0)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
-		if (err) {
-			dsl_dir_close(dd, FTAG);
+		if (err != 0) {
+			dsl_dir_rele(dd, FTAG);
+			dsl_pool_config_exit(dp, FTAG);
 			kmem_free(attr, sizeof (zap_attribute_t));
 			return (err);
 		}
@@ -1417,77 +1984,61 @@ dmu_objset_find_spa(spa_t *spa, const ch
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
-		if (!dsl_pool_sync_context(dp))
-			rw_enter(&dp->dp_config_rwlock, RW_READER);
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
-		if (!dsl_pool_sync_context(dp))
-			rw_exit(&dp->dp_config_rwlock);
 
 		if (err == 0) {
-			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
-				ASSERT(attr->za_integer_length ==
+				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
-				ASSERT(attr->za_num_integers == 1);
+				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				child = kmem_asprintf("%s@%s",
 				    name, attr->za_name);
-				err = func(spa, attr->za_first_integer,
-				    child, arg);
+				dsl_pool_config_exit(dp, FTAG);
+				err = func(child, arg);
+				dsl_pool_config_enter(dp, FTAG);
 				strfree(child);
-				if (err)
+				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
-	dsl_dir_close(dd, FTAG);
+	dsl_dir_rele(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
+	dsl_pool_config_exit(dp, FTAG);
 
-	if (err)
+	if (err != 0)
 		return (err);
 
-	/*
-	 * Apply to self if appropriate.
-	 */
-	err = func(spa, thisobj, name, arg);
-	return (err);
+	/* Apply to self. */
+	return (func(name, arg));
 }
 
-/* ARGSUSED */
+/*
+ * See comment above dmu_objset_find_impl().
+ */
 int
-dmu_objset_prefetch(const char *name, void *arg)
+dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+    int flags)
 {
-	dsl_dataset_t *ds;
-
-	if (dsl_dataset_hold(name, FTAG, &ds))
-		return (0);
-
-	if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
-		mutex_enter(&ds->ds_opening_lock);
-		if (ds->ds_objset == NULL) {
-			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-			zbookmark_t zb;
-
-			SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
-			    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-
-			(void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
-			    &ds->ds_phys->ds_bp, NULL, NULL,
-			    ZIO_PRIORITY_ASYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-			    &aflags, &zb);
-		}
-		mutex_exit(&ds->ds_opening_lock);
-	}
+	spa_t *spa;
+	int error;
 
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
+	error = spa_open(name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	error = dmu_objset_find_impl(spa, name, func, arg, flags);
+	spa_close(spa, FTAG);
+	return (error);
 }
 
 void
@@ -1503,3 +2054,19 @@ dmu_objset_get_user(objset_t *os)
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	return (os->os_user_ptr);
 }
+
+/*
+ * Determine name of filesystem, given name of snapshot.
+ * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
+ */
+int
+dmu_fsname(const char *snapname, char *buf)
+{
+	char *atp = strchr(snapname, '@');
+	if (atp == NULL)
+		return (SET_ERROR(EINVAL));
+	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	(void) strlcpy(buf, snapname, atp - snapname + 1);
+	return (0);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_send.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c	27 Feb 2010 22:30:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c	16 Jun 2017 21:23:39 -0000
@@ -19,8 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu.h>
@@ -39,53 +45,208 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
+
+#ifdef __FreeBSD__
+#undef dump_write
+#define dump_write dmu_dump_write
+#endif
+
+#ifdef __NetBSD__
+#ifdef _KERNEL
+#define FOF_OFFSET FOF_UPDATE_OFFSET
+#define td_ucred l_cred
+#define bwillwrite() /* nothing */
+
+static int
+fo_write(struct file *fp, struct uio *uio, cred_t *cred, int flags, kthread_t *thr)
+{
+
+	return (*fp->f_ops->fo_write)(fp, &fp->f_offset, uio, cred, flags);
+}
+
+static int
+fo_read(struct file *fp, struct uio *uio, cred_t *cred, int flags, kthread_t *thr)
+{
+
+	return (*fp->f_ops->fo_read)(fp, &fp->f_offset, uio, cred, flags);
+}
+#endif
+#endif
+
+/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+int zfs_send_corrupt_data = B_FALSE;
+int zfs_send_queue_length = 16 * 1024 * 1024;
+int zfs_recv_queue_length = 16 * 1024 * 1024;
+/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
+int zfs_send_set_freerecords_bit = B_TRUE;
+
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
+#endif
 
 static char *dmu_recv_tag = "dmu_recv_tag";
+const char *recv_clone_name = "%recv";
 
-/*
- * The list of data whose inclusion in a send stream can be pending from
- * one call to backup_cb to another.  Multiple calls to dump_free() and
- * dump_freeobjects() can be aggregated into a single DRR_FREE or
- * DRR_FREEOBJECTS replay record.
- */
-typedef enum {
-	PENDING_NONE,
-	PENDING_FREE,
-	PENDING_FREEOBJECTS
-} pendop_t;
+#define	BP_SPAN(datablkszsec, indblkshift, level) \
+	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
+
+static void byteswap_record(dmu_replay_record_t *drr);
+
+struct send_thread_arg {
+	bqueue_t	q;
+	dsl_dataset_t	*ds;		/* Dataset to traverse */
+	uint64_t	fromtxg;	/* Traverse from this txg */
+	int		flags;		/* flags to pass to traverse_dataset */
+	int		error_code;
+	boolean_t	cancel;
+	zbookmark_phys_t resume;
+};
 
-struct backuparg {
-	dmu_replay_record_t *drr;
-	vnode_t *vp;
-	offset_t *off;
-	objset_t *os;
-	zio_cksum_t zc;
-	uint64_t toguid;
-	int err;
-	pendop_t pending_op;
+struct send_block_record {
+	boolean_t		eos_marker; /* Marks the end of the stream */
+	blkptr_t		bp;
+	zbookmark_phys_t	zb;
+	uint8_t			indblkshift;
+	uint16_t		datablkszsec;
+	bqueue_node_t		ln;
 };
 
 static int
-dump_bytes(struct backuparg *ba, void *buf, int len)
+dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
 {
-	ssize_t resid; /* have to get resid to get detailed errno */
-	ASSERT3U(len % 8, ==, 0);
+	dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
+	struct uio auio;
+	struct iovec aiov;
+
+	/*
+	 * The code does not rely on this (len being a multiple of 8).  We keep
+	 * this assertion because of the corresponding assertion in
+	 * receive_read().  Keeping this assertion ensures that we do not
+	 * inadvertently break backwards compatibility (causing the assertion
+	 * in receive_read() to trigger on old software).
+	 *
+	 * Removing the assertions could be rolled into a new feature that uses
+	 * data that isn't 8-byte aligned; if the assertions were removed, a
+	 * feature flag would have to be added.
+	 */
+
+	ASSERT0(len % 8);
+
+	aiov.iov_base = buf;
+	aiov.iov_len = len;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = len;
+#ifdef __NetBSD__
+#ifdef _KERNEL
+	auio.uio_vmspace = vmspace_kernel();
+#endif
+#else
+	auio.uio_segflg = UIO_SYSSPACE;
+#endif
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_offset = (off_t)-1;
+#ifdef __FreeBSD__
+	auio.uio_td = dsp->dsa_td;
+#endif
+#ifdef _KERNEL
+	if (dsp->dsa_fp->f_type == DTYPE_VNODE)
+		bwillwrite();
+	dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0,
+	    dsp->dsa_td);
+#else
+	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+	dsp->dsa_err = EOPNOTSUPP;
+#endif
+	mutex_enter(&ds->ds_sendstream_lock);
+	*dsp->dsa_off += len;
+	mutex_exit(&ds->ds_sendstream_lock);
+
+	return (dsp->dsa_err);
+}
 
-	fletcher_4_incremental_native(buf, len, &ba->zc);
-	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
-	    (caddr_t)buf, len,
-	    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
-	*ba->off += len;
-	return (ba->err);
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
+static int
+dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
+{
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	fletcher_4_incremental_native(dsp->dsa_drr,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    &dsp->dsa_zc);
+	if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
+		dsp->dsa_sent_begin = B_TRUE;
+	} else {
+		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
+		    drr_checksum.drr_checksum));
+		dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
+	}
+	if (dsp->dsa_drr->drr_type == DRR_END) {
+		dsp->dsa_sent_end = B_TRUE;
+	}
+	fletcher_4_incremental_native(&dsp->dsa_drr->
+	    drr_u.drr_checksum.drr_checksum,
+	    sizeof (zio_cksum_t), &dsp->dsa_zc);
+	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+		return (SET_ERROR(EINTR));
+	if (payload_len != 0) {
+		fletcher_4_incremental_native(payload, payload_len,
+		    &dsp->dsa_zc);
+		if (dump_bytes(dsp, payload, payload_len) != 0)
+			return (SET_ERROR(EINTR));
+	}
+	return (0);
 }
 
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
 static int
-dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
+dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     uint64_t length)
 {
-	struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
+	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
+
+	/*
+	 * When we receive a free record, dbuf_free_range() assumes
+	 * that the receiving system doesn't have any dbufs in the range
+	 * being freed.  This is always true because there is a one-record
+	 * constraint: we only send one WRITE record for any given
+	 * object,offset.  We know that the one-record constraint is
+	 * true because we always send data in increasing order by
+	 * object,offset.
+	 *
+	 * If the increasing-order constraint ever changes, we should find
+	 * another way to assert that the one-record constraint is still
+	 * satisfied.
+	 */
+	ASSERT(object > dsp->dsa_last_data_object ||
+	    (object == dsp->dsa_last_data_object &&
+	    offset > dsp->dsa_last_data_offset));
+
+	if (length != -1ULL && offset + length < offset)
+		length = -1ULL;
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREE, push it out,
@@ -94,13 +255,14 @@ dump_free(struct backuparg *ba, uint64_t
 	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
 	 * aggregated with other DRR_FREEOBJECTS records.
 	 */
-	if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) {
-		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
-			return (EINTR);
-		ba->pending_op = PENDING_NONE;
+	if (dsp->dsa_pending_op != PENDING_NONE &&
+	    dsp->dsa_pending_op != PENDING_FREE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
-	if (ba->pending_op == PENDING_FREE) {
+	if (dsp->dsa_pending_op == PENDING_FREE) {
 		/*
 		 * There should never be a PENDING_FREE if length is -1
 		 * (because dump_dnode is the only place where this
@@ -118,35 +280,43 @@ dump_free(struct backuparg *ba, uint64_t
 			return (0);
 		} else {
 			/* not a continuation.  Push out pending record */
-			if (dump_bytes(ba, ba->drr,
-			    sizeof (dmu_replay_record_t)) != 0)
-				return (EINTR);
-			ba->pending_op = PENDING_NONE;
+			if (dump_record(dsp, NULL, 0) != 0)
+				return (SET_ERROR(EINTR));
+			dsp->dsa_pending_op = PENDING_NONE;
 		}
 	}
 	/* create a FREE record and make it pending */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_FREE;
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_FREE;
 	drrf->drr_object = object;
 	drrf->drr_offset = offset;
 	drrf->drr_length = length;
-	drrf->drr_toguid = ba->toguid;
+	drrf->drr_toguid = dsp->dsa_toguid;
 	if (length == -1ULL) {
-		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
-			return (EINTR);
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
 	} else {
-		ba->pending_op = PENDING_FREE;
+		dsp->dsa_pending_op = PENDING_FREE;
 	}
 
 	return (0);
 }
 
 static int
-dump_data(struct backuparg *ba, dmu_object_type_t type,
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
-	struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
+	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 
+	/*
+	 * We send data in increasing object, offset order.
+	 * See comment in dump_free() for details.
+	 */
+	ASSERT(object > dsp->dsa_last_data_object ||
+	    (object == dsp->dsa_last_data_object &&
+	    offset > dsp->dsa_last_data_offset));
+	dsp->dsa_last_data_object = object;
+	dsp->dsa_last_data_offset = offset + blksz - 1;
 
 	/*
 	 * If there is any kind of pending aggregation (currently either
@@ -154,38 +324,104 @@ dump_data(struct backuparg *ba, dmu_obje
 	 * the stream, since aggregation can't be done across operations
 	 * of different types.
 	 */
-	if (ba->pending_op != PENDING_NONE) {
-		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+	/* write a WRITE record */
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_WRITE;
+	drrw->drr_object = object;
+	drrw->drr_type = type;
+	drrw->drr_offset = offset;
+	drrw->drr_length = blksz;
+	drrw->drr_toguid = dsp->dsa_toguid;
+	if (bp == NULL || BP_IS_EMBEDDED(bp)) {
+		/*
+		 * There's no pre-computed checksum for partial-block
+		 * writes or embedded BP's, so (like
+		 * fletcher4-checkummed blocks) userland will have to
+		 * compute a dedup-capable checksum itself.
+		 */
+		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+	} else {
+		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)
+			drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+		drrw->drr_key.ddk_cksum = bp->blk_cksum;
+	}
+
+	if (dump_record(dsp, data, blksz) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+    int blksz, const blkptr_t *bp)
+{
+	char buf[BPE_PAYLOAD_SIZE];
+	struct drr_write_embedded *drrw =
+	    &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
 			return (EINTR);
-		ba->pending_op = PENDING_NONE;
+		dsp->dsa_pending_op = PENDING_NONE;
 	}
-	/* write a DATA record */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_WRITE;
+
+	ASSERT(BP_IS_EMBEDDED(bp));
+
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
 	drrw->drr_object = object;
-	drrw->drr_type = type;
 	drrw->drr_offset = offset;
 	drrw->drr_length = blksz;
-	drrw->drr_toguid = ba->toguid;
-	drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
-	if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
-		drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
-	DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
-	DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
-	DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
-	drrw->drr_key.ddk_cksum = bp->blk_cksum;
+	drrw->drr_toguid = dsp->dsa_toguid;
+	drrw->drr_compression = BP_GET_COMPRESS(bp);
+	drrw->drr_etype = BPE_GET_ETYPE(bp);
+	drrw->drr_lsize = BPE_GET_LSIZE(bp);
+	drrw->drr_psize = BPE_GET_PSIZE(bp);
 
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
-		return (EINTR);
-	if (dump_bytes(ba, data, blksz) != 0)
+	decode_embedded_bp_compressed(bp, buf);
+
+	if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
 		return (EINTR);
 	return (0);
 }
 
 static int
-dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
+dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
+{
+	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
+
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
+	}
+
+	/* write a SPILL record */
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_SPILL;
+	drrs->drr_object = object;
+	drrs->drr_length = blksz;
+	drrs->drr_toguid = dsp->dsa_toguid;
+
+	if (dump_record(dsp, data, blksz) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static int
+dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
 {
-	struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
+	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 
 	/*
 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
@@ -194,13 +430,13 @@ dump_freeobjects(struct backuparg *ba, u
 	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 	 * can only be aggregated with other DRR_FREEOBJECTS records.
 	 */
-	if (ba->pending_op != PENDING_NONE &&
-	    ba->pending_op != PENDING_FREEOBJECTS) {
-		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
-			return (EINTR);
-		ba->pending_op = PENDING_NONE;
+	if (dsp->dsa_pending_op != PENDING_NONE &&
+	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
 	}
-	if (ba->pending_op == PENDING_FREEOBJECTS) {
+	if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
 		/*
 		 * See whether this free object array can be aggregated
 		 * with pending one
@@ -210,42 +446,54 @@ dump_freeobjects(struct backuparg *ba, u
 			return (0);
 		} else {
 			/* can't be aggregated.  Push out pending record */
-			if (dump_bytes(ba, ba->drr,
-			    sizeof (dmu_replay_record_t)) != 0)
-				return (EINTR);
-			ba->pending_op = PENDING_NONE;
+			if (dump_record(dsp, NULL, 0) != 0)
+				return (SET_ERROR(EINTR));
+			dsp->dsa_pending_op = PENDING_NONE;
 		}
 	}
 
 	/* write a FREEOBJECTS record */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_FREEOBJECTS;
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
 	drrfo->drr_firstobj = firstobj;
 	drrfo->drr_numobjs = numobjs;
-	drrfo->drr_toguid = ba->toguid;
+	drrfo->drr_toguid = dsp->dsa_toguid;
 
-	ba->pending_op = PENDING_FREEOBJECTS;
+	dsp->dsa_pending_op = PENDING_FREEOBJECTS;
 
 	return (0);
 }
 
 static int
-dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
+dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 {
-	struct drr_object *drro = &(ba->drr->drr_u.drr_object);
+	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
+
+	if (object < dsp->dsa_resume_object) {
+		/*
+		 * Note: when resuming, we will visit all the dnodes in
+		 * the block of dnodes that we are resuming from.  In
+		 * this case it's unnecessary to send the dnodes prior to
+		 * the one we are resuming from.  We should be at most one
+		 * block's worth of dnodes behind the resume point.
+		 */
+		ASSERT3U(dsp->dsa_resume_object - object, <,
+		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+		return (0);
+	}
 
 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
-		return (dump_freeobjects(ba, object, 1));
+		return (dump_freeobjects(dsp, object, 1));
 
-	if (ba->pending_op != PENDING_NONE) {
-		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
-			return (EINTR);
-		ba->pending_op = PENDING_NONE;
+	if (dsp->dsa_pending_op != PENDING_NONE) {
+		if (dump_record(dsp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dsp->dsa_pending_op = PENDING_NONE;
 	}
 
 	/* write an OBJECT record */
-	bzero(ba->drr, sizeof (dmu_replay_record_t));
-	ba->drr->drr_type = DRR_OBJECT;
+	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+	dsp->dsa_drr->drr_type = DRR_OBJECT;
 	drro->drr_object = object;
 	drro->drr_type = dnp->dn_type;
 	drro->drr_bonustype = dnp->dn_bonustype;
@@ -253,488 +501,1317 @@ dump_dnode(struct backuparg *ba, uint64_
 	drro->drr_bonuslen = dnp->dn_bonuslen;
 	drro->drr_checksumtype = dnp->dn_checksum;
 	drro->drr_compress = dnp->dn_compress;
-	drro->drr_toguid = ba->toguid;
+	drro->drr_toguid = dsp->dsa_toguid;
 
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
-		return (EINTR);
+	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
+	if (dump_record(dsp, DN_BONUS(dnp),
+	    P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
+		return (SET_ERROR(EINTR));
+	}
+
+	/* Free anything past the end of the file. */
+	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
+	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
+		return (SET_ERROR(EINTR));
+	if (dsp->dsa_err != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
 
-	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
-		return (EINTR);
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+	if (!BP_IS_EMBEDDED(bp))
+		return (B_FALSE);
 
-	/* free anything past the end of the file */
-	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
-	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
-		return (EINTR);
-	if (ba->err)
-		return (EINTR);
-	return (0);
+	/*
+	 * Compression function must be legacy, or explicitly enabled.
+	 */
+	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+	    !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+		return (B_FALSE);
+
+	/*
+	 * Embed type must be explicitly enabled.
+	 */
+	switch (BPE_GET_ETYPE(bp)) {
+	case BP_EMBEDDED_TYPE_DATA:
+		if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+			return (B_TRUE);
+		break;
+	default:
+		return (B_FALSE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * This is the callback function to traverse_dataset that acts as the worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
+static int
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+	struct send_thread_arg *sta = arg;
+	struct send_block_record *record;
+	uint64_t record_size;
+	int err = 0;
+
+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= sta->resume.zb_object);
+
+	if (sta->cancel)
+		return (SET_ERROR(EINTR));
+
+	if (bp == NULL) {
+		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+		return (0);
+	} else if (zb->zb_level < 0) {
+		return (0);
+	}
+
+	record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
+	record->eos_marker = B_FALSE;
+	record->bp = *bp;
+	record->zb = *zb;
+	record->indblkshift = dnp->dn_indblkshift;
+	record->datablkszsec = dnp->dn_datablkszsec;
+	record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	bqueue_enqueue(&sta->q, record, record_size);
+
+	return (err);
 }
 
-#define	BP_SPAN(dnp, level) \
-	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
-	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+/*
+ * This function kicks off the traverse_dataset.  It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished.  If there is no
+ * dataset to traverse, the thread immediately pushes End of Stream marker.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+	struct send_thread_arg *st_arg = arg;
+	int err;
+	struct send_block_record *data;
+
+	if (st_arg->ds != NULL) {
+		err = traverse_dataset_resume(st_arg->ds,
+		    st_arg->fromtxg, &st_arg->resume,
+		    st_arg->flags, send_cb, st_arg);
+
+		if (err != EINTR)
+			st_arg->error_code = err;
+	}
+	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+	data->eos_marker = B_TRUE;
+	bqueue_enqueue(&st_arg->q, data, 1);
+	thread_exit();
+}
 
-/* ARGSUSED */
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, reading the data (which has hopefully been prefetched), and calling
+ * the appropriate helper function.
+ */
 static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 {
-	struct backuparg *ba = arg;
+	dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
+	const blkptr_t *bp = &data->bp;
+	const zbookmark_phys_t *zb = &data->zb;
+	uint8_t indblkshift = data->indblkshift;
+	uint16_t dblkszsec = data->datablkszsec;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 	int err = 0;
 
-	if (issig(JUSTLOOKING) && issig(FORREAL))
-		return (EINTR);
+	ASSERT3U(zb->zb_level, >=, 0);
+
+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= dsa->dsa_resume_object);
 
 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 		return (0);
-	} else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
+	} else if (BP_IS_HOLE(bp) &&
+	    zb->zb_object == DMU_META_DNODE_OBJECT) {
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
-		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
-	} else if (bp == NULL) {
-		uint64_t span = BP_SPAN(dnp, zb->zb_level);
-		err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span);
+		err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
+	} else if (BP_IS_HOLE(bp)) {
+		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+		uint64_t offset = zb->zb_blkid * span;
+		err = dump_free(dsa, zb->zb_object, offset, span);
 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 		return (0);
 	} else if (type == DMU_OT_DNODE) {
-		dnode_phys_t *blk;
-		int i;
 		int blksz = BP_GET_LSIZE(bp);
-		uint32_t aflags = ARC_WAIT;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 
-		if (arc_read_nolock(NULL, spa, bp,
-		    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
-		    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
-			return (EIO);
-
-		blk = abuf->b_data;
-		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
-			uint64_t dnobj = (zb->zb_blkid <<
-			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
-			err = dump_dnode(ba, dnobj, blk+i);
-			if (err)
+		ASSERT0(zb->zb_level);
+
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    &aflags, zb) != 0)
+			return (SET_ERROR(EIO));
+
+		dnode_phys_t *blk = abuf->b_data;
+		uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
+		for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+			err = dump_dnode(dsa, dnobj + i, blk + i);
+			if (err != 0)
 				break;
 		}
-		(void) arc_buf_remove_ref(abuf, &abuf);
-	} else { /* it's a level-0 block of a regular object */
-		uint32_t aflags = ARC_WAIT;
+		arc_buf_destroy(abuf, &abuf);
+	} else if (type == DMU_OT_SA) {
+		arc_flags_t aflags = ARC_FLAG_WAIT;
 		arc_buf_t *abuf;
 		int blksz = BP_GET_LSIZE(bp);
 
-		if (arc_read_nolock(NULL, spa, bp,
-		    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
-		    ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
-			return (EIO);
-
-		err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
-		    blksz, bp, abuf->b_data);
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    &aflags, zb) != 0)
+			return (SET_ERROR(EIO));
+
+		err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
+		arc_buf_destroy(abuf, &abuf);
+	} else if (backup_do_embed(dsa, bp)) {
+		/* it's an embedded level-0 block of a regular object */
+		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+		ASSERT0(zb->zb_level);
+		err = dump_write_embedded(dsa, zb->zb_object,
+		    zb->zb_blkid * blksz, blksz, bp);
+	} else {
+		/* it's a level-0 block of a regular object */
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		arc_buf_t *abuf;
+		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+		uint64_t offset;
+
+		ASSERT0(zb->zb_level);
+		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+		    (zb->zb_object == dsa->dsa_resume_object &&
+		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+		    &aflags, zb) != 0) {
+			if (zfs_send_corrupt_data) {
+				/* Send a block filled with 0x"zfs badd bloc" */
+				abuf = arc_alloc_buf(spa, blksz, &abuf,
+				    ARC_BUFC_DATA);
+				uint64_t *ptr;
+				for (ptr = abuf->b_data;
+				    (char *)ptr < (char *)abuf->b_data + blksz;
+				    ptr++)
+					*ptr = 0x2f5baddb10cULL;
+			} else {
+				return (SET_ERROR(EIO));
+			}
+		}
+
+		offset = zb->zb_blkid * blksz;
+
+		if (!(dsa->dsa_featureflags &
+		    DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+		    blksz > SPA_OLD_MAXBLOCKSIZE) {
+			char *buf = abuf->b_data;
+			while (blksz > 0 && err == 0) {
+				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+				err = dump_write(dsa, type, zb->zb_object,
+				    offset, n, NULL, buf);
+				offset += n;
+				buf += n;
+				blksz -= n;
+			}
+		} else {
+			err = dump_write(dsa, type, zb->zb_object,
+			    offset, blksz, bp, abuf->b_data);
+		}
+		arc_buf_destroy(abuf, &abuf);
 	}
 
 	ASSERT(err == 0 || err == EINTR);
 	return (err);
 }
 
-int
-dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
+/*
+ * Pop the new data off the queue, and free the old data.
+ */
+static struct send_block_record *
+get_next_record(bqueue_t *bq, struct send_block_record *data)
+{
+	struct send_block_record *tmp = bqueue_dequeue(bq);
+	kmem_free(data, sizeof (*data));
+	return (tmp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * Note: Releases dp using the specified tag.
+ */
+static int
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
+    zfs_bookmark_phys_t *ancestor_zb,
+    boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
+    uint64_t resumeobj, uint64_t resumeoff,
+#ifdef illumos
     vnode_t *vp, offset_t *off)
+#else
+    struct file *fp, offset_t *off)
+#endif
 {
-	dsl_dataset_t *ds = tosnap->os_dsl_dataset;
-	dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
+	objset_t *os;
 	dmu_replay_record_t *drr;
-	struct backuparg ba;
+	dmu_sendarg_t *dsp;
 	int err;
 	uint64_t fromtxg = 0;
+	uint64_t featureflags = 0;
+	struct send_thread_arg to_arg = { 0 };
 
-	/* tosnap must be a snapshot */
-	if (ds->ds_phys->ds_next_snap_obj == 0)
-		return (EINVAL);
-
-	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
-	if (fromds && (ds->ds_dir != fromds->ds_dir ||
-	    fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
-		return (EXDEV);
-
-	if (fromorigin) {
-		dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-		if (fromsnap)
-			return (EINVAL);
-
-		if (dsl_dir_is_clone(ds->ds_dir)) {
-			rw_enter(&dp->dp_config_rwlock, RW_READER);
-			err = dsl_dataset_hold_obj(dp,
-			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
-			rw_exit(&dp->dp_config_rwlock);
-			if (err)
-				return (err);
-		} else {
-			fromorigin = B_FALSE;
-		}
+	err = dmu_objset_from_ds(to_ds, &os);
+	if (err != 0) {
+		dsl_pool_rele(dp, tag);
+		return (err);
 	}
 
-
 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
 	    DMU_SUBSTREAM);
+
+#ifdef _KERNEL
+	if (dmu_objset_type(os) == DMU_OST_ZFS) {
+		uint64_t version;
+		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
+			kmem_free(drr, sizeof (dmu_replay_record_t));
+			dsl_pool_rele(dp, tag);
+			return (SET_ERROR(EINVAL));
+		}
+		if (version >= ZPL_VERSION_SA) {
+			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+		}
+	}
+#endif
+
+	if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
+		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+	if (embedok &&
+	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+		if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+			featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+	}
+
+	if (resumeobj != 0 || resumeoff != 0) {
+		featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+	}
+
+	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+	    featureflags);
+
 	drr->drr_u.drr_begin.drr_creation_time =
-	    ds->ds_phys->ds_creation_time;
-	drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
-	if (fromorigin)
+	    dsl_dataset_phys(to_ds)->ds_creation_time;
+	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
+	if (is_clone)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
-	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+	if (zfs_send_set_freerecords_bit)
+		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
 
-	if (fromds)
-		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
-	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-
-	if (fromds)
-		fromtxg = fromds->ds_phys->ds_creation_txg;
-	if (fromorigin)
-		dsl_dataset_rele(fromds, FTAG);
+	if (ancestor_zb != NULL) {
+		drr->drr_u.drr_begin.drr_fromguid =
+		    ancestor_zb->zbm_guid;
+		fromtxg = ancestor_zb->zbm_creation_txg;
+	}
+	dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
+	if (!to_ds->ds_is_snapshot) {
+		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
+		    sizeof (drr->drr_u.drr_begin.drr_toname));
+	}
+
+	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
+
+	dsp->dsa_drr = drr;
+	dsp->dsa_outfd = outfd;
+	dsp->dsa_proc = curproc;
+	dsp->dsa_td = curthread;
+	dsp->dsa_fp = fp;
+	dsp->dsa_os = os;
+	dsp->dsa_off = off;
+	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+	dsp->dsa_pending_op = PENDING_NONE;
+	dsp->dsa_featureflags = featureflags;
+	dsp->dsa_resume_object = resumeobj;
+	dsp->dsa_resume_offset = resumeoff;
+
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_insert_head(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
+
+	dsl_dataset_long_hold(to_ds, FTAG);
+	dsl_pool_rele(dp, tag);
+
+	void *payload = NULL;
+	size_t payload_len = 0;
+	if (resumeobj != 0 || resumeoff != 0) {
+		dmu_object_info_t to_doi;
+		err = dmu_object_info(os, resumeobj, &to_doi);
+		if (err != 0)
+			goto out;
+		SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
+		    resumeoff / to_doi.doi_data_block_size);
 
-	ba.drr = drr;
-	ba.vp = vp;
-	ba.os = tosnap;
-	ba.off = off;
-	ba.toguid = ds->ds_phys->ds_guid;
-	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
-	ba.pending_op = PENDING_NONE;
-
-	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-		return (ba.err);
+		nvlist_t *nvl = fnvlist_alloc();
+		fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+		fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+		payload = fnvlist_pack(nvl, &payload_len);
+		drr->drr_payloadlen = payload_len;
+		fnvlist_free(nvl);
+	}
+
+	err = dump_record(dsp, payload, payload_len);
+	fnvlist_pack_free(payload, payload_len);
+	if (err != 0) {
+		err = dsp->dsa_err;
+		goto out;
+	}
+
+	err = bqueue_init(&to_arg.q, zfs_send_queue_length,
+	    offsetof(struct send_block_record, ln));
+	to_arg.error_code = 0;
+	to_arg.cancel = B_FALSE;
+	to_arg.ds = to_ds;
+	to_arg.fromtxg = fromtxg;
+	to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+	(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0,
+	    TS_RUN, minclsyspri);
+
+	struct send_block_record *to_data;
+	to_data = bqueue_dequeue(&to_arg.q);
+
+	while (!to_data->eos_marker && err == 0) {
+		err = do_dump(dsp, to_data);
+		to_data = get_next_record(&to_arg.q, to_data);
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			err = EINTR;
 	}
 
-	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
-	    backup_cb, &ba);
+	if (err != 0) {
+		to_arg.cancel = B_TRUE;
+		while (!to_data->eos_marker) {
+			to_data = get_next_record(&to_arg.q, to_data);
+		}
+	}
+	kmem_free(to_data, sizeof (*to_data));
 
-	if (ba.pending_op != PENDING_NONE)
-		if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0)
-			err = EINTR;
+	bqueue_destroy(&to_arg.q);
 
-	if (err) {
-		if (err == EINTR && ba.err)
-			err = ba.err;
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-		return (err);
+	if (err == 0 && to_arg.error_code != 0)
+		err = to_arg.error_code;
+
+	if (err != 0)
+		goto out;
+
+	if (dsp->dsa_pending_op != PENDING_NONE)
+		if (dump_record(dsp, NULL, 0) != 0)
+			err = SET_ERROR(EINTR);
+
+	if (err != 0) {
+		if (err == EINTR && dsp->dsa_err != 0)
+			err = dsp->dsa_err;
+		goto out;
 	}
 
 	bzero(drr, sizeof (dmu_replay_record_t));
 	drr->drr_type = DRR_END;
-	drr->drr_u.drr_end.drr_checksum = ba.zc;
-	drr->drr_u.drr_end.drr_toguid = ba.toguid;
+	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
+	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 
-	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-		return (ba.err);
-	}
+	if (dump_record(dsp, NULL, 0) != 0)
+		err = dsp->dsa_err;
+
+out:
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_remove(&to_ds->ds_sendstreams, dsp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
+
+	VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
 
 	kmem_free(drr, sizeof (dmu_replay_record_t));
+	kmem_free(dsp, sizeof (dmu_sendarg_t));
 
-	return (0);
-}
+	dsl_dataset_long_rele(to_ds, FTAG);
 
-struct recvbeginsyncarg {
-	const char *tofs;
-	const char *tosnap;
-	dsl_dataset_t *origin;
-	uint64_t fromguid;
-	dmu_objset_type_t type;
-	void *tag;
-	boolean_t force;
-	uint64_t dsflags;
-	char clonelastname[MAXNAMELEN];
-	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
-};
+	return (err);
+}
 
-/* ARGSUSED */
-static int
-recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
+int
+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
+#ifdef illumos
+    int outfd, vnode_t *vp, offset_t *off)
+#else
+    int outfd, struct file *fp, offset_t *off)
+#endif
 {
-	dsl_dir_t *dd = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t val;
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	dsl_dataset_t *fromds = NULL;
 	int err;
 
-	err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
-	    strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
-
-	if (err != ENOENT)
-		return (err ? err : EEXIST);
+	err = dsl_pool_hold(pool, FTAG, &dp);
+	if (err != 0)
+		return (err);
 
-	if (rbsa->origin) {
-		/* make sure it's a snap in the same pool */
-		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
-			return (EXDEV);
-		if (!dsl_dataset_is_snapshot(rbsa->origin))
-			return (EINVAL);
-		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
-			return (ENODEV);
+	err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
 	}
 
-	return (0);
-}
-
-static void
-recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dir_t *dd = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
-	uint64_t dsobj;
+	if (fromsnap != 0) {
+		zfs_bookmark_phys_t zb;
+		boolean_t is_clone;
 
-	/* Create and open new dataset. */
-	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
-	    rbsa->origin, flags, cr, tx);
-	VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
-	    B_TRUE, dmu_recv_tag, &rbsa->ds));
-
-	if (rbsa->origin == NULL) {
-		(void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
-		    rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
+		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
+		if (err != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (err);
+		}
+		if (!dsl_dataset_is_before(ds, fromds, 0))
+			err = SET_ERROR(EXDEV);
+		zb.zbm_creation_time =
+		    dsl_dataset_phys(fromds)->ds_creation_time;
+		zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
+		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+		is_clone = (fromds->ds_dir != ds->ds_dir);
+		dsl_dataset_rele(fromds, FTAG);
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok, outfd, 0, 0, fp, off);
+	} else {
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok, outfd, 0, 0, fp, off);
 	}
-
-	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
-	    dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj);
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
 }
 
-/* ARGSUSED */
-static int
-recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
+int
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+#ifdef illumos
+    vnode_t *vp, offset_t *off)
+#else
+    struct file *fp, offset_t *off)
+#endif
 {
-	dsl_dataset_t *ds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
 	int err;
-	uint64_t val;
+	boolean_t owned = B_FALSE;
+
+	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
+		return (SET_ERROR(EINVAL));
 
-	/* must not have any changes since most recent snapshot */
-	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
-		return (ETXTBSY);
-
-	if (rbsa->fromguid) {
-		/* if incremental, most recent snapshot must match fromguid */
-		if (ds->ds_prev == NULL)
-			return (ENODEV);
+	err = dsl_pool_hold(tosnap, FTAG, &dp);
+	if (err != 0)
+		return (err);
 
+	if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
 		/*
-		 * most recent snapshot must match fromguid, or there are no
-		 * changes since the fromguid one
+		 * We are sending a filesystem or volume.  Ensure
+		 * that it doesn't change by owning the dataset.
 		 */
-		if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
-			uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
-			uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
-			while (obj != 0) {
-				dsl_dataset_t *snap;
-				err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
-				    obj, FTAG, &snap);
-				if (err)
-					return (ENODEV);
-				if (snap->ds_phys->ds_creation_txg < birth) {
-					dsl_dataset_rele(snap, FTAG);
-					return (ENODEV);
-				}
-				if (snap->ds_phys->ds_guid == rbsa->fromguid) {
-					dsl_dataset_rele(snap, FTAG);
-					break; /* it's ok */
-				}
-				obj = snap->ds_phys->ds_prev_snap_obj;
-				dsl_dataset_rele(snap, FTAG);
-			}
-			if (obj == 0)
-				return (ENODEV);
-		}
+		err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
+		owned = B_TRUE;
 	} else {
-		/* if full, most recent snapshot must be $ORIGIN */
-		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
-			return (ENODEV);
+		err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
 	}
-
-	/* temporary clone name must not exist */
-	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_dir->dd_phys->dd_child_dir_zapobj,
-	    rbsa->clonelastname, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
 		return (err);
+	}
 
-	/* new snapshot name must not exist */
-	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
-	return (0);
+	if (fromsnap != NULL) {
+		zfs_bookmark_phys_t zb;
+		boolean_t is_clone = B_FALSE;
+		int fsnamelen = strchr(tosnap, '@') - tosnap;
+
+		/*
+		 * If the fromsnap is in a different filesystem, then
+		 * mark the send stream as a clone.
+		 */
+		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
+		    (fromsnap[fsnamelen] != '@' &&
+		    fromsnap[fsnamelen] != '#')) {
+			is_clone = B_TRUE;
+		}
+
+		if (strchr(fromsnap, '@')) {
+			dsl_dataset_t *fromds;
+			err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+			if (err == 0) {
+				if (!dsl_dataset_is_before(ds, fromds, 0))
+					err = SET_ERROR(EXDEV);
+				zb.zbm_creation_time =
+				    dsl_dataset_phys(fromds)->ds_creation_time;
+				zb.zbm_creation_txg =
+				    dsl_dataset_phys(fromds)->ds_creation_txg;
+				zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+				is_clone = (ds->ds_dir != fromds->ds_dir);
+				dsl_dataset_rele(fromds, FTAG);
+			}
+		} else {
+			err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
+		}
+		if (err != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (err);
+		}
+		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+		    embedok, large_block_ok,
+		    outfd, resumeobj, resumeoff, fp, off);
+	} else {
+		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+		    embedok, large_block_ok,
+		    outfd, resumeobj, resumeoff, fp, off);
+	}
+	if (owned)
+		dsl_dataset_disown(ds, FTAG);
+	else
+		dsl_dataset_rele(ds, FTAG);
+	return (err);
 }
 
-/* ARGSUSED */
-static void
-recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
+    uint64_t *sizep)
 {
-	dsl_dataset_t *ohds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
-	dsl_dataset_t *cds;
-	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
-	uint64_t dsobj;
-
-	/* create and open the temporary clone */
-	dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
-	    ohds->ds_prev, flags, cr, tx);
-	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
+	int err;
+	/*
+	 * Assume that space (both on-disk and in-stream) is dominated by
+	 * data.  We will adjust for indirect blocks and the copies property,
+	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+	 */
 
 	/*
-	 * If we actually created a non-clone, we need to create the
-	 * objset in our new dataset.
+	 * Subtract out approximate space used by indirect blocks.
+	 * Assume most space is used by data blocks (non-indirect, non-dnode).
+	 * Assume all blocks are recordsize.  Assume ditto blocks and
+	 * internal fragmentation counter out compression.
+	 *
+	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+	 * block, which we observe in practice.
 	 */
-	if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
-		(void) dmu_objset_create_impl(dp->dp_spa,
-		    cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
-	}
+	uint64_t recordsize;
+	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
+	if (err != 0)
+		return (err);
+	size -= size / recordsize * sizeof (blkptr_t);
 
-	rbsa->ds = cds;
+	/* Add in the space for the record associated with each block. */
+	size += size / recordsize * sizeof (dmu_replay_record_t);
 
-	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
-	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
+	*sizep = size;
+
+	return (0);
 }
 
-/*
- * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
- * succeeds; otherwise we will leak the holds on the datasets.
- */
 int
-dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
-    boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
+dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
 {
-	int err = 0;
-	boolean_t byteswap;
-	struct recvbeginsyncarg rbsa = { 0 };
-	uint64_t versioninfo;
-	int flags;
-	dsl_dataset_t *ds;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	int err;
+	uint64_t size;
 
-	if (drrb->drr_magic == DMU_BACKUP_MAGIC)
-		byteswap = FALSE;
-	else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
-		byteswap = TRUE;
-	else
-		return (EINVAL);
+	ASSERT(dsl_pool_config_held(dp));
 
-	rbsa.tofs = tofs;
-	rbsa.tosnap = tosnap;
-	rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
-	rbsa.fromguid = drrb->drr_fromguid;
-	rbsa.type = drrb->drr_type;
-	rbsa.tag = FTAG;
-	rbsa.dsflags = 0;
-	versioninfo = drrb->drr_versioninfo;
-	flags = drrb->drr_flags;
-
-	if (byteswap) {
-		rbsa.type = BSWAP_32(rbsa.type);
-		rbsa.fromguid = BSWAP_64(rbsa.fromguid);
-		versioninfo = BSWAP_64(versioninfo);
-		flags = BSWAP_32(flags);
-	}
-
-	if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
-	    rbsa.type >= DMU_OST_NUMTYPES ||
-	    ((flags & DRR_FLAG_CLONE) && origin == NULL))
-		return (EINVAL);
+	/* tosnap must be a snapshot */
+	if (!ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
 
-	if (flags & DRR_FLAG_CI_DATA)
-		rbsa.dsflags = DS_FLAG_CI_DATASET;
+	/* fromsnap, if provided, must be a snapshot */
+	if (fromds != NULL && !fromds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
 
-	bzero(drc, sizeof (dmu_recv_cookie_t));
-	drc->drc_drrb = drrb;
-	drc->drc_tosnap = tosnap;
-	drc->drc_top_ds = top_ds;
-	drc->drc_force = force;
+	/*
+	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
+	 * or the origin's fs.
+	 */
+	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
+		return (SET_ERROR(EXDEV));
+
+	/* Get uncompressed size estimate of changed data. */
+	if (fromds == NULL) {
+		size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+	} else {
+		uint64_t used, comp;
+		err = dsl_dataset_space_written(fromds, ds,
+		    &used, &comp, &size);
+		if (err != 0)
+			return (err);
+	}
+
+	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+	return (err);
+}
+
+/*
+ * Simple callback used to traverse the blocks of a snapshot and sum their
+ * uncompressed size
+ */
+/* ARGSUSED */
+static int
+dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	uint64_t *spaceptr = arg;
+	if (bp != NULL && !BP_IS_HOLE(bp)) {
+		*spaceptr += BP_GET_UCSIZE(bp);
+	}
+	return (0);
+}
+
+/*
+ * Given a desination snapshot and a TXG, calculate the approximate size of a
+ * send stream sent from that TXG. from_txg may be zero, indicating that the
+ * whole snapshot will be sent.
+ */
+int
+dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
+    uint64_t *sizep)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	int err;
+	uint64_t size = 0;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	/* tosnap must be a snapshot */
+	if (!dsl_dataset_is_snapshot(ds))
+		return (SET_ERROR(EINVAL));
+
+	/* verify that from_txg is before the provided snapshot was taken */
+	if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
+		return (SET_ERROR(EXDEV));
+	}
 
 	/*
-	 * Process the begin in syncing context.
+	 * traverse the blocks of the snapshot with birth times after
+	 * from_txg, summing their uncompressed size
 	 */
+	err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
+	    dmu_calculate_send_traversal, &size);
+	if (err)
+		return (err);
 
-	/* open the dataset we are logically receiving into */
-	err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
-	if (err == 0) {
+	err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+	return (err);
+}
+
+typedef struct dmu_recv_begin_arg {
+	const char *drba_origin;
+	dmu_recv_cookie_t *drba_cookie;
+	cred_t *drba_cred;
+	uint64_t drba_snapobj;
+} dmu_recv_begin_arg_t;
+
+static int
+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
+    uint64_t fromguid)
+{
+	uint64_t val;
+	int error;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	/* temporary clone name must not exist */
+	error = zap_lookup(dp->dp_meta_objset,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
+	    8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? EBUSY : error);
+
+	/* new snapshot name must not exist */
+	error = zap_lookup(dp->dp_meta_objset,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? EEXIST : error);
+
+	/*
+	 * Check snapshot limit before receiving. We'll recheck again at the
+	 * end, but might as well abort before receiving if we're already over
+	 * the limit.
+	 *
+	 * Note that we do not check the file system limit with
+	 * dsl_dir_fscount_check because the temporary %clones don't count
+	 * against that limit.
+	 */
+	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+	    NULL, drba->drba_cred);
+	if (error != 0)
+		return (error);
+
+	if (fromguid != 0) {
+		dsl_dataset_t *snap;
+		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+
+		/* Find snapshot in this dir that matches fromguid. */
+		while (obj != 0) {
+			error = dsl_dataset_hold_obj(dp, obj, FTAG,
+			    &snap);
+			if (error != 0)
+				return (SET_ERROR(ENODEV));
+			if (snap->ds_dir != ds->ds_dir) {
+				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ENODEV));
+			}
+			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
+				break;
+			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+			dsl_dataset_rele(snap, FTAG);
+		}
+		if (obj == 0)
+			return (SET_ERROR(ENODEV));
+
+		if (drba->drba_cookie->drc_force) {
+			drba->drba_snapobj = obj;
+		} else {
+			/*
+			 * If we are not forcing, there must be no
+			 * changes since fromsnap.
+			 */
+			if (dsl_dataset_modified_since_snap(ds, snap)) {
+				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ETXTBSY));
+			}
+			drba->drba_snapobj = ds->ds_prev->ds_object;
+		}
+
+		dsl_dataset_rele(snap, FTAG);
+	} else {
+		/* if full, then must be forced */
+		if (!drba->drba_cookie->drc_force)
+			return (SET_ERROR(EEXIST));
+		/* start from $ORIGIN@$ORIGIN, if supported */
+		drba->drba_snapobj = dp->dp_origin_snap != NULL ?
+		    dp->dp_origin_snap->ds_object : 0;
+	}
+
+	return (0);
+
+}
+
+static int
+dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	uint64_t fromguid = drrb->drr_fromguid;
+	int flags = drrb->drr_flags;
+	int error;
+	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+	dsl_dataset_t *ds;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES ||
+	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
+		return (SET_ERROR(EINVAL));
+
+	/* Verify pool version supports SA if SA_SPILL feature set */
+	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
+		return (SET_ERROR(ENOTSUP));
+
+	if (drba->drba_cookie->drc_resumable &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+	 * record to a plan WRITE record, so the pool must have the
+	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate large blocks
+	 * to smaller ones, so the pool must have the LARGE_BLOCKS
+	 * feature enabled if the stream has LARGE_BLOCKS.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ENOTSUP));
+
+	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+	if (error == 0) {
 		/* target fs already exists; recv into temp clone */
 
 		/* Can't recv a clone into an existing fs */
-		if (flags & DRR_FLAG_CLONE) {
-			dsl_dataset_rele(ds, dmu_recv_tag);
-			return (EINVAL);
-		}
-
-		/* must not have an incremental recv already in progress */
-		if (!mutex_tryenter(&ds->ds_recvlock)) {
-			dsl_dataset_rele(ds, dmu_recv_tag);
-			return (EBUSY);
-		}
-
-		/* tmp clone name is: tofs/%tosnap" */
-		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
-		    "%%%s", tosnap);
-		rbsa.force = force;
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
-		if (err) {
-			mutex_exit(&ds->ds_recvlock);
-			dsl_dataset_rele(ds, dmu_recv_tag);
-			return (err);
+		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(EINVAL));
 		}
-		drc->drc_logical_ds = ds;
-		drc->drc_real_ds = rbsa.ds;
-	} else if (err == ENOENT) {
+
+		error = recv_begin_check_existing_impl(drba, ds, fromguid);
+		dsl_dataset_rele(ds, FTAG);
+	} else if (error == ENOENT) {
 		/* target fs does not exist; must be a full backup or clone */
-		char *cp;
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
 
 		/*
 		 * If it's a non-clone incremental, we are missing the
 		 * target fs, so fail the recv.
 		 */
-		if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
-			return (ENOENT);
+		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
+		    drba->drba_origin))
+			return (SET_ERROR(ENOENT));
+
+		/*
+		 * If we're receiving a full send as a clone, and it doesn't
+		 * contain all the necessary free records and freeobject
+		 * records, reject it.
+		 */
+		if (fromguid == 0 && drba->drba_origin &&
+		    !(flags & DRR_FLAG_FREERECORDS))
+			return (SET_ERROR(EINVAL));
 
 		/* Open the parent of tofs */
-		cp = strrchr(tofs, '/');
-		*cp = '\0';
-		err = dsl_dataset_hold(tofs, FTAG, &ds);
-		*cp = '/';
-		if (err)
-			return (err);
+		ASSERT3U(strlen(tofs), <, sizeof (buf));
+		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
+		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Check filesystem and snapshot limits before receiving. We'll
+		 * recheck snapshot limits again at the end (we create the
+		 * filesystems and increment those counts during begin_sync).
+		 */
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
 
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+
+		if (drba->drba_origin != NULL) {
+			dsl_dataset_t *origin;
+			error = dsl_dataset_hold(dp, drba->drba_origin,
+			    FTAG, &origin);
+			if (error != 0) {
+				dsl_dataset_rele(ds, FTAG);
+				return (error);
+			}
+			if (!origin->ds_is_snapshot) {
+				dsl_dataset_rele(origin, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(EINVAL));
+			}
+			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+			    fromguid != 0) {
+				dsl_dataset_rele(origin, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(ENODEV));
+			}
+			dsl_dataset_rele(origin, FTAG);
+		}
 		dsl_dataset_rele(ds, FTAG);
-		if (err)
-			return (err);
-		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
-		drc->drc_newfs = B_TRUE;
+		error = 0;
 	}
+	return (error);
+}
 
-	return (err);
+static void
+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	dsl_dataset_t *ds, *newds;
+	uint64_t dsobj;
+	int error;
+	uint64_t crflags = 0;
+
+	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+		crflags |= DS_FLAG_CI_DATASET;
+
+	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+	if (error == 0) {
+		/* create temporary clone */
+		dsl_dataset_t *snap = NULL;
+		if (drba->drba_snapobj != 0) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    drba->drba_snapobj, FTAG, &snap));
+		}
+		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
+		    snap, crflags, drba->drba_cred, tx);
+		if (drba->drba_snapobj != 0)
+			dsl_dataset_rele(snap, FTAG);
+		dsl_dataset_rele(ds, FTAG);
+	} else {
+		dsl_dir_t *dd;
+		const char *tail;
+		dsl_dataset_t *origin = NULL;
+
+		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
+
+		if (drba->drba_origin != NULL) {
+			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
+			    FTAG, &origin));
+		}
+
+		/* Create new dataset. */
+		dsobj = dsl_dataset_create_sync(dd,
+		    strrchr(tofs, '/') + 1,
+		    origin, crflags, drba->drba_cred, tx);
+		if (origin != NULL)
+			dsl_dataset_rele(origin, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		drba->drba_cookie->drc_newfs = B_TRUE;
+	}
+	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+
+	if (drba->drba_cookie->drc_resumable) {
+		dsl_dataset_zapify(newds, tx);
+		if (drrb->drr_fromguid != 0) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+			    8, 1, &drrb->drr_fromguid, tx));
+		}
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+		    8, 1, &drrb->drr_toguid, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+		uint64_t one = 1;
+		uint64_t zero = 0;
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+		    8, 1, &one, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+		    8, 1, &zero, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+		    8, 1, &zero, tx));
+		if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+		    DMU_BACKUP_FEATURE_EMBED_DATA) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+			    8, 1, &one, tx));
+		}
+	}
+
+	dmu_buf_will_dirty(newds->ds_dbuf, tx);
+	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	/*
+	 * If we actually created a non-clone, we need to create the
+	 * objset in our new dataset.
+	 */
+	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
+	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
+		(void) dmu_objset_create_impl(dp->dp_spa,
+		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
+	}
+	rrw_exit(&newds->ds_bp_rwlock, FTAG);
+
+	drba->drba_cookie->drc_ds = newds;
+
+	spa_history_log_internal_ds(newds, "receive", tx, "");
+}
+
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	int error;
+	uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+	dsl_dataset_t *ds;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES)
+		return (SET_ERROR(EINVAL));
+
+	/* Verify pool version supports SA if SA_SPILL feature set */
+	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(dp->dp_spa) < SPA_VERSION_SA)
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+	 * record to a plain WRITE record, so the pool must have the
+	 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+	 * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+
+	/* 6 extra bytes for /%recv */
+	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
+	    tofs, recv_clone_name);
+
+	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+		/* %recv does not exist; continue in tofs */
+		error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+		if (error != 0)
+			return (error);
+	}
+
+	/* check that ds is marked inconsistent */
+	if (!DS_IS_INCONSISTENT(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* check that there is resuming data, and that the toguid matches */
+	if (!dsl_dataset_is_zapified(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	uint64_t val;
+	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+	if (error != 0 || drrb->drr_toguid != val) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Check if the receive is still running.  If so, it will be owned.
+	 * Note that nothing else can own the dataset (e.g. after the receive
+	 * fails) because it will be marked inconsistent.
+	 */
+	if (dsl_dataset_has_owner(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EBUSY));
+	}
+
+	/* There should not be any snapshots of this fs yet. */
+	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Note: resume point will be checked when we process the first WRITE
+	 * record.
+	 */
+
+	/* check that the origin matches */
+	val = 0;
+	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+	if (drrb->drr_fromguid != val) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	dsl_dataset_t *ds;
+	uint64_t dsobj;
+	/* 6 extra bytes for /%recv */
+	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
+	    tofs, recv_clone_name);
+
+	if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+		/* %recv does not exist; continue in tofs */
+		VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+		drba->drba_cookie->drc_newfs = B_TRUE;
+	}
+
+	/* clear the inconsistent flag so that we can own it */
+	ASSERT(DS_IS_INCONSISTENT(ds));
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+	dsobj = ds->ds_object;
+	dsl_dataset_rele(ds, FTAG);
+
+	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	drba->drba_cookie->drc_ds = ds;
+
+	spa_history_log_internal_ds(ds, "resume receive", tx, "");
+}
+
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
+{
+	dmu_recv_begin_arg_t drba = { 0 };
+
+	bzero(drc, sizeof (dmu_recv_cookie_t));
+	drc->drc_drr_begin = drr_begin;
+	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
+	drc->drc_tosnap = tosnap;
+	drc->drc_tofs = tofs;
+	drc->drc_force = force;
+	drc->drc_resumable = resumable;
+	drc->drc_cred = CRED();
+
+	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+		drc->drc_byteswap = B_TRUE;
+		fletcher_4_incremental_byteswap(drr_begin,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+		byteswap_record(drr_begin);
+	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+		fletcher_4_incremental_native(drr_begin,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+	} else {
+		return (SET_ERROR(EINVAL));
+	}
+
+	drba.drba_origin = origin;
+	drba.drba_cookie = drc;
+	drba.drba_cred = CRED();
+
+	if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+	    DMU_BACKUP_FEATURE_RESUMING) {
+		return (dsl_sync_task(tofs,
+		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+	} else  {
+		return (dsl_sync_task(tofs,
+		    dmu_recv_begin_check, dmu_recv_begin_sync,
+		    &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+	}
 }
 
-struct restorearg {
+struct receive_record_arg {
+	dmu_replay_record_t header;
+	void *payload; /* Pointer to a buffer containing the payload */
+	/*
+	 * If the record is a write, pointer to the arc_buf_t containing the
+	 * payload.
+	 */
+	arc_buf_t *write_buf;
+	int payload_size;
+	uint64_t bytes_read; /* bytes read from stream when record created */
+	boolean_t eos_marker; /* Marks the end of the stream */
+	bqueue_node_t node;
+};
+
+struct receive_writer_arg {
+	objset_t *os;
+	boolean_t byteswap;
+	bqueue_t q;
+
+	/*
+	 * These three args are used to signal to the main thread that we're
+	 * done.
+	 */
+	kmutex_t mutex;
+	kcondvar_t cv;
+	boolean_t done;
+
 	int err;
-	int byteswap;
-	vnode_t *vp;
-	char *buf;
-	uint64_t voff;
-	int bufsize; /* amount of memory allocated for buf */
+	/* A map from guid to dataset to help handle dedup'd streams. */
+	avl_tree_t *guid_to_ds_map;
+	boolean_t resumable;
+	uint64_t last_object, last_offset;
+	uint64_t bytes_read; /* bytes read when current record created */
+};
+
+struct objlist {
+	list_t list; /* List of struct receive_objnode. */
+	/*
+	 * Last object looked up. Used to assert that objects are being looked
+	 * up in ascending order.
+	 */
+	uint64_t last_lookup;
+};
+
+struct receive_objnode {
+	list_node_t node;
+	uint64_t object;
+};
+
+struct receive_arg  {
+	objset_t *os;
+	kthread_t *td;
+	struct file *fp;
+	uint64_t voff; /* The current offset in the stream */
+	uint64_t bytes_read;
+	/*
+	 * A record that has had its payload read in, but hasn't yet been handed
+	 * off to the worker thread.
+	 */
+	struct receive_record_arg *rrd;
+	/* A record that has had its header read in, but not its payload. */
+	struct receive_record_arg *next_rrd;
 	zio_cksum_t cksum;
-	avl_tree_t guid_to_ds_map;
+	zio_cksum_t prev_cksum;
+	int err;
+	boolean_t byteswap;
+	/* Sorted list of objects not to issue prefetches for. */
+	struct objlist ignore_objlist;
 };
 
 typedef struct guid_map_entry {
@@ -756,102 +1833,100 @@ guid_compare(const void *arg1, const voi
 	return (0);
 }
 
-/*
- * This function is a callback used by dmu_objset_find() (which
- * enumerates the object sets) to build an avl tree that maps guids
- * to datasets.  The resulting table is used when processing DRR_WRITE_BYREF
- * send stream records.  These records, which are used in dedup'ed
- * streams, do not contain data themselves, but refer to a copy
- * of the data block that has already been written because it was
- * earlier in the stream.  That previous copy is identified by the
- * guid of the dataset with the referenced data.
- */
-int
-find_ds_by_guid(const char *name, void *arg)
+static void
+free_guid_map_onexit(void *arg)
 {
-	avl_tree_t *guid_map = arg;
-	dsl_dataset_t *ds, *snapds;
+	avl_tree_t *ca = arg;
+	void *cookie = NULL;
 	guid_map_entry_t *gmep;
-	dsl_pool_t *dp;
-	int err;
-	uint64_t lastobj, firstobj;
-
-	if (dsl_dataset_hold(name, FTAG, &ds) != 0)
-		return (0);
 
-	dp = ds->ds_dir->dd_pool;
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	firstobj = ds->ds_dir->dd_phys->dd_origin_obj;
-	lastobj = ds->ds_phys->ds_prev_snap_obj;
-
-	while (lastobj != firstobj) {
-		err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds);
-		if (err) {
-			/*
-			 * Skip this snapshot and move on. It's not
-			 * clear why this would ever happen, but the
-			 * remainder of the snapshot streadm can be
-			 * processed.
-			 */
-			rw_exit(&dp->dp_config_rwlock);
-			dsl_dataset_rele(ds, FTAG);
-			return (0);
-		}
-
-		gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
-		gmep->guid = snapds->ds_phys->ds_guid;
-		gmep->gme_ds = snapds;
-		avl_add(guid_map, gmep);
-		lastobj = snapds->ds_phys->ds_prev_snap_obj;
+	while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
+		dsl_dataset_long_rele(gmep->gme_ds, gmep);
+		dsl_dataset_rele(gmep->gme_ds, gmep);
+		kmem_free(gmep, sizeof (guid_map_entry_t));
 	}
+	avl_destroy(ca);
+	kmem_free(ca, sizeof (avl_tree_t));
+}
 
-	rw_exit(&dp->dp_config_rwlock);
-	dsl_dataset_rele(ds, FTAG);
-
-	return (0);
+static int
+restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
+{
+	struct uio auio;
+	struct iovec aiov;
+	int error;
+
+	aiov.iov_base = buf;
+	aiov.iov_len = len;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_resid = len;
+#ifdef __NetBSD__
+#ifdef _KERNEL
+	auio.uio_vmspace = vmspace_kernel();
+#endif
+#else
+	auio.uio_segflg = UIO_SYSSPACE;
+#endif
+	auio.uio_rw = UIO_READ;
+	auio.uio_offset = off;
+#ifdef __FreeBSD__
+	auio.uio_td = ra->td;
+#endif
+#ifdef _KERNEL
+	error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td);
+#else
+	fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__);
+	error = EOPNOTSUPP;
+#endif
+	*resid = auio.uio_resid;
+	return (error);
 }
 
-static void *
-restore_read(struct restorearg *ra, int len)
+static int
+receive_read(struct receive_arg *ra, int len, void *buf)
 {
-	void *rv;
 	int done = 0;
 
-	/* some things will require 8-byte alignment, so everything must */
-	ASSERT3U(len % 8, ==, 0);
+	/*
+	 * The code doesn't rely on this (lengths being multiples of 8).  See
+	 * comment in dump_bytes.
+	 */
+	ASSERT0(len % 8);
 
 	while (done < len) {
 		ssize_t resid;
 
-		ra->err = vn_rdwr(UIO_READ, ra->vp,
-		    (caddr_t)ra->buf + done, len - done,
-		    ra->voff, UIO_SYSSPACE, FAPPEND,
-		    RLIM64_INFINITY, CRED(), &resid);
+		ra->err = restore_bytes(ra, buf + done,
+		    len - done, ra->voff, &resid);
 
-		if (resid == len - done)
-			ra->err = EINVAL;
+		if (resid == len - done) {
+			/*
+			 * Note: ECKSUM indicates that the receive
+			 * was interrupted and can potentially be resumed.
+			 */
+			ra->err = SET_ERROR(ECKSUM);
+		}
 		ra->voff += len - done - resid;
 		done = len - resid;
-		if (ra->err)
-			return (NULL);
+		if (ra->err != 0)
+			return (ra->err);
 	}
 
+	ra->bytes_read += len;
+
 	ASSERT3U(done, ==, len);
-	rv = ra->buf;
-	if (ra->byteswap)
-		fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
-	else
-		fletcher_4_incremental_native(rv, len, &ra->cksum);
-	return (rv);
+	return (0);
 }
 
 static void
-backup_byteswap(dmu_replay_record_t *drr)
+byteswap_record(dmu_replay_record_t *drr)
 {
 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 	drr->drr_type = BSWAP_32(drr->drr_type);
 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
@@ -864,7 +1939,6 @@ backup_byteswap(dmu_replay_record_t *drr
 		break;
 	case DRR_OBJECT:
 		DO64(drr_object.drr_object);
-		/* DO64(drr_object.drr_allocation_txg); */
 		DO32(drr_object.drr_type);
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
@@ -882,10 +1956,7 @@ backup_byteswap(dmu_replay_record_t *drr
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_length);
 		DO64(drr_write.drr_toguid);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
-		DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
 		DO64(drr_write.drr_key.ddk_prop);
 		break;
 	case DRR_WRITE_BYREF:
@@ -896,538 +1967,1327 @@ backup_byteswap(dmu_replay_record_t *drr
 		DO64(drr_write_byref.drr_refguid);
 		DO64(drr_write_byref.drr_refobject);
 		DO64(drr_write_byref.drr_refoffset);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
-		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
+		    drr_key.ddk_cksum);
 		DO64(drr_write_byref.drr_key.ddk_prop);
 		break;
+	case DRR_WRITE_EMBEDDED:
+		DO64(drr_write_embedded.drr_object);
+		DO64(drr_write_embedded.drr_offset);
+		DO64(drr_write_embedded.drr_length);
+		DO64(drr_write_embedded.drr_toguid);
+		DO32(drr_write_embedded.drr_lsize);
+		DO32(drr_write_embedded.drr_psize);
+		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
 		DO64(drr_free.drr_length);
 		DO64(drr_free.drr_toguid);
 		break;
+	case DRR_SPILL:
+		DO64(drr_spill.drr_object);
+		DO64(drr_spill.drr_length);
+		DO64(drr_spill.drr_toguid);
+		break;
 	case DRR_END:
-		DO64(drr_end.drr_checksum.zc_word[0]);
-		DO64(drr_end.drr_checksum.zc_word[1]);
-		DO64(drr_end.drr_checksum.zc_word[2]);
-		DO64(drr_end.drr_checksum.zc_word[3]);
 		DO64(drr_end.drr_toguid);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
 		break;
 	}
+
+	if (drr->drr_type != DRR_BEGIN) {
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+	}
+
 #undef DO64
 #undef DO32
 }
 
+static inline uint8_t
+deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+{
+	if (bonus_type == DMU_OT_SA) {
+		return (1);
+	} else {
+		return (1 +
+		    ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+	}
+}
+
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+    uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+	if (!rwa->resumable)
+		return;
+
+	/*
+	 * We use ds_resume_bytes[] != 0 to indicate that we need to
+	 * update this on disk, so it must not be 0.
+	 */
+	ASSERT(rwa->bytes_read != 0);
+
+	/*
+	 * We only resume from write records, which have a valid
+	 * (non-meta-dnode) object number.
+	 */
+	ASSERT(object != 0);
+
+	/*
+	 * For resuming to work correctly, we must receive records in order,
+	 * sorted by object,offset.  This is checked by the callers, but
+	 * assert it here for good measure.
+	 */
+	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+	ASSERT3U(rwa->bytes_read, >=,
+	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
 static int
-restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+    void *data)
 {
-	int err;
+	dmu_object_info_t doi;
 	dmu_tx_t *tx;
-	void *data = NULL;
+	uint64_t object;
+	int err;
 
 	if (drro->drr_type == DMU_OT_NONE ||
-	    drro->drr_type >= DMU_OT_NUMTYPES ||
-	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
+	    !DMU_OT_IS_VALID(drro->drr_type) ||
+	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
-	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
-	err = dmu_object_info(os, drro->drr_object, NULL);
+	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
 
 	if (err != 0 && err != ENOENT)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
+	object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
-	if (drro->drr_bonuslen) {
-		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
-		if (ra->err)
-			return (ra->err);
-	}
+	/*
+	 * If we are losing blkptrs or changing the block size this must
+	 * be a new file instance.  We must clear out the previous file
+	 * contents before we can change this type of metadata in the dnode.
+	 */
+	if (err == 0) {
+		int nblkptr;
 
-	if (err == ENOENT) {
-		/* currently free, want to be allocated */
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
-			return (err);
+		nblkptr = deduce_nblkptr(drro->drr_bonustype,
+		    drro->drr_bonuslen);
+
+		if (drro->drr_blksz != doi.doi_data_block_size ||
+		    nblkptr < doi.doi_nblkptr) {
+			err = dmu_free_long_range(rwa->os, drro->drr_object,
+			    0, DMU_OBJECT_END);
+			if (err != 0)
+				return (SET_ERROR(EINVAL));
 		}
-		err = dmu_object_claim(os, drro->drr_object,
-		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen, tx);
-		dmu_tx_commit(tx);
-	} else {
-		/* currently allocated, want to be allocated */
-		err = dmu_object_reclaim(os, drro->drr_object,
-		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen);
 	}
-	if (err)
-		return (EINVAL);
 
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, drro->drr_object);
+	tx = dmu_tx_create(rwa->os);
+	dmu_tx_hold_bonus(tx, object);
 	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
+	if (err != 0) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
-	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
-	    tx);
-	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
+	if (object == DMU_NEW_OBJECT) {
+		/* currently free, want to be allocated */
+		err = dmu_object_claim(rwa->os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	} else if (drro->drr_type != doi.doi_type ||
+	    drro->drr_blksz != doi.doi_data_block_size ||
+	    drro->drr_bonustype != doi.doi_bonus_type ||
+	    drro->drr_bonuslen != doi.doi_bonus_size) {
+		/* currently allocated, but with different properties */
+		err = dmu_object_reclaim(rwa->os, drro->drr_object,
+		    drro->drr_type, drro->drr_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+	}
+	if (err != 0) {
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINVAL));
+	}
+
+	dmu_object_set_checksum(rwa->os, drro->drr_object,
+	    drro->drr_checksumtype, tx);
+	dmu_object_set_compress(rwa->os, drro->drr_object,
+	    drro->drr_compress, tx);
 
 	if (data != NULL) {
 		dmu_buf_t *db;
 
-		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
+		VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
 		bcopy(data, db->db_data, drro->drr_bonuslen);
-		if (ra->byteswap) {
-			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
+		if (rwa->byteswap) {
+			dmu_object_byteswap_t byteswap =
+			    DMU_OT_BYTESWAP(drro->drr_bonustype);
+			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
 			    drro->drr_bonuslen);
 		}
 		dmu_buf_rele(db, FTAG);
 	}
 	dmu_tx_commit(tx);
+
 	return (0);
 }
 
 /* ARGSUSED */
 static int
-restore_freeobjects(struct restorearg *ra, objset_t *os,
+receive_freeobjects(struct receive_writer_arg *rwa,
     struct drr_freeobjects *drrfo)
 {
 	uint64_t obj;
+	int next_err = 0;
 
 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	for (obj = drrfo->drr_firstobj;
-	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
+	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
 		int err;
 
-		if (dmu_object_info(os, obj, NULL) != 0)
+		if (dmu_object_info(rwa->os, obj, NULL) != 0)
 			continue;
 
-		err = dmu_free_object(os, obj);
-		if (err)
+		err = dmu_free_long_object(rwa->os, obj);
+		if (err != 0)
 			return (err);
 	}
+	if (next_err != ESRCH)
+		return (next_err);
+	return (0);
+}
+
+static int
+receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
+    arc_buf_t *abuf)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
+	    !DMU_OT_IS_VALID(drrw->drr_type))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * For resuming to work, records must be in increasing order
+	 * by (object, offset).
+	 */
+	if (drrw->drr_object < rwa->last_object ||
+	    (drrw->drr_object == rwa->last_object &&
+	    drrw->drr_offset < rwa->last_offset)) {
+		return (SET_ERROR(EINVAL));
+	}
+	rwa->last_object = drrw->drr_object;
+	rwa->last_offset = drrw->drr_offset;
+
+	if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	tx = dmu_tx_create(rwa->os);
+
+	dmu_tx_hold_write(tx, drrw->drr_object,
+	    drrw->drr_offset, drrw->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	if (rwa->byteswap) {
+		dmu_object_byteswap_t byteswap =
+		    DMU_OT_BYTESWAP(drrw->drr_type);
+		dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
+		    drrw->drr_length);
+	}
+
+	dmu_buf_t *bonus;
+	if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
+		return (SET_ERROR(EINVAL));
+	dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+
+	/*
+	 * Note: If the receive fails, we want the resume stream to start
+	 * with the same record that we last successfully received (as opposed
+	 * to the next record), so that we can verify that we are
+	 * resuming from the correct location.
+	 */
+	save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
+	dmu_tx_commit(tx);
+	dmu_buf_rele(bonus, FTAG);
+
 	return (0);
 }
 
+/*
+ * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
+ * streams to refer to a copy of the data that is already on the
+ * system because it came in earlier in the stream.  This function
+ * finds the earlier copy of the data, and uses that copy instead of
+ * data from the stream to fulfill this write.
+ */
+static int
+receive_write_byref(struct receive_writer_arg *rwa,
+    struct drr_write_byref *drrwbr)
+{
+	dmu_tx_t *tx;
+	int err;
+	guid_map_entry_t gmesrch;
+	guid_map_entry_t *gmep;
+	avl_index_t where;
+	objset_t *ref_os = NULL;
+	dmu_buf_t *dbp;
+
+	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * If the GUID of the referenced dataset is different from the
+	 * GUID of the target dataset, find the referenced dataset.
+	 */
+	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
+		gmesrch.guid = drrwbr->drr_refguid;
+		if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
+		    &where)) == NULL) {
+			return (SET_ERROR(EINVAL));
+		}
+		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
+			return (SET_ERROR(EINVAL));
+	} else {
+		ref_os = rwa->os;
+	}
+
+	err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+	if (err != 0)
+		return (err);
+
+	tx = dmu_tx_create(rwa->os);
+
+	dmu_tx_hold_write(tx, drrwbr->drr_object,
+	    drrwbr->drr_offset, drrwbr->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	dmu_write(rwa->os, drrwbr->drr_object,
+	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+	dmu_buf_rele(dbp, FTAG);
+
+	/* See comment in restore_write. */
+	save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+static int
+receive_write_embedded(struct receive_writer_arg *rwa,
+    struct drr_write_embedded *drrwe, void *data)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
+		return (EINVAL);
+
+	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
+		return (EINVAL);
+
+	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+		return (EINVAL);
+	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+		return (EINVAL);
+
+	tx = dmu_tx_create(rwa->os);
+
+	dmu_tx_hold_write(tx, drrwe->drr_object,
+	    drrwe->drr_offset, drrwe->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	dmu_write_embedded(rwa->os, drrwe->drr_object,
+	    drrwe->drr_offset, data, drrwe->drr_etype,
+	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
+	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+	/* See comment in restore_write. */
+	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+static int
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+    void *data)
+{
+	dmu_tx_t *tx;
+	dmu_buf_t *db, *db_spill;
+	int err;
+
+	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
+		return (SET_ERROR(EINVAL));
+
+	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
+	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
+		dmu_buf_rele(db, FTAG);
+		return (err);
+	}
+
+	tx = dmu_tx_create(rwa->os);
+
+	dmu_tx_hold_spill(tx, db->db_object);
+
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_buf_rele(db, FTAG);
+		dmu_buf_rele(db_spill, FTAG);
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	dmu_buf_will_dirty(db_spill, tx);
+
+	if (db_spill->db_size < drrs->drr_length)
+		VERIFY(0 == dbuf_spill_set_blksz(db_spill,
+		    drrs->drr_length, tx));
+	bcopy(data, db_spill->db_data, drrs->drr_length);
+
+	dmu_buf_rele(db, FTAG);
+	dmu_buf_rele(db_spill, FTAG);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
+{
+	int err;
+
+	if (drrf->drr_length != -1ULL &&
+	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+		return (SET_ERROR(EINVAL));
+
+	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	err = dmu_free_long_range(rwa->os, drrf->drr_object,
+	    drrf->drr_offset, drrf->drr_length);
+
+	return (err);
+}
+
+/* used to destroy the drc_ds on error */
+static void
+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
+{
+	if (drc->drc_resumable) {
+		/* wait for our resume state to be written to disk */
+		txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	} else {
+		char name[ZFS_MAX_DATASET_NAME_LEN];
+		dsl_dataset_name(drc->drc_ds, name);
+		dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+		(void) dsl_destroy_head(name);
+	}
+}
+
+static void
+receive_cksum(struct receive_arg *ra, int len, void *buf)
+{
+	if (ra->byteswap) {
+		fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+	} else {
+		fletcher_4_incremental_native(buf, len, &ra->cksum);
+	}
+}
+
+/*
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate ra->next_rrd and read the next record's header into
+ * ra->next_rrd->header.
+ * Verify checksum of payload and next record.
+ */
+static int
+receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
+{
+	int err;
+
+	if (len != 0) {
+		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+		err = receive_read(ra, len, buf);
+		if (err != 0)
+			return (err);
+		receive_cksum(ra, len, buf);
+
+		/* note: rrd is NULL when reading the begin record's payload */
+		if (ra->rrd != NULL) {
+			ra->rrd->payload = buf;
+			ra->rrd->payload_size = len;
+			ra->rrd->bytes_read = ra->bytes_read;
+		}
+	}
+
+	ra->prev_cksum = ra->cksum;
+
+	ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
+	err = receive_read(ra, sizeof (ra->next_rrd->header),
+	    &ra->next_rrd->header);
+	ra->next_rrd->bytes_read = ra->bytes_read;
+	if (err != 0) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
+		return (err);
+	}
+	if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Note: checksum is of everything up to but not including the
+	 * checksum itself.
+	 */
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	receive_cksum(ra,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    &ra->next_rrd->header);
+
+	zio_cksum_t cksum_orig =
+	    ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+	zio_cksum_t *cksump =
+	    &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+
+	if (ra->byteswap)
+		byteswap_record(&ra->next_rrd->header);
+
+	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+	    !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
+		kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+		ra->next_rrd = NULL;
+		return (SET_ERROR(ECKSUM));
+	}
+
+	receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+
+	return (0);
+}
+
+static void
+objlist_create(struct objlist *list)
+{
+	list_create(&list->list, sizeof (struct receive_objnode),
+	    offsetof(struct receive_objnode, node));
+	list->last_lookup = 0;
+}
+
+static void
+objlist_destroy(struct objlist *list)
+{
+	for (struct receive_objnode *n = list_remove_head(&list->list);
+	    n != NULL; n = list_remove_head(&list->list)) {
+		kmem_free(n, sizeof (*n));
+	}
+	list_destroy(&list->list);
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist.  In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number.  Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+static boolean_t
+objlist_exists(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = list_head(&list->list);
+	ASSERT3U(object, >=, list->last_lookup);
+	list->last_lookup = object;
+	while (node != NULL && node->object < object) {
+		VERIFY3P(node, ==, list_remove_head(&list->list));
+		kmem_free(node, sizeof (*node));
+		node = list_head(&list->list);
+	}
+	return (node != NULL && node->object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order.  However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+static void
+objlist_insert(struct objlist *list, uint64_t object)
+{
+	struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+	node->object = object;
+#ifdef ZFS_DEBUG
+	struct receive_objnode *last_object = list_tail(&list->list);
+	uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
+	ASSERT3U(node->object, >, last_objnum);
+#endif
+	list_insert_tail(&list->list, node);
+}
+
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object.  We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches).  We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records).  As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(struct receive_arg *ra,
+    uint64_t object, uint64_t offset, uint64_t length)
+{
+	if (!objlist_exists(&ra->ignore_objlist, object)) {
+		dmu_prefetch(ra->os, object, 1, offset, length,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
 static int
-restore_write(struct restorearg *ra, objset_t *os,
-    struct drr_write *drrw)
+receive_read_record(struct receive_arg *ra)
 {
-	dmu_tx_t *tx;
-	void *data;
 	int err;
 
-	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
-	    drrw->drr_type >= DMU_OT_NUMTYPES)
-		return (EINVAL);
-
-	data = restore_read(ra, drrw->drr_length);
-	if (data == NULL)
-		return (ra->err);
-
-	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
-		return (EINVAL);
-
-	tx = dmu_tx_create(os);
+	switch (ra->rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
+		uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
+		void *buf = kmem_zalloc(size, KM_SLEEP);
+		dmu_object_info_t doi;
+		err = receive_read_payload_and_next_header(ra, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
+			return (err);
+		}
+		err = dmu_object_info(ra->os, drro->drr_object, &doi);
+		/*
+		 * See receive_read_prefetch for an explanation why we're
+		 * storing this object in the ignore_obj_list.
+		 */
+		if (err == ENOENT ||
+		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+			objlist_insert(&ra->ignore_objlist, drro->drr_object);
+			err = 0;
+		}
+		return (err);
+	}
+	case DRR_FREEOBJECTS:
+	{
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		return (err);
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
+		arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+		    drrw->drr_length);
+
+		err = receive_read_payload_and_next_header(ra,
+		    drrw->drr_length, abuf->b_data);
+		if (err != 0) {
+			dmu_return_arcbuf(abuf);
+			return (err);
+		}
+		ra->rrd->write_buf = abuf;
+		receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
+		    drrw->drr_length);
+		return (err);
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwb =
+		    &ra->rrd->header.drr_u.drr_write_byref;
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
+		    drrwb->drr_length);
+		return (err);
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &ra->rrd->header.drr_u.drr_write_embedded;
+		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+		void *buf = kmem_zalloc(size, KM_SLEEP);
+
+		err = receive_read_payload_and_next_header(ra, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
+			return (err);
+		}
 
-	dmu_tx_hold_write(tx, drrw->drr_object,
-	    drrw->drr_offset, drrw->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
+		receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
+		    drrwe->drr_length);
 		return (err);
 	}
-	if (ra->byteswap)
-		dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
-	dmu_write(os, drrw->drr_object,
-	    drrw->drr_offset, drrw->drr_length, data, tx);
-	dmu_tx_commit(tx);
-	return (0);
+	case DRR_FREE:
+	{
+		/*
+		 * It might be beneficial to prefetch indirect blocks here, but
+		 * we don't really have the data to decide for sure.
+		 */
+		err = receive_read_payload_and_next_header(ra, 0, NULL);
+		return (err);
+	}
+	case DRR_END:
+	{
+		struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
+		if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
+			return (SET_ERROR(ECKSUM));
+		return (0);
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
+		void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
+		err = receive_read_payload_and_next_header(ra, drrs->drr_length,
+		    buf);
+		if (err != 0)
+			kmem_free(buf, drrs->drr_length);
+		return (err);
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
 }
 
 /*
- * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
- * streams to refer to a copy of the data that is already on the
- * system because it came in earlier in the stream.  This function
- * finds the earlier copy of the data, and uses that copy instead of
- * data from the stream to fulfill this write.
+ * Commit the records to the pool.
  */
 static int
-restore_write_byref(struct restorearg *ra, objset_t *os,
-    struct drr_write_byref *drrwbr)
+receive_process_record(struct receive_writer_arg *rwa,
+    struct receive_record_arg *rrd)
 {
-	dmu_tx_t *tx;
 	int err;
-	guid_map_entry_t gmesrch;
-	guid_map_entry_t *gmep;
-	avl_index_t	where;
-	objset_t *ref_os = NULL;
-	dmu_buf_t *dbp;
 
-	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
-		return (EINVAL);
+	/* Processing in order, therefore bytes_read should be increasing. */
+	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+	rwa->bytes_read = rrd->bytes_read;
 
-	/*
-	 * If the GUID of the referenced dataset is different from the
-	 * GUID of the target dataset, find the referenced dataset.
-	 */
-	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
-		gmesrch.guid = drrwbr->drr_refguid;
-		if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch,
-		    &where)) == NULL) {
-			return (EINVAL);
-		}
-		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
-			return (EINVAL);
-	} else {
-		ref_os = os;
+	switch (rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &rrd->header.drr_u.drr_object;
+		err = receive_object(rwa, drro, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
 	}
-
-	if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
-	    drrwbr->drr_refoffset, FTAG, &dbp))
+	case DRR_FREEOBJECTS:
+	{
+		struct drr_freeobjects *drrfo =
+		    &rrd->header.drr_u.drr_freeobjects;
+		return (receive_freeobjects(rwa, drrfo));
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+		err = receive_write(rwa, drrw, rrd->write_buf);
+		/* if receive_write() is successful, it consumes the arc_buf */
+		if (err != 0)
+			dmu_return_arcbuf(rrd->write_buf);
+		rrd->write_buf = NULL;
+		rrd->payload = NULL;
 		return (err);
-
-	tx = dmu_tx_create(os);
-
-	dmu_tx_hold_write(tx, drrwbr->drr_object,
-	    drrwbr->drr_offset, drrwbr->drr_length);
-	err = dmu_tx_assign(tx, TXG_WAIT);
-	if (err) {
-		dmu_tx_abort(tx);
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwbr =
+		    &rrd->header.drr_u.drr_write_byref;
+		return (receive_write_byref(rwa, drrwbr));
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &rrd->header.drr_u.drr_write_embedded;
+		err = receive_write_embedded(rwa, drrwe, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
 		return (err);
 	}
-	dmu_write(os, drrwbr->drr_object,
-	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
-	dmu_buf_rele(dbp, FTAG);
-	dmu_tx_commit(tx);
-	return (0);
+	case DRR_FREE:
+	{
+		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+		return (receive_free(rwa, drrf));
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+		err = receive_spill(rwa, drrs, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		return (err);
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
 }
 
-/* ARGSUSED */
-static int
-restore_free(struct restorearg *ra, objset_t *os,
-    struct drr_free *drrf)
+/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record  When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
 {
-	int err;
-
-	if (drrf->drr_length != -1ULL &&
-	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
-		return (EINVAL);
+	struct receive_writer_arg *rwa = arg;
+	struct receive_record_arg *rrd;
+	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+	    rrd = bqueue_dequeue(&rwa->q)) {
+		/*
+		 * If there's an error, the main thread will stop putting things
+		 * on the queue, but we need to clear everything in it before we
+		 * can exit.
+		 */
+		if (rwa->err == 0) {
+			rwa->err = receive_process_record(rwa, rrd);
+		} else if (rrd->write_buf != NULL) {
+			dmu_return_arcbuf(rrd->write_buf);
+			rrd->write_buf = NULL;
+			rrd->payload = NULL;
+		} else if (rrd->payload != NULL) {
+			kmem_free(rrd->payload, rrd->payload_size);
+			rrd->payload = NULL;
+		}
+		kmem_free(rrd, sizeof (*rrd));
+	}
+	kmem_free(rrd, sizeof (*rrd));
+	mutex_enter(&rwa->mutex);
+	rwa->done = B_TRUE;
+	cv_signal(&rwa->cv);
+	mutex_exit(&rwa->mutex);
+	thread_exit();
+}
 
-	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
-		return (EINVAL);
+static int
+resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
+{
+	uint64_t val;
+	objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
+	uint64_t dsobj = dmu_objset_id(ra->os);
+	uint64_t resume_obj, resume_off;
+
+	if (nvlist_lookup_uint64(begin_nvl,
+	    "resume_object", &resume_obj) != 0 ||
+	    nvlist_lookup_uint64(begin_nvl,
+	    "resume_offset", &resume_off) != 0) {
+		return (SET_ERROR(EINVAL));
+	}
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+	if (resume_obj != val)
+		return (SET_ERROR(EINVAL));
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+	if (resume_off != val)
+		return (SET_ERROR(EINVAL));
 
-	err = dmu_free_long_range(os, drrf->drr_object,
-	    drrf->drr_offset, drrf->drr_length);
-	return (err);
+	return (0);
 }
 
 /*
+ * Read in the stream's records, one by one, and apply them to the pool.  There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks.  It will then push the records
+ * onto an internal blocking queue.  The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU.  This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
 int
-dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
+dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+    int cleanup_fd, uint64_t *action_handlep)
 {
-	struct restorearg ra = { 0 };
-	dmu_replay_record_t *drr;
-	objset_t *os;
-	zio_cksum_t pcksum;
-	guid_map_entry_t *gmep;
+	int err = 0;
+	struct receive_arg ra = { 0 };
+	struct receive_writer_arg rwa = { 0 };
 	int featureflags;
+	nvlist_t *begin_nvl = NULL;
 
-	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
-		ra.byteswap = TRUE;
-
-	{
-		/* compute checksum of drr_begin record */
-		dmu_replay_record_t *drr;
-		drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-
-		drr->drr_type = DRR_BEGIN;
-		drr->drr_u.drr_begin = *drc->drc_drrb;
-		if (ra.byteswap) {
-			fletcher_4_incremental_byteswap(drr,
-			    sizeof (dmu_replay_record_t), &ra.cksum);
-		} else {
-			fletcher_4_incremental_native(drr,
-			    sizeof (dmu_replay_record_t), &ra.cksum);
-		}
-		kmem_free(drr, sizeof (dmu_replay_record_t));
-	}
+	ra.byteswap = drc->drc_byteswap;
+	ra.cksum = drc->drc_cksum;
+	ra.td = curthread;
+	ra.fp = fp;
+	ra.voff = *voffp;
 
-	if (ra.byteswap) {
-		struct drr_begin *drrb = drc->drc_drrb;
-		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
-		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
-		drrb->drr_type = BSWAP_32(drrb->drr_type);
-		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
-		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+	if (dsl_dataset_is_zapified(drc->drc_ds)) {
+		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+		    sizeof (ra.bytes_read), 1, &ra.bytes_read);
 	}
 
-	ra.vp = vp;
-	ra.voff = *voffp;
-	ra.bufsize = 1<<20;
-	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+	objlist_create(&ra.ignore_objlist);
 
 	/* these were verified in dmu_recv_begin */
-	ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
+	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
 	    DMU_SUBSTREAM);
-	ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
+	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
 
 	/*
 	 * Open the objset we are modifying.
 	 */
-	VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
+	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
 
-	ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 
 	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
 
 	/* if this stream is dedup'ed, set up the avl tree for guid mapping */
 	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
-		avl_create(&ra.guid_to_ds_map, guid_compare,
-		    sizeof (guid_map_entry_t),
-		    offsetof(guid_map_entry_t, avlnode));
-		(void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
-		    (void *)&ra.guid_to_ds_map,
-		    DS_FIND_CHILDREN);
-	}
+		minor_t minor;
 
-	/*
-	 * Read records and process them.
-	 */
-	pcksum = ra.cksum;
-	while (ra.err == 0 &&
-	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
-		if (issig(JUSTLOOKING) && issig(FORREAL)) {
-			ra.err = EINTR;
+		if (cleanup_fd == -1) {
+			ra.err = SET_ERROR(EBADF);
+			goto out;
+		}
+		ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
+		if (ra.err != 0) {
+			cleanup_fd = -1;
 			goto out;
 		}
 
-		if (ra.byteswap)
-			backup_byteswap(drr);
+		if (*action_handlep == 0) {
+			rwa.guid_to_ds_map =
+			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+			avl_create(rwa.guid_to_ds_map, guid_compare,
+			    sizeof (guid_map_entry_t),
+			    offsetof(guid_map_entry_t, avlnode));
+			err = zfs_onexit_add_cb(minor,
+			    free_guid_map_onexit, rwa.guid_to_ds_map,
+			    action_handlep);
+			if (ra.err != 0)
+				goto out;
+		} else {
+			err = zfs_onexit_cb_data(minor, *action_handlep,
+			    (void **)&rwa.guid_to_ds_map);
+			if (ra.err != 0)
+				goto out;
+		}
+
+		drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
+	}
+
+	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
+	void *payload = NULL;
+	if (payloadlen != 0)
+		payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+	err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
+	if (err != 0) {
+		if (payloadlen != 0)
+			kmem_free(payload, payloadlen);
+		goto out;
+	}
+	if (payloadlen != 0) {
+		err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
+		kmem_free(payload, payloadlen);
+		if (err != 0)
+			goto out;
+	}
 
-		switch (drr->drr_type) {
-		case DRR_OBJECT:
-		{
-			/*
-			 * We need to make a copy of the record header,
-			 * because restore_{object,write} may need to
-			 * restore_read(), which will invalidate drr.
-			 */
-			struct drr_object drro = drr->drr_u.drr_object;
-			ra.err = restore_object(&ra, os, &drro);
-			break;
-		}
-		case DRR_FREEOBJECTS:
-		{
-			struct drr_freeobjects drrfo =
-			    drr->drr_u.drr_freeobjects;
-			ra.err = restore_freeobjects(&ra, os, &drrfo);
-			break;
-		}
-		case DRR_WRITE:
-		{
-			struct drr_write drrw = drr->drr_u.drr_write;
-			ra.err = restore_write(&ra, os, &drrw);
-			break;
-		}
-		case DRR_WRITE_BYREF:
-		{
-			struct drr_write_byref drrwbr =
-			    drr->drr_u.drr_write_byref;
-			ra.err = restore_write_byref(&ra, os, &drrwbr);
+	if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+		err = resume_check(&ra, begin_nvl);
+		if (err != 0)
+			goto out;
+	}
+
+	(void) bqueue_init(&rwa.q, zfs_recv_queue_length,
+	    offsetof(struct receive_record_arg, node));
+	cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
+	rwa.os = ra.os;
+	rwa.byteswap = drc->drc_byteswap;
+	rwa.resumable = drc->drc_resumable;
+
+	(void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
+	    TS_RUN, minclsyspri);
+	/*
+	 * We're reading rwa.err without locks, which is safe since we are the
+	 * only reader, and the worker thread is the only writer.  It's ok if we
+	 * miss a write for an iteration or two of the loop, since the writer
+	 * thread will keep freeing records we send it until we send it an eos
+	 * marker.
+	 *
+	 * We can leave this loop in 3 ways:  First, if rwa.err is
+	 * non-zero.  In that case, the writer thread will free the rrd we just
+	 * pushed.  Second, if  we're interrupted; in that case, either it's the
+	 * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
+	 * has been handed off to the writer thread who will free it.  Finally,
+	 * if receive_read_record fails or we're at the end of the stream, then
+	 * we free ra.rrd and exit.
+	 */
+	while (rwa.err == 0) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+			err = SET_ERROR(EINTR);
 			break;
 		}
-		case DRR_FREE:
-		{
-			struct drr_free drrf = drr->drr_u.drr_free;
-			ra.err = restore_free(&ra, os, &drrf);
+
+		ASSERT3P(ra.rrd, ==, NULL);
+		ra.rrd = ra.next_rrd;
+		ra.next_rrd = NULL;
+		/* Allocates and loads header into ra.next_rrd */
+		err = receive_read_record(&ra);
+
+		if (ra.rrd->header.drr_type == DRR_END || err != 0) {
+			kmem_free(ra.rrd, sizeof (*ra.rrd));
+			ra.rrd = NULL;
 			break;
 		}
-		case DRR_END:
-		{
-			struct drr_end drre = drr->drr_u.drr_end;
-			/*
-			 * We compare against the *previous* checksum
-			 * value, because the stored checksum is of
-			 * everything before the DRR_END record.
-			 */
-			if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
-				ra.err = ECKSUM;
-			goto out;
-		}
-		default:
-			ra.err = EINVAL;
-			goto out;
-		}
-		pcksum = ra.cksum;
-	}
-	ASSERT(ra.err != 0);
+
+		bqueue_enqueue(&rwa.q, ra.rrd,
+		    sizeof (struct receive_record_arg) + ra.rrd->payload_size);
+		ra.rrd = NULL;
+	}
+	if (ra.next_rrd == NULL)
+		ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
+	ra.next_rrd->eos_marker = B_TRUE;
+	bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
+
+	mutex_enter(&rwa.mutex);
+	while (!rwa.done) {
+		cv_wait(&rwa.cv, &rwa.mutex);
+	}
+	mutex_exit(&rwa.mutex);
+
+	cv_destroy(&rwa.cv);
+	mutex_destroy(&rwa.mutex);
+	bqueue_destroy(&rwa.q);
+	if (err == 0)
+		err = rwa.err;
 
 out:
-	if (ra.err != 0) {
+	nvlist_free(begin_nvl);
+	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
+		zfs_onexit_fd_rele(cleanup_fd);
+
+	if (err != 0) {
 		/*
-		 * destroy what we created, so we don't leave it in the
-		 * inconsistent restoring state.
+		 * Clean up references. If receive is not resumable,
+		 * destroy what we created, so we don't leave it in
+		 * the inconsistent state.
 		 */
-		txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
-
-		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
-		    B_FALSE);
-		if (drc->drc_real_ds != drc->drc_logical_ds) {
-			mutex_exit(&drc->drc_logical_ds->ds_recvlock);
-			dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
-		}
-	}
-
-	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
-		void *cookie = NULL;
-
-		while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) {
-			dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map);
-			kmem_free(gmep, sizeof (guid_map_entry_t));
-		}
-		avl_destroy(&ra.guid_to_ds_map);
+		dmu_recv_cleanup_ds(drc);
 	}
 
-	kmem_free(ra.buf, ra.bufsize);
 	*voffp = ra.voff;
-	return (ra.err);
+	objlist_destroy(&ra.ignore_objlist);
+	return (err);
 }
 
-struct recvendsyncarg {
-	char *tosnap;
-	uint64_t creation_time;
-	uint64_t toguid;
-};
-
 static int
-recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dmu_recv_end_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	struct recvendsyncarg *resa = arg2;
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int error;
+
+	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
+
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
+
+		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
+		if (error != 0)
+			return (error);
+		if (drc->drc_force) {
+			/*
+			 * We will destroy any snapshots in tofs (i.e. before
+			 * origin_head) that are after the origin (which is
+			 * the snap before drc_ds, because drc_ds can not
+			 * have any snaps of its own).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				error = dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap);
+				if (error != 0)
+					break;
+				if (snap->ds_dir != origin_head->ds_dir)
+					error = SET_ERROR(EINVAL);
+				if (error == 0)  {
+					error = dsl_destroy_snapshot_check_impl(
+					    snap, B_FALSE);
+				}
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_dataset_rele(snap, FTAG);
+				if (error != 0)
+					break;
+			}
+			if (error != 0) {
+				dsl_dataset_rele(origin_head, FTAG);
+				return (error);
+			}
+		}
+		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
+		    origin_head, drc->drc_force, drc->drc_owner, tx);
+		if (error != 0) {
+			dsl_dataset_rele(origin_head, FTAG);
+			return (error);
+		}
+		error = dsl_dataset_snapshot_check_impl(origin_head,
+		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
+		dsl_dataset_rele(origin_head, FTAG);
+		if (error != 0)
+			return (error);
 
-	return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
+		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
+	} else {
+		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
+		    drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
+	}
+	return (error);
 }
 
 static void
-recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	struct recvendsyncarg *resa = arg2;
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 
-	dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx);
+	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
+	    tx, "snap=%s", drc->drc_tosnap);
 
-	/* set snapshot's creation time and guid */
-	dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-	ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
-	ds->ds_prev->ds_phys->ds_guid = resa->toguid;
-	ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
 
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
+		    &origin_head));
+
+		if (drc->drc_force) {
+			/*
+			 * Destroy any snapshots of drc_tofs (origin_head)
+			 * after the origin (the snap before drc_ds).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap));
+				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_destroy_snapshot_sync_impl(snap,
+				    B_FALSE, tx);
+				dsl_dataset_rele(snap, FTAG);
+			}
+		}
+		VERIFY3P(drc->drc_ds->ds_prev, ==,
+		    origin_head->ds_prev);
+
+		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
+		    origin_head, tx);
+		dsl_dataset_snapshot_sync_impl(origin_head,
+		    drc->drc_tosnap, tx);
+
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
+		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+		dsl_dataset_phys(origin_head)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		drc->drc_newsnapobj =
+		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+
+		dsl_dataset_rele(origin_head, FTAG);
+		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+		if (drc->drc_owner != NULL)
+			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
+	} else {
+		dsl_dataset_t *ds = drc->drc_ds;
+
+		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
+
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		dsl_dataset_phys(ds->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		if (dsl_dataset_has_resume_receive_state(ds)) {
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_FROMGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OBJECT, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OFFSET, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_BYTES, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TOGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TONAME, tx);
+		}
+		drc->drc_newsnapobj =
+		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
+	}
+	/*
+	 * Release the hold from dmu_recv_begin.  This must be done before
+	 * we return to open context, so that when we free the dataset's dnode,
+	 * we can evict its bonus buffer.
+	 */
+	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+	drc->drc_ds = NULL;
 }
 
 static int
-dmu_recv_existing_end(dmu_recv_cookie_t *drc)
+add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
 {
-	struct recvendsyncarg resa;
-	dsl_dataset_t *ds = drc->drc_logical_ds;
+	dsl_pool_t *dp;
+	dsl_dataset_t *snapds;
+	guid_map_entry_t *gmep;
 	int err;
 
-	/*
-	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
-	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
-	 * can close it.
-	 */
-	txg_wait_synced(ds->ds_dir->dd_pool, 0);
-
-	if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
-		err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
-		    drc->drc_force);
-		if (err)
-			goto out;
-	} else {
-		mutex_exit(&ds->ds_recvlock);
-		dsl_dataset_rele(ds, dmu_recv_tag);
-		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
-		    B_FALSE);
-		return (EBUSY);
-	}
-
-	resa.creation_time = drc->drc_drrb->drr_creation_time;
-	resa.toguid = drc->drc_drrb->drr_toguid;
-	resa.tosnap = drc->drc_tosnap;
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    recv_end_check, recv_end_sync, ds, &resa, 3);
-	if (err) {
-		/* swap back */
-		(void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
-	}
+	ASSERT(guid_map != NULL);
 
-out:
-	mutex_exit(&ds->ds_recvlock);
-	dsl_dataset_disown(ds, dmu_recv_tag);
-	(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
+	err = dsl_pool_hold(name, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
+	err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
+	if (err == 0) {
+		gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
+		gmep->gme_ds = snapds;
+		avl_add(guid_map, gmep);
+		dsl_dataset_long_hold(snapds, gmep);
+	} else
+		kmem_free(gmep, sizeof (*gmep));
+
+	dsl_pool_rele(dp, FTAG);
 	return (err);
 }
 
+static int dmu_recv_end_modified_blocks = 3;
+
 static int
-dmu_recv_new_end(dmu_recv_cookie_t *drc)
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
-	struct recvendsyncarg resa;
-	dsl_dataset_t *ds = drc->drc_logical_ds;
-	int err;
-
+#ifdef _KERNEL
 	/*
-	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
-	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
-	 * can close it.
-	 */
-	txg_wait_synced(ds->ds_dir->dd_pool, 0);
-
-	resa.creation_time = drc->drc_drrb->drr_creation_time;
-	resa.toguid = drc->drc_drrb->drr_toguid;
-	resa.tosnap = drc->drc_tosnap;
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    recv_end_check, recv_end_sync, ds, &resa, 3);
-	if (err) {
-		/* clean up the fs we just recv'd into */
-		(void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
-	} else {
-		/* release the hold from dmu_recv_begin */
-		dsl_dataset_disown(ds, dmu_recv_tag);
-	}
-	return (err);
+	 * We will be destroying the ds; make sure its origin is unmounted if
+	 * necessary.
+	 */
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	dsl_dataset_name(drc->drc_ds, name);
+	zfs_destroy_unmount_origin(name);
+#endif
+
+	return (dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+	return (dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
 }
 
 int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
 {
-	if (drc->drc_logical_ds != drc->drc_real_ds)
-		return (dmu_recv_existing_end(drc));
+	int error;
+
+	drc->drc_owner = owner;
+
+	if (drc->drc_newfs)
+		error = dmu_recv_new_end(drc);
 	else
-		return (dmu_recv_new_end(drc));
+		error = dmu_recv_existing_end(drc);
+
+	if (error != 0) {
+		dmu_recv_cleanup_ds(drc);
+	} else if (drc->drc_guid_to_ds_map != NULL) {
+		(void) add_ds_to_guidmap(drc->drc_tofs,
+		    drc->drc_guid_to_ds_map,
+		    drc->drc_newsnapobj);
+	}
+	return (error);
+}
+
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
+{
+	return (os->os_dsl_dataset != NULL &&
+	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_traverse.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c	27 Feb 2010 22:30:49 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c	10 Oct 2016 11:09:56 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2015 Chunwei Chen. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -33,40 +34,57 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/callb.h>
+#include <sys/zfeature.h>
 
-struct prefetch_data {
+int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
+boolean_t send_holes_without_birth_time = B_TRUE;
+
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN,
+    &send_holes_without_birth_time, 0, "Send holes without birth time");
+#endif
+
+typedef struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
-	int pd_blks_max;
-	int pd_blks_fetched;
+	int32_t pd_bytes_fetched;
 	int pd_flags;
 	boolean_t pd_cancel;
 	boolean_t pd_exited;
-};
+	zbookmark_phys_t pd_resume;
+} prefetch_data_t;
 
-struct traverse_data {
+typedef struct traverse_data {
 	spa_t *td_spa;
 	uint64_t td_objset;
 	blkptr_t *td_rootbp;
 	uint64_t td_min_txg;
+	zbookmark_phys_t *td_resume;
 	int td_flags;
-	struct prefetch_data *td_pfd;
+	prefetch_data_t *td_pfd;
+	boolean_t td_paused;
+	uint64_t td_hole_birth_enabled_txg;
 	blkptr_cb_t *td_func;
 	void *td_arg;
-};
+	boolean_t td_realloc_possible;
+} traverse_data_t;
 
-static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
-    arc_buf_t *buf, uint64_t objset, uint64_t object);
+static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object);
+static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
+    uint64_t objset, uint64_t object);
 
-/* ARGSUSED */
 static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
-	struct traverse_data *td = arg;
-	zbookmark_t zb;
+	traverse_data_t *td = arg;
+	zbookmark_phys_t zb;
 
-	if (bp->blk_birth == 0)
+	if (BP_IS_HOLE(bp))
 		return (0);
 
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
@@ -80,25 +98,24 @@ traverse_zil_block(zilog_t *zilog, blkpt
 	return (0);
 }
 
-/* ARGSUSED */
 static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
-	struct traverse_data *td = arg;
+	traverse_data_t *td = arg;
 
 	if (lrc->lrc_txtype == TX_WRITE) {
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
-		zbookmark_t zb;
+		zbookmark_phys_t zb;
 
-		if (bp->blk_birth == 0)
+		if (BP_IS_HOLE(bp))
 			return (0);
 
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
 			return (0);
 
-		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
-		    lr->lr_offset / BP_GET_LSIZE(bp));
+		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 		    td->td_arg);
@@ -107,7 +124,7 @@ traverse_zil_record(zilog_t *zilog, lr_t
 }
 
 static void
-traverse_zil(struct traverse_data *td, zil_header_t *zh)
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 	zilog_t *zilog;
@@ -127,184 +144,383 @@ traverse_zil(struct traverse_data *td, z
 	zil_free(zilog);
 }
 
+typedef enum resume_skip {
+	RESUME_SKIP_ALL,
+	RESUME_SKIP_NONE,
+	RESUME_SKIP_CHILDREN
+} resume_skip_t;
+
+/*
+ * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+ * the block indicated by zb does not need to be visited at all. Returns
+ * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+ * resume point. This indicates that this block should be visited but not its
+ * children (since they must have been visited in a previous traversal).
+ * Otherwise returns RESUME_SKIP_NONE.
+ */
+static resume_skip_t
+resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+    const zbookmark_phys_t *zb)
+{
+	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+		/*
+		 * If we already visited this bp & everything below,
+		 * don't bother doing it again.
+		 */
+		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
+			return (RESUME_SKIP_ALL);
+
+		/*
+		 * If we found the block we're trying to resume from, zero
+		 * the bookmark out to indicate that we have resumed.
+		 */
+		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+			bzero(td->td_resume, sizeof (*zb));
+			if (td->td_flags & TRAVERSE_POST)
+				return (RESUME_SKIP_CHILDREN);
+		}
+	}
+	return (RESUME_SKIP_NONE);
+}
+
+static void
+traverse_prefetch_metadata(traverse_data_t *td,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
+		return;
+	/*
+	 * If we are in the process of resuming, don't prefetch, because
+	 * some children will not be needed (and in fact may have already
+	 * been freed).
+	 */
+	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
+		return;
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+		return;
+	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
+		return;
+
+	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
+	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+}
+
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
 static int
-traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
-    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
-	zbookmark_t czb;
-	int err = 0, lasterr = 0;
+	zbookmark_phys_t czb;
+	int err = 0;
 	arc_buf_t *buf = NULL;
-	struct prefetch_data *pd = td->td_pfd;
+	prefetch_data_t *pd = td->td_pfd;
 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
 
-	if (bp->blk_birth == 0) {
-		err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
-		return (err);
+	switch (resume_skip_check(td, dnp, zb)) {
+	case RESUME_SKIP_ALL:
+		return (0);
+	case RESUME_SKIP_CHILDREN:
+		goto post;
+	case RESUME_SKIP_NONE:
+		break;
+	default:
+		ASSERT(0);
 	}
 
-	if (bp->blk_birth <= td->td_min_txg)
+	if (bp->blk_birth == 0) {
+		/*
+		 * Since this block has a birth time of 0 it must be one of
+		 * two things: a hole created before the
+		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
+		 * which has always been a hole in an object.
+		 *
+		 * If a file is written sparsely, then the unwritten parts of
+		 * the file were "always holes" -- that is, they have been
+		 * holes since this object was allocated.  However, we (and
+		 * our callers) can not necessarily tell when an object was
+		 * allocated.  Therefore, if it's possible that this object
+		 * was freed and then its object number reused, we need to
+		 * visit all the holes with birth==0.
+		 *
+		 * If it isn't possible that the object number was reused,
+		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
+		 * all the blocks we will visit as part of this traversal,
+		 * then this hole must have always existed, so we can skip
+		 * it.  We visit blocks born after (exclusive) td_min_txg.
+		 *
+		 * Note that the meta-dnode cannot be reallocated.
+		 */
+		if (!send_holes_without_birth_time &&
+		    (!td->td_realloc_possible ||
+		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
+		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
+			return (0);
+	} else if (bp->blk_birth <= td->td_min_txg) {
 		return (0);
+	}
 
-	if (pd && !pd->pd_exited &&
-	    ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
+	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
+		uint64_t size = BP_GET_LSIZE(bp);
 		mutex_enter(&pd->pd_mtx);
-		ASSERT(pd->pd_blks_fetched >= 0);
-		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
+		ASSERT(pd->pd_bytes_fetched >= 0);
+		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
-		pd->pd_blks_fetched--;
+		pd->pd_bytes_fetched -= size;
 		cv_broadcast(&pd->pd_cv);
 		mutex_exit(&pd->pd_mtx);
 	}
 
-	if (td->td_flags & TRAVERSE_PRE) {
+	if (BP_IS_HOLE(bp)) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-		if (err)
-			return (err);
+		if (err != 0)
+			goto post;
+		return (0);
+	}
+
+	if (td->td_flags & TRAVERSE_PRE) {
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			goto post;
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
-		uint32_t flags = ARC_WAIT;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 
-		err = arc_read(NULL, td->td_spa, bp, pbuf,
-		    arc_getbuf_func, &buf,
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err)
-			return (err);
+		if (err != 0)
+			goto post;
+		cbp = buf->b_data;
+
+		for (i = 0; i < epb; i++) {
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			traverse_prefetch_metadata(td, &cbp[i], &czb);
+		}
 
 		/* recursively visitbp() blocks below this */
-		cbp = buf->b_data;
-		for (i = 0; i < epb; i++, cbp++) {
+		for (i = 0; i < epb; i++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
-			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
-			if (err) {
-				if (!hard)
-					break;
-				lasterr = err;
-			}
+			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
+			if (err != 0)
+				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		uint32_t flags = ARC_WAIT;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
-		err = arc_read(NULL, td->td_spa, bp, pbuf,
-		    arc_getbuf_func, &buf,
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err)
-			return (err);
+		if (err != 0)
+			goto post;
+		dnode_phys_t *child_dnp = buf->b_data;
+
+		for (i = 0; i < epb; i++) {
+			prefetch_dnode_metadata(td, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
+		}
 
 		/* recursively visitbp() blocks below this */
-		dnp = buf->b_data;
-		for (i = 0; i < epb; i++, dnp++) {
-			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
-			    zb->zb_blkid * epb + i);
-			if (err) {
-				if (!hard)
-					break;
-				lasterr = err;
-			}
+		for (i = 0; i < epb; i++) {
+			err = traverse_dnode(td, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
+			if (err != 0)
+				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		uint32_t flags = ARC_WAIT;
-		objset_phys_t *osp;
-		dnode_phys_t *dnp;
+		arc_flags_t flags = ARC_FLAG_WAIT;
 
-		err = arc_read_nolock(NULL, td->td_spa, bp,
-		    arc_getbuf_func, &buf,
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err)
-			return (err);
-
-		osp = buf->b_data;
-		traverse_zil(td, &osp->os_zil_header);
+		if (err != 0)
+			goto post;
 
-		dnp = &osp->os_meta_dnode;
-		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+		objset_phys_t *osp = buf->b_data;
+		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
-		if (err && hard) {
-			lasterr = err;
-			err = 0;
+		/*
+		 * See the block comment above for the goal of this variable.
+		 * If the maxblkid of the meta-dnode is 0, then we know that
+		 * we've never had more than DNODES_PER_BLOCK objects in the
+		 * dataset, which means we can't have reused any object ids.
+		 */
+		if (osp->os_meta_dnode.dn_maxblkid == 0)
+			td->td_realloc_possible = B_FALSE;
+
+		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
+			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
+			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
+
+		err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			dnp = &osp->os_userused_dnode;
-			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
-			    DMU_USERUSED_OBJECT);
-		}
-		if (err && hard) {
-			lasterr = err;
-			err = 0;
+			err = traverse_dnode(td, &osp->os_groupused_dnode,
+			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			dnp = &osp->os_groupused_dnode;
-			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
-			    DMU_GROUPUSED_OBJECT);
+			err = traverse_dnode(td, &osp->os_userused_dnode,
+			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 	}
 
 	if (buf)
-		(void) arc_buf_remove_ref(buf, &buf);
+		arc_buf_destroy(buf, &buf);
 
-	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST))
+post:
+	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 
-	return (err != 0 ? err : lasterr);
+	if (hard && (err == EIO || err == ECKSUM)) {
+		/*
+		 * Ignore this disk error as requested by the HARD flag,
+		 * and continue traversal.
+		 */
+		err = 0;
+	}
+
+	/*
+	 * If we are stopping here, set td_resume.
+	 */
+	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+		td->td_resume->zb_objset = zb->zb_objset;
+		td->td_resume->zb_object = zb->zb_object;
+		td->td_resume->zb_level = 0;
+		/*
+		 * If we have stopped on an indirect block (e.g. due to
+		 * i/o error), we have not visited anything below it.
+		 * Set the bookmark to the first level-0 block that we need
+		 * to visit.  This way, the resuming code does not need to
+		 * deal with resuming from indirect blocks.
+		 *
+		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+		 * to dereference it.
+		 */
+		td->td_resume->zb_blkid = zb->zb_blkid;
+		if (zb->zb_level > 0) {
+			td->td_resume->zb_blkid <<= zb->zb_level *
+			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+		}
+		td->td_paused = B_TRUE;
+	}
+
+	return (err);
+}
+
+static void
+prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+	int j;
+	zbookmark_phys_t czb;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+		traverse_prefetch_metadata(td, &dnp->dn_spill, &czb);
+	}
 }
 
 static int
-traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
-    arc_buf_t *buf, uint64_t objset, uint64_t object)
+traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
 {
-	int j, err = 0, lasterr = 0;
-	zbookmark_t czb;
-	boolean_t hard = (td->td_flags & TRAVERSE_HARD);
+	int j, err = 0;
+	zbookmark_phys_t czb;
+
+	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+	    object < td->td_resume->zb_object)
+		return (0);
+
+	if (td->td_flags & TRAVERSE_PRE) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
-		err = traverse_visitbp(td, dnp, buf,
-		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
-		if (err) {
-			if (!hard)
-				break;
-			lasterr = err;
-		}
+		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
+		if (err != 0)
+			break;
+	}
+
+	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+		err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
+	}
+
+	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
 	}
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
 
 /* ARGSUSED */
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-	struct prefetch_data *pfd = arg;
-	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+	prefetch_data_t *pfd = arg;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
-	ASSERT(pfd->pd_blks_fetched >= 0);
+	ASSERT(pfd->pd_bytes_fetched >= 0);
+	if (bp == NULL)
+		return (0);
 	if (pfd->pd_cancel)
-		return (EINTR);
+		return (SET_ERROR(EINTR));
 
-	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
-	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+	if (!prefetch_needed(pfd, bp))
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
-	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
+	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
-	pfd->pd_blks_fetched++;
+	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);
 
-	(void) arc_read_nolock(NULL, spa, bp, NULL, NULL,
-	    ZIO_PRIORITY_ASYNC_READ,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
-	    &aflags, zb);
+	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
 
 	return (0);
 }
@@ -312,17 +528,18 @@ traverse_prefetcher(spa_t *spa, zilog_t 
 static void
 traverse_prefetch_thread(void *arg)
 {
-	struct traverse_data *td_main = arg;
-	struct traverse_data td = *td_main;
-	zbookmark_t czb;
+	traverse_data_t *td_main = arg;
+	traverse_data_t td = *td_main;
+	zbookmark_phys_t czb;
 
 	td.td_func = traverse_prefetcher;
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
+	td.td_resume = &td_main->td_pfd->pd_resume;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
+	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 
 	mutex_enter(&td_main->td_pfd->pd_mtx);
 	td_main->td_pfd->pd_exited = B_TRUE;
@@ -335,36 +552,68 @@ traverse_prefetch_thread(void *arg)
  * in syncing context).
  */
 static int
-traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
-    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
 {
-	struct traverse_data td;
-	struct prefetch_data pd = { 0 };
-	zbookmark_t czb;
+	traverse_data_t td;
+	prefetch_data_t pd = { 0 };
+	zbookmark_phys_t czb;
 	int err;
 
+	ASSERT(ds == NULL || objset == ds->ds_object);
+	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+
 	td.td_spa = spa;
 	td.td_objset = objset;
 	td.td_rootbp = rootbp;
 	td.td_min_txg = txg_start;
+	td.td_resume = resume;
 	td.td_func = func;
 	td.td_arg = arg;
 	td.td_pfd = &pd;
 	td.td_flags = flags;
+	td.td_paused = B_FALSE;
+	td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+		VERIFY(spa_feature_enabled_txg(spa,
+		    SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
+	} else {
+		td.td_hole_birth_enabled_txg = UINT64_MAX;
+	}
 
-	pd.pd_blks_max = 100;
 	pd.pd_flags = flags;
+	if (resume != NULL)
+		pd.pd_resume = *resume;
 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 
-	if (!(flags & TRAVERSE_PREFETCH) ||
+	/* See comment on ZIL traversal in dsl_scan_visitds. */
+	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, td.td_spa, rootbp,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
+		if (err != 0)
+			return (err);
+
+		osp = buf->b_data;
+		traverse_zil(&td, &osp->os_zil_header);
+		arc_buf_destroy(buf, &buf);
+	}
+
+	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
 	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
 	    &td, TQ_NOQUEUE))
 		pd.pd_exited = B_TRUE;
 
-	SET_BOOKMARK(&czb, objset,
+	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
+	err = traverse_visitbp(&td, NULL, rootbp, &czb);
 
 	mutex_enter(&pd.pd_mtx);
 	pd.pd_cancel = B_TRUE;
@@ -384,11 +633,28 @@ traverse_impl(spa_t *spa, uint64_t objse
  * in syncing context).
  */
 int
-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume,
+    int flags, blkptr_cb_t func, void *arg)
+{
+	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
+	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
+}
+
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+    int flags, blkptr_cb_t func, void *arg)
+{
+	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
+}
+
+int
+traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
-	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
-	    &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
+	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+	    blkptr, txg_start, resume, flags, func, arg));
 }
 
 /*
@@ -398,56 +664,50 @@ int
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
-	int err, lasterr = 0;
-	uint64_t obj;
+	int err;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
 	boolean_t hard = (flags & TRAVERSE_HARD);
 
 	/* visit the MOS */
-	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
-	    txg_start, flags, func, arg);
-	if (err)
+	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
+	    txg_start, NULL, flags, func, arg);
+	if (err != 0)
 		return (err);
 
 	/* visit each dataset */
-	for (obj = 1; err == 0 || (err != ESRCH && hard);
-	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
+	for (uint64_t obj = 1; err == 0;
+	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
-		if (err) {
-			if (!hard)
-				return (err);
-			lasterr = err;
-			continue;
+		if (err != 0) {
+			if (hard)
+				continue;
+			break;
 		}
 
-		if (doi.doi_type == DMU_OT_DSL_DATASET) {
+		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
 			uint64_t txg = txg_start;
 
-			rw_enter(&dp->dp_config_rwlock, RW_READER);
+			dsl_pool_config_enter(dp, FTAG);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
-			rw_exit(&dp->dp_config_rwlock);
-			if (err) {
-				if (!hard)
-					return (err);
-				lasterr = err;
-				continue;
+			dsl_pool_config_exit(dp, FTAG);
+			if (err != 0) {
+				if (hard)
+					continue;
+				break;
 			}
-			if (ds->ds_phys->ds_prev_snap_txg > txg)
-				txg = ds->ds_phys->ds_prev_snap_txg;
+			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
-			if (err) {
-				if (!hard)
-					return (err);
-				lasterr = err;
-			}
+			if (err != 0)
+				break;
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
-	return (err != 0 ? err : lasterr);
+	return (err);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_tx.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c	27 Feb 2010 22:30:50 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c	16 May 2017 01:04:48 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu.h>
@@ -33,7 +35,10 @@
 #include <sys/dsl_pool.h>
 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
 #include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
+#include <sys/varargs.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
@@ -44,12 +49,13 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 {
 	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
 	tx->tx_dir = dd;
-	if (dd)
+	if (dd != NULL)
 		tx->tx_pool = dd->dd_pool;
 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
 	    offsetof(dmu_tx_hold_t, txh_node));
 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
 	    offsetof(dmu_tx_callback_t, dcb_node));
+	tx->tx_start = gethrtime();
 #ifdef ZFS_DEBUG
 	refcount_create(&tx->tx_space_written);
 	refcount_create(&tx->tx_space_freed);
@@ -123,6 +129,12 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, ob
 	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 	txh->txh_tx = tx;
 	txh->txh_dnode = dn;
+	refcount_create(&txh->txh_space_towrite);
+	refcount_create(&txh->txh_space_tofree);
+	refcount_create(&txh->txh_space_tooverwrite);
+	refcount_create(&txh->txh_space_tounref);
+	refcount_create(&txh->txh_memory_tohold);
+	refcount_create(&txh->txh_fudge);
 #ifdef ZFS_DEBUG
 	txh->txh_type = type;
 	txh->txh_arg1 = arg1;
@@ -156,7 +168,7 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *
 	db = dbuf_hold_level(dn, level, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
-		return (EIO);
+		return (SET_ERROR(EIO));
 	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	dbuf_rele(db, FTAG);
 	return (err);
@@ -184,7 +196,7 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dn
 		ASSERT(level != 0);
 		db = NULL;
 	} else {
-		ASSERT(db->db_dnode == dn);
+		ASSERT(DB_DNODE(db) == dn);
 		ASSERT(db->db_level == level);
 		ASSERT(db->db.db_size == space);
 		ASSERT(db->db_blkid == blkid);
@@ -193,14 +205,20 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dn
 	}
 
 	freeable = (bp && (freeable ||
-	    dsl_dataset_block_freeable(ds, bp->blk_birth)));
+	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 
-	if (freeable)
-		txh->txh_space_tooverwrite += space;
-	else
-		txh->txh_space_towrite += space;
-	if (bp)
-		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+	if (freeable) {
+		(void) refcount_add_many(&txh->txh_space_tooverwrite,
+		    space, FTAG);
+	} else {
+		(void) refcount_add_many(&txh->txh_space_towrite,
+		    space, FTAG);
+	}
+
+	if (bp) {
+		(void) refcount_add_many(&txh->txh_space_tounref,
+		    bp_get_dsize(os->os_spa, bp), FTAG);
+	}
 
 	dmu_tx_count_twig(txh, dn, parent, level + 1,
 	    blkid >> epbs, freeable, history);
@@ -219,7 +237,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 		return;
 
 	min_bs = SPA_MINBLOCKSHIFT;
-	max_bs = SPA_MAXBLOCKSHIFT;
+	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
@@ -280,6 +298,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 			delta = P2NPHASE(off, dn->dn_datablksz);
 		}
 
+		min_ibs = max_ibs = dn->dn_indblkshift;
 		if (dn->dn_maxblkid > 0) {
 			/*
 			 * The blocksize can't change,
@@ -287,13 +306,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 			 */
 			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
-			min_ibs = max_ibs = dn->dn_indblkshift;
-		} else if (dn->dn_indblkshift > max_ibs) {
+		} else {
 			/*
-			 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
-			 * the code will still work correctly on older pools.
+			 * The blocksize can increase up to the recordsize,
+			 * or if it is already more than the recordsize,
+			 * up to the next power of 2.
 			 */
-			min_ibs = max_ibs = dn->dn_indblkshift;
+			min_bs = highbit64(dn->dn_datablksz - 1);
+			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 		}
 
 		/*
@@ -308,8 +328,15 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 			dmu_buf_impl_t *db;
 
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			db = dbuf_hold_level(dn, 0, start, FTAG);
+			err = dbuf_hold_impl(dn, 0, start,
+			    FALSE, FALSE, FTAG, &db);
 			rw_exit(&dn->dn_struct_rwlock);
+
+			if (err) {
+				txh->txh_tx->tx_err = err;
+				return;
+			}
+
 			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 			    history);
 			dbuf_rele(db, FTAG);
@@ -321,8 +348,11 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 				bits = 64 - min_bs;
 				epbs = min_ibs - SPA_BLKPTRSHIFT;
 				for (bits -= epbs * (nlvls - 1);
-				    bits >= 0; bits -= epbs)
-					txh->txh_fudge += 1ULL << max_ibs;
+				    bits >= 0; bits -= epbs) {
+					(void) refcount_add_many(
+					    &txh->txh_fudge,
+					    1ULL << max_ibs, FTAG);
+				}
 				goto out;
 			}
 			off += delta;
@@ -338,7 +368,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 	 */
 	start = P2ALIGN(off, 1ULL << max_bs);
 	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
-	txh->txh_space_towrite += end - start + 1;
+	(void) refcount_add_many(&txh->txh_space_towrite,
+	    end - start + 1, FTAG);
 
 	start >>= min_bs;
 	end >>= min_bs;
@@ -353,20 +384,23 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u
 		start >>= epbs;
 		end >>= epbs;
 		ASSERT3U(end, >=, start);
-		txh->txh_space_towrite += (end - start + 1) << max_ibs;
+		(void) refcount_add_many(&txh->txh_space_towrite,
+		    (end - start + 1) << max_ibs, FTAG);
 		if (start != 0) {
 			/*
 			 * We also need a new blkid=0 indirect block
 			 * to reference any existing file data.
 			 */
-			txh->txh_space_towrite += 1ULL << max_ibs;
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    1ULL << max_ibs, FTAG);
 		}
 	}
 
 out:
-	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
+	if (refcount_count(&txh->txh_space_towrite) +
+	    refcount_count(&txh->txh_space_tooverwrite) >
 	    2 * DMU_MAX_ACCESS)
-		err = EFBIG;
+		err = SET_ERROR(EFBIG);
 
 	if (err)
 		txh->txh_tx->tx_err = err;
@@ -376,19 +410,22 @@ static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	dnode_t *dn = txh->txh_dnode;
-	dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
+	dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 	uint64_t space = mdn->dn_datablksz +
 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 
 	if (dn && dn->dn_dbuf->db_blkptr &&
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-	    dn->dn_dbuf->db_blkptr->blk_birth)) {
-		txh->txh_space_tooverwrite += space;
-		txh->txh_space_tounref += space;
+	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
+		(void) refcount_add_many(&txh->txh_space_tooverwrite,
+		    space, FTAG);
+		(void) refcount_add_many(&txh->txh_space_tounref, space, FTAG);
 	} else {
-		txh->txh_space_towrite += space;
-		if (dn && dn->dn_dbuf->db_blkptr)
-			txh->txh_space_tounref += space;
+		(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
+		if (dn && dn->dn_dbuf->db_blkptr) {
+			(void) refcount_add_many(&txh->txh_space_tounref,
+			    space, FTAG);
+		}
 	}
 }
 
@@ -419,6 +456,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 	int epbs;
+	uint64_t l0span = 0, nl1blks = 0;
 
 	if (dn->dn_nlevels == 0)
 		return;
@@ -427,7 +465,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui
 	 * The struct_rwlock protects us against dn_nlevels
 	 * changing, in case (against all odds) we manage to dirty &
 	 * sync out the changes after we check for being dirty.
-	 * Also, dbuf_hold_level() wants us to have the struct_rwlock.
+	 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -443,44 +481,31 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui
 		blkid = off >> dn->dn_datablkshift;
 		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 
-		if (blkid >= dn->dn_maxblkid) {
+		if (blkid > dn->dn_maxblkid) {
 			rw_exit(&dn->dn_struct_rwlock);
 			return;
 		}
 		if (blkid + nblks > dn->dn_maxblkid)
-			nblks = dn->dn_maxblkid - blkid;
+			nblks = dn->dn_maxblkid - blkid + 1;
 
 	}
+	l0span = nblks;    /* save for later use to calc level > 1 overhead */
 	if (dn->dn_nlevels == 1) {
 		int i;
 		for (i = 0; i < nblks; i++) {
 			blkptr_t *bp = dn->dn_phys->dn_blkptr;
 			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 			bp += blkid + i;
-			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
+			if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
 				space += bp_get_dsize(spa, bp);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
+		nl1blks = 1;
 		nblks = 0;
 	}
 
-	/*
-	 * Add in memory requirements of higher-level indirects.
-	 * This assumes a worst-possible scenario for dn_nlevels.
-	 */
-	{
-		uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs);
-		int level = (dn->dn_nlevels > 1) ? 2 : 1;
-
-		while (level++ < DN_MAX_LEVELS) {
-			txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift;
-			blkcnt = 1 + (blkcnt >> epbs);
-		}
-		ASSERT(blkcnt <= dn->dn_nblkptr);
-	}
-
 	lastblk = blkid + nblks - 1;
 	while (nblks) {
 		dmu_buf_impl_t *dbuf;
@@ -515,9 +540,15 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui
 		blkoff = P2PHASE(blkid, epb);
 		tochk = MIN(epb - blkoff, nblks);
 
-		dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
+		err = dbuf_hold_impl(dn, 1, blkid >> epbs,
+		    FALSE, FALSE, FTAG, &dbuf);
+		if (err) {
+			txh->txh_tx->tx_err = err;
+			break;
+		}
 
-		txh->txh_memory_tohold += dbuf->db.db_size;
+		(void) refcount_add_many(&txh->txh_memory_tohold,
+		    dbuf->db.db_size, FTAG);
 
 		/*
 		 * We don't check memory_tohold against DMU_MAX_ACCESS because
@@ -538,7 +569,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui
 		bp += blkoff;
 
 		for (i = 0; i < tochk; i++) {
-			if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
+			if (dsl_dataset_block_freeable(ds, &bp[i],
+			    bp[i].blk_birth)) {
 				dprintf_bp(&bp[i], "can free old%s", "");
 				space += bp_get_dsize(spa, &bp[i]);
 			}
@@ -546,19 +578,75 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui
 		}
 		dbuf_rele(dbuf, FTAG);
 
+		++nl1blks;
 		blkid += tochk;
 		nblks -= tochk;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
+	/*
+	 * Add in memory requirements of higher-level indirects.
+	 * This assumes a worst-possible scenario for dn_nlevels and a
+	 * worst-possible distribution of l1-blocks over the region to free.
+	 */
+	{
+		uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
+		int level = 2;
+		/*
+		 * Here we don't use DN_MAX_LEVEL, but calculate it with the
+		 * given datablkshift and indblkshift. This makes the
+		 * difference between 19 and 8 on large files.
+		 */
+		int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
+		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+
+		while (level++ < maxlevel) {
+			(void) refcount_add_many(&txh->txh_memory_tohold,
+			    MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift,
+			    FTAG);
+			blkcnt = 1 + (blkcnt >> epbs);
+		}
+	}
+
 	/* account for new level 1 indirect blocks that might show up */
 	if (skipped > 0) {
-		txh->txh_fudge += skipped << dn->dn_indblkshift;
+		(void) refcount_add_many(&txh->txh_fudge,
+		    skipped << dn->dn_indblkshift, FTAG);
 		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
-		txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+		(void) refcount_add_many(&txh->txh_memory_tohold,
+		    skipped << dn->dn_indblkshift, FTAG);
 	}
-	txh->txh_space_tofree += space;
-	txh->txh_space_tounref += unref;
+	(void) refcount_add_many(&txh->txh_space_tofree, space, FTAG);
+	(void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG);
+}
+
+/*
+ * This function marks the transaction as being a "net free".  The end
+ * result is that refquotas will be disabled for this transaction, and
+ * this transaction will be able to use half of the pool space overhead
+ * (see dsl_pool_adjustedsize()).  Therefore this function should only
+ * be called for transactions that we expect will not cause a net increase
+ * in the amount of space used (but it's OK if that is occasionally not true).
+ */
+void
+dmu_tx_mark_netfree(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *txh;
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    DMU_NEW_OBJECT, THT_FREE, 0, 0);
+
+	/*
+	 * Pretend that this operation will free 1GB of space.  This
+	 * should be large enough to cancel out the largest write.
+	 * We don't want to use something like UINT64_MAX, because that would
+	 * cause overflows when doing math with these values (e.g. in
+	 * dmu_tx_try_assign()).
+	 */
+	(void) refcount_add_many(&txh->txh_space_tofree,
+	    1024 * 1024 * 1024, FTAG);
+	(void) refcount_add_many(&txh->txh_space_tounref,
+	    1024 * 1024 * 1024, FTAG);
 }
 
 void
@@ -566,8 +654,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
-	uint64_t start, end, i;
-	int err, shift;
+	int err;
 	zio_t *zio;
 
 	ASSERT(tx->tx_txg == 0);
@@ -577,14 +664,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 	if (txh == NULL)
 		return;
 	dn = txh->txh_dnode;
-
-	/* first block */
-	if (off != 0)
-		dmu_tx_count_write(txh, off, 1);
-	/* last block */
-	if (len != DMU_OBJECT_END)
-		dmu_tx_count_write(txh, off+len, 1);
-
 	dmu_tx_count_dnode(txh);
 
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
@@ -592,24 +671,54 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 
+
+	/*
+	 * For i/o error checking, we read the first and last level-0
+	 * blocks if they are not aligned, and all the level-1 blocks.
+	 *
+	 * Note:  dbuf_free_range() assumes that we have not instantiated
+	 * any level-0 dbufs that will be completely freed.  Therefore we must
+	 * exercise care to not read or count the first and last blocks
+	 * if they are blocksize-aligned.
+	 */
+	if (dn->dn_datablkshift == 0) {
+		if (off != 0 || len < dn->dn_datablksz)
+			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
+	} else {
+		/* first block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off, 1);
+		/* last block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off+len, 1);
+	}
+
 	/*
-	 * For i/o error checking, read the first and last level-0
-	 * blocks, and all the level-1 blocks.  The above count_write's
-	 * have already taken care of the level-0 blocks.
+	 * Check level-1 blocks.
 	 */
 	if (dn->dn_nlevels > 1) {
-		shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
-		start = off >> shift;
-		end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+		uint64_t start = off >> shift;
+		uint64_t end = (off + len) >> shift;
+
+		ASSERT(dn->dn_indblkshift != 0);
+
+		/*
+		 * dnode_reallocate() can result in an object with indirect
+		 * blocks having an odd data block size.  In this case,
+		 * just check the single block.
+		 */
+		if (dn->dn_datablkshift == 0)
+			start = end = 0;
 
 		zio = zio_root(tx->tx_pool->dp_spa,
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
-		for (i = start; i <= end; i++) {
+		for (uint64_t i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
-			if (err == ESRCH)
+			if (err == ESRCH || i > end)
 				break;
 			if (err) {
 				tx->tx_err = err;
@@ -637,8 +746,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
-	uint64_t nblocks;
-	int epbs, err;
+	int err;
 
 	ASSERT(tx->tx_txg == 0);
 
@@ -660,9 +768,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o
 		return;
 	}
 
-	ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
+	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 
 	if (dn->dn_maxblkid == 0 && !add) {
+		blkptr_t *bp;
+
 		/*
 		 * If there is only one block  (i.e. this is a micro-zap)
 		 * and we are not adding anything, the accounting is simple.
@@ -677,14 +787,19 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o
 		 * Use max block size here, since we don't know how much
 		 * the size will change between now and the dbuf dirty call.
 		 */
+		bp = &dn->dn_phys->dn_blkptr[0];
 		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    dn->dn_phys->dn_blkptr[0].blk_birth)) {
-			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
+		    bp, bp->blk_birth)) {
+			(void) refcount_add_many(&txh->txh_space_tooverwrite,
+			    MZAP_MAX_BLKSZ, FTAG);
 		} else {
-			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    MZAP_MAX_BLKSZ, FTAG);
+		}
+		if (!BP_IS_HOLE(bp)) {
+			(void) refcount_add_many(&txh->txh_space_tounref,
+			    MZAP_MAX_BLKSZ, FTAG);
 		}
-		if (dn->dn_phys->dn_blkptr[0].blk_birth)
-			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 		return;
 	}
 
@@ -693,27 +808,41 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
-		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
-		    8, 0, NULL);
+		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
 			return;
 		}
 	}
 
-	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
+	err = zap_count_write_by_dnode(dn, name, add,
 	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
-	 * we'll have to modify an indirect twig for each.
+	 * we'll have to modify an indirect twig for each.  We can make
+	 * modifications at up to 3 locations:
+	 *  - header block at the beginning of the object
+	 *  - target leaf block
+	 *  - end of the object, where we might need to write:
+	 *	- a new leaf block if the target block needs to be split
+	 *	- the new pointer table, if it is growing
+	 *	- the new cookie table, if it is growing
 	 */
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
-			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
-		else
-			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	dsl_dataset_phys_t *ds_phys =
+	    dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
+	for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) {
+		uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl));
+		uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift;
+		if (ds_phys->ds_prev_snap_obj != 0) {
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    spc, FTAG);
+		} else {
+			(void) refcount_add_many(&txh->txh_space_tooverwrite,
+			    spc, FTAG);
+		}
+	}
 }
 
 void
@@ -738,7 +867,7 @@ dmu_tx_hold_space(dmu_tx_t *tx, uint64_t
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
 
-	txh->txh_space_towrite += space;
+	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
 }
 
 int
@@ -773,18 +902,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i
 {
 	dmu_tx_hold_t *txh;
 	int match_object = FALSE, match_offset = FALSE;
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
-	if (tx->tx_anyobj)
+	if (tx->tx_anyobj) {
+		DB_DNODE_EXIT(db);
 		return;
+	}
 
 	/* XXX No checking on the meta dnode for now */
-	if (db->db.db_object == DMU_META_DNODE_OBJECT)
+	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+		DB_DNODE_EXIT(db);
 		return;
+	}
 
 	for (txh = list_head(&tx->tx_holds); txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
@@ -813,10 +948,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i
 					match_offset = TRUE;
 				/*
 				 * We will let this hold work for the bonus
-				 * buffer so that we don't need to hold it
-				 * when creating a new object.
+				 * or spill buffer so that we don't need to
+				 * hold it when creating a new object.
 				 */
-				if (blkid == DB_BONUS_BLKID)
+				if (blkid == DMU_BONUS_BLKID ||
+				    blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				/*
 				 * They might have to increase nlevels,
@@ -837,8 +973,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i
 				    txh->txh_arg2 == DMU_OBJECT_END))
 					match_offset = TRUE;
 				break;
+			case THT_SPILL:
+				if (blkid == DMU_SPILL_BLKID)
+					match_offset = TRUE;
+				break;
 			case THT_BONUS:
-				if (blkid == DB_BONUS_BLKID)
+				if (blkid == DMU_BONUS_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_ZAP:
@@ -851,24 +991,186 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i
 				ASSERT(!"bad txh_type");
 			}
 		}
-		if (match_object && match_offset)
+		if (match_object && match_offset) {
+			DB_DNODE_EXIT(db);
 			return;
+		}
 	}
+	DB_DNODE_EXIT(db);
 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 	    (u_longlong_t)db->db.db_object, db->db_level,
 	    (u_longlong_t)db->db_blkid);
 }
 #endif
 
+/*
+ * If we can't do 10 iops, something is wrong.  Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting.  This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time.  This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ *     min_time = scale * (dirty - min) / (max - dirty)
+ *     min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ *  10ms +-------------------------------------------------------------*+
+ *       |                                                             *|
+ *   9ms +                                                             *+
+ *       |                                                             *|
+ *   8ms +                                                             *+
+ *       |                                                            * |
+ *   7ms +                                                            * +
+ *       |                                                            * |
+ *   6ms +                                                            * +
+ *       |                                                            * |
+ *   5ms +                                                           *  +
+ *       |                                                           *  |
+ *   4ms +                                                           *  +
+ *       |                                                           *  |
+ *   3ms +                                                          *   +
+ *       |                                                          *   |
+ *   2ms +                                              (midpoint) *    +
+ *       |                                                  |    **     |
+ *   1ms +                                                  v ***       +
+ *       |             zfs_delay_scale ---------->     ********         |
+ *     0 +-------------------------------------*********----------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                             *+
+ *  10ms +                                                             *+
+ *       +                                                           ** +
+ *       |                                              (midpoint)  **  |
+ *       +                                                  |     **    +
+ *   1ms +                                                  v ****      +
+ *       +             zfs_delay_scale ---------->        *****         +
+ *       |                                             ****             |
+ *       +                                          ****                +
+ * 100us +                                        **                    +
+ *       +                                       *                      +
+ *       |                                      *                       |
+ *       +                                     *                        +
+ *  10us +                                     *                        +
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                              +
+ *       +--------------------------------------------------------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+	dsl_pool_t *dp = tx->tx_pool;
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	hrtime_t wakeup, min_tx_time, now;
+
+	if (dirty <= delay_min_bytes)
+		return;
+
+	/*
+	 * The caller has already waited until we are under the max.
+	 * We make them pass us the amount of dirty data so we don't
+	 * have to handle the case of it being >= the max, which could
+	 * cause a divide-by-zero if it's == the max.
+	 */
+	ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+	now = gethrtime();
+	min_tx_time = zfs_delay_scale *
+	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+	if (now > tx->tx_start + min_tx_time)
+		return;
+
+	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+
+	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+	    uint64_t, min_tx_time);
+
+	mutex_enter(&dp->dp_lock);
+	wakeup = MAX(tx->tx_start + min_tx_time,
+	    dp->dp_last_wakeup + min_tx_time);
+	dp->dp_last_wakeup = wakeup;
+	mutex_exit(&dp->dp_lock);
+
+#ifdef _KERNEL
+#ifdef illumos
+	mutex_enter(&curthread->t_delay_lock);
+	while (cv_timedwait_hires(&curthread->t_delay_cv,
+	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
+	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
+		continue;
+	mutex_exit(&curthread->t_delay_lock);
+#endif
+#ifdef __FreeBSD__
+	pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
+	    zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
+#endif
+#ifdef __NetBSD__
+	kpause("dmu_tx_delay", false, (wakeup - now) * hz / 1000000000, NULL);
+#endif
+#else
+	hrtime_t delta = wakeup - gethrtime();
+	struct timespec ts;
+	ts.tv_sec = delta / NANOSEC;
+	ts.tv_nsec = delta % NANOSEC;
+	(void) nanosleep(&ts, NULL);
+#endif
+}
+
 static int
-dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	dmu_tx_hold_t *txh;
 	spa_t *spa = tx->tx_pool->dp_spa;
 	uint64_t memory, asize, fsize, usize;
 	uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
 
-	ASSERT3U(tx->tx_txg, ==, 0);
+	ASSERT0(tx->tx_txg);
 
 	if (tx->tx_err)
 		return (tx->tx_err);
@@ -885,9 +1187,15 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t
 		 */
 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 		    txg_how != TXG_WAIT)
-			return (EIO);
+			return (SET_ERROR(EIO));
+
+		return (SET_ERROR(ERESTART));
+	}
 
-		return (ERESTART);
+	if (!tx->tx_waited &&
+	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
+		tx->tx_wait_dirty = B_TRUE;
+		return (SET_ERROR(ERESTART));
 	}
 
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
@@ -908,7 +1216,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t
 			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 				mutex_exit(&dn->dn_mtx);
 				tx->tx_needassign_txh = txh;
-				return (ERESTART);
+				return (SET_ERROR(ERESTART));
 			}
 			if (dn->dn_assigned_txg == 0)
 				dn->dn_assigned_txg = tx->tx_txg;
@@ -916,22 +1224,15 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
-		towrite += txh->txh_space_towrite;
-		tofree += txh->txh_space_tofree;
-		tooverwrite += txh->txh_space_tooverwrite;
-		tounref += txh->txh_space_tounref;
-		tohold += txh->txh_memory_tohold;
-		fudge += txh->txh_fudge;
+		towrite += refcount_count(&txh->txh_space_towrite);
+		tofree += refcount_count(&txh->txh_space_tofree);
+		tooverwrite += refcount_count(&txh->txh_space_tooverwrite);
+		tounref += refcount_count(&txh->txh_space_tounref);
+		tohold += refcount_count(&txh->txh_memory_tohold);
+		fudge += refcount_count(&txh->txh_fudge);
 	}
 
 	/*
-	 * NB: This check must be after we've held the dnodes, so that
-	 * the dmu_tx_unassign() logic will work properly
-	 */
-	if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
-		return (ERESTART);
-
-	/*
 	 * If a snapshot has been taken since we made our estimates,
 	 * assume that we won't be able to free or overwrite anything.
 	 */
@@ -984,6 +1285,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
 
 	txg_rele_to_quiesce(&tx->tx_txgh);
 
+	/*
+	 * Walk the transaction's hold list, removing the hold on the
+	 * associated dnode, and notifying waiters if the refcount drops to 0.
+	 */
 	for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
@@ -1011,26 +1316,33 @@ dmu_tx_unassign(dmu_tx_t *tx)
  *
  * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
  *	a new one.  This should be used when you're not holding locks.
- *	If will only fail if we're truly out of space (or over quota).
+ *	It will only fail if we're truly out of space (or over quota).
  *
  * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
  *	blocking, returns immediately with ERESTART.  This should be used
  *	whenever you're holding locks.  On an ERESTART error, the caller
  *	should drop locks, do a dmu_tx_wait(tx), and try again.
  *
- * (3)	A specific txg.  Use this if you need to ensure that multiple
- *	transactions all sync in the same txg.  Like TXG_NOWAIT, it
- *	returns ERESTART if it can't assign you into the requested txg.
+ * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
+ *      has already been called on behalf of this operation (though
+ *      most likely on a different tx).
  */
 int
-dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
 {
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(txg_how != 0);
+	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
+	    txg_how == TXG_WAITED);
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
+	/* If we might wait, we must not hold the config lock. */
+	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+
+	if (txg_how == TXG_WAITED)
+		tx->tx_waited = B_TRUE;
+
 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 		dmu_tx_unassign(tx);
 
@@ -1049,17 +1361,48 @@ void
 dmu_tx_wait(dmu_tx_t *tx)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
+	dsl_pool_t *dp = tx->tx_pool;
 
 	ASSERT(tx->tx_txg == 0);
+	ASSERT(!dsl_pool_config_held(tx->tx_pool));
 
-	/*
-	 * It's possible that the pool has become active after this thread
-	 * has tried to obtain a tx. If that's the case then his
-	 * tx_lasttried_txg would not have been assigned.
-	 */
-	if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
-		txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+	if (tx->tx_wait_dirty) {
+		/*
+		 * dmu_tx_try_assign() has determined that we need to wait
+		 * because we've consumed much or all of the dirty buffer
+		 * space.
+		 */
+		mutex_enter(&dp->dp_lock);
+		while (dp->dp_dirty_total >= zfs_dirty_data_max)
+			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+		uint64_t dirty = dp->dp_dirty_total;
+		mutex_exit(&dp->dp_lock);
+
+		dmu_tx_delay(tx, dirty);
+
+		tx->tx_wait_dirty = B_FALSE;
+
+		/*
+		 * Note: setting tx_waited only has effect if the caller
+		 * used TX_WAIT.  Otherwise they are going to destroy
+		 * this tx and try again.  The common case, zfs_write(),
+		 * uses TX_WAIT.
+		 */
+		tx->tx_waited = B_TRUE;
+	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+		/*
+		 * If the pool is suspended we need to wait until it
+		 * is resumed.  Note that it's possible that the pool
+		 * has become active after this thread has tried to
+		 * obtain a tx.  If that's the case then tx_lasttried_txg
+		 * would not have been set.
+		 */
+		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 	} else if (tx->tx_needassign_txh) {
+		/*
+		 * A dnode is assigned to the quiescing txg.  Wait for its
+		 * transaction to complete.
+		 */
 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 
 		mutex_enter(&dn->dn_mtx);
@@ -1089,20 +1432,59 @@ dmu_tx_willuse_space(dmu_tx_t *tx, int64
 #endif
 }
 
-void
-dmu_tx_commit(dmu_tx_t *tx)
+static void
+dmu_tx_destroy(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
-	ASSERT(tx->tx_txg != 0);
-
-	while (txh = list_head(&tx->tx_holds)) {
+	while ((txh = list_head(&tx->tx_holds)) != NULL) {
 		dnode_t *dn = txh->txh_dnode;
 
 		list_remove(&tx->tx_holds, txh);
+		refcount_destroy_many(&txh->txh_space_towrite,
+		    refcount_count(&txh->txh_space_towrite));
+		refcount_destroy_many(&txh->txh_space_tofree,
+		    refcount_count(&txh->txh_space_tofree));
+		refcount_destroy_many(&txh->txh_space_tooverwrite,
+		    refcount_count(&txh->txh_space_tooverwrite));
+		refcount_destroy_many(&txh->txh_space_tounref,
+		    refcount_count(&txh->txh_space_tounref));
+		refcount_destroy_many(&txh->txh_memory_tohold,
+		    refcount_count(&txh->txh_memory_tohold));
+		refcount_destroy_many(&txh->txh_fudge,
+		    refcount_count(&txh->txh_fudge));
 		kmem_free(txh, sizeof (dmu_tx_hold_t));
+		if (dn != NULL)
+			dnode_rele(dn, tx);
+	}
+
+	list_destroy(&tx->tx_callbacks);
+	list_destroy(&tx->tx_holds);
+#ifdef ZFS_DEBUG
+	refcount_destroy_many(&tx->tx_space_written,
+	    refcount_count(&tx->tx_space_written));
+	refcount_destroy_many(&tx->tx_space_freed,
+	    refcount_count(&tx->tx_space_freed));
+#endif
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+
+	/*
+	 * Go through the transaction's hold list and remove holds on
+	 * associated dnodes, notifying waiters if no holds remain.
+	 */
+	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		dnode_t *dn = txh->txh_dnode;
+
 		if (dn == NULL)
 			continue;
+
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
@@ -1111,7 +1493,6 @@ dmu_tx_commit(dmu_tx_t *tx)
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
-		dnode_rele(dn, tx);
 	}
 
 	if (tx->tx_tempreserve_cookie)
@@ -1123,51 +1504,26 @@ dmu_tx_commit(dmu_tx_t *tx)
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
 
-	list_destroy(&tx->tx_callbacks);
-	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
 	    tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
 	    tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
-	refcount_destroy_many(&tx->tx_space_written,
-	    refcount_count(&tx->tx_space_written));
-	refcount_destroy_many(&tx->tx_space_freed,
-	    refcount_count(&tx->tx_space_freed));
 #endif
-	kmem_free(tx, sizeof (dmu_tx_t));
+	dmu_tx_destroy(tx);
 }
 
 void
 dmu_tx_abort(dmu_tx_t *tx)
 {
-	dmu_tx_hold_t *txh;
-
 	ASSERT(tx->tx_txg == 0);
 
-	while (txh = list_head(&tx->tx_holds)) {
-		dnode_t *dn = txh->txh_dnode;
-
-		list_remove(&tx->tx_holds, txh);
-		kmem_free(txh, sizeof (dmu_tx_hold_t));
-		if (dn != NULL)
-			dnode_rele(dn, tx);
-	}
-
 	/*
 	 * Call any registered callbacks with an error code.
 	 */
 	if (!list_is_empty(&tx->tx_callbacks))
 		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
 
-	list_destroy(&tx->tx_callbacks);
-	list_destroy(&tx->tx_holds);
-#ifdef ZFS_DEBUG
-	refcount_destroy_many(&tx->tx_space_written,
-	    refcount_count(&tx->tx_space_written));
-	refcount_destroy_many(&tx->tx_space_freed,
-	    refcount_count(&tx->tx_space_freed));
-#endif
-	kmem_free(tx, sizeof (dmu_tx_t));
+	dmu_tx_destroy(tx);
 }
 
 uint64_t
@@ -1177,6 +1533,14 @@ dmu_tx_get_txg(dmu_tx_t *tx)
 	return (tx->tx_txg);
 }
 
+dsl_pool_t *
+dmu_tx_pool(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_pool != NULL);
+	return (tx->tx_pool);
+}
+
+
 void
 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 {
@@ -1198,9 +1562,163 @@ dmu_tx_do_callbacks(list_t *cb_list, int
 {
 	dmu_tx_callback_t *dcb;
 
-	while (dcb = list_head(cb_list)) {
+	while ((dcb = list_head(cb_list)) != NULL) {
 		list_remove(cb_list, dcb);
 		dcb->dcb_func(dcb->dcb_data, error);
 		kmem_free(dcb, sizeof (dmu_tx_callback_t));
 	}
 }
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed.  If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+	int i;
+
+	if (!sa->sa_need_attr_registration)
+		return;
+
+	for (i = 0; i != sa->sa_num_attrs; i++) {
+		if (!sa->sa_attr_table[i].sa_registered) {
+			if (sa->sa_reg_attr_obj)
+				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+			else
+				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+		}
+	}
+}
+
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_tx_hold_t *txh;
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+	    THT_SPILL, 0, 0);
+
+	dn = txh->txh_dnode;
+
+	if (dn == NULL)
+		return;
+
+	/* If blkptr doesn't exist then add space to towrite */
+	if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+		(void) refcount_add_many(&txh->txh_space_towrite,
+		    SPA_OLD_MAXBLOCKSIZE, FTAG);
+	} else {
+		blkptr_t *bp;
+
+		bp = &dn->dn_phys->dn_spill;
+		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
+		    bp, bp->blk_birth)) {
+			(void) refcount_add_many(&txh->txh_space_tooverwrite,
+			    SPA_OLD_MAXBLOCKSIZE, FTAG);
+		} else {
+			(void) refcount_add_many(&txh->txh_space_towrite,
+			    SPA_OLD_MAXBLOCKSIZE, FTAG);
+		}
+		if (!BP_IS_HOLE(bp)) {
+			(void) refcount_add_many(&txh->txh_space_tounref,
+			    SPA_OLD_MAXBLOCKSIZE, FTAG);
+		}
+	}
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_layout_attr_obj)
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+	else {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
+		return;
+
+	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+	    THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function.  It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+	uint64_t object;
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	ASSERT(hdl != NULL);
+
+	object = sa_handle_object(hdl);
+
+	dmu_tx_hold_bonus(tx, object);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
+		ASSERT(tx->tx_txg == 0);
+		dmu_tx_hold_spill(tx, object);
+	} else {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
+		if (dn->dn_have_spill) {
+			ASSERT(tx->tx_txg == 0);
+			dmu_tx_hold_spill(tx, object);
+		}
+		DB_DNODE_EXIT(db);
+	}
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_zfetch.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c	27 Feb 2010 22:30:50 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c	19 Dec 2016 02:06:18 -0000
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
@@ -32,207 +36,60 @@
 #include <sys/kstat.h>
 
 /*
- * I'm against tune-ables, but these should probably exist as tweakable globals
- * until we can get this working the way we want it to.
+ * This tunable disables predictive prefetch.  Note that it leaves "prescient"
+ * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
+ * prescient prefetch never issues i/os that end up not being needed,
+ * so it can't hurt performance.
  */
-
-int zfs_prefetch_disable = 0;
+boolean_t zfs_prefetch_disable = B_FALSE;
 
 /* max # of streams per zfetch */
 uint32_t	zfetch_max_streams = 8;
 /* min time before stream reclaim */
 uint32_t	zfetch_min_sec_reap = 2;
-/* max number of blocks to fetch at a time */
-uint32_t	zfetch_block_cap = 256;
-/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+/* max bytes to prefetch per stream (default 8MB) */
+uint32_t	zfetch_max_distance = 8 * 1024 * 1024;
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+uint32_t	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
 uint64_t	zfetch_array_rd_sz = 1024 * 1024;
 
-/* forward decls for static routines */
-static int		dmu_zfetch_colinear(zfetch_t *, zstream_t *);
-static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
-static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
-static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static int		dmu_zfetch_find(zfetch_t *, zstream_t *, int);
-static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
-static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
-static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
-static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
+    &zfs_prefetch_disable, 0, "Disable prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN,
+    &zfetch_max_streams, 0, "Max # of streams per zfetch");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
+    &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+    &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
+    &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream");
+SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
+    &zfetch_array_rd_sz, 0,
+    "Number of bytes in a array_read at which we stop prefetching");
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
 	kstat_named_t zfetchstat_misses;
-	kstat_named_t zfetchstat_colinear_hits;
-	kstat_named_t zfetchstat_colinear_misses;
-	kstat_named_t zfetchstat_stride_hits;
-	kstat_named_t zfetchstat_stride_misses;
-	kstat_named_t zfetchstat_reclaim_successes;
-	kstat_named_t zfetchstat_reclaim_failures;
-	kstat_named_t zfetchstat_stream_resets;
-	kstat_named_t zfetchstat_stream_noresets;
-	kstat_named_t zfetchstat_bogus_streams;
+	kstat_named_t zfetchstat_max_streams;
 } zfetch_stats_t;
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
-	{ "colinear_hits",		KSTAT_DATA_UINT64 },
-	{ "colinear_misses",		KSTAT_DATA_UINT64 },
-	{ "stride_hits",		KSTAT_DATA_UINT64 },
-	{ "stride_misses",		KSTAT_DATA_UINT64 },
-	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
-	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
-	{ "streams_resets",		KSTAT_DATA_UINT64 },
-	{ "streams_noresets",		KSTAT_DATA_UINT64 },
-	{ "bogus_streams",		KSTAT_DATA_UINT64 },
+	{ "max_streams",		KSTAT_DATA_UINT64 },
 };
 
-#define	ZFETCHSTAT_INCR(stat, val) \
-	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
-
-#define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
+#define	ZFETCHSTAT_BUMP(stat) \
+	atomic_inc_64(&zfetch_stats.stat.value.ui64);
 
 kstat_t		*zfetch_ksp;
 
-/*
- * Given a zfetch structure and a zstream structure, determine whether the
- * blocks to be read are part of a co-linear pair of existing prefetch
- * streams.  If a set is found, coalesce the streams, removing one, and
- * configure the prefetch so it looks for a strided access pattern.
- *
- * In other words: if we find two sequential access streams that are
- * the same length and distance N appart, and this read is N from the
- * last stream, then we are probably in a strided access pattern.  So
- * combine the two sequential streams into a single strided stream.
- *
- * If no co-linear streams are found, return NULL.
- */
-static int
-dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
-{
-	zstream_t	*z_walk;
-	zstream_t	*z_comp;
-
-	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
-		return (0);
-
-	if (zh == NULL) {
-		rw_exit(&zf->zf_rwlock);
-		return (0);
-	}
-
-	for (z_walk = list_head(&zf->zf_stream); z_walk;
-	    z_walk = list_next(&zf->zf_stream, z_walk)) {
-		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
-		    z_comp = list_next(&zf->zf_stream, z_comp)) {
-			int64_t		diff;
-
-			if (z_walk->zst_len != z_walk->zst_stride ||
-			    z_comp->zst_len != z_comp->zst_stride) {
-				continue;
-			}
-
-			diff = z_comp->zst_offset - z_walk->zst_offset;
-			if (z_comp->zst_offset + diff == zh->zst_offset) {
-				z_walk->zst_offset = zh->zst_offset;
-				z_walk->zst_direction = diff < 0 ? -1 : 1;
-				z_walk->zst_stride =
-				    diff * z_walk->zst_direction;
-				z_walk->zst_ph_offset =
-				    zh->zst_offset + z_walk->zst_stride;
-				dmu_zfetch_stream_remove(zf, z_comp);
-				mutex_destroy(&z_comp->zst_lock);
-				kmem_free(z_comp, sizeof (zstream_t));
-
-				dmu_zfetch_dofetch(zf, z_walk);
-
-				rw_exit(&zf->zf_rwlock);
-				return (1);
-			}
-
-			diff = z_walk->zst_offset - z_comp->zst_offset;
-			if (z_walk->zst_offset + diff == zh->zst_offset) {
-				z_walk->zst_offset = zh->zst_offset;
-				z_walk->zst_direction = diff < 0 ? -1 : 1;
-				z_walk->zst_stride =
-				    diff * z_walk->zst_direction;
-				z_walk->zst_ph_offset =
-				    zh->zst_offset + z_walk->zst_stride;
-				dmu_zfetch_stream_remove(zf, z_comp);
-				mutex_destroy(&z_comp->zst_lock);
-				kmem_free(z_comp, sizeof (zstream_t));
-
-				dmu_zfetch_dofetch(zf, z_walk);
-
-				rw_exit(&zf->zf_rwlock);
-				return (1);
-			}
-		}
-	}
-
-	rw_exit(&zf->zf_rwlock);
-	return (0);
-}
-
-/*
- * Given a zstream_t, determine the bounds of the prefetch.  Then call the
- * routine that actually prefetches the individual blocks.
- */
-static void
-dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
-{
-	uint64_t	prefetch_tail;
-	uint64_t	prefetch_limit;
-	uint64_t	prefetch_ofst;
-	uint64_t	prefetch_len;
-	uint64_t	blocks_fetched;
-
-	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
-	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
-
-	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
-	    (int64_t)(zs->zst_offset + zs->zst_stride));
-	/*
-	 * XXX: use a faster division method?
-	 */
-	prefetch_limit = zs->zst_offset + zs->zst_len +
-	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
-
-	while (prefetch_tail < prefetch_limit) {
-		prefetch_ofst = zs->zst_offset + zs->zst_direction *
-		    (prefetch_tail - zs->zst_offset);
-
-		prefetch_len = zs->zst_len;
-
-		/*
-		 * Don't prefetch beyond the end of the file, if working
-		 * backwards.
-		 */
-		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
-		    (prefetch_ofst > prefetch_tail)) {
-			prefetch_len += prefetch_ofst;
-			prefetch_ofst = 0;
-		}
-
-		/* don't prefetch more than we're supposed to */
-		if (prefetch_len > zs->zst_len)
-			break;
-
-		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
-		    prefetch_ofst, zs->zst_len);
-
-		prefetch_tail += zs->zst_stride;
-		/* stop if we've run out of stuff to prefetch */
-		if (blocks_fetched < zs->zst_len)
-			break;
-	}
-	zs->zst_ph_offset = prefetch_tail;
-	zs->zst_last = ddi_get_lbolt();
-}
-
 void
 zfetch_init(void)
 {
-
 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
@@ -260,272 +117,41 @@ zfetch_fini(void)
 void
 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
 {
-	if (zf == NULL) {
+	if (zf == NULL)
 		return;
-	}
 
 	zf->zf_dnode = dno;
-	zf->zf_stream_cnt = 0;
-	zf->zf_alloc_fail = 0;
 
 	list_create(&zf->zf_stream, sizeof (zstream_t),
-	    offsetof(zstream_t, zst_node));
+	    offsetof(zstream_t, zs_node));
 
 	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
 }
 
-/*
- * This function computes the actual size, in blocks, that can be prefetched,
- * and fetches it.
- */
-static uint64_t
-dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
-	uint64_t	fetchsz;
-	uint64_t	i;
-
-	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
-
-	for (i = 0; i < fetchsz; i++) {
-		dbuf_prefetch(dn, blkid + i);
-	}
-
-	return (fetchsz);
-}
-
-/*
- * this function returns the number of blocks that would be prefetched, based
- * upon the supplied dnode, blockid, and nblks.  This is used so that we can
- * update streams in place, and then prefetch with their old value after the
- * fact.  This way, we can delay the prefetch, but subsequent accesses to the
- * stream won't result in the same data being prefetched multiple times.
- */
-static uint64_t
-dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
-	uint64_t	fetchsz;
-
-	if (blkid > dn->dn_maxblkid) {
-		return (0);
-	}
-
-	/* compute fetch size */
-	if (blkid + nblks + 1 > dn->dn_maxblkid) {
-		fetchsz = (dn->dn_maxblkid - blkid) + 1;
-		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
-	} else {
-		fetchsz = nblks;
-	}
-
-
-	return (fetchsz);
-}
-
-/*
- * given a zfetch and a zstream structure, see if there is an associated zstream
- * for this block read.  If so, it starts a prefetch for the stream it
- * located and returns true, otherwise it returns false
- */
-static int
-dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
 {
-	zstream_t	*zs;
-	int64_t		diff;
-	int		reset = !prefetched;
-	int		rc = 0;
-
-	if (zh == NULL)
-		return (0);
-
-	/*
-	 * XXX: This locking strategy is a bit coarse; however, it's impact has
-	 * yet to be tested.  If this turns out to be an issue, it can be
-	 * modified in a number of different ways.
-	 */
-
-	rw_enter(&zf->zf_rwlock, RW_READER);
-top:
-
-	for (zs = list_head(&zf->zf_stream); zs;
-	    zs = list_next(&zf->zf_stream, zs)) {
-
-		/*
-		 * XXX - should this be an assert?
-		 */
-		if (zs->zst_len == 0) {
-			/* bogus stream */
-			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
-			continue;
-		}
-
-		/*
-		 * We hit this case when we are in a strided prefetch stream:
-		 * we will read "len" blocks before "striding".
-		 */
-		if (zh->zst_offset >= zs->zst_offset &&
-		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
-			if (prefetched) {
-				/* already fetched */
-				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
-				rc = 1;
-				goto out;
-			} else {
-				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
-			}
-		}
-
-		/*
-		 * This is the forward sequential read case: we increment
-		 * len by one each time we hit here, so we will enter this
-		 * case on every read.
-		 */
-		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
-
-			reset = !prefetched && zs->zst_len > 1;
-
-			mutex_enter(&zs->zst_lock);
-
-			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-			zs->zst_len += zh->zst_len;
-			diff = zs->zst_len - zfetch_block_cap;
-			if (diff > 0) {
-				zs->zst_offset += diff;
-				zs->zst_len = zs->zst_len > diff ?
-				    zs->zst_len - diff : 0;
-			}
-			zs->zst_direction = ZFETCH_FORWARD;
-
-			break;
-
-		/*
-		 * Same as above, but reading backwards through the file.
-		 */
-		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
-			/* backwards sequential access */
-
-			reset = !prefetched && zs->zst_len > 1;
-
-			mutex_enter(&zs->zst_lock);
-
-			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset = zs->zst_offset > zh->zst_len ?
-			    zs->zst_offset - zh->zst_len : 0;
-			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
-			    zs->zst_ph_offset - zh->zst_len : 0;
-			zs->zst_len += zh->zst_len;
-
-			diff = zs->zst_len - zfetch_block_cap;
-			if (diff > 0) {
-				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
-				    zs->zst_ph_offset - diff : 0;
-				zs->zst_len = zs->zst_len > diff ?
-				    zs->zst_len - diff : zs->zst_len;
-			}
-			zs->zst_direction = ZFETCH_BACKWARD;
-
-			break;
-
-		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
-		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
-			/* strided forward access */
-
-			mutex_enter(&zs->zst_lock);
-
-			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
-			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset += zs->zst_stride;
-			zs->zst_direction = ZFETCH_FORWARD;
-
-			break;
-
-		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
-		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
-			/* strided reverse access */
-
-			mutex_enter(&zs->zst_lock);
-
-			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
-			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
-				mutex_exit(&zs->zst_lock);
-				goto top;
-			}
-
-			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
-			    zs->zst_offset - zs->zst_stride : 0;
-			zs->zst_ph_offset = (zs->zst_ph_offset >
-			    (2 * zs->zst_stride)) ?
-			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
-			zs->zst_direction = ZFETCH_BACKWARD;
-
-			break;
-		}
-	}
-
-	if (zs) {
-		if (reset) {
-			zstream_t *remove = zs;
-
-			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
-			rc = 0;
-			mutex_exit(&zs->zst_lock);
-			rw_exit(&zf->zf_rwlock);
-			rw_enter(&zf->zf_rwlock, RW_WRITER);
-			/*
-			 * Relocate the stream, in case someone removes
-			 * it while we were acquiring the WRITER lock.
-			 */
-			for (zs = list_head(&zf->zf_stream); zs;
-			    zs = list_next(&zf->zf_stream, zs)) {
-				if (zs == remove) {
-					dmu_zfetch_stream_remove(zf, zs);
-					mutex_destroy(&zs->zst_lock);
-					kmem_free(zs, sizeof (zstream_t));
-					break;
-				}
-			}
-		} else {
-			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
-			rc = 1;
-			dmu_zfetch_dofetch(zf, zs);
-			mutex_exit(&zs->zst_lock);
-		}
-	}
-out:
-	rw_exit(&zf->zf_rwlock);
-	return (rc);
+	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+	list_remove(&zf->zf_stream, zs);
+	mutex_destroy(&zs->zs_lock);
+	kmem_free(zs, sizeof (*zs));
 }
 
 /*
- * Clean-up state associated with a zfetch structure.  This frees allocated
- * structure members, empties the zf_stream tree, and generally makes things
- * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
+ * Clean-up state associated with a zfetch structure (e.g. destroy the
+ * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
  */
 void
-dmu_zfetch_rele(zfetch_t *zf)
+dmu_zfetch_fini(zfetch_t *zf)
 {
-	zstream_t	*zs;
-	zstream_t	*zs_next;
+	zstream_t *zs;
 
 	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
 
-	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
-		zs_next = list_next(&zf->zf_stream, zs);
-
-		list_remove(&zf->zf_stream, zs);
-		mutex_destroy(&zs->zst_lock);
-		kmem_free(zs, sizeof (zstream_t));
-	}
+	rw_enter(&zf->zf_rwlock, RW_WRITER);
+	while ((zs = list_head(&zf->zf_stream)) != NULL)
+		dmu_zfetch_stream_remove(zf, zs);
+	rw_exit(&zf->zf_rwlock);
 	list_destroy(&zf->zf_stream);
 	rw_destroy(&zf->zf_rwlock);
 
@@ -533,192 +159,190 @@ dmu_zfetch_rele(zfetch_t *zf)
 }
 
 /*
- * Given a zfetch and zstream structure, insert the zstream structure into the
- * AVL tree contained within the zfetch structure.  Peform the appropriate
- * book-keeping.  It is possible that another thread has inserted a stream which
- * matches one that we are about to insert, so we must be sure to check for this
- * case.  If one is found, return failure, and let the caller cleanup the
- * duplicates.
+ * If there aren't too many streams already, create a new stream.
+ * The "blkid" argument is the next block that we expect this stream to access.
+ * While we're here, clean up old streams (which haven't been
+ * accessed for at least zfetch_min_sec_reap seconds).
  */
-static int
-dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+static void
+dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
-	zstream_t	*zs_walk;
-	zstream_t	*zs_next;
+	zstream_t *zs_next;
+	int numstreams = 0;
 
 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
 
-	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
-		zs_next = list_next(&zf->zf_stream, zs_walk);
-
-		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
-			return (0);
-		}
-	}
-
-	list_insert_head(&zf->zf_stream, zs);
-	zf->zf_stream_cnt++;
-	return (1);
-}
-
-
-/*
- * Walk the list of zstreams in the given zfetch, find an old one (by time), and
- * reclaim it for use by the caller.
- */
-static zstream_t *
-dmu_zfetch_stream_reclaim(zfetch_t *zf)
-{
-	zstream_t	*zs;
-
-	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
-		return (0);
-
-	for (zs = list_head(&zf->zf_stream); zs;
-	    zs = list_next(&zf->zf_stream, zs)) {
-
-		if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
-			break;
+	/*
+	 * Clean up old streams.
+	 */
+	for (zstream_t *zs = list_head(&zf->zf_stream);
+	    zs != NULL; zs = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs);
+		if (((gethrtime() - zs->zs_atime) / NANOSEC) >
+		    zfetch_min_sec_reap)
+			dmu_zfetch_stream_remove(zf, zs);
+		else
+			numstreams++;
 	}
 
-	if (zs) {
-		dmu_zfetch_stream_remove(zf, zs);
-		mutex_destroy(&zs->zst_lock);
-		bzero(zs, sizeof (zstream_t));
-	} else {
-		zf->zf_alloc_fail++;
+	/*
+	 * The maximum number of streams is normally zfetch_max_streams,
+	 * but for small files we lower it such that it's at least possible
+	 * for all the streams to be non-overlapping.
+	 *
+	 * If we are already at the maximum number of streams for this file,
+	 * even after removing old streams, then don't create this stream.
+	 */
+	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
+	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    zfetch_max_distance));
+	if (numstreams >= max_streams) {
+		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
+		return;
 	}
-	rw_exit(&zf->zf_rwlock);
 
-	return (zs);
-}
+	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+	zs->zs_blkid = blkid;
+	zs->zs_pf_blkid = blkid;
+	zs->zs_ipf_blkid = blkid;
+	zs->zs_atime = gethrtime();
+	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
 
-/*
- * Given a zfetch and zstream structure, remove the zstream structure from its
- * container in the zfetch structure.  Perform the appropriate book-keeping.
- */
-static void
-dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
-{
-	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
-	list_remove(&zf->zf_stream, zs);
-	zf->zf_stream_cnt--;
-}
-
-static int
-dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
-{
-	if (zs1->zst_offset != zs2->zst_offset)
-		return (0);
-
-	if (zs1->zst_len != zs2->zst_len)
-		return (0);
-
-	if (zs1->zst_stride != zs2->zst_stride)
-		return (0);
-
-	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
-		return (0);
-
-	if (zs1->zst_cap != zs2->zst_cap)
-		return (0);
-
-	if (zs1->zst_direction != zs2->zst_direction)
-		return (0);
-
-	return (1);
+	list_insert_head(&zf->zf_stream, zs);
 }
 
 /*
- * This is the prefetch entry point.  It calls all of the other dmu_zfetch
- * routines to create, delete, find, or operate upon prefetch streams.
+ * This is the predictive prefetch entry point.  It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ *   FALSE -- prefetch only indirect blocks for predicted data blocks;
+ *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
  */
 void
-dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
 {
-	zstream_t	zst;
-	zstream_t	*newstream;
-	int		fetched;
-	int		inserted;
-	unsigned int	blkshft;
-	uint64_t	blksz;
+	zstream_t *zs;
+	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+	int64_t pf_ahead_blks, max_blks;
+	int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+	uint64_t end_of_access_blkid = blkid + nblks;
 
 	if (zfs_prefetch_disable)
 		return;
 
-	/* files that aren't ln2 blocksz are only one block -- nothing to do */
-	if (!zf->zf_dnode->dn_datablkshift)
+	/*
+	 * As a fast path for small (single-block) files, ignore access
+	 * to the first block.
+	 */
+	if (blkid == 0)
 		return;
 
-	/* convert offset and size, into blockid and nblocks */
-	blkshft = zf->zf_dnode->dn_datablkshift;
-	blksz = (1 << blkshft);
-
-	bzero(&zst, sizeof (zstream_t));
-	zst.zst_offset = offset >> blkshft;
-	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
-	    P2ALIGN(offset, blksz)) >> blkshft;
-
-	fetched = dmu_zfetch_find(zf, &zst, prefetched);
-	if (fetched) {
-		ZFETCHSTAT_BUMP(zfetchstat_hits);
-	} else {
-		ZFETCHSTAT_BUMP(zfetchstat_misses);
-		if (fetched = dmu_zfetch_colinear(zf, &zst)) {
-			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
-		} else {
-			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
+	rw_enter(&zf->zf_rwlock, RW_READER);
+
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid == zs->zs_blkid) {
+			mutex_enter(&zs->zs_lock);
+			/*
+			 * zs_blkid could have changed before we
+			 * acquired zs_lock; re-check them here.
+			 */
+			if (blkid != zs->zs_blkid) {
+				mutex_exit(&zs->zs_lock);
+				continue;
+			}
+			break;
 		}
 	}
 
-	if (!fetched) {
-		newstream = dmu_zfetch_stream_reclaim(zf);
-
+	if (zs == NULL) {
 		/*
-		 * we still couldn't find a stream, drop the lock, and allocate
-		 * one if possible.  Otherwise, give up and go home.
+		 * This access is not part of any existing stream.  Create
+		 * a new stream for it.
 		 */
-		if (newstream) {
-			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
-		} else {
-			uint64_t	maxblocks;
-			uint32_t	max_streams;
-			uint32_t	cur_streams;
-
-			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
-			cur_streams = zf->zf_stream_cnt;
-			maxblocks = zf->zf_dnode->dn_maxblkid;
-
-			max_streams = MIN(zfetch_max_streams,
-			    (maxblocks / zfetch_block_cap));
-			if (max_streams == 0) {
-				max_streams++;
-			}
+		ZFETCHSTAT_BUMP(zfetchstat_misses);
+		if (rw_tryupgrade(&zf->zf_rwlock))
+			dmu_zfetch_stream_create(zf, end_of_access_blkid);
+		rw_exit(&zf->zf_rwlock);
+		return;
+	}
 
-			if (cur_streams >= max_streams) {
-				return;
-			}
-			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
-		}
+	/*
+	 * This access was to a block that we issued a prefetch for on
+	 * behalf of this stream. Issue further prefetches for this stream.
+	 *
+	 * Normally, we start prefetching where we stopped
+	 * prefetching last (zs_pf_blkid).  But when we get our first
+	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
+	 * want to prefetch the block we just accessed.  In this case,
+	 * start just after the block we just accessed.
+	 */
+	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
+
+	/*
+	 * Double our amount of prefetched data, but don't let the
+	 * prefetch get further ahead than zfetch_max_distance.
+	 */
+	if (fetch_data) {
+		max_dist_blks =
+		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
+		/*
+		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
+		 * want to now be double that, so read that amount again,
+		 * plus the amount we are catching up by (i.e. the amount
+		 * read just now).
+		 */
+		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+		pf_nblks = MIN(pf_ahead_blks, max_blks);
+	} else {
+		pf_nblks = 0;
+	}
 
-		newstream->zst_offset = zst.zst_offset;
-		newstream->zst_len = zst.zst_len;
-		newstream->zst_stride = zst.zst_len;
-		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
-		newstream->zst_cap = zst.zst_len;
-		newstream->zst_direction = ZFETCH_FORWARD;
-		newstream->zst_last = ddi_get_lbolt();
+	zs->zs_pf_blkid = pf_start + pf_nblks;
 
-		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+	/*
+	 * Do the same for indirects, starting from where we stopped last,
+	 * or where we will stop reading data blocks (and the indirects
+	 * that point to them).
+	 */
+	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+	/*
+	 * We want to double our distance ahead of the data prefetch
+	 * (or reader, if we are not prefetching data).  Previously, we
+	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
+	 * that amount again, plus the amount we are catching up by
+	 * (i.e. the amount read now + the amount of data prefetched now).
+	 */
+	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+	ipf_nblks = MIN(pf_ahead_blks, max_blks);
+	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
+
+	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
+
+	zs->zs_atime = gethrtime();
+	zs->zs_blkid = end_of_access_blkid;
+	mutex_exit(&zs->zs_lock);
+	rw_exit(&zf->zf_rwlock);
 
-		rw_enter(&zf->zf_rwlock, RW_WRITER);
-		inserted = dmu_zfetch_stream_insert(zf, newstream);
-		rw_exit(&zf->zf_rwlock);
+	/*
+	 * dbuf_prefetch() is asynchronous (even when it needs to read
+	 * indirect blocks), but we still prefer to drop our locks before
+	 * calling it to reduce the time we hold them.
+	 */
 
-		if (!inserted) {
-			mutex_destroy(&newstream->zst_lock);
-			kmem_free(newstream, sizeof (zstream_t));
-		}
+	for (int i = 0; i < pf_nblks; i++) {
+		dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
+		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+	}
+	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+		dbuf_prefetch(zf->zf_dnode, 1, iblk,
+		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
 	}
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c,v
retrieving revision 1.4
diff -u -p -r1.4 dnode.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c	21 Jun 2013 16:22:46 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c	26 Apr 2017 00:42:13 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -35,44 +37,130 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
-
-static int free_range_compar(const void *node1, const void *node2);
+#include <sys/range_tree.h>
 
 static kmem_cache_t *dnode_cache;
+/*
+ * Define DNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef	DEBUG
+#define	DNODE_STATS
+#endif	/* DEBUG */
+
+#ifdef	DNODE_STATS
+#define	DNODE_STAT_ADD(stat)			((stat)++)
+#else
+#define	DNODE_STAT_ADD(stat)			/* nothing */
+#endif	/* DNODE_STATS */
 
 static dnode_phys_t dnode_phys_zero;
 
 int zfs_default_bs = SPA_MINBLOCKSHIFT;
 int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 
+#ifdef illumos
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+#endif
+
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+	const dmu_buf_impl_t *d1 = x1;
+	const dmu_buf_impl_t *d2 = x2;
+
+	if (d1->db_level < d2->db_level) {
+		return (-1);
+	}
+	if (d1->db_level > d2->db_level) {
+		return (1);
+	}
+
+	if (d1->db_blkid < d2->db_blkid) {
+		return (-1);
+	}
+	if (d1->db_blkid > d2->db_blkid) {
+		return (1);
+	}
+
+	if (d1->db_state == DB_SEARCH) {
+		ASSERT3S(d2->db_state, !=, DB_SEARCH);
+		return (-1);
+	} else if (d2->db_state == DB_SEARCH) {
+		ASSERT3S(d1->db_state, !=, DB_SEARCH);
+		return (1);
+	}
+
+	if ((uintptr_t)d1 < (uintptr_t)d2) {
+		return (-1);
+	}
+	if ((uintptr_t)d1 > (uintptr_t)d2) {
+		return (1);
+	}
+	return (0);
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
 {
+	dnode_t *dn = arg;
 	int i;
-	dnode_t *dn = unused;
-	bzero(dn, sizeof (dnode_t));
 
+#ifdef __NetBSD__
+	dn = unused;
+#endif
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 
-	refcount_create(&dn->dn_holds);
+	/*
+	 * Every dbuf has a reference, and dropping a tracked reference is
+	 * O(number of references), so don't track dn_holds.
+	 */
+	refcount_create_untracked(&dn->dn_holds);
 	refcount_create(&dn->dn_tx_holds);
+	list_link_init(&dn->dn_link);
+
+	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
 
 	for (i = 0; i < TXG_SIZE; i++) {
-		avl_create(&dn->dn_ranges[i], free_range_compar,
-		    sizeof (free_range_t),
-		    offsetof(struct free_range, fr_node));
+		list_link_init(&dn->dn_dirty_link[i]);
+		dn->dn_free_ranges[i] = NULL;
 		list_create(&dn->dn_dirty_records[i],
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 
-	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+	dn->dn_dirtyctx = 0;
+	dn->dn_dirtyctx_firstset = NULL;
+	dn->dn_bonus = NULL;
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_zio = NULL;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_id_flags = 0;
+
+	dn->dn_dbufs_count = 0;
+	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
+	dn->dn_moved = 0;
+	POINTER_INVALIDATE(&dn->dn_objset);
 	return (0);
 }
 
@@ -81,35 +169,67 @@ static void
 dnode_dest(void *arg, void *unused)
 {
 	int i;
-	dnode_t *dn = unused;
+	dnode_t *dn = arg;
 
+#ifdef __NetBSD__
+	dn = unused;
+#endif
 	rw_destroy(&dn->dn_struct_rwlock);
 	mutex_destroy(&dn->dn_mtx);
 	mutex_destroy(&dn->dn_dbufs_mtx);
 	cv_destroy(&dn->dn_notxholds);
 	refcount_destroy(&dn->dn_holds);
 	refcount_destroy(&dn->dn_tx_holds);
+	ASSERT(!list_link_active(&dn->dn_link));
 
 	for (i = 0; i < TXG_SIZE; i++) {
-		avl_destroy(&dn->dn_ranges[i]);
+		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 		list_destroy(&dn->dn_dirty_records[i]);
-	}
+		ASSERT0(dn->dn_next_nblkptr[i]);
+		ASSERT0(dn->dn_next_nlevels[i]);
+		ASSERT0(dn->dn_next_indblkshift[i]);
+		ASSERT0(dn->dn_next_bonustype[i]);
+		ASSERT0(dn->dn_rm_spillblk[i]);
+		ASSERT0(dn->dn_next_bonuslen[i]);
+		ASSERT0(dn->dn_next_blksz[i]);
+	}
+
+	ASSERT0(dn->dn_allocated_txg);
+	ASSERT0(dn->dn_free_txg);
+	ASSERT0(dn->dn_assigned_txg);
+	ASSERT0(dn->dn_dirtyctx);
+	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+	ASSERT3P(dn->dn_bonus, ==, NULL);
+	ASSERT(!dn->dn_have_spill);
+	ASSERT3P(dn->dn_zio, ==, NULL);
+	ASSERT0(dn->dn_oldused);
+	ASSERT0(dn->dn_oldflags);
+	ASSERT0(dn->dn_olduid);
+	ASSERT0(dn->dn_oldgid);
+	ASSERT0(dn->dn_newuid);
+	ASSERT0(dn->dn_newgid);
+	ASSERT0(dn->dn_id_flags);
 
-	list_destroy(&dn->dn_dbufs);
+	ASSERT0(dn->dn_dbufs_count);
+	avl_destroy(&dn->dn_dbufs);
 }
 
 void
 dnode_init(void)
 {
+	ASSERT(dnode_cache == NULL);
 	dnode_cache = kmem_cache_create("dnode_t",
 	    sizeof (dnode_t),
 	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+	kmem_cache_set_move(dnode_cache, dnode_move);
 }
 
 void
 dnode_fini(void)
 {
 	kmem_cache_destroy(dnode_cache);
+	dnode_cache = NULL;
 }
 
 
@@ -121,8 +241,9 @@ dnode_verify(dnode_t *dn)
 
 	ASSERT(dn->dn_phys);
 	ASSERT(dn->dn_objset);
+	ASSERT(dn->dn_handle->dnh_dnode == dn);
 
-	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
 	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
 		return;
@@ -141,7 +262,7 @@ dnode_verify(dnode_t *dn)
 			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
 		}
 		ASSERT3U(dn->dn_nlevels, <=, 30);
-		ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES);
+		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
 		ASSERT3U(dn->dn_nblkptr, >=, 1);
 		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 		ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
@@ -207,9 +328,16 @@ dnode_byteswap(dnode_phys_t *dnp)
 		 */
 		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
 		size_t len = DN_MAX_BONUSLEN - off;
-		ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES);
-		dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len);
+		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
+		dmu_object_byteswap_t byteswap =
+		    DMU_OT_BYTESWAP(dnp->dn_bonustype);
+		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
 	}
+
+	/* Swap SPILL block if we have one */
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+		byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
+
 }
 
 void
@@ -228,19 +356,6 @@ dnode_buf_byteswap(void *vbuf, size_t si
 	}
 }
 
-static int
-free_range_compar(const void *node1, const void *node2)
-{
-	const free_range_t *rp1 = node1;
-	const free_range_t *rp2 = node2;
-
-	if (rp1->fr_blkid < rp2->fr_blkid)
-		return (-1);
-	else if (rp1->fr_blkid > rp2->fr_blkid)
-		return (1);
-	else return (0);
-}
-
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
@@ -258,33 +373,66 @@ dnode_setbonuslen(dnode_t *dn, int newsi
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dn->dn_bonustype = newtype;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+	dnode_setdirty(dn, tx);
+	dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+	dn->dn_have_spill = B_FALSE;
+}
+
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
-	ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
 	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 	dn->dn_datablksz = size;
 	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
-	dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
-    uint64_t object)
+    uint64_t object, dnode_handle_t *dnh)
 {
-	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
-//	(void) dnode_cons(dn, NULL, 0); /* XXX */
+	dnode_t *dn;
 
-	dn->dn_objset = os;
+	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(dn->dn_objset));
+	dn->dn_moved = 0;
+
+	/*
+	 * Defer setting dn_objset until the dnode is ready to be a candidate
+	 * for the dnode_move() callback.
+	 */
 	dn->dn_object = object;
 	dn->dn_dbuf = db;
+	dn->dn_handle = dnh;
 	dn->dn_phys = dnp;
 
-	if (dnp->dn_datablkszsec)
+	if (dnp->dn_datablkszsec) {
 		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	} else {
+		dn->dn_datablksz = 0;
+		dn->dn_datablkszsec = 0;
+		dn->dn_datablkshift = 0;
+	}
 	dn->dn_indblkshift = dnp->dn_indblkshift;
 	dn->dn_nlevels = dnp->dn_nlevels;
 	dn->dn_type = dnp->dn_type;
@@ -294,51 +442,100 @@ dnode_create(objset_t *os, dnode_phys_t 
 	dn->dn_bonustype = dnp->dn_bonustype;
 	dn->dn_bonuslen = dnp->dn_bonuslen;
 	dn->dn_maxblkid = dnp->dn_maxblkid;
+	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+	dn->dn_id_flags = 0;
 
 	dmu_zfetch_init(&dn->dn_zfetch, dn);
 
-	ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+
 	mutex_enter(&os->os_lock);
-	list_insert_head(&os->os_dnodes, dn);
+	if (dnh->dnh_dnode != NULL) {
+		/* Lost the allocation race. */
+		mutex_exit(&os->os_lock);
+		kmem_cache_free(dnode_cache, dn);
+		return (dnh->dnh_dnode);
+	}
+
+	/*
+	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
+	 * signifies that the special dnodes have no references from
+	 * their children (the entries in os_dnodes).  This allows
+	 * dnode_destroy() to easily determine if the last child has
+	 * been removed and then complete eviction of the objset.
+	 */
+	if (!DMU_OBJECT_IS_SPECIAL(object))
+		list_insert_head(&os->os_dnodes, dn);
+	membar_producer();
+
+	/*
+	 * Everything else must be valid before assigning dn_objset
+	 * makes the dnode eligible for dnode_move().
+	 */
+	dn->dn_objset = os;
+
+	dnh->dnh_dnode = dn;
 	mutex_exit(&os->os_lock);
 
 	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 	return (dn);
 }
 
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
 static void
 dnode_destroy(dnode_t *dn)
 {
 	objset_t *os = dn->dn_objset;
+	boolean_t complete_os_eviction = B_FALSE;
 
-#ifdef ZFS_DEBUG
-	int i;
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-		ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
-		ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
-	}
-	ASSERT(NULL == list_head(&dn->dn_dbufs));
-#endif
-	ASSERT(dn->dn_oldphys == NULL);
+	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
 	mutex_enter(&os->os_lock);
-	list_remove(&os->os_dnodes, dn);
+	POINTER_INVALIDATE(&dn->dn_objset);
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		list_remove(&os->os_dnodes, dn);
+		complete_os_eviction =
+		    list_is_empty(&os->os_dnodes) &&
+		    list_link_active(&os->os_evicting_node);
+	}
 	mutex_exit(&os->os_lock);
 
-	if (dn->dn_dirtyctx_firstset) {
+	/* the dnode can no longer move, so we can release the handle */
+	zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+
+	dn->dn_dirtyctx = 0;
+	if (dn->dn_dirtyctx_firstset != NULL) {
 		kmem_free(dn->dn_dirtyctx_firstset, 1);
 		dn->dn_dirtyctx_firstset = NULL;
 	}
-	dmu_zfetch_rele(&dn->dn_zfetch);
-	if (dn->dn_bonus) {
+	if (dn->dn_bonus != NULL) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_evict(dn->dn_bonus);
+		dbuf_destroy(dn->dn_bonus);
 		dn->dn_bonus = NULL;
 	}
+	dn->dn_zio = NULL;
+
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_id_flags = 0;
+
+	dmu_zfetch_fini(&dn->dn_zfetch);
 	kmem_cache_free(dnode_cache, dn);
 	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+
+	if (complete_os_eviction)
+		dmu_objset_evict_done(os);
 }
 
 void
@@ -347,10 +544,10 @@ dnode_allocate(dnode_t *dn, dmu_object_t
 {
 	int i;
 
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (blocksize == 0)
 		blocksize = 1 << zfs_default_bs;
-	else if (blocksize > SPA_MAXBLOCKSIZE)
-		blocksize = SPA_MAXBLOCKSIZE;
 	else
 		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -366,34 +563,42 @@ dnode_allocate(dnode_t *dn, dmu_object_t
 	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
 	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
 	ASSERT(ot != DMU_OT_NONE);
-	ASSERT3U(ot, <, DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(ot));
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
-	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 	ASSERT(dn->dn_type == DMU_OT_NONE);
-	ASSERT3U(dn->dn_maxblkid, ==, 0);
-	ASSERT3U(dn->dn_allocated_txg, ==, 0);
-	ASSERT3U(dn->dn_assigned_txg, ==, 0);
+	ASSERT0(dn->dn_maxblkid);
+	ASSERT0(dn->dn_allocated_txg);
+	ASSERT0(dn->dn_assigned_txg);
 	ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 	ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+	ASSERT(avl_is_empty(&dn->dn_dbufs));
 
 	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
-		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
-		ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
-		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+		ASSERT0(dn->dn_next_nblkptr[i]);
+		ASSERT0(dn->dn_next_nlevels[i]);
+		ASSERT0(dn->dn_next_indblkshift[i]);
+		ASSERT0(dn->dn_next_bonuslen[i]);
+		ASSERT0(dn->dn_next_bonustype[i]);
+		ASSERT0(dn->dn_rm_spillblk[i]);
+		ASSERT0(dn->dn_next_blksz[i]);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
-		ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
 	}
 
 	dn->dn_type = ot;
 	dnode_setdblksz(dn, blocksize);
 	dn->dn_indblkshift = ibs;
 	dn->dn_nlevels = 1;
-	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		dn->dn_nblkptr = 1;
+	else
+		dn->dn_nblkptr = 1 +
+		    ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -407,10 +612,12 @@ dnode_allocate(dnode_t *dn, dmu_object_t
 	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
+	dn->dn_id_flags = 0;
 
 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
 	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
@@ -421,18 +628,22 @@ dnode_reallocate(dnode_t *dn, dmu_object
 	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
-	ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 	ASSERT(tx->tx_txg != 0);
 	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
-	    (bonustype != DMU_OT_NONE && bonuslen != 0));
-	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
+	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0));
+	ASSERT(DMU_OT_IS_VALID(bonustype));
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);
 
+	dn->dn_id_flags = 0;
+
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_setdirty(dn, tx);
 	if (dn->dn_datablksz != blocksize) {
@@ -445,9 +656,19 @@ dnode_reallocate(dnode_t *dn, dmu_object
 	}
 	if (dn->dn_bonuslen != bonuslen)
 		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
-	nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		nblkptr = 1;
+	else
+		nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	if (dn->dn_bonustype != bonustype)
+		dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
 	if (dn->dn_nblkptr != nblkptr)
 		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
+	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		dbuf_rm_spill(dn, tx);
+		dnode_rm_spill(dn, tx);
+	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* change type */
@@ -473,9 +694,304 @@ dnode_reallocate(dnode_t *dn, dmu_object
 	mutex_exit(&dn->dn_mtx);
 }
 
+#ifdef	DNODE_STATS
+static struct {
+	uint64_t dms_dnode_invalid;
+	uint64_t dms_dnode_recheck1;
+	uint64_t dms_dnode_recheck2;
+	uint64_t dms_dnode_special;
+	uint64_t dms_dnode_handle;
+	uint64_t dms_dnode_rwlock;
+	uint64_t dms_dnode_active;
+} dnode_move_stats;
+#endif	/* DNODE_STATS */
+
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+	int i;
+
+	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+	ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+
+	/* Copy fields. */
+	ndn->dn_objset = odn->dn_objset;
+	ndn->dn_object = odn->dn_object;
+	ndn->dn_dbuf = odn->dn_dbuf;
+	ndn->dn_handle = odn->dn_handle;
+	ndn->dn_phys = odn->dn_phys;
+	ndn->dn_type = odn->dn_type;
+	ndn->dn_bonuslen = odn->dn_bonuslen;
+	ndn->dn_bonustype = odn->dn_bonustype;
+	ndn->dn_nblkptr = odn->dn_nblkptr;
+	ndn->dn_checksum = odn->dn_checksum;
+	ndn->dn_compress = odn->dn_compress;
+	ndn->dn_nlevels = odn->dn_nlevels;
+	ndn->dn_indblkshift = odn->dn_indblkshift;
+	ndn->dn_datablkshift = odn->dn_datablkshift;
+	ndn->dn_datablkszsec = odn->dn_datablkszsec;
+	ndn->dn_datablksz = odn->dn_datablksz;
+	ndn->dn_maxblkid = odn->dn_maxblkid;
+	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+	    sizeof (odn->dn_next_nblkptr));
+	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+	    sizeof (odn->dn_next_nlevels));
+	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+	    sizeof (odn->dn_next_indblkshift));
+	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+	    sizeof (odn->dn_next_bonustype));
+	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+	    sizeof (odn->dn_rm_spillblk));
+	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+	    sizeof (odn->dn_next_bonuslen));
+	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+	    sizeof (odn->dn_next_blksz));
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_move_tail(&ndn->dn_dirty_records[i],
+		    &odn->dn_dirty_records[i]);
+	}
+	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+	    sizeof (odn->dn_free_ranges));
+	ndn->dn_allocated_txg = odn->dn_allocated_txg;
+	ndn->dn_free_txg = odn->dn_free_txg;
+	ndn->dn_assigned_txg = odn->dn_assigned_txg;
+	ndn->dn_dirtyctx = odn->dn_dirtyctx;
+	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+	ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
+	refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+	ASSERT(avl_is_empty(&ndn->dn_dbufs));
+	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
+	ndn->dn_dbufs_count = odn->dn_dbufs_count;
+	ndn->dn_bonus = odn->dn_bonus;
+	ndn->dn_have_spill = odn->dn_have_spill;
+	ndn->dn_zio = odn->dn_zio;
+	ndn->dn_oldused = odn->dn_oldused;
+	ndn->dn_oldflags = odn->dn_oldflags;
+	ndn->dn_olduid = odn->dn_olduid;
+	ndn->dn_oldgid = odn->dn_oldgid;
+	ndn->dn_newuid = odn->dn_newuid;
+	ndn->dn_newgid = odn->dn_newgid;
+	ndn->dn_id_flags = odn->dn_id_flags;
+	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+
+	/*
+	 * Update back pointers. Updating the handle fixes the back pointer of
+	 * every descendant dbuf as well as the bonus dbuf.
+	 */
+	ASSERT(ndn->dn_handle->dnh_dnode == odn);
+	ndn->dn_handle->dnh_dnode = ndn;
+	if (ndn->dn_zfetch.zf_dnode == odn) {
+		ndn->dn_zfetch.zf_dnode = ndn;
+	}
+
+	/*
+	 * Invalidate the original dnode by clearing all of its back pointers.
+	 */
+	odn->dn_dbuf = NULL;
+	odn->dn_handle = NULL;
+	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+	odn->dn_dbufs_count = 0;
+	odn->dn_bonus = NULL;
+	odn->dn_zfetch.zf_dnode = NULL;
+
+	/*
+	 * Set the low bit of the objset pointer to ensure that dnode_move()
+	 * recognizes the dnode as invalid in any subsequent callback.
+	 */
+	POINTER_INVALIDATE(&odn->dn_objset);
+
+	/*
+	 * Satisfy the destructor.
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_create(&odn->dn_dirty_records[i],
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+		odn->dn_free_ranges[i] = NULL;
+		odn->dn_next_nlevels[i] = 0;
+		odn->dn_next_indblkshift[i] = 0;
+		odn->dn_next_bonustype[i] = 0;
+		odn->dn_rm_spillblk[i] = 0;
+		odn->dn_next_bonuslen[i] = 0;
+		odn->dn_next_blksz[i] = 0;
+	}
+	odn->dn_allocated_txg = 0;
+	odn->dn_free_txg = 0;
+	odn->dn_assigned_txg = 0;
+	odn->dn_dirtyctx = 0;
+	odn->dn_dirtyctx_firstset = NULL;
+	odn->dn_have_spill = B_FALSE;
+	odn->dn_zio = NULL;
+	odn->dn_oldused = 0;
+	odn->dn_oldflags = 0;
+	odn->dn_olduid = 0;
+	odn->dn_oldgid = 0;
+	odn->dn_newuid = 0;
+	odn->dn_newgid = 0;
+	odn->dn_id_flags = 0;
+
+	/*
+	 * Mark the dnode.
+	 */
+	ndn->dn_moved = 1;
+	odn->dn_moved = (uint8_t)-1;
+}
+
+#ifdef illumos
+#ifdef	_KERNEL
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+	dnode_t *odn = buf, *ndn = newbuf;
+	objset_t *os;
+	int64_t refcount;
+	uint32_t dbufs;
+
+	/*
+	 * The dnode is on the objset's list of known dnodes if the objset
+	 * pointer is valid. We set the low bit of the objset pointer when
+	 * freeing the dnode to invalidate it, and the memory patterns written
+	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+	 * A newly created dnode sets the objset pointer last of all to indicate
+	 * that the dnode is known and in a valid state to be moved by this
+	 * function.
+	 */
+	os = odn->dn_objset;
+	if (!POINTER_IS_VALID(os)) {
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * Ensure that the objset does not go away during the move.
+	 */
+	rw_enter(&os_lock, RW_WRITER);
+	if (os != odn->dn_objset) {
+		rw_exit(&os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * If the dnode is still valid, then so is the objset. We know that no
+	 * valid objset can be freed while we hold os_lock, so we can safely
+	 * ensure that the objset remains in use.
+	 */
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * Recheck the objset pointer in case the dnode was removed just before
+	 * acquiring the lock.
+	 */
+	if (os != odn->dn_objset) {
+		mutex_exit(&os->os_lock);
+		rw_exit(&os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * At this point we know that as long as we hold os->os_lock, the dnode
+	 * cannot be freed and fields within the dnode can be safely accessed.
+	 * The objset listing this dnode cannot go away as long as this dnode is
+	 * on its list.
+	 */
+	rw_exit(&os_lock);
+	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+		return (KMEM_CBRC_NO);
+	}
+	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+	/*
+	 * Lock the dnode handle to prevent the dnode from obtaining any new
+	 * holds. This also prevents the descendant dbufs and the bonus dbuf
+	 * from accessing the dnode, so that we can discount their holds. The
+	 * handle is safe to access because we know that while the dnode cannot
+	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
+	 * safely move any dnode referenced only by dbufs.
+	 */
+	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+	 * We need to guarantee that there is a hold for every dbuf in order to
+	 * determine whether the dnode is actively referenced. Falsely matching
+	 * a dbuf to an active hold would lead to an unsafe move. It's possible
+	 * that a thread already having an active dnode hold is about to add a
+	 * dbuf, and we can't compare hold and dbuf counts while the add is in
+	 * progress.
+	 */
+	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * A dbuf may be removed (evicted) without an active dnode hold. In that
+	 * case, the dbuf count is decremented under the handle lock before the
+	 * dbuf's hold is released. This order ensures that if we count the hold
+	 * after the dbuf is removed but before its hold is released, we will
+	 * treat the unmatched hold as active and exit safely. If we count the
+	 * hold before the dbuf is removed, the hold is discounted, and the
+	 * removal is blocked until the move completes.
+	 */
+	refcount = refcount_count(&odn->dn_holds);
+	ASSERT(refcount >= 0);
+	dbufs = odn->dn_dbufs_count;
+
+	/* We can't have more dbufs than dnode holds. */
+	ASSERT3U(dbufs, <=, refcount);
+	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+	    uint32_t, dbufs);
+
+	if (refcount > dbufs) {
+		rw_exit(&odn->dn_struct_rwlock);
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+		return (KMEM_CBRC_LATER);
+	}
+
+	rw_exit(&odn->dn_struct_rwlock);
+
+	/*
+	 * At this point we know that anyone with a hold on the dnode is not
+	 * actively referencing it. The dnode is known and in a valid state to
+	 * move. We're holding the locks needed to execute the critical section.
+	 */
+	dnode_move_impl(odn, ndn);
+
+	list_link_replace(&odn->dn_link, &ndn->dn_link);
+	/* If the dnode was safe to move, the refcount cannot have changed. */
+	ASSERT(refcount == refcount_count(&ndn->dn_holds));
+	ASSERT(dbufs == ndn->dn_dbufs_count);
+	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+	mutex_exit(&os->os_lock);
+
+	return (KMEM_CBRC_YES);
+}
+#endif	/* _KERNEL */
+#endif	/* illumos */
+
 void
-dnode_special_close(dnode_t *dn)
+dnode_special_close(dnode_handle_t *dnh)
 {
+	dnode_t *dn = dnh->dnh_dnode;
+
 	/*
 	 * Wait for final references to the dnode to clear.  This can
 	 * only happen if the arc is asyncronously evicting state that
@@ -483,49 +999,63 @@ dnode_special_close(dnode_t *dn)
 	 * dnode.
 	 */
 	while (refcount_count(&dn->dn_holds) > 0)
-		xdelay(1);
-	dnode_destroy(dn);
+		delay(1);
+	ASSERT(dn->dn_dbuf == NULL ||
+	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
+	zrl_add(&dnh->dnh_zrlock);
+	dnode_destroy(dn); /* implicit zrl_remove() */
+	zrl_destroy(&dnh->dnh_zrlock);
+	dnh->dnh_dnode = NULL;
 }
 
-dnode_t *
-dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
+void
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+    dnode_handle_t *dnh)
 {
-	dnode_t *dn = dnode_create(os, dnp, NULL, object);
+	dnode_t *dn;
+
+	dn = dnode_create(os, dnp, NULL, object, dnh);
+	zrl_init(&dnh->dnh_zrlock);
 	DNODE_VERIFY(dn);
-	return (dn);
 }
 
 static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_evict_async(void *dbu)
 {
-	dnode_t **children_dnodes = arg;
+	dnode_children_t *children_dnodes = dbu;
 	int i;
-	int epb = db->db_size >> DNODE_SHIFT;
 
-	for (i = 0; i < epb; i++) {
-		dnode_t *dn = children_dnodes[i];
-		int n;
+	for (i = 0; i < children_dnodes->dnc_count; i++) {
+		dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+		dnode_t *dn;
 
-		if (dn == NULL)
+		/*
+		 * The dnode handle lock guards against the dnode moving to
+		 * another valid address, so there is no need here to guard
+		 * against changes to or from NULL.
+		 */
+		if (dnh->dnh_dnode == NULL) {
+			zrl_destroy(&dnh->dnh_zrlock);
 			continue;
-#ifdef ZFS_DEBUG
+		}
+
+		zrl_add(&dnh->dnh_zrlock);
+		dn = dnh->dnh_dnode;
 		/*
 		 * If there are holds on this dnode, then there should
 		 * be holds on the dnode's containing dbuf as well; thus
-		 * it wouldn't be eligable for eviction and this function
+		 * it wouldn't be eligible for eviction and this function
 		 * would not have been called.
 		 */
 		ASSERT(refcount_is_zero(&dn->dn_holds));
-		ASSERT(list_head(&dn->dn_dbufs) == NULL);
 		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
-		for (n = 0; n < TXG_SIZE; n++)
-			ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
-#endif
-		children_dnodes[i] = NULL;
-		dnode_destroy(dn);
+		dnode_destroy(dn); /* implicit zrl_remove() */
+		zrl_destroy(&dnh->dnh_zrlock);
+		dnh->dnh_dnode = NULL;
 	}
-	kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+	kmem_free(children_dnodes, sizeof (dnode_children_t) +
+	    children_dnodes->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
@@ -544,24 +1074,29 @@ dnode_hold_impl(objset_t *os, uint64_t o
 	uint64_t blk;
 	dnode_t *mdn, *dn;
 	dmu_buf_impl_t *db;
-	dnode_t **children_dnodes;
+	dnode_children_t *children_dnodes;
+	dnode_handle_t *dnh;
 
 	/*
 	 * If you are holding the spa config lock as writer, you shouldn't
-	 * be asking the DMU to do *anything*.
+	 * be asking the DMU to do *anything* unless it's the root pool
+	 * which may require us to read from the root filesystem while
+	 * holding some (not all) of the locks as writer.
 	 */
-	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
+	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+	    (spa_is_root(os->os_spa) &&
+	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
 
 	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
 		dn = (object == DMU_USERUSED_OBJECT) ?
-		    os->os_userused_dnode : os->os_groupused_dnode;
+		    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
 		if (dn == NULL)
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 		type = dn->dn_type;
 		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
-			return (EEXIST);
+			return (SET_ERROR(EEXIST));
 		DNODE_VERIFY(dn);
 		(void) refcount_add(&dn->dn_holds, tag);
 		*dnp = dn;
@@ -569,9 +1104,10 @@ dnode_hold_impl(objset_t *os, uint64_t o
 	}
 
 	if (object == 0 || object >= DN_MAX_OBJECT)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
-	mdn = os->os_meta_dnode;
+	mdn = DMU_META_DNODE(os);
+	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
 
 	DNODE_VERIFY(mdn);
 
@@ -580,13 +1116,13 @@ dnode_hold_impl(objset_t *os, uint64_t o
 		drop_struct_lock = TRUE;
 	}
 
-	blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
 	db = dbuf_hold(mdn, blk, FTAG);
 	if (drop_struct_lock)
 		rw_exit(&mdn->dn_struct_rwlock);
 	if (db == NULL)
-		return (EIO);
+		return (SET_ERROR(EIO));
 	err = dbuf_read(db, NULL, DB_RF_CANFAIL);
 	if (err) {
 		dbuf_rele(db, FTAG);
@@ -598,28 +1134,41 @@ dnode_hold_impl(objset_t *os, uint64_t o
 
 	idx = object & (epb-1);
 
+	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
 	children_dnodes = dmu_buf_get_user(&db->db);
 	if (children_dnodes == NULL) {
-		dnode_t **winner;
-		children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *),
-		    KM_SLEEP);
-		if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
-		    dnode_buf_pageout)) {
-			kmem_free(children_dnodes, epb * sizeof (dnode_t *));
+		int i;
+		dnode_children_t *winner;
+		children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+		    epb * sizeof (dnode_handle_t), KM_SLEEP);
+		children_dnodes->dnc_count = epb;
+		dnh = &children_dnodes->dnc_children[0];
+		for (i = 0; i < epb; i++) {
+			zrl_init(&dnh[i].dnh_zrlock);
+		}
+		dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
+		    dnode_buf_evict_async, NULL);
+		winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+		if (winner != NULL) {
+
+			for (i = 0; i < epb; i++) {
+				zrl_destroy(&dnh[i].dnh_zrlock);
+			}
+
+			kmem_free(children_dnodes, sizeof (dnode_children_t) +
+			    epb * sizeof (dnode_handle_t));
 			children_dnodes = winner;
 		}
 	}
+	ASSERT(children_dnodes->dnc_count == epb);
 
-	if ((dn = children_dnodes[idx]) == NULL) {
-		dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx;
-		dnode_t *winner;
+	dnh = &children_dnodes->dnc_children[idx];
+	zrl_add(&dnh->dnh_zrlock);
+	dn = dnh->dnh_dnode;
+	if (dn == NULL) {
+		dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
 
-		dn = dnode_create(os, dnp, db, object);
-		winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn);
-		if (winner != NULL) {
-			dnode_destroy(dn);
-			dn = winner;
-		}
+		dn = dnode_create(os, phys, db, object, dnh);
 	}
 
 	mutex_enter(&dn->dn_mtx);
@@ -627,15 +1176,18 @@ dnode_hold_impl(objset_t *os, uint64_t o
 	if (dn->dn_free_txg ||
 	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
 	    ((flag & DNODE_MUST_BE_FREE) &&
-	    (type != DMU_OT_NONE || dn->dn_oldphys))) {
+	    (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
 		mutex_exit(&dn->dn_mtx);
+		zrl_remove(&dnh->dnh_zrlock);
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
 	}
+	if (refcount_add(&dn->dn_holds, tag) == 1)
+		dbuf_add_ref(db, dnh);
 	mutex_exit(&dn->dn_mtx);
 
-	if (refcount_add(&dn->dn_holds, tag) == 1)
-		dbuf_add_ref(db, dn);
+	/* Now we can rely on the hold to prevent the dnode from moving. */
+	zrl_remove(&dnh->dnh_zrlock);
 
 	DNODE_VERIFY(dn);
 	ASSERT3P(dn->dn_dbuf, ==, db);
@@ -676,14 +1228,44 @@ dnode_add_ref(dnode_t *dn, void *tag)
 void
 dnode_rele(dnode_t *dn, void *tag)
 {
+	mutex_enter(&dn->dn_mtx);
+	dnode_rele_and_unlock(dn, tag);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
+{
 	uint64_t refs;
+	/* Get while the hold prevents the dnode from moving. */
+	dmu_buf_impl_t *db = dn->dn_dbuf;
+	dnode_handle_t *dnh = dn->dn_handle;
 
-	mutex_enter(&dn->dn_mtx);
 	refs = refcount_remove(&dn->dn_holds, tag);
 	mutex_exit(&dn->dn_mtx);
+
+	/*
+	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
+	 * indirectly by dbuf_rele() while relying on the dnode handle to
+	 * prevent the dnode from moving, since releasing the last hold could
+	 * result in the dnode's parent dbuf evicting its dnode handles. For
+	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
+	 * other direct or indirect hold on the dnode must first drop the dnode
+	 * handle.
+	 */
+	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
-	if (refs == 0 && dn->dn_dbuf)
-		dbuf_rele(dn->dn_dbuf, dn);
+	if (refs == 0 && db != NULL) {
+		/*
+		 * Another thread could add a hold to the dnode handle in
+		 * dnode_hold_impl() while holding the parent dbuf. Since the
+		 * hold on the parent dbuf prevents the handle from being
+		 * destroyed, the hold on the handle is OK. We can't yet assert
+		 * that the handle has zero references, but that will be
+		 * asserted anyway when the handle gets destroyed.
+		 */
+		dbuf_rele(db, dnh);
+	}
 }
 
 void
@@ -702,10 +1284,15 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx
 #ifdef ZFS_DEBUG
 	mutex_enter(&dn->dn_mtx);
 	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
-	/* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */
+	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
 	mutex_exit(&dn->dn_mtx);
 #endif
 
+	/*
+	 * Determine old uid/gid when necessary
+	 */
+	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
 	mutex_enter(&os->os_lock);
 
 	/*
@@ -716,10 +1303,12 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx
 		return;
 	}
 
-	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+	ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+	    !avl_is_empty(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
-	ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
-	ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
+	ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
+	ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
+	ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
 	    dn->dn_object, txg);
@@ -735,7 +1324,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx
 	/*
 	 * The dnode maintains a hold on its containing dbuf as
 	 * long as there are holds on it.  Each instantiated child
-	 * dbuf maintaines a hold on the dnode.  When the last child
+	 * dbuf maintains a hold on the dnode.  When the last child
 	 * drops its hold, the dnode will drop its hold on the
 	 * containing dbuf. We add a "dirty hold" here so that the
 	 * dnode will hang around after we finish processing its
@@ -788,13 +1377,12 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *db, *db_next;
+	dmu_buf_impl_t *db;
 	int err;
 
+	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
 	if (size == 0)
 		size = SPA_MINBLOCKSIZE;
-	if (size > SPA_MAXBLOCKSIZE)
-		size = SPA_MAXBLOCKSIZE;
 	else
 		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
@@ -807,14 +1395,14 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
 	/* Check for any allocated blocks beyond the first */
-	if (dn->dn_phys->dn_maxblkid != 0)
+	if (dn->dn_maxblkid != 0)
 		goto fail;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
-	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-		db_next = list_next(&dn->dn_dbufs, db);
-
-		if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) {
+	for (db = avl_first(&dn->dn_dbufs); db != NULL;
+	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
+		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+		    db->db_blkid != DMU_SPILL_BLKID) {
 			mutex_exit(&dn->dn_dbufs_mtx);
 			goto fail;
 		}
@@ -825,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
 		goto fail;
 
 	/* resize the old block */
-	err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
 	if (err == 0)
 		dbuf_new_size(db, size, tx);
 	else if (err != ENOENT)
@@ -847,7 +1435,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
 
 fail:
 	rw_exit(&dn->dn_struct_rwlock);
-	return (ENOTSUP);
+	return (SET_ERROR(ENOTSUP));
 }
 
 /* read-holding callers must not rely on the lock being continuously held */
@@ -858,7 +1446,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t bl
 	int epbs, new_nlevels;
 	uint64_t sz;
 
-	ASSERT(blkid != DB_BONUS_BLKID);
+	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	ASSERT(have_read ?
 	    RW_READ_HELD(&dn->dn_struct_rwlock) :
@@ -905,6 +1493,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t bl
 
 		/* dirty the left indirects */
 		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+		ASSERT(db != NULL);
 		new = dbuf_dirty(db, tx);
 		dbuf_rele(db, FTAG);
 
@@ -915,7 +1504,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t bl
 		for (dr = list_head(list); dr; dr = dr_next) {
 			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
 			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
-			    dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+			    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
 				list_remove(&dn->dn_dirty_records[txgoff], dr);
 				list_insert_tail(&new->dt.di.dr_children, dr);
@@ -931,56 +1521,13 @@ out:
 		rw_downgrade(&dn->dn_struct_rwlock);
 }
 
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 {
-	avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-	avl_index_t where;
-	free_range_t *rp;
-	free_range_t rp_tofind;
-	uint64_t endblk = blkid + nblks;
-
-	ASSERT(MUTEX_HELD(&dn->dn_mtx));
-	ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
-	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-	    blkid, nblks, tx->tx_txg);
-	rp_tofind.fr_blkid = blkid;
-	rp = avl_find(tree, &rp_tofind, &where);
-	if (rp == NULL)
-		rp = avl_nearest(tree, where, AVL_BEFORE);
-	if (rp == NULL)
-		rp = avl_nearest(tree, where, AVL_AFTER);
-
-	while (rp && (rp->fr_blkid <= blkid + nblks)) {
-		uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
-		free_range_t *nrp = AVL_NEXT(tree, rp);
-
-		if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
-			/* clear this entire range */
-			avl_remove(tree, rp);
-			kmem_free(rp, sizeof (free_range_t));
-		} else if (blkid <= rp->fr_blkid &&
-		    endblk > rp->fr_blkid && endblk < fr_endblk) {
-			/* clear the beginning of this range */
-			rp->fr_blkid = endblk;
-			rp->fr_nblks = fr_endblk - endblk;
-		} else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
-		    endblk >= fr_endblk) {
-			/* clear the end of this range */
-			rp->fr_nblks = blkid - rp->fr_blkid;
-		} else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
-			/* clear a chunk out of this range */
-			free_range_t *new_rp =
-			    kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-
-			new_rp->fr_blkid = endblk;
-			new_rp->fr_nblks = fr_endblk - endblk;
-			avl_insert_here(tree, new_rp, rp, AVL_AFTER);
-			rp->fr_nblks = blkid - rp->fr_blkid;
-		}
-		/* there may be no overlap */
-		rp = nrp;
+	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+	if (db != NULL) {
+		dmu_buf_will_dirty(&db->db, tx);
+		dbuf_rele(db, FTAG);
 	}
 }
 
@@ -998,7 +1545,7 @@ dnode_free_range(dnode_t *dn, uint64_t o
 	blkshift = dn->dn_datablkshift;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
-	if (len == -1ULL) {
+	if (len == DMU_OBJECT_END) {
 		len = UINT64_MAX - off;
 		trunc = TRUE;
 	}
@@ -1014,7 +1561,13 @@ dnode_free_range(dnode_t *dn, uint64_t o
 	} else {
 		ASSERT(dn->dn_maxblkid == 0);
 		if (off == 0 && len >= blksz) {
-			/* Freeing the whole block; fast-track this request */
+			/*
+			 * Freeing the whole block; fast-track this request.
+			 * Note that we won't dirty any indirect blocks,
+			 * which is fine because we will be freeing the entire
+			 * file and thus all indirect blocks will be freed
+			 * by free_children().
+			 */
 			blkid = 0;
 			nblks = 1;
 			goto done;
@@ -1033,15 +1586,15 @@ dnode_free_range(dnode_t *dn, uint64_t o
 		ASSERT3U(blkoff + head, ==, blksz);
 		if (len < head)
 			head = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-		    FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			caddr_t data;
 
 			/* don't dirty if it isn't on disk and isn't dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
-				dbuf_will_dirty(db, tx);
+				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				data = db->db.db_data;
 				bzero(data + blkoff, head);
@@ -1066,18 +1619,18 @@ dnode_free_range(dnode_t *dn, uint64_t o
 	else
 		tail = P2PHASE(len, blksz);
 
-	ASSERT3U(P2PHASE(off, blksz), ==, 0);
+	ASSERT0(P2PHASE(off, blksz));
 	/* zero out any partial block data at the end of the range */
 	if (tail) {
 		if (len < tail)
 			tail = len;
-		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-		    TRUE, FTAG, &db) == 0) {
+		if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+		    TRUE, FALSE, FTAG, &db) == 0) {
 			/* don't dirty if not on disk and not dirty */
 			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
-				dbuf_will_dirty(db, tx);
+				dmu_buf_will_dirty(&db->db, tx);
 				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 				bzero(db->db.db_data, tail);
 			}
@@ -1098,87 +1651,116 @@ dnode_free_range(dnode_t *dn, uint64_t o
 		nblks += 1;
 
 	/*
-	 * Read in and mark all the level-1 indirects dirty,
-	 * so that they will stay in memory until syncing phase.
-	 * Always dirty the first and last indirect to make sure
-	 * we dirty all the partial indirects.
+	 * Dirty all the indirect blocks in this range.  Note that only
+	 * the first and last indirect blocks can actually be written
+	 * (if they were partially freed) -- they must be dirtied, even if
+	 * they do not exist on disk yet.  The interior blocks will
+	 * be freed by free_children(), so they will not actually be written.
+	 * Even though these interior blocks will not be written, we
+	 * dirty them for two reasons:
+	 *
+	 *  - It ensures that the indirect blocks remain in memory until
+	 *    syncing context.  (They have already been prefetched by
+	 *    dmu_tx_hold_free(), so we don't have to worry about reading
+	 *    them serially here.)
+	 *
+	 *  - The dirty space accounting will put pressure on the txg sync
+	 *    mechanism to begin syncing, and to delay transactions if there
+	 *    is a large amount of freeing.  Even though these indirect
+	 *    blocks will not be written, we could need to write the same
+	 *    amount of space if we copy the freed BPs into deadlists.
 	 */
 	if (dn->dn_nlevels > 1) {
-		uint64_t i, first, last;
-		int shift = epbs + dn->dn_datablkshift;
+		uint64_t first, last;
 
 		first = blkid >> epbs;
-		if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
+		dnode_dirty_l1(dn, first, tx);
 		if (trunc)
 			last = dn->dn_maxblkid >> epbs;
 		else
 			last = (blkid + nblks - 1) >> epbs;
-		if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-			dbuf_will_dirty(db, tx);
-			dbuf_rele(db, FTAG);
-		}
-		for (i = first + 1; i < last; i++) {
-			uint64_t ibyte = i << shift;
-			int err;
+		if (last != first)
+			dnode_dirty_l1(dn, last, tx);
 
-			err = dnode_next_offset(dn,
-			    DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		for (uint64_t i = first + 1; i < last; i++) {
+			/*
+			 * Set i to the blockid of the next non-hole
+			 * level-1 indirect block at or after i.  Note
+			 * that dnode_next_offset() operates in terms of
+			 * level-0-equivalent bytes.
+			 */
+			uint64_t ibyte = i << shift;
+			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+			    &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
-			if (err == ESRCH || i >= last)
+			if (i >= last)
 				break;
-			ASSERT(err == 0);
-			db = dbuf_hold_level(dn, 1, i, FTAG);
-			if (db) {
-				dbuf_will_dirty(db, tx);
-				dbuf_rele(db, FTAG);
-			}
+
+			/*
+			 * Normally we should not see an error, either
+			 * from dnode_next_offset() or dbuf_hold_level()
+			 * (except for ESRCH from dnode_next_offset).
+			 * If there is an i/o error, then when we read
+			 * this block in syncing context, it will use
+			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+			 * to the "failmode" property.  dnode_next_offset()
+			 * doesn't have a flag to indicate MUSTSUCCEED.
+			 */
+			if (err != 0)
+				break;
+
+			dnode_dirty_l1(dn, i, tx);
 		}
 	}
+
 done:
 	/*
 	 * Add this range to the dnode range list.
 	 * We will finish up this free operation in the syncing phase.
 	 */
 	mutex_enter(&dn->dn_mtx);
-	dnode_clear_range(dn, blkid, nblks, tx);
-	{
-		free_range_t *rp, *found;
-		avl_index_t where;
-		avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
-		/* Add new range to dn_ranges */
-		rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-		rp->fr_blkid = blkid;
-		rp->fr_nblks = nblks;
-		found = avl_find(tree, rp, &where);
-		ASSERT(found == NULL);
-		avl_insert(tree, rp, where);
-		dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-		    blkid, nblks, tx->tx_txg);
+	int txgoff = tx->tx_txg & TXG_MASK;
+	if (dn->dn_free_ranges[txgoff] == NULL) {
+		dn->dn_free_ranges[txgoff] =
+		    range_tree_create(NULL, NULL, &dn->dn_mtx);
 	}
+	range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+	range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+	    blkid, nblks, tx->tx_txg);
 	mutex_exit(&dn->dn_mtx);
 
 	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
 	dnode_setdirty(dn, tx);
 out:
-	if (trunc && dn->dn_maxblkid >= (off >> blkshift))
-		dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
 
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+	int i;
+
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
 /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
-	free_range_t range_tofind;
 	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
 	int i;
 
-	if (blkid == DB_BONUS_BLKID)
+	if (blkid == DMU_BONUS_BLKID)
 		return (FALSE);
 
 	/*
@@ -1191,20 +1773,13 @@ dnode_block_freed(dnode_t *dn, uint64_t 
 	if (dn->dn_free_txg)
 		return (TRUE);
 
-	range_tofind.fr_blkid = blkid;
+	if (blkid == DMU_SPILL_BLKID)
+		return (dnode_spill_freed(dn));
+
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
-		free_range_t *range_found;
-		avl_index_t idx;
-
-		range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
-		if (range_found) {
-			ASSERT(range_found->fr_nblks > 0);
-			break;
-		}
-		range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
-		if (range_found &&
-		    range_found->fr_blkid + range_found->fr_nblks > blkid)
+		if (dn->dn_free_ranges[i] != NULL &&
+		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
 			break;
 	}
 	mutex_exit(&dn->dn_mtx);
@@ -1231,7 +1806,7 @@ dnode_diduse_space(dnode_t *dn, int64_t 
 	space += delta;
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
 		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
-		ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
+		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
 		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
 	} else {
 		dn->dn_phys->dn_used = space;
@@ -1241,34 +1816,35 @@ dnode_diduse_space(dnode_t *dn, int64_t 
 }
 
 /*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of memory in use by the currently open txg.
  */
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int64_t aspace = spa_get_asize(os->os_spa, space);
 
-	if (space > 0)
-		space = spa_get_asize(os->os_spa, space);
-
-	if (ds)
-		dsl_dir_willuse_space(ds->ds_dir, space, tx);
+	if (ds != NULL) {
+		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+	}
 
-	dmu_tx_willuse_space(tx, space);
+	dmu_tx_willuse_space(tx, aspace);
 }
 
 /*
- * This function scans a block at the indicated "level" looking for
- * a hole or data (depending on 'flags').  If level > 0, then we are
- * scanning an indirect block looking at its pointers.  If level == 0,
- * then we are looking at a block of dnodes.  If we don't find what we
- * are looking for in the block, we return ESRCH.  Otherwise, return
- * with *offset pointing to the beginning (if searching forwards) or
- * end (if searching backwards) of the range covered by the block
- * pointer we matched on (or dnode).
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers.  If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
  *
  * The basic search algorithm used below by dnode_next_offset() is to
  * use this function to search up the block tree (widen the search) until
@@ -1278,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-	int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
 	dmu_buf_impl_t *db = NULL;
 	void *data = NULL;
@@ -1300,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int
 		epb = dn->dn_phys->dn_nblkptr;
 		data = dn->dn_phys->dn_blkptr;
 	} else {
-		uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
 		if (error) {
 			if (error != ENOENT)
 				return (error);
@@ -1314,7 +1890,7 @@ dnode_next_offset_level(dnode_t *dn, int
 			 * at the pointer to this block in its parent, and its
 			 * going to be unallocated, so we will skip over it.
 			 */
-			return (ESRCH);
+			return (SET_ERROR(ESRCH));
 		}
 		error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
 		if (error) {
@@ -1324,13 +1900,15 @@ dnode_next_offset_level(dnode_t *dn, int
 		data = db->db.db_data;
 	}
 
-	if (db && txg &&
-	    (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+
+	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+	    db->db_blkptr->blk_birth <= txg ||
+	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
 		 * and these conditions mean that we need to keep climbing.
 		 */
-		error = ESRCH;
+		error = SET_ERROR(ESRCH);
 	} else if (lvl == 0) {
 		dnode_phys_t *dnp = data;
 		span = DNODE_SHIFT;
@@ -1343,7 +1921,7 @@ dnode_next_offset_level(dnode_t *dn, int
 			*offset += (1ULL << span) * inc;
 		}
 		if (i < 0 || i == blkfill)
-			error = ESRCH;
+			error = SET_ERROR(ESRCH);
 	} else {
 		blkptr_t *bp = data;
 		uint64_t start = *offset;
@@ -1359,8 +1937,8 @@ dnode_next_offset_level(dnode_t *dn, int
 		*offset = *offset >> span;
 		for (i = BF64_GET(*offset, 0, epbs);
 		    i >= 0 && i < epb; i += inc) {
-			if (bp[i].blk_fill >= minfill &&
-			    bp[i].blk_fill <= maxfill &&
+			if (BP_GET_FILL(&bp[i]) >= minfill &&
+			    BP_GET_FILL(&bp[i]) <= maxfill &&
 			    (hole || bp[i].blk_birth > txg))
 				break;
 			if (inc > 0 || *offset > 0)
@@ -1375,7 +1953,7 @@ dnode_next_offset_level(dnode_t *dn, int
 			*offset = start;
 		}
 		if (i < 0 || i >= epb)
-			error = ESRCH;
+			error = SET_ERROR(ESRCH);
 	}
 
 	if (db)
@@ -1419,7 +1997,7 @@ dnode_next_offset(dnode_t *dn, int flags
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dn->dn_phys->dn_nlevels == 0) {
-		error = ESRCH;
+		error = SET_ERROR(ESRCH);
 		goto out;
 	}
 
@@ -1428,7 +2006,7 @@ dnode_next_offset(dnode_t *dn, int flags
 			if (flags & DNODE_FIND_HOLE)
 				*offset = dn->dn_datablksz;
 		} else {
-			error = ESRCH;
+			error = SET_ERROR(ESRCH);
 		}
 		goto out;
 	}
@@ -1447,9 +2025,18 @@ dnode_next_offset(dnode_t *dn, int flags
 		    flags, offset, lvl, blkfill, txg);
 	}
 
+	/*
+	 * There's always a "virtual hole" at the end of the object, even
+	 * if all BP's which physically exist are non-holes.
+	 */
+	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+		error = 0;
+	}
+
 	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
 	    initial_offset < *offset : initial_offset > *offset))
-		error = ESRCH;
+		error = SET_ERROR(ESRCH);
 out:
 	if (!(flags & DNODE_FIND_HAVELOCK))
 		rw_exit(&dn->dn_struct_rwlock);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c,v
retrieving revision 1.5
diff -u -p -r1.5 dnode_sync.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c	21 Jun 2013 16:22:46 -0000	1.5
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c	10 Oct 2016 11:09:56 -0000
@@ -18,9 +18,11 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -31,6 +33,8 @@
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
+#include <sys/range_tree.h>
+#include <sys/zfeature.h>
 
 static void
 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
@@ -56,28 +60,27 @@ dnode_increase_indirection(dnode_t *dn, 
 	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
 	    dn->dn_object, dn->dn_phys->dn_nlevels);
 
-	/* check for existing blkptrs in the dnode */
-	for (i = 0; i < nblkptr; i++)
-		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
-			break;
-	if (i != nblkptr) {
-		/* transfer dnode's block pointers to new indirect block */
-		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
-		ASSERT(db->db.db_data);
-		ASSERT(arc_released(db->db_buf));
-		ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
-		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
-		    sizeof (blkptr_t) * nblkptr);
-		arc_buf_freeze(db->db_buf);
-	}
+	/* transfer dnode's block pointers to new indirect block */
+	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+	ASSERT(db->db.db_data);
+	ASSERT(arc_released(db->db_buf));
+	ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+	bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+	    sizeof (blkptr_t) * nblkptr);
+	arc_buf_freeze(db->db_buf);
 
 	/* set dbuf's parent pointers to new indirect buf */
 	for (i = 0; i < nblkptr; i++) {
-		dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+		dmu_buf_impl_t *child =
+		    dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
 
 		if (child == NULL)
 			continue;
-		ASSERT3P(child->db_dnode, ==, dn);
+#ifdef	DEBUG
+		DB_DNODE_ENTER(child);
+		ASSERT3P(DB_DNODE(child), ==, dn);
+		DB_DNODE_EXIT(child);
+#endif	/* DEBUG */
 		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
 			ASSERT(child->db_parent->db_level == db->db_level);
 			ASSERT(child->db_blkptr !=
@@ -107,26 +110,44 @@ dnode_increase_indirection(dnode_t *dn, 
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
-static int
+static void
 free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	uint64_t bytesfreed = 0;
-	int i, blocks_freed = 0;
 
 	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
 
-	for (i = 0; i < num; i++, bp++) {
+	for (int i = 0; i < num; i++, bp++) {
 		if (BP_IS_HOLE(bp))
 			continue;
 
 		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+
+		/*
+		 * Save some useful information on the holes being
+		 * punched, including logical size, type, and indirection
+		 * level. Retaining birth time enables detection of when
+		 * holes are punched for reducing the number of free
+		 * records transmitted during a zfs send.
+		 */
+
+		uint64_t lsize = BP_GET_LSIZE(bp);
+		dmu_object_type_t type = BP_GET_TYPE(bp);
+		uint64_t lvl = BP_GET_LEVEL(bp);
+
 		bzero(bp, sizeof (blkptr_t));
-		blocks_freed += 1;
+
+		if (spa_feature_is_active(dn->dn_objset->os_spa,
+		    SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, type);
+			BP_SET_LEVEL(bp, lvl);
+			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
+		}
 	}
 	dnode_diduse_space(dn, -bytesfreed);
-	return (blocks_freed);
 }
 
 #ifdef ZFS_DEBUG
@@ -136,15 +157,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t
 	int off, num;
 	int i, err, epbs;
 	uint64_t txg = tx->tx_txg;
+	dnode_t *dn;
 
-	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	off = start - (db->db_blkid * 1<<epbs);
 	num = end - start + 1;
 
 	ASSERT3U(off, >=, 0);
 	ASSERT3U(num, >=, 0);
 	ASSERT3U(db->db_level, >, 0);
-	ASSERT3U(db->db.db_size, ==, 1<<db->db_dnode->dn_phys->dn_indblkshift);
+	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
 	ASSERT(db->db_blkptr != NULL);
 
@@ -156,10 +180,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t
 
 		ASSERT(db->db_level == 1);
 
-		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(db->db_dnode, db->db_level-1,
-		    (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
-		rw_exit(&db->db_dnode->dn_struct_rwlock);
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, db->db_level-1,
+		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
+		rw_exit(&dn->dn_struct_rwlock);
 		if (err == ENOENT)
 			continue;
 		ASSERT(err == 0);
@@ -201,43 +225,40 @@ free_verify(dmu_buf_impl_t *db, uint64_t
 
 		dbuf_rele(child, FTAG);
 	}
+	DB_DNODE_EXIT(db);
 }
 #endif
 
-#define	ALL -1
-
-static int
-free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+static void
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
     dmu_tx_t *tx)
 {
-	dnode_t *dn = db->db_dnode;
+	dnode_t *dn;
 	blkptr_t *bp;
 	dmu_buf_impl_t *subdb;
 	uint64_t start, end, dbstart, dbend, i;
-	int epbs, shift, err;
-	int all = TRUE;
-	int blocks_freed = 0;
+	int epbs, shift;
 
 	/*
 	 * There is a small possibility that this block will not be cached:
 	 *   1 - if level > 1 and there are no children with level <= 1
-	 *   2 - if we didn't get a dirty hold (because this block had just
-	 *	 finished being written -- and so had no holds), and then this
-	 *	 block got evicted before we got here.
+	 *   2 - if this block was evicted since we read it from
+	 *	 dmu_tx_hold_free().
 	 */
 	if (db->db_state != DB_CACHED)
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 
-	arc_release(db->db_buf, db);
-	bp = (blkptr_t *)db->db.db_data;
+	dbuf_release_bp(db);
+	bp = db->db.db_data;
 
-	epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	shift = (db->db_level - 1) * epbs;
 	dbstart = db->db_blkid << epbs;
 	start = blkid >> shift;
 	if (dbstart < start) {
 		bp += start - dbstart;
-		all = FALSE;
 	} else {
 		start = dbstart;
 	}
@@ -245,68 +266,68 @@ free_children(dmu_buf_impl_t *db, uint64
 	end = (blkid + nblks - 1) >> shift;
 	if (dbend <= end)
 		end = dbend;
-	else if (all)
-		all = trunc;
+
 	ASSERT3U(start, <=, end);
 
 	if (db->db_level == 1) {
 		FREE_VERIFY(db, start, end, tx);
-		blocks_freed = free_blocks(dn, bp, end-start+1, tx);
-		arc_buf_freeze(db->db_buf);
-		ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
-		return (all ? ALL : blocks_freed);
+		free_blocks(dn, bp, end-start+1, tx);
+	} else {
+		for (i = start; i <= end; i++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
+			    i, TRUE, FALSE, FTAG, &subdb));
+			rw_exit(&dn->dn_struct_rwlock);
+			ASSERT3P(bp, ==, subdb->db_blkptr);
+
+			free_children(subdb, blkid, nblks, tx);
+			dbuf_rele(subdb, FTAG);
+		}
 	}
 
-	for (i = start; i <= end; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
-		ASSERT3U(err, ==, 0);
-		rw_exit(&dn->dn_struct_rwlock);
-
-		if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
-			ASSERT3P(subdb->db_blkptr, ==, bp);
-			blocks_freed += free_blocks(dn, bp, 1, tx);
-		} else {
-			all = FALSE;
-		}
-		dbuf_rele(subdb, FTAG);
+	/* If this whole block is free, free ourself too. */
+	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+		if (!BP_IS_HOLE(bp))
+			break;
 	}
-	arc_buf_freeze(db->db_buf);
-#ifdef ZFS_DEBUG
-	bp -= (end-start)+1;
-	for (i = start; i <= end; i++, bp++) {
-		if (i == start && blkid != 0)
-			continue;
-		else if (i == end && !trunc)
-			continue;
-		ASSERT3U(bp->blk_birth, ==, 0);
+	if (i == 1 << epbs) {
+		/* didn't find any non-holes */
+		bzero(db->db.db_data, db->db.db_size);
+		free_blocks(dn, db->db_blkptr, 1, tx);
+	} else {
+		/*
+		 * Partial block free; must be marked dirty so that it
+		 * will be written out.
+		 */
+		ASSERT(db->db_dirtycnt > 0);
 	}
-#endif
-	ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
-	return (all ? ALL : blocks_freed);
+
+	DB_DNODE_EXIT(db);
+	arc_buf_freeze(db->db_buf);
 }
 
 /*
- * free_range: Traverse the indicated range of the provided file
+ * Traverse the indicated range of the provided file
  * and "free" all the blocks contained there.
  */
 static void
-dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
+    dmu_tx_t *tx)
 {
 	blkptr_t *bp = dn->dn_phys->dn_blkptr;
-	dmu_buf_impl_t *db;
-	int trunc, start, end, shift, i, err;
 	int dnlevel = dn->dn_phys->dn_nlevels;
+	boolean_t trunc = B_FALSE;
 
 	if (blkid > dn->dn_phys->dn_maxblkid)
 		return;
 
 	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
-	trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
-	if (trunc)
+	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
 		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+		trunc = B_TRUE;
+	}
 
 	/* There are no indirect blocks in the object */
 	if (dnlevel == 1) {
@@ -315,102 +336,110 @@ dnode_sync_free_range(dnode_t *dn, uint6
 			return;
 		}
 		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
-		(void) free_blocks(dn, bp + blkid, nblks, tx);
-		if (trunc) {
-			uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
-			    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-			dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
-			ASSERT(off < dn->dn_phys->dn_maxblkid ||
-			    dn->dn_phys->dn_maxblkid == 0 ||
-			    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
-		}
-		return;
-	}
-
-	shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
-	start = blkid >> shift;
-	ASSERT(start < dn->dn_phys->dn_nblkptr);
-	end = (blkid + nblks - 1) >> shift;
-	bp += start;
-	for (i = start; i <= end; i++, bp++) {
-		if (BP_IS_HOLE(bp))
-			continue;
-		rw_enter(&dn->dn_struct_rwlock, RW_READER);
-		err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
-		ASSERT3U(err, ==, 0);
-		rw_exit(&dn->dn_struct_rwlock);
+		free_blocks(dn, bp + blkid, nblks, tx);
+	} else {
+		int shift = (dnlevel - 1) *
+		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+		int start = blkid >> shift;
+		int end = (blkid + nblks - 1) >> shift;
+		dmu_buf_impl_t *db;
+
+		ASSERT(start < dn->dn_phys->dn_nblkptr);
+		bp += start;
+		for (int i = start; i <= end; i++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
+			    TRUE, FALSE, FTAG, &db));
+			rw_exit(&dn->dn_struct_rwlock);
 
-		if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
-			ASSERT3P(db->db_blkptr, ==, bp);
-			(void) free_blocks(dn, bp, 1, tx);
+			free_children(db, blkid, nblks, tx);
+			dbuf_rele(db, FTAG);
 		}
-		dbuf_rele(db, FTAG);
 	}
+
 	if (trunc) {
+		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
+
 		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
 		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-		dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
 		ASSERT(off < dn->dn_phys->dn_maxblkid ||
 		    dn->dn_phys->dn_maxblkid == 0 ||
 		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
 	}
 }
 
+typedef struct dnode_sync_free_range_arg {
+	dnode_t *dsfra_dnode;
+	dmu_tx_t *dsfra_tx;
+} dnode_sync_free_range_arg_t;
+
+static void
+dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
+{
+	dnode_sync_free_range_arg_t *dsfra = arg;
+	dnode_t *dn = dsfra->dsfra_dnode;
+
+	mutex_exit(&dn->dn_mtx);
+	dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+	mutex_enter(&dn->dn_mtx);
+}
+
 /*
- * Try to kick all the dnodes dbufs out of the cache...
+ * Try to kick all the dnode's dbufs out of the cache...
  */
 void
 dnode_evict_dbufs(dnode_t *dn)
 {
-	int progress;
-	int pass = 0;
+	dmu_buf_impl_t db_marker;
+	dmu_buf_impl_t *db, *db_next;
 
-	do {
-		dmu_buf_impl_t *db, marker;
-		int evicting = FALSE;
-
-		progress = FALSE;
-		mutex_enter(&dn->dn_dbufs_mtx);
-		list_insert_tail(&dn->dn_dbufs, &marker);
-		db = list_head(&dn->dn_dbufs);
-		for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
-			list_remove(&dn->dn_dbufs, db);
-			list_insert_tail(&dn->dn_dbufs, db);
-			ASSERT3P(db->db_dnode, ==, dn);
-
-			mutex_enter(&db->db_mtx);
-			if (db->db_state == DB_EVICTING) {
-				progress = TRUE;
-				evicting = TRUE;
-				mutex_exit(&db->db_mtx);
-			} else if (refcount_is_zero(&db->db_holds)) {
-				progress = TRUE;
-				dbuf_clear(db); /* exits db_mtx for us */
-			} else {
-				mutex_exit(&db->db_mtx);
-			}
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+
+#ifdef	DEBUG
+		DB_DNODE_ENTER(db);
+		ASSERT3P(DB_DNODE(db), ==, dn);
+		DB_DNODE_EXIT(db);
+#endif	/* DEBUG */
+
+		mutex_enter(&db->db_mtx);
+		if (db->db_state != DB_EVICTING &&
+		    refcount_is_zero(&db->db_holds)) {
+			db_marker.db_level = db->db_level;
+			db_marker.db_blkid = db->db_blkid;
+			db_marker.db_state = DB_SEARCH;
+			avl_insert_here(&dn->dn_dbufs, &db_marker, db,
+			    AVL_BEFORE);
+
+			dbuf_destroy(db);
 
+			db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
+			avl_remove(&dn->dn_dbufs, &db_marker);
+		} else {
+			db->db_pending_evict = TRUE;
+			mutex_exit(&db->db_mtx);
+			db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		}
-		list_remove(&dn->dn_dbufs, &marker);
-		/*
-		 * NB: we need to drop dn_dbufs_mtx between passes so
-		 * that any DB_EVICTING dbufs can make progress.
-		 * Ideally, we would have some cv we could wait on, but
-		 * since we don't, just wait a bit to give the other
-		 * thread a chance to run.
-		 */
-		mutex_exit(&dn->dn_dbufs_mtx);
-		if (evicting)
-			xdelay(1);
-		pass++;
-		ASSERT(pass < 100); /* sanity check */
-	} while (progress);
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
 
+	dnode_evict_bonus(dn);
+}
+
+void
+dnode_evict_bonus(dnode_t *dn)
+{
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
-		mutex_enter(&dn->dn_bonus->db_mtx);
-		dbuf_evict(dn->dn_bonus);
-		dn->dn_bonus = NULL;
+	if (dn->dn_bonus != NULL) {
+		if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
+			mutex_enter(&dn->dn_bonus->db_mtx);
+			dbuf_destroy(dn->dn_bonus);
+			dn->dn_bonus = NULL;
+		} else {
+			dn->dn_bonus->db_pending_evict = TRUE;
+		}
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 }
@@ -434,9 +463,12 @@ dnode_undirty_dbufs(list_t *list)
 		db->db_last_dirty = NULL;
 		db->db_dirtycnt -= 1;
 		if (db->db_level == 0) {
-			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 			    dr->dt.dl.dr_data == db->db_buf);
 			dbuf_unoverride(dr);
+		} else {
+			mutex_destroy(&dr->dt.di.dr_mtx);
+			list_destroy(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
@@ -454,12 +486,11 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t
 	 * Our contents should have been freed in dnode_sync() by the
 	 * free range record inserted by the caller of dnode_free().
 	 */
-	ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0);
+	ASSERT0(DN_USED_BYTES(dn->dn_phys));
 	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
 
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 	dnode_evict_dbufs(dn);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
 	/*
 	 * XXX - It would be nice to assert this, but we may still
@@ -482,7 +513,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t
 
 	ASSERT(dn->dn_free_txg > 0);
 	if (dn->dn_allocated_txg != dn->dn_free_txg)
-		dbuf_will_dirty(dn->dn_dbuf, tx);
+		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
 	bzero(dn->dn_phys, sizeof (dnode_phys_t));
 
 	mutex_enter(&dn->dn_mtx);
@@ -490,6 +521,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t
 	dn->dn_maxblkid = 0;
 	dn->dn_allocated_txg = 0;
 	dn->dn_free_txg = 0;
+	dn->dn_have_spill = B_FALSE;
 	mutex_exit(&dn->dn_mtx);
 
 	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
@@ -507,11 +539,11 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 {
-	free_range_t *rp;
 	dnode_phys_t *dnp = dn->dn_phys;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	list_t *list = &dn->dn_dirty_records[txgoff];
 	static const dnode_phys_t zerodn = { 0 };
+	boolean_t kill_spill = B_FALSE;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -523,10 +555,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 
 	if (dmu_objset_userused_enabled(dn->dn_objset) &&
 	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
-		ASSERT(dn->dn_oldphys == NULL);
-		dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
-		*dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+		dn->dn_oldflags = dn->dn_phys->dn_flags;
 		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+		mutex_exit(&dn->dn_mtx);
+		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
 	} else {
 		/* Once we account for it, we should always account for it. */
 		ASSERT(!(dn->dn_phys->dn_flags &
@@ -546,25 +580,34 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		dnp->dn_bonustype = dn->dn_bonustype;
 		dnp->dn_bonuslen = dn->dn_bonuslen;
 	}
-
 	ASSERT(dnp->dn_nlevels > 1 ||
 	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
 	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	ASSERT(dnp->dn_nlevels < 2 ||
+	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
 
-	if (dn->dn_next_blksz[txgoff]) {
+	if (dn->dn_next_type[txgoff] != 0) {
+		dnp->dn_type = dn->dn_type;
+		dn->dn_next_type[txgoff] = 0;
+	}
+
+	if (dn->dn_next_blksz[txgoff] != 0) {
 		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 		    SPA_MINBLOCKSIZE) == 0);
 		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
-		    dnp->dn_datablkszsec);
+		    dnp->dn_datablkszsec ||
+		    range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
 		dnp->dn_datablkszsec =
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
 		dn->dn_next_blksz[txgoff] = 0;
 	}
 
-	if (dn->dn_next_bonuslen[txgoff]) {
+	if (dn->dn_next_bonuslen[txgoff] != 0) {
 		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
 			dnp->dn_bonuslen = 0;
 		else
@@ -573,7 +616,26 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		dn->dn_next_bonuslen[txgoff] = 0;
 	}
 
-	if (dn->dn_next_indblkshift[txgoff]) {
+	if (dn->dn_next_bonustype[txgoff] != 0) {
+		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
+		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+		dn->dn_next_bonustype[txgoff] = 0;
+	}
+
+	boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
+	    dn->dn_free_txg <= tx->tx_txg;
+
+	/*
+	 * Remove the spill block if we have been explicitly asked to
+	 * remove it, or if the object is being removed.
+	 */
+	if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+			kill_spill = B_TRUE;
+		dn->dn_rm_spillblk[txgoff] = 0;
+	}
+
+	if (dn->dn_next_indblkshift[txgoff] != 0) {
 		ASSERT(dnp->dn_nlevels == 1);
 		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
 		dn->dn_next_indblkshift[txgoff] = 0;
@@ -589,21 +651,36 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 
 	mutex_exit(&dn->dn_mtx);
 
+	if (kill_spill) {
+		free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+		mutex_exit(&dn->dn_mtx);
+	}
+
 	/* process all the "freed" ranges in the file */
-	while (rp = avl_last(&dn->dn_ranges[txgoff])) {
-		dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
-		/* grab the mutex so we don't race with dnode_block_freed() */
+	if (dn->dn_free_ranges[txgoff] != NULL) {
+		dnode_sync_free_range_arg_t dsfra;
+		dsfra.dsfra_dnode = dn;
+		dsfra.dsfra_tx = tx;
 		mutex_enter(&dn->dn_mtx);
-		avl_remove(&dn->dn_ranges[txgoff], rp);
+		range_tree_vacate(dn->dn_free_ranges[txgoff],
+		    dnode_sync_free_range, &dsfra);
+		range_tree_destroy(dn->dn_free_ranges[txgoff]);
+		dn->dn_free_ranges[txgoff] = NULL;
 		mutex_exit(&dn->dn_mtx);
-		kmem_free(rp, sizeof (free_range_t));
 	}
 
-	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+	if (freeing_dnode) {
 		dnode_sync_free(dn, tx);
 		return;
 	}
 
+	if (dn->dn_next_nlevels[txgoff]) {
+		dnode_increase_indirection(dn, tx);
+		dn->dn_next_nlevels[txgoff] = 0;
+	}
+
 	if (dn->dn_next_nblkptr[txgoff]) {
 		/* this should only happen on a realloc */
 		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
@@ -628,12 +705,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 	}
 
-	if (dn->dn_next_nlevels[txgoff]) {
-		dnode_increase_indirection(dn, tx);
-		dn->dn_next_nlevels[txgoff] = 0;
-	}
-
-	dbuf_sync_list(list, tx);
+	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
 
 	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		ASSERT3P(list_head(list), ==, NULL);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c	10 Oct 2016 11:09:56 -0000
@@ -0,0 +1,457 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/spa.h>
+#include <sys/dsl_bookmark.h>
+#include <zfs_namecheck.h>
+
+static int
+dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
+    dsl_dataset_t **dsp, void *tag, char **shortnamep)
+{
+	char buf[ZFS_MAX_DATASET_NAME_LEN];
+	char *hashp;
+
+	if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	hashp = strchr(fullname, '#');
+	if (hashp == NULL)
+		return (SET_ERROR(EINVAL));
+
+	*shortnamep = hashp + 1;
+	if (zfs_component_namecheck(*shortnamep, NULL, NULL))
+		return (SET_ERROR(EINVAL));
+	(void) strlcpy(buf, fullname, hashp - fullname + 1);
+	return (dsl_dataset_hold(dp, buf, tag, dsp));
+}
+
+/*
+ * Returns ESRCH if bookmark is not found.
+ */
+static int
+dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname,
+    zfs_bookmark_phys_t *bmark_phys)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t bmark_zapobj = ds->ds_bookmarks;
+	matchtype_t mt;
+	int err;
+
+	if (bmark_zapobj == 0)
+		return (SET_ERROR(ESRCH));
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_FIRST;
+	else
+		mt = MT_EXACT;
+
+	err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
+	    sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt,
+	    NULL, 0, NULL);
+
+	return (err == ENOENT ? ESRCH : err);
+}
+
+/*
+ * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark
+ * does not represents an earlier point in later_ds's timeline.
+ *
+ * Returns ENOENT if the dataset containing the bookmark does not exist.
+ * Returns ESRCH if the dataset exists but the bookmark was not found in it.
+ */
+int
+dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
+    dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
+{
+	char *shortname;
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_bmark_lookup(ds, shortname, bmp);
+	if (error == 0 && later_ds != NULL) {
+		if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
+			error = SET_ERROR(EXDEV);
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+typedef struct dsl_bookmark_create_arg {
+	nvlist_t *dbca_bmarks;
+	nvlist_t *dbca_errors;
+} dsl_bookmark_create_arg_t;
+
+static int
+dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *bmark_fs;
+	char *shortname;
+	int error;
+	zfs_bookmark_phys_t bmark_phys;
+
+	if (!snapds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	error = dsl_bookmark_hold_ds(dp, bookmark_name,
+	    &bmark_fs, FTAG, &shortname);
+	if (error != 0)
+		return (error);
+
+	if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) {
+		dsl_dataset_rele(bmark_fs, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = dsl_dataset_bmark_lookup(bmark_fs, shortname,
+	    &bmark_phys);
+	dsl_dataset_rele(bmark_fs, FTAG);
+	if (error == 0)
+		return (SET_ERROR(EEXIST));
+	if (error == ESRCH)
+		return (0);
+	return (error);
+}
+
+static int
+dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_create_arg_t *dbca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int rv = 0;
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+		return (SET_ERROR(ENOTSUP));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+		dsl_dataset_t *snapds;
+		int error;
+
+		/* note: validity of nvlist checked by ioctl layer */
+		error = dsl_dataset_hold(dp, fnvpair_value_string(pair),
+		    FTAG, &snapds);
+		if (error == 0) {
+			error = dsl_bookmark_create_check_impl(snapds,
+			    nvpair_name(pair), tx);
+			dsl_dataset_rele(snapds, FTAG);
+		}
+		if (error != 0) {
+			fnvlist_add_int32(dbca->dbca_errors,
+			    nvpair_name(pair), error);
+			rv = error;
+		}
+	}
+
+	return (rv);
+}
+
+static void
+dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_create_arg_t *dbca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+
+	ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+		dsl_dataset_t *snapds, *bmark_fs;
+		zfs_bookmark_phys_t bmark_phys;
+		char *shortname;
+
+		VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair),
+		    FTAG, &snapds));
+		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
+		    &bmark_fs, FTAG, &shortname));
+		if (bmark_fs->ds_bookmarks == 0) {
+			bmark_fs->ds_bookmarks =
+			    zap_create_norm(mos, U8_TEXTPREP_TOUPPER,
+			    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+			spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+
+			dsl_dataset_zapify(bmark_fs, tx);
+			VERIFY0(zap_add(mos, bmark_fs->ds_object,
+			    DS_FIELD_BOOKMARK_NAMES,
+			    sizeof (bmark_fs->ds_bookmarks), 1,
+			    &bmark_fs->ds_bookmarks, tx));
+		}
+
+		bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid;
+		bmark_phys.zbm_creation_txg =
+		    dsl_dataset_phys(snapds)->ds_creation_txg;
+		bmark_phys.zbm_creation_time =
+		    dsl_dataset_phys(snapds)->ds_creation_time;
+
+		VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks,
+		    shortname, sizeof (uint64_t),
+		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+		    &bmark_phys, tx));
+
+		spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
+		    "name=%s creation_txg=%llu target_snap=%llu",
+		    shortname,
+		    (longlong_t)bmark_phys.zbm_creation_txg,
+		    (longlong_t)snapds->ds_object);
+
+		dsl_dataset_rele(bmark_fs, FTAG);
+		dsl_dataset_rele(snapds, FTAG);
+	}
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
+{
+	nvpair_t *pair;
+	dsl_bookmark_create_arg_t dbca;
+
+	pair = nvlist_next_nvpair(bmarks, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dbca.dbca_bmarks = bmarks;
+	dbca.dbca_errors = errors;
+
+	return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
+	    dsl_bookmark_create_sync, &dbca,
+	    fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
+{
+	int err = 0;
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	uint64_t bmark_zapobj = ds->ds_bookmarks;
+	if (bmark_zapobj == 0)
+		return (0);
+
+	for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj);
+	    zap_cursor_retrieve(&zc, &attr) == 0;
+	    zap_cursor_advance(&zc)) {
+		char *bmark_name = attr.za_name;
+		zfs_bookmark_phys_t bmark_phys;
+
+		err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys);
+		ASSERT3U(err, !=, ENOENT);
+		if (err != 0)
+			break;
+
+		nvlist_t *out_props = fnvlist_alloc();
+		if (nvlist_exists(props,
+		    zfs_prop_to_name(ZFS_PROP_GUID))) {
+			dsl_prop_nvlist_add_uint64(out_props,
+			    ZFS_PROP_GUID, bmark_phys.zbm_guid);
+		}
+		if (nvlist_exists(props,
+		    zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
+			dsl_prop_nvlist_add_uint64(out_props,
+			    ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg);
+		}
+		if (nvlist_exists(props,
+		    zfs_prop_to_name(ZFS_PROP_CREATION))) {
+			dsl_prop_nvlist_add_uint64(out_props,
+			    ZFS_PROP_CREATION, bmark_phys.zbm_creation_time);
+		}
+
+		fnvlist_add_nvlist(outnvl, bmark_name, out_props);
+		fnvlist_free(out_props);
+	}
+	zap_cursor_fini(&zc);
+	return (err);
+}
+
+/*
+ * Retrieve the bookmarks that exist in the specified dataset, and the
+ * requested properties of each bookmark.
+ *
+ * The "props" nvlist specifies which properties are requested.
+ * See lzc_get_bookmarks() for the list of valid properties.
+ */
+int
+dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_pool_hold(dsname, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	err = dsl_get_bookmarks_impl(ds, props, outnvl);
+
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (err);
+}
+
+typedef struct dsl_bookmark_destroy_arg {
+	nvlist_t *dbda_bmarks;
+	nvlist_t *dbda_success;
+	nvlist_t *dbda_errors;
+} dsl_bookmark_destroy_arg_t;
+
+static int
+dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t bmark_zapobj = ds->ds_bookmarks;
+	matchtype_t mt;
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_FIRST;
+	else
+		mt = MT_EXACT;
+
+	return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
+}
+
+static int
+dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_destroy_arg_t *dbda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int rv = 0;
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+		return (0);
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
+		const char *fullname = nvpair_name(pair);
+		dsl_dataset_t *ds;
+		zfs_bookmark_phys_t bm;
+		int error;
+		char *shortname;
+
+		error = dsl_bookmark_hold_ds(dp, fullname, &ds,
+		    FTAG, &shortname);
+		if (error == ENOENT) {
+			/* ignore it; the bookmark is "already destroyed" */
+			continue;
+		}
+		if (error == 0) {
+			error = dsl_dataset_bmark_lookup(ds, shortname, &bm);
+			dsl_dataset_rele(ds, FTAG);
+			if (error == ESRCH) {
+				/*
+				 * ignore it; the bookmark is
+				 * "already destroyed"
+				 */
+				continue;
+			}
+		}
+		if (error == 0) {
+			fnvlist_add_boolean(dbda->dbda_success, fullname);
+		} else {
+			fnvlist_add_int32(dbda->dbda_errors, fullname, error);
+			rv = error;
+		}
+	}
+	return (rv);
+}
+
+static void
+dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_destroy_arg_t *dbda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
+		dsl_dataset_t *ds;
+		char *shortname;
+		uint64_t zap_cnt;
+
+		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
+		    &ds, FTAG, &shortname));
+		VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx));
+
+		/*
+		 * If all of this dataset's bookmarks have been destroyed,
+		 * free the zap object and decrement the feature's use count.
+		 */
+		VERIFY0(zap_count(mos, ds->ds_bookmarks,
+		    &zap_cnt));
+		if (zap_cnt == 0) {
+			dmu_buf_will_dirty(ds->ds_dbuf, tx);
+			VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
+			ds->ds_bookmarks = 0;
+			spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+			VERIFY0(zap_remove(mos, ds->ds_object,
+			    DS_FIELD_BOOKMARK_NAMES, tx));
+		}
+
+		spa_history_log_internal_ds(ds, "remove bookmark", tx,
+		    "name=%s", shortname);
+
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
+{
+	int rv;
+	dsl_bookmark_destroy_arg_t dbda;
+	nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dbda.dbda_bmarks = bmarks;
+	dbda.dbda_errors = errors;
+	dbda.dbda_success = fnvlist_alloc();
+
+	rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
+	    dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
+	    ZFS_SPACE_CHECK_RESERVED);
+	fnvlist_free(dbda.dbda_success);
+	return (rv);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c,v
retrieving revision 1.2
diff -u -p -r1.2 dsl_dataset.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c	19 Feb 2016 19:25:22 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c	29 Apr 2017 00:45:23 -0000
@@ -19,8 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 #include <sys/dmu_objset.h>
@@ -29,29 +35,60 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_traverse.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_send.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
+#include <sys/zfeature.h>
 #include <sys/unique.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dmu_send.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
 
-static char *dsl_reaper = "the grim reaper";
+SYSCTL_DECL(_vfs_zfs);
 
-static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
-static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
+/*
+ * The SPA supports block sizes up to 16MB.  However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator.  Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB).  Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
+    &zfs_max_recordsize, 0,
+    "Maximum block size.  Expect dragons when tuning this.");
+
+#define	SWITCH64(x, y) \
+	{ \
+		uint64_t __tmp = (x); \
+		(x) = (y); \
+		(y) = __tmp; \
+	}
 
 #define	DS_REF_MAX	(1ULL << 62)
 
-#define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
 
-#define	DSL_DATASET_IS_DESTROYED(ds)	((ds)->ds_owner == dsl_reaper)
+extern int spa_asize_inflation;
 
+static zil_header_t zero_zil;
 
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
@@ -61,13 +98,15 @@ static dsl_syncfunc_t dsl_dataset_set_re
 static int64_t
 parent_delta(dsl_dataset_t *ds, int64_t delta)
 {
+	dsl_dataset_phys_t *ds_phys;
 	uint64_t old_bytes, new_bytes;
 
 	if (ds->ds_reserved == 0)
 		return (delta);
 
-	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
-	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+	ds_phys = dsl_dataset_phys(ds);
+	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
 
 	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
 	return (new_bytes - old_bytes);
@@ -81,129 +120,124 @@ dsl_dataset_block_born(dsl_dataset_t *ds
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
 
-	dprintf_bp(bp, "born, ds=%p\n", ds);
+	dprintf_bp(bp, "ds=%p", ds);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* It could have been compressed away to nothing */
 	if (BP_IS_HOLE(bp))
 		return;
 	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
-	ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES);
+	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
 	if (ds == NULL) {
-		/*
-		 * Account for the meta-objset space in its placeholder
-		 * dsl_dir.
-		 */
-		ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-		    used, compressed, uncompressed, tx);
-		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    used, compressed, uncompressed);
 		return;
 	}
+
+	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
-	ds->ds_phys->ds_used_bytes += used;
-	ds->ds_phys->ds_compressed_bytes += compressed;
-	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
-	ds->ds_phys->ds_unique_bytes += used;
+	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+	dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
+		    B_TRUE;
+	}
+
+	spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+	if (f != SPA_FEATURE_NONE)
+		ds->ds_feature_activation_needed[f] = B_TRUE;
+
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
 	dsl_dir_transfer_space(ds->ds_dir, used - delta,
-	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
-	mutex_exit(&ds->ds_dir->dd_lock);
+	    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
 }
 
 int
 dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
     boolean_t async)
 {
+	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(bp->blk_birth <= tx->tx_txg);
 
-	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
-	int compressed = BP_GET_PSIZE(bp);
-	int uncompressed = BP_GET_UCSIZE(bp);
-
-	ASSERT(used > 0);
 	if (ds == NULL) {
-		/*
-		 * Account for the meta-objset space in its placeholder
-		 * dataset.
-		 */
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
-
-		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
-		    -used, -compressed, -uncompressed, tx);
-		dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx);
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    -used, -compressed, -uncompressed);
 		return (used);
 	}
 	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
 
-	ASSERT(!dsl_dataset_is_snapshot(ds));
+	ASSERT(!ds->ds_is_snapshot);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
-	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
-		dprintf_bp(bp, "freeing: %s", "");
+		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
-		mutex_enter(&ds->ds_dir->dd_lock);
 		mutex_enter(&ds->ds_lock);
-		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
 		    !DS_UNIQUE_IS_ACCURATE(ds));
 		delta = parent_delta(ds, -used);
-		ds->ds_phys->ds_unique_bytes -= used;
+		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
 		mutex_exit(&ds->ds_lock);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
 		    delta, -compressed, -uncompressed, tx);
 		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
-		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
-		mutex_exit(&ds->ds_dir->dd_lock);
+		    DD_USED_REFRSRV, DD_USED_HEAD, NULL);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
 		if (async) {
 			/*
 			 * We are here as part of zio's write done callback,
 			 * which means we're a zio interrupt thread.  We can't
-			 * call bplist_enqueue() now because it may block
+			 * call dsl_deadlist_insert() now because it may block
 			 * waiting for I/O.  Instead, put bp on the deferred
 			 * queue and let dsl_pool_sync() finish the job.
 			 */
-			bplist_enqueue_deferred(&ds->ds_deadlist, bp);
+			bplist_append(&ds->ds_pending_deadlist, bp);
 		} else {
-			VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
-		    ds->ds_phys->ds_prev_snap_obj);
-		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
 		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object && bp->blk_birth >
-		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
-			ds->ds_prev->ds_phys->ds_unique_bytes += used;
+			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
-		if (bp->blk_birth > ds->ds_origin_txg) {
+		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
 	}
 	mutex_enter(&ds->ds_lock);
-	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
-	ds->ds_phys->ds_used_bytes -= used;
-	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
-	ds->ds_phys->ds_compressed_bytes -= compressed;
-	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
-	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
 	mutex_exit(&ds->ds_lock);
 
 	return (used);
@@ -229,50 +263,78 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t 
 	if (ds->ds_trysnap_txg >
 	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
 		trysnap = ds->ds_trysnap_txg;
-	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+	return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
 }
 
 boolean_t
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+    uint64_t blk_birth)
 {
-	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
+	if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
+	    (bp != NULL && BP_IS_HOLE(bp)))
+		return (B_FALSE);
+
+	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
+
+	return (B_TRUE);
 }
 
-/* ARGSUSED */
+/*
+ * We have to release the fsid syncronously or we risk that a subsequent
+ * mount of the same dataset will fail to unique_insert the fsid.  This
+ * failure would manifest itself as the fsid of this dataset changing
+ * between mounts which makes NFS clients quite unhappy.
+ */
 static void
-dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+dsl_dataset_evict_sync(void *dbu)
 {
-	dsl_dataset_t *ds = dsv;
+	dsl_dataset_t *ds = dbu;
 
-	ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
+	ASSERT(ds->ds_owner == NULL);
 
 	unique_remove(ds->ds_fsid_guid);
+}
+
+static void
+dsl_dataset_evict_async(void *dbu)
+{
+	dsl_dataset_t *ds = dbu;
+
+	ASSERT(ds->ds_owner == NULL);
+
+	ds->ds_dbuf = NULL;
 
 	if (ds->ds_objset != NULL)
 		dmu_objset_evict(ds->ds_objset);
 
 	if (ds->ds_prev) {
-		dsl_dataset_drop_ref(ds->ds_prev, ds);
+		dsl_dataset_rele(ds->ds_prev, ds);
 		ds->ds_prev = NULL;
 	}
 
-	bplist_close(&ds->ds_deadlist);
+	bplist_destroy(&ds->ds_pending_deadlist);
+	if (ds->ds_deadlist.dl_os != NULL)
+		dsl_deadlist_close(&ds->ds_deadlist);
 	if (ds->ds_dir)
-		dsl_dir_close(ds->ds_dir, ds);
+		dsl_dir_async_rele(ds->ds_dir, ds);
 
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
+	list_destroy(&ds->ds_prop_cbs);
+	if (mutex_owned(&ds->ds_lock))
+		mutex_exit(&ds->ds_lock);
 	mutex_destroy(&ds->ds_lock);
-	mutex_destroy(&ds->ds_recvlock);
+	if (mutex_owned(&ds->ds_opening_lock))
+		mutex_exit(&ds->ds_opening_lock);
 	mutex_destroy(&ds->ds_opening_lock);
-	rw_destroy(&ds->ds_rwlock);
-	cv_destroy(&ds->ds_exclusive_cv);
-	bplist_fini(&ds->ds_deadlist);
+	mutex_destroy(&ds->ds_sendstream_lock);
+	refcount_destroy(&ds->ds_longholds);
+	rrw_destroy(&ds->ds_bp_rwlock);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
 
-static int
+int
 dsl_dataset_get_snapname(dsl_dataset_t *ds)
 {
 	dsl_dataset_phys_t *headphys;
@@ -283,12 +345,12 @@ dsl_dataset_get_snapname(dsl_dataset_t *
 
 	if (ds->ds_snapname[0])
 		return (0);
-	if (ds->ds_phys->ds_next_snap_obj == 0)
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
 		return (0);
 
-	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
 	    FTAG, &headdbuf);
-	if (err)
+	if (err != 0)
 		return (err);
 	headphys = headdbuf->db_data;
 	err = zap_value_search(dp->dp_meta_objset,
@@ -297,15 +359,15 @@ dsl_dataset_get_snapname(dsl_dataset_t *
 	return (err);
 }
 
-static int
+int
 dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
@@ -317,17 +379,18 @@ dsl_dataset_snap_lookup(dsl_dataset_t *d
 	return (err);
 }
 
-static int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
+int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt)
 {
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 	matchtype_t mt;
 	int err;
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
-	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
 		mt = MT_EXACT;
@@ -335,316 +398,305 @@ dsl_dataset_snap_remove(dsl_dataset_t *d
 	err = zap_remove_norm(mos, snapobj, name, mt, tx);
 	if (err == ENOTSUP && mt == MT_FIRST)
 		err = zap_remove(mos, snapobj, name, tx);
+
+	if (err == 0 && adj_cnt)
+		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
 	return (err);
 }
 
-static int
-dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+	dmu_buf_t *dbuf = ds->ds_dbuf;
+	boolean_t result = B_FALSE;
+
+	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+		if (ds == dmu_buf_get_user(dbuf))
+			result = B_TRUE;
+		else
+			dmu_buf_rele(dbuf, tag);
+	}
+
+	return (result);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
     dsl_dataset_t **dsp)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	dmu_buf_t *dbuf;
 	dsl_dataset_t *ds;
 	int err;
+	dmu_object_info_t doi;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
+	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
-	if (err)
+	if (err != 0)
 		return (err);
+
+	/* Make sure dsobj has the correct object type. */
+	dmu_object_info_from_db(dbuf, &doi);
+	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
+		dmu_buf_rele(dbuf, tag);
+		return (SET_ERROR(EINVAL));
+	}
+
 	ds = dmu_buf_get_user(dbuf);
 	if (ds == NULL) {
-		dsl_dataset_t *winner;
+		dsl_dataset_t *winner = NULL;
 
 		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
 		ds->ds_dbuf = dbuf;
 		ds->ds_object = dsobj;
-		ds->ds_phys = dbuf->db_data;
+		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
-		rw_init(&ds->ds_rwlock, 0, 0, 0);
-		cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
-		bplist_init(&ds->ds_deadlist);
-
-		err = bplist_open(&ds->ds_deadlist,
-		    mos, ds->ds_phys->ds_deadlist_obj);
-		if (err == 0) {
-			err = dsl_dir_open_obj(dp,
-			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+		rrw_init(&ds->ds_bp_rwlock, B_FALSE);
+		refcount_create(&ds->ds_longholds);
+
+		bplist_create(&ds->ds_pending_deadlist);
+		dsl_deadlist_open(&ds->ds_deadlist,
+		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
+
+		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
+		    offsetof(dmu_sendarg_t, dsa_link));
+
+		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
+		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
+
+		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+				if (!(spa_feature_table[f].fi_flags &
+				    ZFEATURE_FLAG_PER_DATASET))
+					continue;
+				err = zap_contains(mos, dsobj,
+				    spa_feature_table[f].fi_guid);
+				if (err == 0) {
+					ds->ds_feature_inuse[f] = B_TRUE;
+				} else {
+					ASSERT3U(err, ==, ENOENT);
+					err = 0;
+				}
+			}
 		}
-		if (err) {
-			/*
-			 * we don't really need to close the blist if we
-			 * just opened it.
-			 */
+
+		err = dsl_dir_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
+		if (err != 0) {
 			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			rw_destroy(&ds->ds_rwlock);
-			cv_destroy(&ds->ds_exclusive_cv);
-			bplist_fini(&ds->ds_deadlist);
+			mutex_destroy(&ds->ds_sendstream_lock);
+			refcount_destroy(&ds->ds_longholds);
+			bplist_destroy(&ds->ds_pending_deadlist);
+			dsl_deadlist_close(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
 		}
 
-		if (!dsl_dataset_is_snapshot(ds)) {
+		if (!ds->ds_is_snapshot) {
 			ds->ds_snapname[0] = '\0';
-			if (ds->ds_phys->ds_prev_snap_obj) {
-				err = dsl_dataset_get_ref(dp,
-				    ds->ds_phys->ds_prev_snap_obj,
+			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+				err = dsl_dataset_hold_obj(dp,
+				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
 				    ds, &ds->ds_prev);
 			}
-
-			if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) {
-				dsl_dataset_t *origin;
-
-				err = dsl_dataset_hold_obj(dp,
-				    ds->ds_dir->dd_phys->dd_origin_obj,
-				    FTAG, &origin);
-				if (err == 0) {
-					ds->ds_origin_txg =
-					    origin->ds_phys->ds_creation_txg;
-					dsl_dataset_rele(origin, FTAG);
-				}
+			if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+				int zaperr = zap_lookup(mos, ds->ds_object,
+				    DS_FIELD_BOOKMARK_NAMES,
+				    sizeof (ds->ds_bookmarks), 1,
+				    &ds->ds_bookmarks);
+				if (zaperr != ENOENT)
+					VERIFY0(zaperr);
 			}
 		} else {
 			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
 				err = dsl_dataset_get_snapname(ds);
-			if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+			if (err == 0 &&
+			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
 				err = zap_count(
 				    ds->ds_dir->dd_pool->dp_meta_objset,
-				    ds->ds_phys->ds_userrefs_obj,
+				    dsl_dataset_phys(ds)->ds_userrefs_obj,
 				    &ds->ds_userrefs);
 			}
 		}
 
-		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
-			/*
-			 * In sync context, we're called with either no lock
-			 * or with the write lock.  If we're not syncing,
-			 * we're always called with the read lock held.
-			 */
-			boolean_t need_lock =
-			    !RW_WRITE_HELD(&dp->dp_config_rwlock) &&
-			    dsl_pool_sync_context(dp);
-
-			if (need_lock)
-				rw_enter(&dp->dp_config_rwlock, RW_READER);
-
-			err = dsl_prop_get_ds(ds,
-			    "refreservation", sizeof (uint64_t), 1,
-			    &ds->ds_reserved, NULL);
+		if (err == 0 && !ds->ds_is_snapshot) {
+			err = dsl_prop_get_int_ds(ds,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+			    &ds->ds_reserved);
 			if (err == 0) {
-				err = dsl_prop_get_ds(ds,
-				    "refquota", sizeof (uint64_t), 1,
-				    &ds->ds_quota, NULL);
+				err = dsl_prop_get_int_ds(ds,
+				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+				    &ds->ds_quota);
 			}
-
-			if (need_lock)
-				rw_exit(&dp->dp_config_rwlock);
 		} else {
 			ds->ds_reserved = ds->ds_quota = 0;
 		}
 
-		if (err == 0) {
-			winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
-			    dsl_dataset_evict);
-		}
-		if (err || winner) {
-			bplist_close(&ds->ds_deadlist);
+		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
+		    dsl_dataset_evict_async, &ds->ds_dbuf);
+		if (err == 0)
+			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+		if (err != 0 || winner != NULL) {
+			bplist_destroy(&ds->ds_pending_deadlist);
+			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
-				dsl_dataset_drop_ref(ds->ds_prev, ds);
-			dsl_dir_close(ds->ds_dir, ds);
+				dsl_dataset_rele(ds->ds_prev, ds);
+			dsl_dir_rele(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
-			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			rw_destroy(&ds->ds_rwlock);
-			cv_destroy(&ds->ds_exclusive_cv);
-			bplist_fini(&ds->ds_deadlist);
+			mutex_destroy(&ds->ds_sendstream_lock);
+			refcount_destroy(&ds->ds_longholds);
 			kmem_free(ds, sizeof (dsl_dataset_t));
-			if (err) {
+			if (err != 0) {
 				dmu_buf_rele(dbuf, tag);
 				return (err);
 			}
 			ds = winner;
 		} else {
 			ds->ds_fsid_guid =
-			    unique_insert(ds->ds_phys->ds_fsid_guid);
+			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
+			if (ds->ds_fsid_guid !=
+			    dsl_dataset_phys(ds)->ds_fsid_guid) {
+				zfs_dbgmsg("ds_fsid_guid changed from "
+				    "%llx to %llx for pool %s dataset id %llu",
+				    (long long)
+				    dsl_dataset_phys(ds)->ds_fsid_guid,
+				    (long long)ds->ds_fsid_guid,
+				    spa_name(dp->dp_spa),
+				    dsobj);
+			}
 		}
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
-	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
-	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
 	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
 	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
-	mutex_enter(&ds->ds_lock);
-	if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
-		mutex_exit(&ds->ds_lock);
-		dmu_buf_rele(ds->ds_dbuf, tag);
-		return (ENOENT);
-	}
-	mutex_exit(&ds->ds_lock);
 	*dsp = ds;
 	return (0);
 }
 
-static int
-dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	/*
-	 * In syncing context we don't want the rwlock lock: there
-	 * may be an existing writer waiting for sync phase to
-	 * finish.  We don't need to worry about such writers, since
-	 * sync phase is single-threaded, so the writer can't be
-	 * doing anything while we are active.
-	 */
-	if (dsl_pool_sync_context(dp)) {
-		ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
-		return (0);
-	}
-
-	/*
-	 * Normal users will hold the ds_rwlock as a READER until they
-	 * are finished (i.e., call dsl_dataset_rele()).  "Owners" will
-	 * drop their READER lock after they set the ds_owner field.
-	 *
-	 * If the dataset is being destroyed, the destroy thread will
-	 * obtain a WRITER lock for exclusive access after it's done its
-	 * open-context work and then change the ds_owner to
-	 * dsl_reaper once destruction is assured.  So threads
-	 * may block here temporarily, until the "destructability" of
-	 * the dataset is determined.
-	 */
-	ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock));
-	mutex_enter(&ds->ds_lock);
-	while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) {
-		rw_exit(&dp->dp_config_rwlock);
-		cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock);
-		if (DSL_DATASET_IS_DESTROYED(ds)) {
-			mutex_exit(&ds->ds_lock);
-			dsl_dataset_drop_ref(ds, tag);
-			rw_enter(&dp->dp_config_rwlock, RW_READER);
-			return (ENOENT);
-		}
-		/*
-		 * The dp_config_rwlock lives above the ds_lock. And
-		 * we need to check DSL_DATASET_IS_DESTROYED() while
-		 * holding the ds_lock, so we have to drop and reacquire
-		 * the ds_lock here.
-		 */
-		mutex_exit(&ds->ds_lock);
-		rw_enter(&dp->dp_config_rwlock, RW_READER);
-		mutex_enter(&ds->ds_lock);
-	}
-	mutex_exit(&ds->ds_lock);
-	return (0);
-}
-
-int
-dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
-    dsl_dataset_t **dsp)
-{
-	int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp);
-
-	if (err)
-		return (err);
-	return (dsl_dataset_hold_ref(*dsp, tag));
-}
-
 int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
+dsl_dataset_hold(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
 {
-	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
-	if (err)
-		return (err);
-	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
-		dsl_dataset_rele(*dsp, tag);
-		*dsp = NULL;
-		return (EBUSY);
-	}
-	return (0);
-}
-
-int
-dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
-{
 	dsl_dir_t *dd;
-	dsl_pool_t *dp;
 	const char *snapname;
 	uint64_t obj;
 	int err = 0;
+	dsl_dataset_t *ds;
 
-	err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname);
-	if (err)
+	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
+	if (err != 0)
 		return (err);
 
-	dp = dd->dd_pool;
-	obj = dd->dd_phys->dd_head_dataset_obj;
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if (obj)
-		err = dsl_dataset_get_ref(dp, obj, tag, dsp);
+	ASSERT(dsl_pool_config_held(dp));
+	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+	if (obj != 0)
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
 	else
-		err = ENOENT;
-	if (err)
-		goto out;
-
-	err = dsl_dataset_hold_ref(*dsp, tag);
+		err = SET_ERROR(ENOENT);
 
 	/* we may be looking for a snapshot */
 	if (err == 0 && snapname != NULL) {
-		dsl_dataset_t *ds = NULL;
+		dsl_dataset_t *snap_ds;
 
 		if (*snapname++ != '@') {
-			dsl_dataset_rele(*dsp, tag);
-			err = ENOENT;
-			goto out;
+			dsl_dataset_rele(ds, tag);
+			dsl_dir_rele(dd, FTAG);
+			return (SET_ERROR(ENOENT));
 		}
 
 		dprintf("looking for snapshot '%s'\n", snapname);
-		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
 		if (err == 0)
-			err = dsl_dataset_get_ref(dp, obj, tag, &ds);
-		dsl_dataset_rele(*dsp, tag);
+			err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
+		dsl_dataset_rele(ds, tag);
 
-		ASSERT3U((err == 0), ==, (ds != NULL));
-
-		if (ds) {
-			mutex_enter(&ds->ds_lock);
-			if (ds->ds_snapname[0] == 0)
-				(void) strlcpy(ds->ds_snapname, snapname,
-				    sizeof (ds->ds_snapname));
-			mutex_exit(&ds->ds_lock);
-			err = dsl_dataset_hold_ref(ds, tag);
-			*dsp = err ? NULL : ds;
+		if (err == 0) {
+			mutex_enter(&snap_ds->ds_lock);
+			if (snap_ds->ds_snapname[0] == 0)
+				(void) strlcpy(snap_ds->ds_snapname, snapname,
+				    sizeof (snap_ds->ds_snapname));
+			mutex_exit(&snap_ds->ds_lock);
+			ds = snap_ds;
 		}
 	}
-out:
-	rw_exit(&dp->dp_config_rwlock);
-	dsl_dir_close(dd, FTAG);
+	if (err == 0)
+		*dsp = ds;
+	dsl_dir_rele(dd, FTAG);
 	return (err);
 }
 
 int
-dsl_dataset_own(const char *name, boolean_t inconsistentok,
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
+    void *tag, dsl_dataset_t **dsp)
+{
+	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
+	if (err != 0)
+		return (err);
+	if (!dsl_dataset_tryown(*dsp, tag)) {
+		dsl_dataset_rele(*dsp, tag);
+		*dsp = NULL;
+		return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+int
+dsl_dataset_own(dsl_pool_t *dp, const char *name,
     void *tag, dsl_dataset_t **dsp)
 {
-	int err = dsl_dataset_hold(name, tag, dsp);
-	if (err)
+	int err = dsl_dataset_hold(dp, name, tag, dsp);
+	if (err != 0)
 		return (err);
-	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+	if (!dsl_dataset_tryown(*dsp, tag)) {
 		dsl_dataset_rele(*dsp, tag);
-		return (EBUSY);
+		return (SET_ERROR(EBUSY));
 	}
 	return (0);
 }
 
+/*
+ * See the comment above dsl_pool_hold() for details.  In summary, a long
+ * hold is used to prevent destruction of a dataset while the pool hold
+ * is dropped, allowing other concurrent operations (e.g. spa_sync()).
+ *
+ * The dataset and pool must be held when this function is called.  After it
+ * is called, the pool hold may be released while the dataset is still held
+ * and accessed.
+ */
+void
+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+{
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	(void) refcount_add(&ds->ds_longholds, tag);
+}
+
+void
+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+{
+	(void) refcount_remove(&ds->ds_longholds, tag);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_dataset_long_held(dsl_dataset_t *ds)
+{
+	return (!refcount_is_zero(&ds->ds_longholds));
+}
+
 void
 dsl_dataset_name(dsl_dataset_t *ds, char *name)
 {
@@ -652,106 +704,110 @@ dsl_dataset_name(dsl_dataset_t *ds, char
 		(void) strcpy(name, "mos");
 	} else {
 		dsl_dir_name(ds->ds_dir, name);
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_get_snapname(ds));
 		if (ds->ds_snapname[0]) {
-			(void) strcat(name, "@");
+			VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
+			    <, ZFS_MAX_DATASET_NAME_LEN);
 			/*
 			 * We use a "recursive" mutex so that we
 			 * can call dprintf_ds() with ds_lock held.
 			 */
 			if (!MUTEX_HELD(&ds->ds_lock)) {
 				mutex_enter(&ds->ds_lock);
-				(void) strcat(name, ds->ds_snapname);
+				VERIFY3U(strlcat(name, ds->ds_snapname,
+				    ZFS_MAX_DATASET_NAME_LEN), <,
+				    ZFS_MAX_DATASET_NAME_LEN);
 				mutex_exit(&ds->ds_lock);
 			} else {
-				(void) strcat(name, ds->ds_snapname);
+				VERIFY3U(strlcat(name, ds->ds_snapname,
+				    ZFS_MAX_DATASET_NAME_LEN), <,
+				    ZFS_MAX_DATASET_NAME_LEN);
 			}
 		}
 	}
 }
 
-static int
+int
 dsl_dataset_namelen(dsl_dataset_t *ds)
 {
-	int result;
-
-	if (ds == NULL) {
-		result = 3;	/* "mos" */
-	} else {
-		result = dsl_dir_namelen(ds->ds_dir);
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-		if (ds->ds_snapname[0]) {
-			++result;	/* adding one for the @-sign */
-			if (!MUTEX_HELD(&ds->ds_lock)) {
-				mutex_enter(&ds->ds_lock);
-				result += strlen(ds->ds_snapname);
-				mutex_exit(&ds->ds_lock);
-			} else {
-				result += strlen(ds->ds_snapname);
-			}
-		}
-	}
-
-	return (result);
-}
-
-void
-dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
-{
-	dmu_buf_rele(ds->ds_dbuf, tag);
+	VERIFY0(dsl_dataset_get_snapname(ds));
+	mutex_enter(&ds->ds_lock);
+	int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
+	mutex_exit(&ds->ds_lock);
+	return (len);
 }
 
 void
 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 {
-	if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
-		rw_exit(&ds->ds_rwlock);
-	}
-	dsl_dataset_drop_ref(ds, tag);
+	dmu_buf_rele(ds->ds_dbuf, tag);
 }
 
 void
 dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
-	ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
-	    (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
+	ASSERT3P(ds->ds_owner, ==, tag);
+	ASSERT(ds->ds_dbuf != NULL);
 
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = NULL;
-	if (RW_WRITE_HELD(&ds->ds_rwlock)) {
-		rw_exit(&ds->ds_rwlock);
-		cv_broadcast(&ds->ds_exclusive_cv);
-	}
 	mutex_exit(&ds->ds_lock);
-	if (ds->ds_dbuf)
-		dsl_dataset_drop_ref(ds, tag);
-	else
-		dsl_dataset_evict(ds->ds_dbuf, ds);
+	dsl_dataset_long_rele(ds, tag);
+	dsl_dataset_rele(ds, tag);
 }
 
 boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
 {
 	boolean_t gotit = FALSE;
 
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	mutex_enter(&ds->ds_lock);
-	if (ds->ds_owner == NULL &&
-	    (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
+	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
 		ds->ds_owner = tag;
-		if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
-			rw_exit(&ds->ds_rwlock);
+		dsl_dataset_long_hold(ds, tag);
 		gotit = TRUE;
 	}
 	mutex_exit(&ds->ds_lock);
 	return (gotit);
 }
 
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+	boolean_t rv;
+	mutex_enter(&ds->ds_lock);
+	rv = (ds->ds_owner != NULL);
+	mutex_exit(&ds->ds_lock);
+	return (rv);
+}
+
+static void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+	uint64_t zero = 0;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	spa_feature_incr(spa, f, tx);
+	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+	VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+	    sizeof (zero), 1, &zero, tx));
+}
+
 void
-dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner)
+dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
 {
-	ASSERT3P(owner, ==, ds->ds_owner);
-	if (!RW_WRITE_HELD(&ds->ds_rwlock))
-		rw_enter(&ds->ds_rwlock, RW_WRITER);
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+	spa_feature_decr(spa, f, tx);
 }
 
 uint64_t
@@ -768,58 +824,93 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd
 		origin = dp->dp_origin_snap;
 
 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
-	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = dd->dd_object;
 	dsphys->ds_flags = flags;
 	dsphys->ds_fsid_guid = unique_create();
-	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-	    sizeof (dsphys->ds_guid));
+	do {
+		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+		    sizeof (dsphys->ds_guid));
+	} while (dsphys->ds_guid == 0);
 	dsphys->ds_snapnames_zapobj =
 	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
-	dsphys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 
-	if (origin) {
+	if (origin == NULL) {
+		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+	} else {
+		dsl_dataset_t *ohds; /* head of the origin snapshot */
+
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
-		    origin->ds_phys->ds_creation_txg;
-		dsphys->ds_used_bytes =
-		    origin->ds_phys->ds_used_bytes;
+		    dsl_dataset_phys(origin)->ds_creation_txg;
+		dsphys->ds_referenced_bytes =
+		    dsl_dataset_phys(origin)->ds_referenced_bytes;
 		dsphys->ds_compressed_bytes =
-		    origin->ds_phys->ds_compressed_bytes;
+		    dsl_dataset_phys(origin)->ds_compressed_bytes;
 		dsphys->ds_uncompressed_bytes =
-		    origin->ds_phys->ds_uncompressed_bytes;
-		dsphys->ds_bp = origin->ds_phys->ds_bp;
-		dsphys->ds_flags |= origin->ds_phys->ds_flags;
+		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+		rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
+		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
+		rrw_exit(&origin->ds_bp_rwlock, FTAG);
+
+		/*
+		 * Inherit flags that describe the dataset's contents
+		 * (INCONSISTENT) or properties (Case Insensitive).
+		 */
+		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
+		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+
+		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+			if (origin->ds_feature_inuse[f])
+				dsl_dataset_activate_feature(dsobj, f, tx);
+		}
 
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		origin->ds_phys->ds_num_children++;
+		dsl_dataset_phys(origin)->ds_num_children++;
+
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+		    FTAG, &ohds));
+		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+		dsl_dataset_rele(ohds, FTAG);
 
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
-			if (origin->ds_phys->ds_next_clones_obj == 0) {
-				origin->ds_phys->ds_next_clones_obj =
+			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+				dsl_dataset_phys(origin)->ds_next_clones_obj =
 				    zap_create(mos,
 				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 			}
-			VERIFY(0 == zap_add_int(mos,
-			    origin->ds_phys->ds_next_clones_obj,
+			VERIFY0(zap_add_int(mos,
+			    dsl_dataset_phys(origin)->ds_next_clones_obj,
 			    dsobj, tx));
 		}
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		dd->dd_phys->dd_origin_obj = origin->ds_object;
+		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+				dsl_dir_phys(origin->ds_dir)->dd_clones =
+				    zap_create(mos,
+				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY0(zap_add_int(mos,
+			    dsl_dir_phys(origin->ds_dir)->dd_clones,
+			    dsobj, tx));
+		}
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
@@ -828,11 +919,33 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd
 	dmu_buf_rele(dbuf, FTAG);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_head_dataset_obj = dsobj;
+	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
 
 	return (dsobj);
 }
 
+static void
+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *os;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+		zio_t *zio;
+
+		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+
+		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+		dsl_dataset_sync(ds, zio, tx);
+		VERIFY0(zio_wait(zio));
+
+		/* dsl_dataset_sync_done will drop this reference. */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+		dsl_dataset_sync_done(ds, tx);
+	}
+}
+
 uint64_t
 dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
@@ -841,343 +954,154 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, 
 	uint64_t dsobj, ddobj;
 	dsl_dir_t *dd;
 
+	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(lastname[0] != '@');
 
 	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
 
-	dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
+	dsobj = dsl_dataset_create_sync_dd(dd, origin,
+	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
-	dsl_dir_close(dd, FTAG);
+	/*
+	 * Since we're creating a new node we know it's a leaf, so we can
+	 * initialize the counts if the limit feature is active.
+	 */
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		uint64_t cnt = 0;
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+
+		dsl_dir_zapify(dd, tx);
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+	}
+
+	dsl_dir_rele(dd, FTAG);
+
+	/*
+	 * If we are creating a clone, make sure we zero out any stale
+	 * data from the origin snapshots zil header.
+	 */
+	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		dsl_dataset_zero_zil(ds, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
 
 	return (dsobj);
 }
 
+#if defined(__FreeBSD__) || defined(__NetBSD__)
+/* FreeBSD ioctl compat begin */
 struct destroyarg {
-	dsl_sync_task_group_t *dstg;
-	char *snapname;
-	char *failed;
-	boolean_t defer;
+	nvlist_t *nvl;
+	const char *snapname;
 };
 
 static int
-dsl_snapshot_destroy_one(const char *name, void *arg)
+dsl_check_snap_cb(const char *name, void *arg)
 {
 	struct destroyarg *da = arg;
 	dsl_dataset_t *ds;
-	int err;
 	char *dsname;
 
 	dsname = kmem_asprintf("%s@%s", name, da->snapname);
-	err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds);
-	strfree(dsname);
-	if (err == 0) {
-		struct dsl_ds_destroyarg *dsda;
+	fnvlist_add_boolean(da->nvl, dsname);
+	kmem_free(dsname, strlen(dsname) + 1);
 
-		dsl_dataset_make_exclusive(ds, da->dstg);
-		if (ds->ds_objset != NULL) {
-			dmu_objset_evict(ds->ds_objset);
-			ds->ds_objset = NULL;
-		}
-		dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
-		dsda->ds = ds;
-		dsda->defer = da->defer;
-		dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
-		    dsl_dataset_destroy_sync, dsda, da->dstg, 0);
-	} else if (err == ENOENT) {
-		err = 0;
-	} else {
-		(void) strcpy(da->failed, name);
-	}
-	return (err);
+	return (0);
 }
 
-/*
- * Destroy 'snapname' in all descendants of 'fsname'.
- */
-#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
 int
-dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer)
+dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
+    nvlist_t *snaps)
 {
+	struct destroyarg *da;
 	int err;
-	struct destroyarg da;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-
-	err = spa_open(fsname, &spa, FTAG);
-	if (err)
-		return (err);
-	da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	da.snapname = snapname;
-	da.failed = fsname;
-	da.defer = defer;
-
-	err = dmu_objset_find(fsname,
-	    dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
-
-	if (err == 0)
-		err = dsl_sync_task_group_wait(da.dstg);
 
-	for (dst = list_head(&da.dstg->dstg_tasks); dst;
-	    dst = list_next(&da.dstg->dstg_tasks, dst)) {
-		struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
-		dsl_dataset_t *ds = dsda->ds;
-
-		/*
-		 * Return the file system name that triggered the error
-		 */
-		if (dst->dst_err) {
-			dsl_dataset_name(ds, fsname);
-			*strchr(fsname, '@') = '\0';
-		}
-		ASSERT3P(dsda->rm_origin, ==, NULL);
-		dsl_dataset_disown(ds, da.dstg);
-		kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
-	}
+	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
+	da->nvl = snaps;
+	da->snapname = snapname;
+	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
+	    DS_FIND_CHILDREN);
+	kmem_free(da, sizeof (struct destroyarg));
 
-	dsl_sync_task_group_destroy(da.dstg);
-	spa_close(spa, FTAG);
 	return (err);
 }
-
-static boolean_t
-dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
-{
-	boolean_t might_destroy = B_FALSE;
-
-	mutex_enter(&ds->ds_lock);
-	if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
-	    DS_IS_DEFER_DESTROY(ds))
-		might_destroy = B_TRUE;
-	mutex_exit(&ds->ds_lock);
-
-	return (might_destroy);
-}
+/* FreeBSD ioctl compat end */
+#endif /* defined(__FreeBSD__) || defined(__NetBSD__) */
 
 /*
- * If we're removing a clone, and these three conditions are true:
- *	1) the clone's origin has no other children
- *	2) the clone's origin has no user references
- *	3) the clone's origin has been marked for deferred destruction
- * Then, prepare to remove the origin as part of this sync task group.
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use.  To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
  */
-static int
-dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
 {
-	dsl_dataset_t *ds = dsda->ds;
-	dsl_dataset_t *origin = ds->ds_prev;
+	uint64_t mrs_used;
+	uint64_t dlused, dlcomp, dluncomp;
 
-	if (dsl_dataset_might_destroy_origin(origin)) {
-		char *name;
-		int namelen;
-		int error;
-
-		namelen = dsl_dataset_namelen(origin) + 1;
-		name = kmem_alloc(namelen, KM_SLEEP);
-		dsl_dataset_name(origin, name);
-#ifdef _KERNEL
-		error = zfs_unmount_snap(name, NULL);
-		if (error) {
-			kmem_free(name, namelen);
-			return (error);
-		}
-#endif
-		error = dsl_dataset_own(name, B_TRUE, tag, &origin);
-		kmem_free(name, namelen);
-		if (error)
-			return (error);
-		dsda->rm_origin = origin;
-		dsl_dataset_make_exclusive(origin, tag);
+	ASSERT(!ds->ds_is_snapshot);
 
-		if (origin->ds_objset != NULL) {
-			dmu_objset_evict(origin->ds_objset);
-			origin->ds_objset = NULL;
-		}
-	}
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
+	else
+		mrs_used = 0;
 
-	return (0);
+	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
+
+	ASSERT3U(dlused, <=, mrs_used);
+	dsl_dataset_phys(ds)->ds_unique_bytes =
+	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+	    SPA_VERSION_UNIQUE_ACCURATE)
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
 }
 
-/*
- * ds must be opened as OWNER.  On return (whether successful or not),
- * ds will be closed and caller can no longer dereference it.
- */
-int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
+void
+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx)
 {
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t count;
 	int err;
-	dsl_sync_task_group_t *dstg;
-	objset_t *os;
-	dsl_dir_t *dd;
-	uint64_t obj;
-	struct dsl_ds_destroyarg dsda = { 0 };
-	dsl_dataset_t dummy_ds = { 0 };
-
-	dsda.ds = ds;
-
-	if (dsl_dataset_is_snapshot(ds)) {
-		/* Destroying a snapshot is simpler */
-		dsl_dataset_make_exclusive(ds, tag);
-
-		if (ds->ds_objset != NULL) {
-			dmu_objset_evict(ds->ds_objset);
-			ds->ds_objset = NULL;
-		}
-		dsda.defer = defer;
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-		    &dsda, tag, 0);
-		ASSERT3P(dsda.rm_origin, ==, NULL);
-		goto out;
-	} else if (defer) {
-		err = EINVAL;
-		goto out;
-	}
-
-	dd = ds->ds_dir;
-	dummy_ds.ds_dir = dd;
-	dummy_ds.ds_object = ds->ds_object;
-
-	/*
-	 * Check for errors and mark this ds as inconsistent, in
-	 * case we crash while freeing the objects.
-	 */
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check,
-	    dsl_dataset_destroy_begin_sync, ds, NULL, 0);
-	if (err)
-		goto out;
-
-	err = dmu_objset_from_ds(ds, &os);
-	if (err)
-		goto out;
-
-	/*
-	 * remove the objects in open context, so that we won't
-	 * have too much to do in syncing context.
-	 */
-	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE,
-	    ds->ds_phys->ds_prev_snap_txg)) {
-		/*
-		 * Ignore errors, if there is not enough disk space
-		 * we will deal with it in dsl_dataset_destroy_sync().
-		 */
-		(void) dmu_free_object(os, obj);
-	}
-
-	/*
-	 * We need to sync out all in-flight IO before we try to evict
-	 * (the dataset evict func is trying to clear the cached entries
-	 * for this dataset in the ARC).
-	 */
-	txg_wait_synced(dd->dd_pool, 0);
-
-	/*
-	 * If we managed to free all the objects in open
-	 * context, the user space accounting should be zero.
-	 */
-	if (ds->ds_phys->ds_bp.blk_fill == 0 &&
-	    dmu_objset_userused_enabled(os)) {
-		uint64_t count;
-
-		ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
-		    count == 0);
-		ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
-		    count == 0);
-	}
-
-	if (err != ESRCH)
-		goto out;
-
-	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-	err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd);
-	rw_exit(&dd->dd_pool->dp_config_rwlock);
-
-	if (err)
-		goto out;
-
-	if (ds->ds_objset) {
-		/*
-		 * We need to sync out all in-flight IO before we try
-		 * to evict (the dataset evict func is trying to clear
-		 * the cached entries for this dataset in the ARC).
-		 */
-		txg_wait_synced(dd->dd_pool, 0);
-	}
-
-	/*
-	 * Blow away the dsl_dir + head dataset.
-	 */
-	dsl_dataset_make_exclusive(ds, tag);
-	if (ds->ds_objset) {
-		dmu_objset_evict(ds->ds_objset);
-		ds->ds_objset = NULL;
-	}
 
+	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    obj, tx);
 	/*
-	 * If we're removing a clone, we might also need to remove its
-	 * origin.
+	 * The err should not be ENOENT, but a bug in a previous version
+	 * of the code could cause upgrade_clones_cb() to not set
+	 * ds_next_snap_obj when it should, leading to a missing entry.
+	 * If we knew that the pool was created after
+	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+	 * ENOENT.  However, at least we can check that we don't have
+	 * too many entries in the next_clones_obj even after failing to
+	 * remove this one.
 	 */
-	do {
-		dsda.need_prep = B_FALSE;
-		if (dsl_dir_is_clone(dd)) {
-			err = dsl_dataset_origin_rm_prep(&dsda, tag);
-			if (err) {
-				dsl_dir_close(dd, FTAG);
-				goto out;
-			}
-		}
-
-		dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
-		dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-		    dsl_dataset_destroy_sync, &dsda, tag, 0);
-		dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-		    dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
-		err = dsl_sync_task_group_wait(dstg);
-		dsl_sync_task_group_destroy(dstg);
-
-		/*
-		 * We could be racing against 'zfs release' or 'zfs destroy -d'
-		 * on the origin snap, in which case we can get EBUSY if we
-		 * needed to destroy the origin snap but were not ready to
-		 * do so.
-		 */
-		if (dsda.need_prep) {
-			ASSERT(err == EBUSY);
-			ASSERT(dsl_dir_is_clone(dd));
-			ASSERT(dsda.rm_origin == NULL);
-		}
-	} while (dsda.need_prep);
-
-	if (dsda.rm_origin != NULL)
-		dsl_dataset_disown(dsda.rm_origin, tag);
-
-	/* if it is successful, dsl_dir_destroy_sync will close the dd */
-	if (err)
-		dsl_dir_close(dd, FTAG);
-out:
-	dsl_dataset_disown(ds, tag);
-	return (err);
+	if (err != ENOENT)
+		VERIFY0(err);
+	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    &count));
+	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
 }
 
+
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
-	return (&ds->ds_phys->ds_bp);
-}
-
-void
-dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* If it's the meta-objset, set dp_meta_rootbp */
-	if (ds == NULL) {
-		tx->tx_pool->dp_meta_rootbp = *bp;
-	} else {
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_bp = *bp;
-	}
+	return (&dsl_dataset_phys(ds)->ds_bp);
 }
 
 spa_t *
@@ -1196,708 +1120,290 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu
 
 	ASSERT(ds->ds_objset != NULL);
 
-	if (ds->ds_phys->ds_next_snap_obj != 0)
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
 
-	dp = ds->ds_dir->dd_pool;
+	/* Must not dirty a dataset in the same txg where it got snapshotted. */
+	ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
 
-	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) {
+	dp = ds->ds_dir->dd_pool;
+	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, ds);
 	}
 }
 
-/*
- * The unique space in the head dataset can be calculated by subtracting
- * the space used in the most recent snapshot, that is still being used
- * in this file system, from the space currently in use.  To figure out
- * the space in the most recent snapshot still in use, we need to take
- * the total space used in the snapshot and subtract out the space that
- * has been freed up since the snapshot was taken.
- */
-static void
-dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
-{
-	uint64_t mrs_used;
-	uint64_t dlused, dlcomp, dluncomp;
-
-	ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj);
-
-	if (ds->ds_phys->ds_prev_snap_obj != 0)
-		mrs_used = ds->ds_prev->ds_phys->ds_used_bytes;
-	else
-		mrs_used = 0;
-
-	VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
-	    &dluncomp));
-
-	ASSERT3U(dlused, <=, mrs_used);
-	ds->ds_phys->ds_unique_bytes =
-	    ds->ds_phys->ds_used_bytes - (mrs_used - dlused);
-
-	if (!DS_UNIQUE_IS_ACCURATE(ds) &&
-	    spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-	    SPA_VERSION_UNIQUE_ACCURATE)
-		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-}
-
-static uint64_t
-dsl_dataset_unique(dsl_dataset_t *ds)
+boolean_t
+dsl_dataset_is_dirty(dsl_dataset_t *ds)
 {
-	if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds))
-		dsl_dataset_recalc_head_uniq(ds);
-
-	return (ds->ds_phys->ds_unique_bytes);
+	for (int t = 0; t < TXG_SIZE; t++) {
+		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+		    ds, t))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
 }
 
-struct killarg {
-	dsl_dataset_t *ds;
-	dmu_tx_t *tx;
-};
-
-/* ARGSUSED */
 static int
-kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
-	struct killarg *ka = arg;
-	dmu_tx_t *tx = ka->tx;
+	uint64_t asize;
 
-	if (bp == NULL)
+	if (!dmu_tx_is_syncing(tx))
 		return (0);
 
-	if (zb->zb_level == ZB_ZIL_LEVEL) {
-		ASSERT(zilog != NULL);
-		/*
-		 * It's a block in the intent log.  It has no
-		 * accounting, so just free it.
-		 */
-		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
-	} else {
-		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
-		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
-	}
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t count;
-	int err;
-
 	/*
-	 * Can't delete a head dataset if there are snapshots of it.
-	 * (Except if the only snapshots are from the branch we cloned
-	 * from.)
+	 * If there's an fs-only reservation, any blocks that might become
+	 * owned by the snapshot dataset must be accommodated by space
+	 * outside of the reservation.
 	 */
-	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EBUSY);
+	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
+	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+		return (SET_ERROR(ENOSPC));
 
 	/*
-	 * This is really a dsl_dir thing, but check it here so that
-	 * we'll be less likely to leave this dataset inconsistent &
-	 * nearly destroyed.
+	 * Propagate any reserved space for this snapshot to other
+	 * snapshot checks in this sync group.
 	 */
-	err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
-	if (err)
-		return (err);
-	if (count != 0)
-		return (EEXIST);
+	if (asize > 0)
+		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
 
 	return (0);
 }
 
-/* ARGSUSED */
-static void
-dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	/* Mark it as inconsistent on-disk, in case we crash */
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
-	spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx,
-	    cr, "dataset = %llu", ds->ds_object);
-}
+typedef struct dsl_dataset_snapshot_arg {
+	nvlist_t *ddsa_snaps;
+	nvlist_t *ddsa_props;
+	nvlist_t *ddsa_errors;
+	cred_t *ddsa_cr;
+} dsl_dataset_snapshot_arg_t;
 
-static int
-dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
-    dmu_tx_t *tx)
+int
+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
 {
-	dsl_dataset_t *ds = dsda->ds;
-	dsl_dataset_t *ds_prev = ds->ds_prev;
+	int error;
+	uint64_t value;
 
-	if (dsl_dataset_might_destroy_origin(ds_prev)) {
-		struct dsl_ds_destroyarg ndsda = {0};
+	ds->ds_trysnap_txg = tx->tx_txg;
 
-		/*
-		 * If we're not prepared to remove the origin, don't remove
-		 * the clone either.
-		 */
-		if (dsda->rm_origin == NULL) {
-			dsda->need_prep = B_TRUE;
-			return (EBUSY);
-		}
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
 
-		ndsda.ds = ds_prev;
-		ndsda.is_origin_rm = B_TRUE;
-		return (dsl_dataset_destroy_check(&ndsda, tag, tx));
-	}
+	/*
+	 * We don't allow multiple snapshots of the same txg.  If there
+	 * is already one, try again.
+	 */
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
+		return (SET_ERROR(EAGAIN));
 
 	/*
-	 * If we're not going to remove the origin after all,
-	 * undo the open context setup.
+	 * Check for conflicting snapshot name.
 	 */
-	if (dsda->rm_origin != NULL) {
-		dsl_dataset_disown(dsda->rm_origin, tag);
-		dsda->rm_origin = NULL;
+	error = dsl_dataset_snap_lookup(ds, snapname, &value);
+	if (error == 0)
+		return (SET_ERROR(EEXIST));
+	if (error != ENOENT)
+		return (error);
+
+	/*
+	 * We don't allow taking snapshots of inconsistent datasets, such as
+	 * those into which we are currently receiving.  However, if we are
+	 * creating this snapshot as part of a receive, this check will be
+	 * executed atomically with respect to the completion of the receive
+	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
+	 * case we ignore this, knowing it will be fixed up for us shortly in
+	 * dmu_recv_end_sync().
+	 */
+	if (!recv && DS_IS_INCONSISTENT(ds))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Skip the check for temporary snapshots or if we have already checked
+	 * the counts in dsl_dataset_snapshot_check. This means we really only
+	 * check the count here when we're receiving a stream.
+	 */
+	if (cnt != 0 && cr != NULL) {
+		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
+		if (error != 0)
+			return (error);
 	}
 
+	error = dsl_dataset_snapshot_reserve_space(ds, tx);
+	if (error != 0)
+		return (error);
+
 	return (0);
 }
 
-/* ARGSUSED */
-int
-dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
+static int
+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
 {
-	struct dsl_ds_destroyarg *dsda = arg1;
-	dsl_dataset_t *ds = dsda->ds;
-
-	/* we have an owner hold, so noone else can destroy us */
-	ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
-
-	/*
-	 * Only allow deferred destroy on pools that support it.
-	 * NOTE: deferred destroy is only supported on snapshots.
-	 */
-	if (dsda->defer) {
-		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
-		    SPA_VERSION_USERREFS)
-			return (ENOTSUP);
-		ASSERT(dsl_dataset_is_snapshot(ds));
-		return (0);
-	}
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+	int rv = 0;
 
 	/*
-	 * Can't delete a head dataset if there are snapshots of it.
-	 * (Except if the only snapshots are from the branch we cloned
-	 * from.)
+	 * Pre-compute how many total new snapshots will be created for each
+	 * level in the tree and below. This is needed for validating the
+	 * snapshot limit when either taking a recursive snapshot or when
+	 * taking multiple snapshots.
+	 *
+	 * The problem is that the counts are not actually adjusted when
+	 * we are checking, only when we finally sync. For a single snapshot,
+	 * this is easy, the count will increase by 1 at each node up the tree,
+	 * but its more complicated for the recursive/multiple snapshot case.
+	 *
+	 * The dsl_fs_ss_limit_check function does recursively check the count
+	 * at each level up the tree but since it is validating each snapshot
+	 * independently we need to be sure that we are validating the complete
+	 * count for the entire set of snapshots. We do this by rolling up the
+	 * counts for each component of the name into an nvlist and then
+	 * checking each of those cases with the aggregated count.
+	 *
+	 * This approach properly handles not only the recursive snapshot
+	 * case (where we get all of those on the ddsa_snaps list) but also
+	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
+	 * validate the limit on 'a' using a count of 2).
+	 *
+	 * We validate the snapshot names in the third loop and only report
+	 * name errors once.
 	 */
-	if (ds->ds_prev != NULL &&
-	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EBUSY);
+	if (dmu_tx_is_syncing(tx)) {
+		nvlist_t *cnt_track = NULL;
+		cnt_track = fnvlist_alloc();
+
+		/* Rollup aggregated counts into the cnt_track list */
+		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+		    pair != NULL;
+		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+			char *pdelim;
+			uint64_t val;
+			char nm[MAXPATHLEN];
 
-	/*
-	 * If we made changes this txg, traverse_dsl_dataset won't find
-	 * them.  Try again.
-	 */
-	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-		return (EAGAIN);
+			(void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
+			pdelim = strchr(nm, '@');
+			if (pdelim == NULL)
+				continue;
+			*pdelim = '\0';
+
+			do {
+				if (nvlist_lookup_uint64(cnt_track, nm,
+				    &val) == 0) {
+					/* update existing entry */
+					fnvlist_add_uint64(cnt_track, nm,
+					    val + 1);
+				} else {
+					/* add to list */
+					fnvlist_add_uint64(cnt_track, nm, 1);
+				}
 
-	if (dsl_dataset_is_snapshot(ds)) {
-		/*
-		 * If this snapshot has an elevated user reference count,
-		 * we can't destroy it yet.
-		 */
-		if (ds->ds_userrefs > 0 && !dsda->releasing)
-			return (EBUSY);
+				pdelim = strrchr(nm, '/');
+				if (pdelim != NULL)
+					*pdelim = '\0';
+			} while (pdelim != NULL);
+		}
+
+		/* Check aggregated counts at each level */
+		for (pair = nvlist_next_nvpair(cnt_track, NULL);
+		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+			int error = 0;
+			char *name;
+			uint64_t cnt = 0;
+			dsl_dataset_t *ds;
+
+			name = nvpair_name(pair);
+			cnt = fnvpair_value_uint64(pair);
+			ASSERT(cnt > 0);
+
+			error = dsl_dataset_hold(dp, name, FTAG, &ds);
+			if (error == 0) {
+				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+				    ddsa->ddsa_cr);
+				dsl_dataset_rele(ds, FTAG);
+			}
 
-		mutex_enter(&ds->ds_lock);
-		/*
-		 * Can't delete a branch point. However, if we're destroying
-		 * a clone and removing its origin due to it having a user
-		 * hold count of 0 and having been marked for deferred destroy,
-		 * it's OK for the origin to have a single clone.
-		 */
-		if (ds->ds_phys->ds_num_children >
-		    (dsda->is_origin_rm ? 2 : 1)) {
-			mutex_exit(&ds->ds_lock);
-			return (EEXIST);
+			if (error != 0) {
+				if (ddsa->ddsa_errors != NULL)
+					fnvlist_add_int32(ddsa->ddsa_errors,
+					    name, error);
+				rv = error;
+				/* only report one error for this check */
+				break;
+			}
 		}
-		mutex_exit(&ds->ds_lock);
-	} else if (dsl_dir_is_clone(ds->ds_dir)) {
-		return (dsl_dataset_origin_check(dsda, arg2, tx));
+		nvlist_free(cnt_track);
 	}
 
-	/* XXX we should do some i/o error checking... */
-	return (0);
-}
-
-struct refsarg {
-	kmutex_t lock;
-	boolean_t gone;
-	kcondvar_t cv;
-};
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		int error = 0;
+		dsl_dataset_t *ds;
+		char *name, *atp;
+		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
-/* ARGSUSED */
-static void
-dsl_dataset_refs_gone(dmu_buf_t *db, void *argv)
-{
-	struct refsarg *arg = argv;
-
-	mutex_enter(&arg->lock);
-	arg->gone = TRUE;
-	cv_signal(&arg->cv);
-	mutex_exit(&arg->lock);
-}
-
-static void
-dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
-{
-	struct refsarg arg;
-
-	mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&arg.cv, NULL, CV_DEFAULT, NULL);
-	arg.gone = FALSE;
-	(void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys,
-	    dsl_dataset_refs_gone);
-	dmu_buf_rele(ds->ds_dbuf, tag);
-	mutex_enter(&arg.lock);
-	while (!arg.gone)
-		cv_wait(&arg.cv, &arg.lock);
-	ASSERT(arg.gone);
-	mutex_exit(&arg.lock);
-	ds->ds_dbuf = NULL;
-	ds->ds_phys = NULL;
-	mutex_destroy(&arg.lock);
-	cv_destroy(&arg.cv);
-}
-
-static void
-remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
-{
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t count;
-	int err;
-
-	ASSERT(ds->ds_phys->ds_num_children >= 2);
-	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
-	/*
-	 * The err should not be ENOENT, but a bug in a previous version
-	 * of the code could cause upgrade_clones_cb() to not set
-	 * ds_next_snap_obj when it should, leading to a missing entry.
-	 * If we knew that the pool was created after
-	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
-	 * ENOENT.  However, at least we can check that we don't have
-	 * too many entries in the next_clones_obj even after failing to
-	 * remove this one.
-	 */
-	if (err != ENOENT) {
-		VERIFY3U(err, ==, 0);
-	}
-	ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
-	    &count));
-	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
-}
-
-void
-dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
-{
-	struct dsl_ds_destroyarg *dsda = arg1;
-	dsl_dataset_t *ds = dsda->ds;
-	int err;
-	int after_branch_point = FALSE;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	dsl_dataset_t *ds_prev = NULL;
-	uint64_t obj;
-
-	ASSERT(ds->ds_owner);
-	ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
-	ASSERT(ds->ds_prev == NULL ||
-	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
-	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
-
-	if (dsda->defer) {
-		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
-		if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) {
-			dmu_buf_will_dirty(ds->ds_dbuf, tx);
-			ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
-			return;
-		}
-	}
-
-	/* signal any waiters that this dataset is going away */
-	mutex_enter(&ds->ds_lock);
-	ds->ds_owner = dsl_reaper;
-	cv_broadcast(&ds->ds_exclusive_cv);
-	mutex_exit(&ds->ds_lock);
-
-	/* Remove our reservation */
-	if (ds->ds_reserved != 0) {
-		dsl_prop_setarg_t psa;
-		uint64_t value = 0;
-
-		dsl_prop_setarg_init_uint64(&psa, "refreservation",
-		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
-		    &value);
-		psa.psa_effective_value = 0;	/* predict default value */
-
-		dsl_dataset_set_reservation_sync(ds, &psa, cr, tx);
-		ASSERT3U(ds->ds_reserved, ==, 0);
-	}
-
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-
-	dsl_pool_ds_destroyed(ds, tx);
-
-	obj = ds->ds_object;
-
-	if (ds->ds_phys->ds_prev_snap_obj != 0) {
-		if (ds->ds_prev) {
-			ds_prev = ds->ds_prev;
-		} else {
-			VERIFY(0 == dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
-		}
-		after_branch_point =
-		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
-
-		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
-		if (after_branch_point &&
-		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
-			remove_from_next_clones(ds_prev, obj, tx);
-			if (ds->ds_phys->ds_next_snap_obj != 0) {
-				VERIFY(0 == zap_add_int(mos,
-				    ds_prev->ds_phys->ds_next_clones_obj,
-				    ds->ds_phys->ds_next_snap_obj, tx));
-			}
+		name = nvpair_name(pair);
+		if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
+			error = SET_ERROR(ENAMETOOLONG);
+		if (error == 0) {
+			atp = strchr(name, '@');
+			if (atp == NULL)
+				error = SET_ERROR(EINVAL);
+			if (error == 0)
+				(void) strlcpy(dsname, name, atp - name + 1);
 		}
-		if (after_branch_point &&
-		    ds->ds_phys->ds_next_snap_obj == 0) {
-			/* This clone is toast. */
-			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
-			ds_prev->ds_phys->ds_num_children--;
-
-			/*
-			 * If the clone's origin has no other clones, no
-			 * user holds, and has been marked for deferred
-			 * deletion, then we should have done the necessary
-			 * destroy setup for it.
-			 */
-			if (ds_prev->ds_phys->ds_num_children == 1 &&
-			    ds_prev->ds_userrefs == 0 &&
-			    DS_IS_DEFER_DESTROY(ds_prev)) {
-				ASSERT3P(dsda->rm_origin, !=, NULL);
-			} else {
-				ASSERT3P(dsda->rm_origin, ==, NULL);
-			}
-		} else if (!after_branch_point) {
-			ds_prev->ds_phys->ds_next_snap_obj =
-			    ds->ds_phys->ds_next_snap_obj;
-		}
-	}
-
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
-		blkptr_t bp;
-		dsl_dataset_t *ds_next;
-		uint64_t itor = 0;
-		uint64_t old_unique;
-		int64_t used = 0, compressed = 0, uncompressed = 0;
-
-		VERIFY(0 == dsl_dataset_hold_obj(dp,
-		    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
-		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
-
-		old_unique = dsl_dataset_unique(ds_next);
-
-		dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
-		ds_next->ds_phys->ds_prev_snap_obj =
-		    ds->ds_phys->ds_prev_snap_obj;
-		ds_next->ds_phys->ds_prev_snap_txg =
-		    ds->ds_phys->ds_prev_snap_txg;
-		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
-
-		/*
-		 * Transfer to our deadlist (which will become next's
-		 * new deadlist) any entries from next's current
-		 * deadlist which were born before prev, and free the
-		 * other entries.
-		 *
-		 * XXX we're doing this long task with the config lock held
-		 */
-		while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
-			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
-				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
-				    &bp, tx));
-				if (ds_prev && !after_branch_point &&
-				    bp.blk_birth >
-				    ds_prev->ds_phys->ds_prev_snap_txg) {
-					ds_prev->ds_phys->ds_unique_bytes +=
-					    bp_get_dsize_sync(dp->dp_spa, &bp);
-				}
-			} else {
-				used += bp_get_dsize_sync(dp->dp_spa, &bp);
-				compressed += BP_GET_PSIZE(&bp);
-				uncompressed += BP_GET_UCSIZE(&bp);
-				dsl_free(dp, tx->tx_txg, &bp);
-			}
+		if (error == 0)
+			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+		if (error == 0) {
+			/* passing 0/NULL skips dsl_fs_ss_limit_check */
+			error = dsl_dataset_snapshot_check_impl(ds,
+			    atp + 1, tx, B_FALSE, 0, NULL);
+			dsl_dataset_rele(ds, FTAG);
 		}
 
-		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
-
-		/* change snapused */
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-		    -used, -compressed, -uncompressed, tx);
-
-		/* free next's deadlist */
-		bplist_close(&ds_next->ds_deadlist);
-		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
-
-		/* set next's deadlist to our deadlist */
-		bplist_close(&ds->ds_deadlist);
-		ds_next->ds_phys->ds_deadlist_obj =
-		    ds->ds_phys->ds_deadlist_obj;
-		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
-		    ds_next->ds_phys->ds_deadlist_obj));
-		ds->ds_phys->ds_deadlist_obj = 0;
-
-		if (ds_next->ds_phys->ds_next_snap_obj != 0) {
-			/*
-			 * Update next's unique to include blocks which
-			 * were previously shared by only this snapshot
-			 * and it.  Those blocks will be born after the
-			 * prev snap and before this snap, and will have
-			 * died after the next snap and before the one
-			 * after that (ie. be on the snap after next's
-			 * deadlist).
-			 *
-			 * XXX we're doing this long task with the
-			 * config lock held
-			 */
-			dsl_dataset_t *ds_after_next;
-			uint64_t space;
-
-			VERIFY(0 == dsl_dataset_hold_obj(dp,
-			    ds_next->ds_phys->ds_next_snap_obj,
-			    FTAG, &ds_after_next));
-
-			VERIFY(0 ==
-			    bplist_space_birthrange(&ds_after_next->ds_deadlist,
-			    ds->ds_phys->ds_prev_snap_txg,
-			    ds->ds_phys->ds_creation_txg, &space));
-			ds_next->ds_phys->ds_unique_bytes += space;
-
-			dsl_dataset_rele(ds_after_next, FTAG);
-			ASSERT3P(ds_next->ds_prev, ==, NULL);
-		} else {
-			ASSERT3P(ds_next->ds_prev, ==, ds);
-			dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
-			ds_next->ds_prev = NULL;
-			if (ds_prev) {
-				VERIFY(0 == dsl_dataset_get_ref(dp,
-				    ds->ds_phys->ds_prev_snap_obj,
-				    ds_next, &ds_next->ds_prev));
-			}
-
-			dsl_dataset_recalc_head_uniq(ds_next);
-
-			/*
-			 * Reduce the amount of our unconsmed refreservation
-			 * being charged to our parent by the amount of
-			 * new unique data we have gained.
-			 */
-			if (old_unique < ds_next->ds_reserved) {
-				int64_t mrsdelta;
-				uint64_t new_unique =
-				    ds_next->ds_phys->ds_unique_bytes;
-
-				ASSERT(old_unique <= new_unique);
-				mrsdelta = MIN(new_unique - old_unique,
-				    ds_next->ds_reserved - old_unique);
-				dsl_dir_diduse_space(ds->ds_dir,
-				    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+		if (error != 0) {
+			if (ddsa->ddsa_errors != NULL) {
+				fnvlist_add_int32(ddsa->ddsa_errors,
+				    name, error);
 			}
+			rv = error;
 		}
-		dsl_dataset_rele(ds_next, FTAG);
-	} else {
-		/*
-		 * There's no next snapshot, so this is a head dataset.
-		 * Destroy the deadlist.  Unless it's a clone, the
-		 * deadlist should be empty.  (If it's a clone, it's
-		 * safe to ignore the deadlist contents.)
-		 */
-		struct killarg ka;
-
-		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
-		bplist_close(&ds->ds_deadlist);
-		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
-		ds->ds_phys->ds_deadlist_obj = 0;
-
-		/*
-		 * Free everything that we point to (that's born after
-		 * the previous snapshot, if we are a clone)
-		 *
-		 * NB: this should be very quick, because we already
-		 * freed all the objects in open context.
-		 */
-		ka.ds = ds;
-		ka.tx = tx;
-		err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-		    TRAVERSE_POST, kill_blkptr, &ka);
-		ASSERT3U(err, ==, 0);
-		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
-		    ds->ds_phys->ds_unique_bytes == 0);
-
-		if (ds->ds_prev != NULL) {
-			dsl_dataset_rele(ds->ds_prev, ds);
-			ds->ds_prev = ds_prev = NULL;
-		}
-	}
-
-	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
-		/* Erase the link in the dir */
-		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-		ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
-		ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
-		err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx);
-		ASSERT(err == 0);
-	} else {
-		/* remove from snapshot namespace */
-		dsl_dataset_t *ds_head;
-		ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
-		VERIFY(0 == dsl_dataset_hold_obj(dp,
-		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-#ifdef ZFS_DEBUG
-		{
-			uint64_t val;
-
-			err = dsl_dataset_snap_lookup(ds_head,
-			    ds->ds_snapname, &val);
-			ASSERT3U(err, ==, 0);
-			ASSERT3U(val, ==, obj);
-		}
-#endif
-		err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx);
-		ASSERT(err == 0);
-		dsl_dataset_rele(ds_head, FTAG);
-	}
-
-	if (ds_prev && ds->ds_prev != ds_prev)
-		dsl_dataset_rele(ds_prev, FTAG);
-
-	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
-	spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
-	    cr, "dataset = %llu", ds->ds_object);
-
-	if (ds->ds_phys->ds_next_clones_obj != 0) {
-		uint64_t count;
-		ASSERT(0 == zap_count(mos,
-		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
-		VERIFY(0 == dmu_object_free(mos,
-		    ds->ds_phys->ds_next_clones_obj, tx));
-	}
-	if (ds->ds_phys->ds_props_obj != 0)
-		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
-	if (ds->ds_phys->ds_userrefs_obj != 0)
-		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
-	dsl_dir_close(ds->ds_dir, ds);
-	ds->ds_dir = NULL;
-	dsl_dataset_drain_refs(ds, tag);
-	VERIFY(0 == dmu_object_free(mos, obj, tx));
-
-	if (dsda->rm_origin) {
-		/*
-		 * Remove the origin of the clone we just destroyed.
-		 */
-		struct dsl_ds_destroyarg ndsda = {0};
-
-		ndsda.ds = dsda->rm_origin;
-		dsl_dataset_destroy_sync(&ndsda, tag, cr, tx);
 	}
-}
-
-static int
-dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	uint64_t asize;
-
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
-
-	/*
-	 * If there's an fs-only reservation, any blocks that might become
-	 * owned by the snapshot dataset must be accommodated by space
-	 * outside of the reservation.
-	 */
-	asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
-	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE))
-		return (ENOSPC);
-
-	/*
-	 * Propogate any reserved space for this snapshot to other
-	 * snapshot checks in this sync group.
-	 */
-	if (asize > 0)
-		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
-
-	return (0);
-}
-
-/* ARGSUSED */
-int
-dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	const char *snapname = arg2;
-	int err;
-	uint64_t value;
-
-	/*
-	 * We don't allow multiple snapshots of the same txg.  If there
-	 * is already one, try again.
-	 */
-	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
-		return (EAGAIN);
-
-	/*
-	 * Check for conflicting name snapshot name.
-	 */
-	err = dsl_dataset_snap_lookup(ds, snapname, &value);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
-
-	/*
-	 * Check that the dataset's name is not too long.  Name consists
-	 * of the dataset's length + 1 for the @-sign + snapshot name's length
-	 */
-	if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN)
-		return (ENAMETOOLONG);
-
-	err = dsl_dataset_snapshot_reserve_space(ds, tx);
-	if (err)
-		return (err);
 
-	ds->ds_trysnap_txg = tx->tx_txg;
-	return (0);
+	return (rv);
 }
 
 void
-dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	const char *snapname = arg2;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
 	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
-	int err;
+	objset_t *os;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	/*
+	 * If we are on an old pool, the zil must not be active, in which
+	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
+	 */
+	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
+	    dmu_objset_from_ds(ds, &os) != 0 ||
+	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
+	    sizeof (zero_zil)) == 0);
+
+	/* Should not snapshot a dirty dataset. */
+	ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+	    ds, tx->tx_txg));
 
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
+	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
 
 	/*
 	 * The origin's ds_creation_txg has to be < TXG_INITIAL
@@ -1909,44 +1415,56 @@ dsl_dataset_snapshot_sync(void *arg1, vo
 
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 	bzero(dsphys, sizeof (dsl_dataset_phys_t));
 	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
 	dsphys->ds_fsid_guid = unique_create();
-	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-	    sizeof (dsphys->ds_guid));
-	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
-	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+	do {
+		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+		    sizeof (dsphys->ds_guid));
+	} while (dsphys->ds_guid == 0);
+	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = crtxg;
-	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
-	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
-	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
-	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
-	dsphys->ds_flags = ds->ds_phys->ds_flags;
-	dsphys->ds_bp = ds->ds_phys->ds_bp;
+	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+	dsphys->ds_uncompressed_bytes =
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	dmu_buf_rele(dbuf, FTAG);
 
-	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f])
+			dsl_dataset_activate_feature(dsobj, f, tx);
+	}
+
+	ASSERT3U(ds->ds_prev != 0, ==,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
 		uint64_t next_clones_obj =
-		    ds->ds_prev->ds_phys->ds_next_clones_obj;
-		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
 		    ds->ds_object ||
-		    ds->ds_prev->ds_phys->ds_num_children > 1);
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
-			    ds->ds_prev->ds_phys->ds_creation_txg);
-			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
-			remove_from_next_clones(ds->ds_prev,
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
 			    dsphys->ds_next_snap_obj, tx);
-			VERIFY3U(0, ==, zap_add_int(mos,
+			VERIFY0(zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
 	}
@@ -1957,148 +1475,584 @@ dsl_dataset_snapshot_sync(void *arg1, vo
 	 * since our unique space is going to zero.
 	 */
 	if (ds->ds_reserved) {
-		int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved);
+		int64_t delta;
+		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+		    ds->ds_reserved);
 		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
-		    add, 0, 0, tx);
+		    delta, 0, 0, tx);
 	}
 
-	bplist_close(&ds->ds_deadlist);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
-	ds->ds_phys->ds_prev_snap_obj = dsobj;
-	ds->ds_phys->ds_prev_snap_txg = crtxg;
-	ds->ds_phys->ds_unique_bytes = 0;
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
+	dsl_deadlist_add_key(&ds->ds_deadlist,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
-		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-	ds->ds_phys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-	    ds->ds_phys->ds_deadlist_obj));
-
-	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
-	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
-	    snapname, 8, 1, &dsobj, tx);
-	ASSERT(err == 0);
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    snapname, 8, 1, &dsobj, tx));
 
 	if (ds->ds_prev)
-		dsl_dataset_drop_ref(ds->ds_prev, ds);
-	VERIFY(0 == dsl_dataset_get_ref(dp,
-	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+		dsl_dataset_rele(ds->ds_prev, ds);
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
 
-	dsl_pool_ds_snapshotted(ds, tx);
+	dsl_scan_ds_snapshotted(ds, tx);
 
 	dsl_dir_snap_cmtime_update(ds->ds_dir);
 
-	spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
-	    "dataset = %llu", dsobj);
+	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
 }
 
-void
-dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+static void
+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
-	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(ds->ds_objset != NULL);
-	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
 
-	/*
-	 * in case we had to change ds_fsid_guid when we opened it,
-	 * sync it out now.
-	 */
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		dsl_dataset_t *ds;
+		char *name, *atp;
+		char dsname[ZFS_MAX_DATASET_NAME_LEN];
 
-	dsl_dir_dirty(ds->ds_dir, tx);
-	dmu_objset_sync(ds->ds_objset, zio, tx);
+		name = nvpair_name(pair);
+		atp = strchr(name, '@');
+		(void) strlcpy(dsname, name, atp - name + 1);
+		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
+
+		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
+		if (ddsa->ddsa_props != NULL) {
+			dsl_props_set_sync_impl(ds->ds_prev,
+			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+		}
+		dsl_dataset_rele(ds, FTAG);
+	}
 }
 
-void
-dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+/*
+ * The snapshots must all be in the same pool.
+ * All-or-nothing: if there are any failures, nothing will be modified.
+ */
+int
+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
 {
-	uint64_t refd, avail, uobjs, aobjs;
+	dsl_dataset_snapshot_arg_t ddsa;
+	nvpair_t *pair;
+	boolean_t needsuspend;
+	int error;
+	spa_t *spa;
+	char *firstname;
+	nvlist_t *suspended = NULL;
 
-	dsl_dir_stats(ds->ds_dir, nv);
+	pair = nvlist_next_nvpair(snaps, NULL);
+	if (pair == NULL)
+		return (0);
+	firstname = nvpair_name(pair);
 
-	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
+	error = spa_open(firstname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
 
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
-	    ds->ds_phys->ds_creation_time);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
-	    ds->ds_phys->ds_creation_txg);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
-	    ds->ds_quota);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
-	    ds->ds_reserved);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
-	    ds->ds_phys->ds_guid);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
-	    dsl_dataset_unique(ds));
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
-	    ds->ds_object);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
-	    ds->ds_userrefs);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
-	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
+	if (needsuspend) {
+		suspended = fnvlist_alloc();
+		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(snaps, pair)) {
+			char fsname[ZFS_MAX_DATASET_NAME_LEN];
+			char *snapname = nvpair_name(pair);
+			char *atp;
+			void *cookie;
+
+			atp = strchr(snapname, '@');
+			if (atp == NULL) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+			(void) strlcpy(fsname, snapname, atp - snapname + 1);
 
-	if (ds->ds_phys->ds_next_snap_obj) {
-		/*
-		 * This is a snapshot; override the dd's space used with
-		 * our unique space and compression ratio.
-		 */
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-		    ds->ds_phys->ds_unique_bytes);
-		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-		    ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
-		    (ds->ds_phys->ds_uncompressed_bytes * 100 /
-		    ds->ds_phys->ds_compressed_bytes));
+			error = zil_suspend(fsname, &cookie);
+			if (error != 0)
+				break;
+			fnvlist_add_uint64(suspended, fsname,
+			    (uintptr_t)cookie);
+		}
+	}
+
+	ddsa.ddsa_snaps = snaps;
+	ddsa.ddsa_props = props;
+	ddsa.ddsa_errors = errors;
+	ddsa.ddsa_cr = CRED();
+
+	if (error == 0) {
+		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
+		    dsl_dataset_snapshot_sync, &ddsa,
+		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
+	}
+
+	if (suspended != NULL) {
+		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(suspended, pair)) {
+			zil_resume((void *)(uintptr_t)
+			    fnvpair_value_uint64(pair));
+		}
+		fnvlist_free(suspended);
+	}
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	if (error == 0) {
+		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(snaps, pair)) {
+			char *snapname = nvpair_name(pair);
+			zvol_create_minors(snapname);
+		}
 	}
+#endif
+#endif
+	return (error);
 }
 
-void
-dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+typedef struct dsl_dataset_snapshot_tmp_arg {
+	const char *ddsta_fsname;
+	const char *ddsta_snapname;
+	minor_t ddsta_cleanup_minor;
+	const char *ddsta_htag;
+} dsl_dataset_snapshot_tmp_arg_t;
+
+static int
+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
 {
-	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
-	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
-	stat->dds_guid = ds->ds_phys->ds_guid;
-	if (ds->ds_phys->ds_next_snap_obj) {
-		stat->dds_is_snapshot = B_TRUE;
-		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
-	} else {
-		stat->dds_is_snapshot = B_FALSE;
-		stat->dds_num_clones = 0;
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	/* NULL cred means no limit check for tmp snapshot */
+	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
+	    tx, B_FALSE, 0, NULL);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
 	}
 
-	/* clone origin is really a dsl_dir thing... */
-	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-	if (dsl_dir_is_clone(ds->ds_dir)) {
-		dsl_dataset_t *ods;
-
-		VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
-		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
-		dsl_dataset_name(ods, stat->dds_origin);
-		dsl_dataset_drop_ref(ods, FTAG);
-	} else {
-		stat->dds_origin[0] = '\0';
+	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
+	    B_TRUE, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
 	}
-	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
 }
 
-uint64_t
-dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+static void
+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
 {
-	return (ds->ds_fsid_guid);
-}
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
 
-void
-dsl_dataset_space(dsl_dataset_t *ds,
+	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
+
+	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
+	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
+	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
+	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+    minor_t cleanup_minor, const char *htag)
+{
+	dsl_dataset_snapshot_tmp_arg_t ddsta;
+	int error;
+	spa_t *spa;
+	boolean_t needsuspend;
+	void *cookie;
+
+	ddsta.ddsta_fsname = fsname;
+	ddsta.ddsta_snapname = snapname;
+	ddsta.ddsta_cleanup_minor = cleanup_minor;
+	ddsta.ddsta_htag = htag;
+
+	error = spa_open(fsname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
+
+	if (needsuspend) {
+		error = zil_suspend(fsname, &cookie);
+		if (error != 0)
+			return (error);
+	}
+
+	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
+	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
+
+	if (needsuspend)
+		zil_resume(cookie);
+	return (error);
+}
+
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(ds->ds_objset != NULL);
+	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
+
+	/*
+	 * in case we had to change ds_fsid_guid when we opened it,
+	 * sync it out now.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
+
+	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+	}
+
+	dmu_objset_sync(ds->ds_objset, zio, tx);
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_activation_needed[f]) {
+			if (ds->ds_feature_inuse[f])
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f, tx);
+			ds->ds_feature_inuse[f] = B_TRUE;
+		}
+	}
+}
+
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, tx);
+	return (0);
+}
+
+void
+dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *os = ds->ds_objset;
+
+	bplist_iterate(&ds->ds_pending_deadlist,
+	    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+
+	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+
+	dmu_buf_rele(ds->ds_dbuf, ds);
+}
+
+static void
+get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	uint64_t count = 0;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	nvlist_t *propval = fnvlist_alloc();
+	nvlist_t *val;
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+	/*
+	 * We use nvlist_alloc() instead of fnvlist_alloc() because the
+	 * latter would allocate the list with NV_UNIQUE_NAME flag.
+	 * As a result, every time a clone name is appended to the list
+	 * it would be (linearly) searched for for a duplicate name.
+	 * We already know that all clone names must be unique and we
+	 * want avoid the quadratic complexity of double-checking that
+	 * because we can have a large number of clones.
+	 */
+	VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
+
+	/*
+	 * There may be missing entries in ds_next_clones_obj
+	 * due to a bug in a previous version of the code.
+	 * Only trust it if it has the right number of entries.
+	 */
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+		    &count));
+	}
+	if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
+		goto fail;
+	for (zap_cursor_init(&zc, mos,
+	    dsl_dataset_phys(ds)->ds_next_clones_obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_dataset_t *clone;
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
+		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za.za_first_integer, FTAG, &clone));
+		dsl_dir_name(clone->ds_dir, buf);
+		fnvlist_add_boolean(val, buf);
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(&zc);
+	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
+	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
+fail:
+	nvlist_free(val);
+	nvlist_free(propval);
+}
+
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dsl_dataset_has_resume_receive_state(ds)) {
+		char *str;
+		void *packed;
+		uint8_t *compressed;
+		uint64_t val;
+		nvlist_t *token_nv = fnvlist_alloc();
+		size_t packed_size, compressed_size;
+
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "fromguid", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "object", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "offset", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "bytes", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "toguid", val);
+		}
+		char buf[256];
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+			fnvlist_add_string(token_nv, "toname", buf);
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_EMBEDOK) == 0) {
+			fnvlist_add_boolean(token_nv, "embedok");
+		}
+		packed = fnvlist_pack(token_nv, &packed_size);
+		fnvlist_free(token_nv);
+		compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+		compressed_size = gzip_compress(packed, compressed,
+		    packed_size, packed_size, 6);
+
+		zio_cksum_t cksum;
+		fletcher_4_native(compressed, compressed_size, NULL, &cksum);
+
+		str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
+		for (int i = 0; i < compressed_size; i++) {
+			(void) sprintf(str + i * 2, "%02x", compressed[i]);
+		}
+		str[compressed_size * 2] = '\0';
+		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+		    ZFS_SEND_RESUME_TOKEN_VERSION,
+		    (longlong_t)cksum.zc_word[0],
+		    (longlong_t)packed_size, str);
+		dsl_prop_nvlist_add_string(nv,
+		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+		kmem_free(packed, packed_size);
+		kmem_free(str, compressed_size * 2 + 1);
+		kmem_free(compressed, packed_size);
+		strfree(propval);
+	}
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	uint64_t refd, avail, uobjs, aobjs, ratio;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+	    dsl_dataset_phys(ds)->ds_compressed_bytes);
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes);
+
+	if (ds->ds_is_snapshot) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+		    dsl_dataset_phys(ds)->ds_unique_bytes);
+		get_clones_stat(ds, nv);
+	} else {
+		if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
+			char buf[ZFS_MAX_DATASET_NAME_LEN];
+			dsl_dataset_name(ds->ds_prev, buf);
+			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
+		}
+
+		dsl_dir_stats(ds->ds_dir, nv);
+	}
+
+	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+	    dsl_dataset_phys(ds)->ds_creation_time);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+	    dsl_dataset_phys(ds)->ds_creation_txg);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+	    ds->ds_quota);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+	    ds->ds_reserved);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
+	    dsl_dataset_phys(ds)->ds_guid);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+	    ds->ds_object);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+	    ds->ds_userrefs);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		uint64_t written, comp, uncomp;
+		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+		dsl_dataset_t *prev;
+
+		int err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+		if (err == 0) {
+			err = dsl_dataset_space_written(prev, ds, &written,
+			    &comp, &uncomp);
+			dsl_dataset_rele(prev, FTAG);
+			if (err == 0) {
+				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
+				    written);
+			}
+		}
+	}
+
+	if (!dsl_dataset_is_snapshot(ds)) {
+		/*
+		 * A failed "newfs" (e.g. full) resumable receive leaves
+		 * the stats set on this dataset.  Check here for the prop.
+		 */
+		get_receive_resume_stats(ds, nv);
+
+		/*
+		 * A failed incremental resumable receive leaves the
+		 * stats set on our child named "%recv".  Check the child
+		 * for the prop.
+		 */
+		/* 6 extra bytes for /%recv */
+		char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+		dsl_dataset_t *recv_ds;
+		dsl_dataset_name(ds, recvname);
+		if (strlcat(recvname, "/", sizeof (recvname)) <
+		    sizeof (recvname) &&
+		    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+		    sizeof (recvname) &&
+		    dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+			get_receive_resume_stats(recv_ds, nv);
+			dsl_dataset_rele(recv_ds, FTAG);
+		}
+	}
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	ASSERT(dsl_pool_config_held(dp));
+
+	stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
+	stat->dds_inconsistent =
+	    dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
+	stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
+	stat->dds_origin[0] = '\0';
+	if (ds->ds_is_snapshot) {
+		stat->dds_is_snapshot = B_TRUE;
+		stat->dds_num_clones =
+		    dsl_dataset_phys(ds)->ds_num_children - 1;
+	} else {
+		stat->dds_is_snapshot = B_FALSE;
+		stat->dds_num_clones = 0;
+
+		if (dsl_dir_is_clone(ds->ds_dir)) {
+			dsl_dataset_t *ods;
+
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
+			    FTAG, &ods));
+			dsl_dataset_name(ods, stat->dds_origin);
+			dsl_dataset_rele(ods, FTAG);
+		}
+	}
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+	return (ds->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-	*refdbytesp = ds->ds_phys->ds_used_bytes;
+	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
 	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
-	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
-		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+		*availbytesp +=
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
 	if (ds->ds_quota != 0) {
 		/*
 		 * Adjust available bytes according to refquota
@@ -2109,316 +2063,452 @@ dsl_dataset_space(dsl_dataset_t *ds,
 		else
 			*availbytesp = 0;
 	}
-	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
 boolean_t
-dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 {
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	uint64_t birth;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
-	if (ds->ds_prev == NULL)
+	ASSERT(dsl_pool_config_held(dp));
+	if (snap == NULL)
 		return (B_FALSE);
-	if (ds->ds_phys->ds_bp.blk_birth >
-	    ds->ds_prev->ds_phys->ds_creation_txg)
-		return (B_TRUE);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
+		objset_t *os, *os_snap;
+		/*
+		 * It may be that only the ZIL differs, because it was
+		 * reset in the head.  Don't count that as being
+		 * modified.
+		 */
+		if (dmu_objset_from_ds(ds, &os) != 0)
+			return (B_TRUE);
+		if (dmu_objset_from_ds(snap, &os_snap) != 0)
+			return (B_TRUE);
+		return (bcmp(&os->os_phys->os_meta_dnode,
+		    &os_snap->os_phys->os_meta_dnode,
+		    sizeof (os->os_phys->os_meta_dnode)) != 0);
+	}
 	return (B_FALSE);
 }
 
+typedef struct dsl_dataset_rename_snapshot_arg {
+	const char *ddrsa_fsname;
+	const char *ddrsa_oldsnapname;
+	const char *ddrsa_newsnapname;
+	boolean_t ddrsa_recursive;
+	dmu_tx_t *ddrsa_tx;
+} dsl_dataset_rename_snapshot_arg_t;
+
 /* ARGSUSED */
 static int
-dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
 {
-	dsl_dataset_t *ds = arg1;
-	char *newsnapname = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_dataset_t *hds;
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	int error;
 	uint64_t val;
-	int err;
-
-	err = dsl_dataset_hold_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds);
-	if (err)
-		return (err);
 
-	/* new name better not be in use */
-	err = dsl_dataset_snap_lookup(hds, newsnapname, &val);
-	dsl_dataset_rele(hds, FTAG);
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	if (error != 0) {
+		/* ignore nonexistent snapshots */
+		return (error == ENOENT ? 0 : error);
+	}
 
-	if (err == 0)
-		err = EEXIST;
-	else if (err == ENOENT)
-		err = 0;
+	/* new name should not exist */
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
+	if (error == 0)
+		error = SET_ERROR(EEXIST);
+	else if (error == ENOENT)
+		error = 0;
 
 	/* dataset name + 1 for the "@" + the new snapshot name must fit */
-	if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN)
-		err = ENAMETOOLONG;
+	if (dsl_dir_namelen(hds->ds_dir) + 1 +
+	    strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
+		error = SET_ERROR(ENAMETOOLONG);
 
-	return (err);
+	return (error);
 }
 
-static void
-dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2,
-    cred_t *cr, dmu_tx_t *tx)
+static int
+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	const char *newsnapname = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *hds;
-	int err;
-
-	ASSERT(ds->ds_phys->ds_next_snap_obj != 0);
-
-	VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
-	    dd->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+	int error;
 
-	VERIFY(0 == dsl_dataset_get_snapname(ds));
-	err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx);
-	ASSERT3U(err, ==, 0);
-	mutex_enter(&ds->ds_lock);
-	(void) strcpy(ds->ds_snapname, newsnapname);
-	mutex_exit(&ds->ds_lock);
-	err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj,
-	    ds->ds_snapname, 8, 1, &ds->ds_object, tx);
-	ASSERT3U(err, ==, 0);
+	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
+	if (error != 0)
+		return (error);
 
-	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx,
-	    cr, "dataset = %llu", ds->ds_object);
+	if (ddrsa->ddrsa_recursive) {
+		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
+		    DS_FIND_CHILDREN);
+	} else {
+		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
+	}
 	dsl_dataset_rele(hds, FTAG);
+	return (error);
 }
 
-struct renamesnaparg {
-	dsl_sync_task_group_t *dstg;
-	char failed[MAXPATHLEN];
-	char *oldsnap;
-	char *newsnap;
-};
-
 static int
-dsl_snapshot_rename_one(const char *name, void *arg)
+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
 {
-	struct renamesnaparg *ra = arg;
-	dsl_dataset_t *ds = NULL;
-	char *snapname;
-	int err;
-
-	snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
-	(void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	char *oldname, *newname;
+#endif
+#endif
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_dataset_t *ds;
+	uint64_t val;
+	dmu_tx_t *tx = ddrsa->ddrsa_tx;
+	int error;
 
-	/*
-	 * For recursive snapshot renames the parent won't be changing
-	 * so we just pass name for both the to/from argument.
-	 */
-	err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
-	if (err != 0) {
-		strfree(snapname);
-		return (err == ENOENT ? 0 : err);
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	ASSERT(error == 0 || error == ENOENT);
+	if (error == ENOENT) {
+		/* ignore nonexistent snapshots */
+		return (0);
 	}
 
+	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
+
+	/* log before we change the name */
+	spa_history_log_internal_ds(ds, "rename", tx,
+	    "-> @%s", ddrsa->ddrsa_newsnapname);
+
+	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+	    B_FALSE));
+	mutex_enter(&ds->ds_lock);
+	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
+	mutex_exit(&ds->ds_lock);
+	VERIFY0(zap_add(dp->dp_meta_objset,
+	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
+	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
+
+#ifdef __FreeBSD__
 #ifdef _KERNEL
-	/*
-	 * For all filesystems undergoing rename, we'll need to unmount it.
-	 */
-	(void) zfs_unmount_snap(snapname, NULL);
+	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
+	    ddrsa->ddrsa_oldsnapname);
+	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
+	    ddrsa->ddrsa_newsnapname);
+	zfsvfs_update_fromname(oldname, newname);
+	zvol_rename_minors(oldname, newname);
+	kmem_free(newname, MAXPATHLEN);
+	kmem_free(oldname, MAXPATHLEN);
 #endif
-	err = dsl_dataset_hold(snapname, ra->dstg, &ds);
-	strfree(snapname);
-	if (err != 0)
-		return (err == ENOENT ? 0 : err);
-
-	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
-	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
+#endif
+	dsl_dataset_rele(ds, FTAG);
 
 	return (0);
 }
 
-static int
-dsl_recursive_rename(char *oldname, const char *newname)
+static void
+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
 {
-	int err;
-	struct renamesnaparg *ra;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	char *cp, *fsname = spa_strdup(oldname);
-	int len = strlen(oldname) + 1;
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
 
-	/* truncate the snapshot name to get the fsname */
-	cp = strchr(fsname, '@');
-	*cp = '\0';
-
-	err = spa_open(fsname, &spa, FTAG);
-	if (err) {
-		kmem_free(fsname, len);
-		return (err);
+	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
+	ddrsa->ddrsa_tx = tx;
+	if (ddrsa->ddrsa_recursive) {
+		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
+		    DS_FIND_CHILDREN));
+	} else {
+		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
 	}
-	ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
-	ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+	dsl_dataset_rele(hds, FTAG);
+}
 
-	ra->oldsnap = strchr(oldname, '@') + 1;
-	ra->newsnap = strchr(newname, '@') + 1;
-	*ra->failed = '\0';
+int
+dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+{
+	dsl_dataset_rename_snapshot_arg_t ddrsa;
 
-	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
-	    DS_FIND_CHILDREN);
-	kmem_free(fsname, len);
+	ddrsa.ddrsa_fsname = fsname;
+	ddrsa.ddrsa_oldsnapname = oldsnapname;
+	ddrsa.ddrsa_newsnapname = newsnapname;
+	ddrsa.ddrsa_recursive = recursive;
+
+	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+	    dsl_dataset_rename_snapshot_sync, &ddrsa,
+	    1, ZFS_SPACE_CHECK_RESERVED));
+}
 
-	if (err == 0) {
-		err = dsl_sync_task_group_wait(ra->dstg);
-	}
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset.  We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here.  We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
+static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+	boolean_t held;
 
-	for (dst = list_head(&ra->dstg->dstg_tasks); dst;
-	    dst = list_next(&ra->dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
-		if (dst->dst_err) {
-			dsl_dir_name(ds->ds_dir, ra->failed);
-			(void) strlcat(ra->failed, "@", sizeof (ra->failed));
-			(void) strlcat(ra->failed, ra->newsnap,
-			    sizeof (ra->failed));
-		}
-		dsl_dataset_rele(ds, ra->dstg);
-	}
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
 
-	if (err)
-		(void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
+	if (owner != NULL) {
+		VERIFY3P(ds->ds_owner, ==, owner);
+		dsl_dataset_long_rele(ds, owner);
+	}
 
-	dsl_sync_task_group_destroy(ra->dstg);
-	kmem_free(ra, sizeof (struct renamesnaparg));
-	spa_close(spa, FTAG);
-	return (err);
-}
+	held = dsl_dataset_long_held(ds);
 
-static int
-dsl_valid_rename(const char *oldname, void *arg)
-{
-	int delta = *(int *)arg;
+	if (owner != NULL)
+		dsl_dataset_long_hold(ds, owner);
 
-	if (strlen(oldname) + delta >= MAXNAMELEN)
-		return (ENAMETOOLONG);
+	if (held)
+		return (SET_ERROR(EBUSY));
 
 	return (0);
 }
 
-#pragma weak dmu_objset_rename = dsl_dataset_rename
-int
-dsl_dataset_rename(const char *oldname, const char *newname, boolean_t recursive)
+typedef struct dsl_dataset_rollback_arg {
+	const char *ddra_fsname;
+	void *ddra_owner;
+	nvlist_t *ddra_result;
+} dsl_dataset_rollback_arg_t;
+
+static int
+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd;
+	dsl_dataset_rollback_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
-	const char *tail;
-	int err;
+	int64_t unused_refres_delta;
+	int error;
 
-	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
-	if (err)
-		return (err);
+	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
 
-	if (tail == NULL) {
-		int delta = strlen(newname) - strlen(oldname);
+	/* must not be a snapshot */
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
 
-		/* if we're growing, validate child name lengths */
-		if (delta > 0)
-			err = dmu_objset_find(oldname, dsl_valid_rename,
-			    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+	/* must have a most recent snapshot */
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
 
-		if (!err) {
-			/*
-			 * If there are more than 2 references there may be
-			 * holds hanging around that haven't been cleared
-			 * out yet.
-			 */
-			if (dmu_buf_refcount(dd->dd_dbuf) > 2)
-				txg_wait_synced(dd->dd_pool, 0);
+	/*
+	 * No rollback to a snapshot created in the current txg, because
+	 * the rollback may dirty the dataset and create blocks that are
+	 * not reachable from the rootbp while having a birth txg that
+	 * falls into the snapshot's range.
+	 */
+	if (dmu_tx_is_syncing(tx) &&
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EAGAIN));
+	}
 
-			err = dsl_dir_rename(dd, newname);
+	/* must not have any bookmarks after the most recent snapshot */
+	nvlist_t *proprequest = fnvlist_alloc();
+	fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
+	nvlist_t *bookmarks = fnvlist_alloc();
+	error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
+	fnvlist_free(proprequest);
+	if (error != 0)
+		return (error);
+	for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
+		nvlist_t *valuenv =
+		    fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
+		    zfs_prop_to_name(ZFS_PROP_CREATETXG));
+		uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
+		if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+			fnvlist_free(bookmarks);
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(EEXIST));
 		}
-		dsl_dir_close(dd, FTAG);
-		return (err);
 	}
+	fnvlist_free(bookmarks);
 
-	if (tail[0] != '@') {
-		/* the name ended in a nonexistent component */
-		dsl_dir_close(dd, FTAG);
-		return (ENOENT);
-	}
-
-	dsl_dir_close(dd, FTAG);
-
-	/* new name must be snapshot in same filesystem */
-	tail = strchr(newname, '@');
-	if (tail == NULL)
-		return (EINVAL);
-	tail++;
-	if (strncmp(oldname, newname, tail - newname) != 0)
-		return (EXDEV);
+	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
 
-	if (recursive) {
-		err = dsl_recursive_rename(oldname, newname);
-	} else {
-		err = dsl_dataset_hold(oldname, FTAG, &ds);
-		if (err)
-			return (err);
+	/*
+	 * Check if the snap we are rolling back to uses more than
+	 * the refquota.
+	 */
+	if (ds->ds_quota != 0 &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EDQUOT));
+	}
 
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    dsl_dataset_snapshot_rename_check,
-		    dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1);
+	/*
+	 * When we do the clone swap, we will temporarily use more space
+	 * due to the refreservation (the head will no longer have any
+	 * unique space, so the entire amount of the refreservation will need
+	 * to be free).  We will immediately destroy the clone, freeing
+	 * this space, but the freeing happens over many txg's.
+	 */
+	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
 
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
 		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOSPC));
 	}
 
-	return (err);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
 }
 
-struct promotenode {
-	list_node_t link;
-	dsl_dataset_t *ds;
+static void
+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rollback_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds, *clone;
+	uint64_t cloneobj;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
+
+	dsl_dataset_name(ds->ds_prev, namebuf);
+	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
+
+	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
+	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
+
+	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
+
+	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
+	dsl_dataset_zero_zil(ds, tx);
+
+	dsl_destroy_head_sync_impl(clone, tx);
+
+	dsl_dataset_rele(clone, FTAG);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Rolls back the given filesystem or volume to the most recent snapshot.
+ * The name of the most recent snapshot will be returned under key "target"
+ * in the result nvlist.
+ *
+ * If owner != NULL:
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ *   succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted.  See
+ * notes above zfs_suspend_fs() for further details.
+ */
+int
+dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
+{
+	dsl_dataset_rollback_arg_t ddra;
+
+	ddra.ddra_fsname = fsname;
+	ddra.ddra_owner = owner;
+	ddra.ddra_result = result;
+
+	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
+	    dsl_dataset_rollback_sync, &ddra,
+	    1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+struct promotenode {
+	list_node_t link;
+	dsl_dataset_t *ds;
 };
 
-struct promotearg {
+typedef struct dsl_dataset_promote_arg {
+	const char *ddpa_clonename;
+	dsl_dataset_t *ddpa_clone;
 	list_t shared_snaps, origin_snaps, clone_snaps;
-	dsl_dataset_t *origin_origin, *origin_head;
+	dsl_dataset_t *origin_origin; /* origin of the origin */
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
 	char *err_ds;
-};
+	cred_t *cr;
+} dsl_dataset_promote_arg_t;
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
+    void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
 
-/* ARGSUSED */
 static int
-dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *hds = arg1;
-	struct promotearg *pa = arg2;
-	struct promotenode *snap = list_head(&pa->shared_snaps);
-	dsl_dataset_t *origin_ds = snap->ds;
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds;
 	int err;
+	uint64_t unused;
+	uint64_t ss_mv_cnt;
+	size_t max_snap_len;
 
-	/* Check that it is a real clone */
-	if (!dsl_dir_is_clone(hds->ds_dir))
-		return (EINVAL);
+	err = promote_hold(ddpa, dp, FTAG);
+	if (err != 0)
+		return (err);
 
-	/* Since this is so expensive, don't do the preliminary check */
-	if (!dmu_tx_is_syncing(tx))
+	hds = ddpa->ddpa_clone;
+	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
+
+	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
+		promote_rele(ddpa, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * Compute and check the amount of space to transfer.  Since this is
+	 * so expensive, don't do the preliminary check.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		promote_rele(ddpa, FTAG);
 		return (0);
+	}
 
-	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)
-		return (EXDEV);
+	snap = list_head(&ddpa->shared_snaps);
+	origin_ds = snap->ds;
 
 	/* compute origin's new unique space */
-	snap = list_tail(&pa->clone_snaps);
-	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
-	err = bplist_space_birthrange(&snap->ds->ds_deadlist,
-	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
-	if (err)
-		return (err);
+	snap = list_tail(&ddpa->clone_snaps);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
+	    &ddpa->unique, &unused, &unused);
 
 	/*
 	 * Walk the snapshots that we are moving
 	 *
 	 * Compute space to transfer.  Consider the incremental changes
-	 * to used for each snapshot:
+	 * to used by each snapshot:
 	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
 	 * So each snapshot gave birth to:
 	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
@@ -2429,51 +2519,71 @@ dsl_dataset_promote_check(void *arg1, vo
 	 * Note however, if we stop before we reach the ORIGIN we get:
 	 * uN + kN + kN-1 + ... + kM - uM-1
 	 */
-	pa->used = origin_ds->ds_phys->ds_used_bytes;
-	pa->comp = origin_ds->ds_phys->ds_compressed_bytes;
-	pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
-	for (snap = list_head(&pa->shared_snaps); snap;
-	    snap = list_next(&pa->shared_snaps, snap)) {
+	ss_mv_cnt = 0;
+	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		uint64_t val, dlused, dlcomp, dluncomp;
 		dsl_dataset_t *ds = snap->ds;
 
+		ss_mv_cnt++;
+
+		/*
+		 * If there are long holds, we won't be able to evict
+		 * the objset.
+		 */
+		if (dsl_dataset_long_held(ds)) {
+			err = SET_ERROR(EBUSY);
+			goto out;
+		}
+
 		/* Check that the snapshot name does not conflict */
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		if (strlen(ds->ds_snapname) >= max_snap_len) {
+			err = SET_ERROR(ENAMETOOLONG);
+			goto out;
+		}
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
 		if (err == 0) {
-			err = EEXIST;
+			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
+			err = SET_ERROR(EEXIST);
 			goto out;
 		}
 		if (err != ENOENT)
 			goto out;
 
 		/* The very first snapshot does not have a deadlist */
-		if (ds->ds_phys->ds_prev_snap_obj == 0)
+		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
 			continue;
 
-		if (err = bplist_space(&ds->ds_deadlist,
-		    &dlused, &dlcomp, &dluncomp))
-			goto out;
-		pa->used += dlused;
-		pa->comp += dlcomp;
-		pa->uncomp += dluncomp;
+		dsl_deadlist_space(&ds->ds_deadlist,
+		    &dlused, &dlcomp, &dluncomp);
+		ddpa->used += dlused;
+		ddpa->comp += dlcomp;
+		ddpa->uncomp += dluncomp;
 	}
 
 	/*
 	 * If we are a clone of a clone then we never reached ORIGIN,
 	 * so we need to subtract out the clone origin's used space.
 	 */
-	if (pa->origin_origin) {
-		pa->used -= pa->origin_origin->ds_phys->ds_used_bytes;
-		pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes;
-		pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes;
+	if (ddpa->origin_origin) {
+		ddpa->used -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+		ddpa->comp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
+		ddpa->uncomp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->
+		    ds_uncompressed_bytes;
 	}
 
-	/* Check that there is enough space here */
+	/* Check that there is enough space and limit headroom here */
 	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
-	    pa->used);
-	if (err)
-		return (err);
+	    0, ss_mv_cnt, ddpa->used, ddpa->cr);
+	if (err != 0)
+		goto out;
 
 	/*
 	 * Compute the amounts of space that will be used by snapshots
@@ -2481,120 +2591,217 @@ dsl_dataset_promote_check(void *arg1, vo
 	 * it is the amount of space that will be on all of their
 	 * deadlists (that was not born before their new origin).
 	 */
-	if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		uint64_t space;
 
 		/*
 		 * Note, typically this will not be a clone of a clone,
-		 * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so
-		 * these snaplist_space() -> bplist_space_birthrange()
+		 * so dd_origin_txg will be < TXG_INITIAL, so
+		 * these snaplist_space() -> dsl_deadlist_space_range()
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
-		snap = list_head(&pa->origin_snaps);
-		err = snaplist_space(&pa->shared_snaps,
-		    snap->ds->ds_origin_txg, &pa->cloneusedsnap);
-		if (err)
-			return (err);
+		snap = list_head(&ddpa->origin_snaps);
+		err = snaplist_space(&ddpa->shared_snaps,
+		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
+		if (err != 0)
+			goto out;
 
-		err = snaplist_space(&pa->clone_snaps,
-		    snap->ds->ds_origin_txg, &space);
-		if (err)
-			return (err);
-		pa->cloneusedsnap += space;
+		err = snaplist_space(&ddpa->clone_snaps,
+		    snap->ds->ds_dir->dd_origin_txg, &space);
+		if (err != 0)
+			goto out;
+		ddpa->cloneusedsnap += space;
 	}
-	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
-		err = snaplist_space(&pa->origin_snaps,
-		    origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap);
-		if (err)
-			return (err);
+	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+	    DD_FLAG_USED_BREAKDOWN) {
+		err = snaplist_space(&ddpa->origin_snaps,
+		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
+		    &ddpa->originusedsnap);
+		if (err != 0)
+			goto out;
 	}
 
-	return (0);
 out:
-	pa->err_ds =  snap->ds->ds_snapname;
+	promote_rele(ddpa, FTAG);
 	return (err);
 }
 
 static void
-dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *hds = arg1;
-	struct promotearg *pa = arg2;
-	struct promotenode *snap = list_head(&pa->shared_snaps);
-	dsl_dataset_t *origin_ds = snap->ds;
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds;
 	dsl_dataset_t *origin_head;
-	dsl_dir_t *dd = hds->ds_dir;
-	dsl_pool_t *dp = hds->ds_dir->dd_pool;
+	dsl_dir_t *dd;
 	dsl_dir_t *odd = NULL;
 	uint64_t oldnext_obj;
 	int64_t delta;
+#if defined(__FreeBSD__) && defined(_KERNEL)
+	char *oldname, *newname;
+#endif
 
-	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
+	VERIFY0(promote_hold(ddpa, dp, FTAG));
+	hds = ddpa->ddpa_clone;
 
-	snap = list_head(&pa->origin_snaps);
+	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
+
+	snap = list_head(&ddpa->shared_snaps);
+	origin_ds = snap->ds;
+	dd = hds->ds_dir;
+
+	snap = list_head(&ddpa->origin_snaps);
 	origin_head = snap->ds;
 
 	/*
 	 * We need to explicitly open odd, since origin_ds's dd will be
 	 * changing.
 	 */
-	VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object,
+	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
 	    NULL, FTAG, &odd));
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
-	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
-	snap = list_tail(&pa->clone_snaps);
-	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
-	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
+	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
+	snap = list_tail(&ddpa->clone_snaps);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
 
 	/* change the origin's next clone */
-	if (origin_ds->ds_phys->ds_next_clones_obj) {
-		remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
-		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
-		    origin_ds->ds_phys->ds_next_clones_obj,
+	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
+		dsl_dataset_remove_from_next_clones(origin_ds,
+		    snap->ds->ds_object, tx);
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
 		    oldnext_obj, tx));
 	}
 
 	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
-	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
-	hds->ds_origin_txg = origin_head->ds_origin_txg;
+	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
+	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
 	dmu_buf_will_dirty(odd->dd_dbuf, tx);
-	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
-	origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg;
+	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
+	origin_head->ds_dir->dd_origin_txg =
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
+
+	/* change dd_clone entries */
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
+		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+		    hds->ds_object, tx));
+
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+		    origin_head->ds_object, tx));
+		if (dsl_dir_phys(dd)->dd_clones == 0) {
+			dsl_dir_phys(dd)->dd_clones =
+			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+			    DMU_OT_NONE, 0, tx);
+		}
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
+	}
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+	/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
+	mutex_enter(&spa_namespace_lock);
+
+	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+#endif
 
 	/* move snapshots to this dir */
-	for (snap = list_head(&pa->shared_snaps); snap;
-	    snap = list_next(&pa->shared_snaps, snap)) {
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
 		dsl_dataset_t *ds = snap->ds;
 
-		/* unregister props as dsl_dir is changing */
+		/*
+		 * Property callbacks are registered to a particular
+		 * dsl_dir.  Since ours is changing, evict the objset
+		 * so that they will be unregistered from the old dsl_dir.
+		 */
 		if (ds->ds_objset) {
 			dmu_objset_evict(ds->ds_objset);
 			ds->ds_objset = NULL;
 		}
+
 		/* move snap name entry */
-		VERIFY(0 == dsl_dataset_get_snapname(ds));
-		VERIFY(0 == dsl_dataset_snap_remove(origin_head,
-		    ds->ds_snapname, tx));
-		VERIFY(0 == zap_add(dp->dp_meta_objset,
-		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_snap_remove(origin_head,
+		    ds->ds_snapname, tx, B_TRUE));
+		VERIFY0(zap_add(dp->dp_meta_objset,
+		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
+		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
-		ds->ds_phys->ds_dir_obj = dd->dd_object;
+		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
 		ASSERT3P(ds->ds_dir, ==, odd);
-		dsl_dir_close(ds->ds_dir, ds);
-		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
+		dsl_dir_rele(ds->ds_dir, ds);
+		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
-		ASSERT3U(dsl_prop_numcb(ds), ==, 0);
+#if defined(__FreeBSD__) && defined(_KERNEL)
+		dsl_dataset_name(ds, newname);
+		zfsvfs_update_fromname(oldname, newname);
+		zvol_rename_minors(oldname, newname);
+#endif
+
+		/* move any clone references */
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
+		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			zap_cursor_t zc;
+			zap_attribute_t za;
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    zap_cursor_advance(&zc)) {
+				dsl_dataset_t *cnds;
+				uint64_t o;
+
+				if (za.za_first_integer == oldnext_obj) {
+					/*
+					 * We've already moved the
+					 * origin's reference.
+					 */
+					continue;
+				}
+
+				VERIFY0(dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &cnds));
+				o = dsl_dir_phys(cnds->ds_dir)->
+				    dd_head_dataset_obj;
+
+				VERIFY0(zap_remove_int(dp->dp_meta_objset,
+				    dsl_dir_phys(odd)->dd_clones, o, tx));
+				VERIFY0(zap_add_int(dp->dp_meta_objset,
+				    dsl_dir_phys(dd)->dd_clones, o, tx));
+				dsl_dataset_rele(cnds, FTAG);
+			}
+			zap_cursor_fini(&zc);
+		}
+
+		ASSERT(!dsl_prop_hascb(ds));
 	}
 
+#if defined(__FreeBSD__) && defined(_KERNEL)
+	mutex_exit(&spa_namespace_lock);
+
+	kmem_free(newname, MAXPATHLEN);
+	kmem_free(oldname, MAXPATHLEN);
+#endif
 	/*
 	 * Change space accounting.
 	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
@@ -2602,32 +2809,31 @@ dsl_dataset_promote_sync(void *arg1, voi
 	 * is true for each of {clone,origin} independently.
 	 */
 
-	delta = pa->cloneusedsnap -
-	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	delta = ddpa->cloneusedsnap -
+	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, >=, 0);
-	ASSERT3U(pa->used, >=, delta);
+	ASSERT3U(ddpa->used, >=, delta);
 	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(dd, DD_USED_HEAD,
-	    pa->used - delta, pa->comp, pa->uncomp, tx);
+	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
 
-	delta = pa->originusedsnap -
-	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+	delta = ddpa->originusedsnap -
+	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
 	ASSERT3S(delta, <=, 0);
-	ASSERT3U(pa->used, >=, -delta);
+	ASSERT3U(ddpa->used, >=, -delta);
 	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
 	dsl_dir_diduse_space(odd, DD_USED_HEAD,
-	    -pa->used - delta, -pa->comp, -pa->uncomp, tx);
+	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
 
-	origin_ds->ds_phys->ds_unique_bytes = pa->unique;
+	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
 
 	/* log history record */
-	spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx,
-	    cr, "dataset = %llu", hds->ds_object);
+	spa_history_log_internal_ds(hds, "promote", tx, "");
 
-	dsl_dir_close(odd, FTAG);
+	dsl_dir_rele(odd, FTAG);
+	promote_rele(ddpa, FTAG);
 }
 
-static char *snaplist_tag = "snaplist";
 /*
  * Make a list of dsl_dataset_t's for the snapshots between first_obj
  * (exclusive) and last_obj (inclusive).  The list will be in reverse
@@ -2635,13 +2841,11 @@ static char *snaplist_tag = "snaplist";
  * snapshots back to this dataset's origin.
  */
 static int
-snaplist_make(dsl_pool_t *dp, boolean_t own,
-    uint64_t first_obj, uint64_t last_obj, list_t *l)
+snaplist_make(dsl_pool_t *dp,
+    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
 {
 	uint64_t obj = last_obj;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
-
 	list_create(l, sizeof (struct promotenode),
 	    offsetof(struct promotenode, link));
 
@@ -2650,31 +2854,18 @@ snaplist_make(dsl_pool_t *dp, boolean_t 
 		struct promotenode *snap;
 		int err;
 
-		if (own) {
-			err = dsl_dataset_own_obj(dp, obj,
-			    0, snaplist_tag, &ds);
-			if (err == 0)
-				dsl_dataset_make_exclusive(ds, snaplist_tag);
-		} else {
-			err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds);
-		}
-		if (err == ENOENT) {
-			/* lost race with snapshot destroy */
-			struct promotenode *last = list_tail(l);
-			ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj);
-			obj = last->ds->ds_phys->ds_prev_snap_obj;
-			continue;
-		} else if (err) {
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+		ASSERT(err != ENOENT);
+		if (err != 0)
 			return (err);
-		}
 
 		if (first_obj == 0)
-			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
 
-		snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
+		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
 		snap->ds = ds;
 		list_insert_tail(l, snap);
-		obj = ds->ds_phys->ds_prev_snap_obj;
+		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 	}
 
 	return (0);
@@ -2687,1077 +2878,873 @@ snaplist_space(list_t *l, uint64_t mintx
 
 	*spacep = 0;
 	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
-		uint64_t used;
-		int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
-		    mintxg, UINT64_MAX, &used);
-		if (err)
-			return (err);
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
 		*spacep += used;
 	}
 	return (0);
 }
 
 static void
-snaplist_destroy(list_t *l, boolean_t own)
+snaplist_destroy(list_t *l, void *tag)
 {
 	struct promotenode *snap;
 
-	if (!l || !list_link_active(&l->list_head))
+	if (l == NULL || !list_link_active(&l->list_head))
 		return;
 
 	while ((snap = list_tail(l)) != NULL) {
 		list_remove(l, snap);
-		if (own)
-			dsl_dataset_disown(snap->ds, snaplist_tag);
-		else
-			dsl_dataset_rele(snap->ds, snaplist_tag);
-		kmem_free(snap, sizeof (struct promotenode));
+		dsl_dataset_rele(snap->ds, tag);
+		kmem_free(snap, sizeof (*snap));
 	}
 	list_destroy(l);
 }
 
-/*
- * Promote a clone.  Nomenclature note:
- * "clone" or "cds": the original clone which is being promoted
- * "origin" or "ods": the snapshot which is originally clone's origin
- * "origin head" or "ohds": the dataset which is the head
- * (filesystem/volume) for the origin
- * "origin origin": the origin of the origin's filesystem (typically
- * NULL, indicating that the clone is not a clone of a clone).
- */
-int
-dsl_dataset_promote(const char *name, char *conflsnap)
+static int
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
 {
-	dsl_dataset_t *ds;
+	int error;
 	dsl_dir_t *dd;
-	dsl_pool_t *dp;
-	dmu_object_info_t doi;
-	struct promotearg pa = { 0 };
 	struct promotenode *snap;
-	int err;
-
-	err = dsl_dataset_hold(name, FTAG, &ds);
-	if (err)
-		return (err);
-	dd = ds->ds_dir;
-	dp = dd->dd_pool;
 
-	err = dmu_object_info(dp->dp_meta_objset,
-	    ds->ds_phys->ds_snapnames_zapobj, &doi);
-	if (err) {
-		dsl_dataset_rele(ds, FTAG);
-		return (err);
-	}
+	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
+	    &ddpa->ddpa_clone);
+	if (error != 0)
+		return (error);
+	dd = ddpa->ddpa_clone->ds_dir;
 
-	if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) {
-		dsl_dataset_rele(ds, FTAG);
-		return (EINVAL);
+	if (ddpa->ddpa_clone->ds_is_snapshot ||
+	    !dsl_dir_is_clone(dd)) {
+		dsl_dataset_rele(ddpa->ddpa_clone, tag);
+		return (SET_ERROR(EINVAL));
 	}
 
-	/*
-	 * We are going to inherit all the snapshots taken before our
-	 * origin (i.e., our new origin will be our parent's origin).
-	 * Take ownership of them so that we can rename them into our
-	 * namespace.
-	 */
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-
-	err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj,
-	    &pa.shared_snaps);
-	if (err != 0)
+	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
+	    &ddpa->shared_snaps, tag);
+	if (error != 0)
 		goto out;
 
-	err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps);
-	if (err != 0)
+	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
+	    &ddpa->clone_snaps, tag);
+	if (error != 0)
 		goto out;
 
-	snap = list_head(&pa.shared_snaps);
-	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
-	err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj,
-	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps);
-	if (err != 0)
+	snap = list_head(&ddpa->shared_snaps);
+	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
+	    &ddpa->origin_snaps, tag);
+	if (error != 0)
 		goto out;
 
-	if (dsl_dir_is_clone(snap->ds->ds_dir)) {
-		err = dsl_dataset_own_obj(dp,
-		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
-		    0, FTAG, &pa.origin_origin);
-		if (err != 0)
+	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
+		error = dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
+		    tag, &ddpa->origin_origin);
+		if (error != 0)
 			goto out;
 	}
-
 out:
-	rw_exit(&dp->dp_config_rwlock);
-
-	/*
-	 * Add in 128x the snapnames zapobj size, since we will be moving
-	 * a bunch of snapnames to the promoted ds, and dirtying their
-	 * bonus buffers.
-	 */
-	if (err == 0) {
-		err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
-		    dsl_dataset_promote_sync, ds, &pa,
-		    2 + 2 * doi.doi_physical_blocks_512);
-		if (err && pa.err_ds && conflsnap)
-			(void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
-	}
-
-	snaplist_destroy(&pa.shared_snaps, B_TRUE);
-	snaplist_destroy(&pa.clone_snaps, B_FALSE);
-	snaplist_destroy(&pa.origin_snaps, B_FALSE);
-	if (pa.origin_origin)
-		dsl_dataset_disown(pa.origin_origin, FTAG);
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
-}
-
-struct cloneswaparg {
-	dsl_dataset_t *cds; /* clone dataset */
-	dsl_dataset_t *ohds; /* origin's head dataset */
-	boolean_t force;
-	int64_t unused_refres_delta; /* change in unconsumed refreservation */
-};
-
-/* ARGSUSED */
-static int
-dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	struct cloneswaparg *csa = arg1;
-
-	/* they should both be heads */
-	if (dsl_dataset_is_snapshot(csa->cds) ||
-	    dsl_dataset_is_snapshot(csa->ohds))
-		return (EINVAL);
-
-	/* the branch point should be just before them */
-	if (csa->cds->ds_prev != csa->ohds->ds_prev)
-		return (EINVAL);
-
-	/* cds should be the clone (unless they are unrelated) */
-	if (csa->cds->ds_prev != NULL &&
-	    csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
-	    csa->ohds->ds_object !=
-	    csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
-		return (EINVAL);
-
-	/* the clone should be a child of the origin */
-	if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir)
-		return (EINVAL);
-
-	/* ohds shouldn't be modified unless 'force' */
-	if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds))
-		return (ETXTBSY);
-
-	/* adjust amount of any unconsumed refreservation */
-	csa->unused_refres_delta =
-	    (int64_t)MIN(csa->ohds->ds_reserved,
-	    csa->ohds->ds_phys->ds_unique_bytes) -
-	    (int64_t)MIN(csa->ohds->ds_reserved,
-	    csa->cds->ds_phys->ds_unique_bytes);
-
-	if (csa->unused_refres_delta > 0 &&
-	    csa->unused_refres_delta >
-	    dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
-		return (ENOSPC);
-
-	if (csa->ohds->ds_quota != 0 &&
-	    csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
-		return (EDQUOT);
-
-	return (0);
+	if (error != 0)
+		promote_rele(ddpa, tag);
+	return (error);
 }
 
-/* ARGSUSED */
 static void
-dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
 {
-	struct cloneswaparg *csa = arg1;
-	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
-
-	ASSERT(csa->cds->ds_reserved == 0);
-	ASSERT(csa->ohds->ds_quota == 0 ||
-	    csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
-
-	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
-	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
-
-	if (csa->cds->ds_objset != NULL) {
-		dmu_objset_evict(csa->cds->ds_objset);
-		csa->cds->ds_objset = NULL;
-	}
-
-	if (csa->ohds->ds_objset != NULL) {
-		dmu_objset_evict(csa->ohds->ds_objset);
-		csa->ohds->ds_objset = NULL;
-	}
-
-	/*
-	 * Reset origin's unique bytes, if it exists.
-	 */
-	if (csa->cds->ds_prev) {
-		dsl_dataset_t *origin = csa->cds->ds_prev;
-		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-		    &origin->ds_phys->ds_unique_bytes));
-	}
-
-	/* swap blkptrs */
-	{
-		blkptr_t tmp;
-		tmp = csa->ohds->ds_phys->ds_bp;
-		csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp;
-		csa->cds->ds_phys->ds_bp = tmp;
-	}
-
-	/* set dd_*_bytes */
-	{
-		int64_t dused, dcomp, duncomp;
-		uint64_t cdl_used, cdl_comp, cdl_uncomp;
-		uint64_t odl_used, odl_comp, odl_uncomp;
-
-		ASSERT3U(csa->cds->ds_dir->dd_phys->
-		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
-
-		VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
-		    &cdl_comp, &cdl_uncomp));
-		VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
-		    &odl_comp, &odl_uncomp));
-
-		dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
-		    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
-		dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp -
-		    (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp);
-		duncomp = csa->cds->ds_phys->ds_uncompressed_bytes +
-		    cdl_uncomp -
-		    (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp);
-
-		dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD,
-		    dused, dcomp, duncomp, tx);
-		dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD,
-		    -dused, -dcomp, -duncomp, tx);
-
-		/*
-		 * The difference in the space used by snapshots is the
-		 * difference in snapshot space due to the head's
-		 * deadlist (since that's the only thing that's
-		 * changing that affects the snapused).
-		 */
-		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-		    csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used));
-		VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
-		    csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used));
-		dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
-		    DD_USED_HEAD, DD_USED_SNAP, tx);
-	}
-
-#define	SWITCH64(x, y) \
-	{ \
-		uint64_t __tmp = (x); \
-		(x) = (y); \
-		(y) = __tmp; \
-	}
-
-	/* swap ds_*_bytes */
-	SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
-	    csa->cds->ds_phys->ds_used_bytes);
-	SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes,
-	    csa->cds->ds_phys->ds_compressed_bytes);
-	SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes,
-	    csa->cds->ds_phys->ds_uncompressed_bytes);
-	SWITCH64(csa->ohds->ds_phys->ds_unique_bytes,
-	    csa->cds->ds_phys->ds_unique_bytes);
-
-	/* apply any parent delta for change in unconsumed refreservation */
-	dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
-	    csa->unused_refres_delta, 0, 0, tx);
-
-	/* swap deadlists */
-	bplist_close(&csa->cds->ds_deadlist);
-	bplist_close(&csa->ohds->ds_deadlist);
-	SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
-	    csa->cds->ds_phys->ds_deadlist_obj);
-	VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
-	    csa->cds->ds_phys->ds_deadlist_obj));
-	VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
-	    csa->ohds->ds_phys->ds_deadlist_obj));
-
-	dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx);
+	snaplist_destroy(&ddpa->shared_snaps, tag);
+	snaplist_destroy(&ddpa->clone_snaps, tag);
+	snaplist_destroy(&ddpa->origin_snaps, tag);
+	if (ddpa->origin_origin != NULL)
+		dsl_dataset_rele(ddpa->origin_origin, tag);
+	dsl_dataset_rele(ddpa->ddpa_clone, tag);
 }
 
 /*
- * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
- * recv" into an existing fs to swizzle the file system to the new
- * version, and by "zfs rollback".  Can also be used to swap two
- * independent head datasets if neither has any snapshots.
+ * Promote a clone.
+ *
+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
+ * in with the name.  (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
  */
 int
-dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
-    boolean_t force)
-{
-	struct cloneswaparg csa;
-	int error;
-
-	ASSERT(clone->ds_owner);
-	ASSERT(origin_head->ds_owner);
-retry:
-	/* Need exclusive access for the swap */
-	rw_enter(&clone->ds_rwlock, RW_WRITER);
-	if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) {
-		rw_exit(&clone->ds_rwlock);
-		rw_enter(&origin_head->ds_rwlock, RW_WRITER);
-		if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) {
-			rw_exit(&origin_head->ds_rwlock);
-			goto retry;
-		}
-	}
-	csa.cds = clone;
-	csa.ohds = origin_head;
-	csa.force = force;
-	error = dsl_sync_task_do(clone->ds_dir->dd_pool,
-	    dsl_dataset_clone_swap_check,
-	    dsl_dataset_clone_swap_sync, &csa, NULL, 9);
-	return (error);
-}
-
-/*
- * Given a pool name and a dataset object number in that pool,
- * return the name of that dataset.
- */
-int
-dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+dsl_dataset_promote(const char *name, char *conflsnap)
 {
-	spa_t *spa;
-	dsl_pool_t *dp;
-	dsl_dataset_t *ds;
+	dsl_dataset_promote_arg_t ddpa = { 0 };
+	uint64_t numsnaps;
 	int error;
+	objset_t *os;
 
-	if ((error = spa_open(pname, &spa, FTAG)) != 0)
+	/*
+	 * We will modify space proportional to the number of
+	 * snapshots.  Compute numsnaps.
+	 */
+	error = dmu_objset_hold(name, FTAG, &os);
+	if (error != 0)
+		return (error);
+	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
+	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+	    &numsnaps);
+	dmu_objset_rele(os, FTAG);
+	if (error != 0)
 		return (error);
-	dp = spa_get_dsl(spa);
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) {
-		dsl_dataset_name(ds, buf);
-		dsl_dataset_rele(ds, FTAG);
-	}
-	rw_exit(&dp->dp_config_rwlock);
-	spa_close(spa, FTAG);
 
-	return (error);
+	ddpa.ddpa_clonename = name;
+	ddpa.err_ds = conflsnap;
+	ddpa.cr = CRED();
+
+	return (dsl_sync_task(name, dsl_dataset_promote_check,
+	    dsl_dataset_promote_sync, &ddpa,
+	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
-dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
-    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
 {
-	int error = 0;
-
-	ASSERT3S(asize, >, 0);
-
 	/*
-	 * *ref_rsrv is the portion of asize that will come from any
-	 * unconsumed refreservation space.
+	 * "slack" factor for received datasets with refquota set on them.
+	 * See the bottom of this function for details on its use.
 	 */
-	*ref_rsrv = 0;
+	uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
+	int64_t unused_refres_delta;
 
-	mutex_enter(&ds->ds_lock);
-	/*
-	 * Make a space adjustment for reserved bytes.
-	 */
-	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
-		ASSERT3U(*used, >=,
-		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
-		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
-		*ref_rsrv =
-		    asize - MIN(asize, parent_delta(ds, asize + inflight));
-	}
-
-	if (!check_quota || ds->ds_quota == 0) {
-		mutex_exit(&ds->ds_lock);
-		return (0);
-	}
-	/*
-	 * If they are requesting more space, and our current estimate
-	 * is over quota, they get to try again unless the actual
-	 * on-disk is over quota and there are no pending changes (which
-	 * may free up space for us).
-	 */
-	if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) {
-		if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota)
-			error = ERESTART;
-		else
-			error = EDQUOT;
-	}
-	mutex_exit(&ds->ds_lock);
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	int err;
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
-		return (ENOTSUP);
+	/* they should both be heads */
+	if (clone->ds_is_snapshot ||
+	    origin_head->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	/* if we are not forcing, the branch point should be just before them */
+	if (!force && clone->ds_prev != origin_head->ds_prev)
+		return (SET_ERROR(EINVAL));
+
+	/* clone should be the clone (unless they are unrelated) */
+	if (clone->ds_prev != NULL &&
+	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
+	    origin_head->ds_dir != clone->ds_prev->ds_dir)
+		return (SET_ERROR(EINVAL));
 
-	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-		return (err);
+	/* the clone should be a child of the origin */
+	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
+		return (SET_ERROR(EINVAL));
 
-	if (psa->psa_effective_value == 0)
-		return (0);
+	/* origin_head shouldn't be modified unless 'force' */
+	if (!force &&
+	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
+		return (SET_ERROR(ETXTBSY));
+
+	/* origin_head should have no long holds (e.g. is not mounted) */
+	if (dsl_dataset_handoff_check(origin_head, owner, tx))
+		return (SET_ERROR(EBUSY));
+
+	/* check amount of any unconsumed refreservation */
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
+
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
+		return (SET_ERROR(ENOSPC));
 
-	if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
-	    psa->psa_effective_value < ds->ds_reserved)
-		return (ENOSPC);
+	/*
+	 * The clone can't be too much over the head's refquota.
+	 *
+	 * To ensure that the entire refquota can be used, we allow one
+	 * transaction to exceed the the refquota.  Therefore, this check
+	 * needs to also allow for the space referenced to be more than the
+	 * refquota.  The maximum amount of space that one transaction can use
+	 * on disk is DMU_MAX_ACCESS * spa_asize_inflation.  Allowing this
+	 * overage ensures that we are able to receive a filesystem that
+	 * exceeds the refquota on the source system.
+	 *
+	 * So that overage is the refquota_slack we use below.
+	 */
+	if (origin_head->ds_quota != 0 &&
+	    dsl_dataset_phys(clone)->ds_referenced_bytes >
+	    origin_head->ds_quota + refquota_slack)
+		return (SET_ERROR(EDQUOT));
 
 	return (0);
 }
 
-extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
-
 void
-dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value = psa->psa_effective_value;
-
-	dsl_prop_set_sync(ds, psa, cr, tx);
-	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
-
-	if (ds->ds_quota != effective_value) {
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_quota = effective_value;
-
-		spa_history_internal_log(LOG_DS_REFQUOTA,
-		    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ",
-		    (longlong_t)ds->ds_quota, ds->ds_object);
-	}
-}
-
-int
-dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds;
-	dsl_prop_setarg_t psa;
-	int err;
-
-	dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
-
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int64_t unused_refres_delta;
 
+	ASSERT(clone->ds_reserved == 0);
 	/*
-	 * If someone removes a file, then tries to set the quota, we
-	 * want to make sure the file freeing takes effect.
+	 * NOTE: On DEBUG kernels there could be a race between this and
+	 * the check function if spa_asize_inflation is adjusted...
 	 */
-	txg_wait_open(ds->ds_dir->dd_pool, 0);
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
-	    ds, &psa, 0);
-
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
-}
-
-static int
-dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value;
-	uint64_t unique;
-	int err;
-
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
-	    SPA_VERSION_REFRESERVATION)
-		return (ENOTSUP);
-
-	if (dsl_dataset_is_snapshot(ds))
-		return (EINVAL);
-
-	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-		return (err);
-
-	effective_value = psa->psa_effective_value;
+	ASSERT(origin_head->ds_quota == 0 ||
+	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
+	    DMU_MAX_ACCESS * spa_asize_inflation);
+	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
 
 	/*
-	 * If we are doing the preliminary check in open context, the
-	 * space estimates may be inaccurate.
+	 * Swap per-dataset feature flags.
 	 */
-	if (!dmu_tx_is_syncing(tx))
-		return (0);
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (!(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET)) {
+			ASSERT(!clone->ds_feature_inuse[f]);
+			ASSERT(!origin_head->ds_feature_inuse[f]);
+			continue;
+		}
 
-	mutex_enter(&ds->ds_lock);
-	unique = dsl_dataset_unique(ds);
-	mutex_exit(&ds->ds_lock);
+		boolean_t clone_inuse = clone->ds_feature_inuse[f];
+		boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
 
-	if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
-		uint64_t delta = MAX(unique, effective_value) -
-		    MAX(unique, ds->ds_reserved);
-
-		if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
-			return (ENOSPC);
-		if (ds->ds_quota > 0 &&
-		    effective_value > ds->ds_quota)
-			return (ENOSPC);
+		if (clone_inuse) {
+			dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_deactivate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_FALSE;
+		}
+		if (clone_inuse) {
+			dsl_dataset_activate_feature(origin_head->ds_object,
+			    f, tx);
+			origin_head->ds_feature_inuse[f] = B_TRUE;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_activate_feature(clone->ds_object, f, tx);
+			clone->ds_feature_inuse[f] = B_TRUE;
+		}
 	}
 
-	return (0);
-}
+	dmu_buf_will_dirty(clone->ds_dbuf, tx);
+	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
 
-/* ARGSUSED */
-static void
-dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
-    dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value = psa->psa_effective_value;
-	uint64_t unique;
-	int64_t delta;
-
-	dsl_prop_set_sync(ds, psa, cr, tx);
-	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-	mutex_enter(&ds->ds_dir->dd_lock);
-	mutex_enter(&ds->ds_lock);
-	unique = dsl_dataset_unique(ds);
-	delta = MAX(0, (int64_t)(effective_value - unique)) -
-	    MAX(0, (int64_t)(ds->ds_reserved - unique));
-	ds->ds_reserved = effective_value;
-	mutex_exit(&ds->ds_lock);
-
-	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
-	mutex_exit(&ds->ds_dir->dd_lock);
-
-	spa_history_internal_log(LOG_DS_REFRESERV,
-	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
-	    (longlong_t)effective_value, ds->ds_object);
-}
-
-int
-dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
-    uint64_t reservation)
-{
-	dsl_dataset_t *ds;
-	dsl_prop_setarg_t psa;
-	int err;
-
-	dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
-	    &reservation);
-
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_set_reservation_check,
-	    dsl_dataset_set_reservation_sync, ds, &psa, 0);
-
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
-}
-
-struct dsl_ds_holdarg {
-	dsl_sync_task_group_t *dstg;
-	char *htag;
-	char *snapname;
-	boolean_t recursive;
-	boolean_t gotone;
-	boolean_t temphold;
-	char failed[MAXPATHLEN];
-};
+	if (clone->ds_objset != NULL) {
+		dmu_objset_evict(clone->ds_objset);
+		clone->ds_objset = NULL;
+	}
 
-/*
- * The max length of a temporary tag prefix is the number of hex digits
- * required to express UINT64_MAX plus one for the hyphen.
- */
-#define	MAX_TAG_PREFIX_LEN	17
+	if (origin_head->ds_objset != NULL) {
+		dmu_objset_evict(origin_head->ds_objset);
+		origin_head->ds_objset = NULL;
+	}
 
-static int
-dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct dsl_ds_holdarg *ha = arg2;
-	char *htag = ha->htag;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	int error = 0;
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
-		return (ENOTSUP);
+	/*
+	 * Reset origin's unique bytes, if it exists.
+	 */
+	if (clone->ds_prev) {
+		dsl_dataset_t *origin = clone->ds_prev;
+		uint64_t comp, uncomp;
 
-	if (!dsl_dataset_is_snapshot(ds))
-		return (EINVAL);
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		dsl_deadlist_space_range(&clone->ds_deadlist,
+		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
+	}
 
-	/* tags must be unique */
-	mutex_enter(&ds->ds_lock);
-	if (ds->ds_phys->ds_userrefs_obj) {
-		error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
-		    8, 1, tx);
-		if (error == 0)
-			error = EEXIST;
-		else if (error == ENOENT)
-			error = 0;
+	/* swap blkptrs */
+	{
+		rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
+		rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
+		blkptr_t tmp;
+		tmp = dsl_dataset_phys(origin_head)->ds_bp;
+		dsl_dataset_phys(origin_head)->ds_bp =
+		    dsl_dataset_phys(clone)->ds_bp;
+		dsl_dataset_phys(clone)->ds_bp = tmp;
+		rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
+		rrw_exit(&clone->ds_bp_rwlock, FTAG);
 	}
-	mutex_exit(&ds->ds_lock);
 
-	if (error == 0 && ha->temphold &&
-	    strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
-		error = E2BIG;
+	/* set dd_*_bytes */
+	{
+		int64_t dused, dcomp, duncomp;
+		uint64_t cdl_used, cdl_comp, cdl_uncomp;
+		uint64_t odl_used, odl_comp, odl_uncomp;
 
-	return (error);
-}
+		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
+		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
-static void
-dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct dsl_ds_holdarg *ha = arg2;
-	char *htag = ha->htag;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t now = gethrestime_sec();
-	uint64_t zapobj;
+		dsl_deadlist_space(&clone->ds_deadlist,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space(&origin_head->ds_deadlist,
+		    &odl_used, &odl_comp, &odl_uncomp);
+
+		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+		    cdl_used -
+		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+		    odl_used);
+		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+		    cdl_comp -
+		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+		    odl_comp);
+		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
+		    cdl_uncomp -
+		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+		    odl_uncomp);
+
+		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
+		    dused, dcomp, duncomp, tx);
+		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
+		    -dused, -dcomp, -duncomp, tx);
 
-	mutex_enter(&ds->ds_lock);
-	if (ds->ds_phys->ds_userrefs_obj == 0) {
 		/*
-		 * This is the first user hold for this dataset.  Create
-		 * the userrefs zap object.
+		 * The difference in the space used by snapshots is the
+		 * difference in snapshot space due to the head's
+		 * deadlist (since that's the only thing that's
+		 * changing that affects the snapused).
 		 */
-		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		zapobj = ds->ds_phys->ds_userrefs_obj =
-		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
-	} else {
-		zapobj = ds->ds_phys->ds_userrefs_obj;
+		dsl_deadlist_space_range(&clone->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space_range(&origin_head->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &odl_used, &odl_comp, &odl_uncomp);
+		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
+		    DD_USED_HEAD, DD_USED_SNAP, NULL);
 	}
-	ds->ds_userrefs++;
-	mutex_exit(&ds->ds_lock);
 
-	VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+	/* swap ds_*_bytes */
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+	    dsl_dataset_phys(clone)->ds_referenced_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+	    dsl_dataset_phys(clone)->ds_compressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
 
-	if (ha->temphold) {
-		VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
-		    htag, &now, tx));
-	}
+	/* apply any parent delta for change in unconsumed refreservation */
+	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
+	    unused_refres_delta, 0, 0, tx);
+
+	/*
+	 * Swap deadlists.
+	 */
+	dsl_deadlist_close(&clone->ds_deadlist);
+	dsl_deadlist_close(&origin_head->ds_deadlist);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
+	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
+	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
+	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
 
-	spa_history_internal_log(LOG_DS_USER_HOLD,
-	    dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag,
-	    (int)ha->temphold, ds->ds_object);
+	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
+
+	spa_history_log_internal_ds(clone, "clone swap", tx,
+	    "parent=%s", origin_head->ds_dir->dd_myname);
 }
 
-static int
-dsl_dataset_user_hold_one(const char *dsname, void *arg)
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
 {
-	struct dsl_ds_holdarg *ha = arg;
+	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int error;
-	char *name;
 
-	/* alloc a buffer to hold dsname@snapname plus terminating NULL */
-	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-	error = dsl_dataset_hold(name, ha->dstg, &ds);
-	strfree(name);
+	error = dsl_pool_hold(pname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 	if (error == 0) {
-		ha->gotone = B_TRUE;
-		dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
-		    dsl_dataset_user_hold_sync, ds, ha, 0);
-	} else if (error == ENOENT && ha->recursive) {
-		error = 0;
-	} else {
-		(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+		dsl_dataset_name(ds, buf);
+		dsl_dataset_rele(ds, FTAG);
 	}
+	dsl_pool_rele(dp, FTAG);
+
 	return (error);
 }
 
 int
-dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive, boolean_t temphold)
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
 {
-	struct dsl_ds_holdarg *ha;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	int error;
+	int error = 0;
 
-	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+	ASSERT3S(asize, >, 0);
 
-	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+	/*
+	 * *ref_rsrv is the portion of asize that will come from any
+	 * unconsumed refreservation space.
+	 */
+	*ref_rsrv = 0;
 
-	error = spa_open(dsname, &spa, FTAG);
-	if (error) {
-		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-		return (error);
+	mutex_enter(&ds->ds_lock);
+	/*
+	 * Make a space adjustment for reserved bytes.
+	 */
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+		ASSERT3U(*used, >=,
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*used -=
+		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*ref_rsrv =
+		    asize - MIN(asize, parent_delta(ds, asize + inflight));
 	}
 
-	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	ha->htag = htag;
-	ha->snapname = snapname;
-	ha->recursive = recursive;
-	ha->temphold = temphold;
-	if (recursive) {
-		error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
-		    ha, DS_FIND_CHILDREN);
-	} else {
-		error = dsl_dataset_user_hold_one(dsname, ha);
+	if (!check_quota || ds->ds_quota == 0) {
+		mutex_exit(&ds->ds_lock);
+		return (0);
 	}
-	if (error == 0)
-		error = dsl_sync_task_group_wait(ha->dstg);
-
-	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
-	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
-
-		if (dst->dst_err) {
-			dsl_dataset_name(ds, ha->failed);
-			*strchr(ha->failed, '@') = '\0';
-		}
-		dsl_dataset_rele(ds, ha->dstg);
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota, they get to try again unless the actual
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
+	 */
+	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+	    ds->ds_quota) {
+		if (inflight > 0 ||
+		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
+			error = SET_ERROR(ERESTART);
+		else
+			error = SET_ERROR(EDQUOT);
 	}
+	mutex_exit(&ds->ds_lock);
 
-	if (error == 0 && recursive && !ha->gotone)
-		error = ENOENT;
-
-	if (error)
-		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
-
-	dsl_sync_task_group_destroy(ha->dstg);
-	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-	spa_close(spa, FTAG);
 	return (error);
 }
 
-struct dsl_ds_releasearg {
-	dsl_dataset_t *ds;
-	const char *htag;
-	boolean_t own;		/* do we own or just hold ds? */
-};
+typedef struct dsl_dataset_set_qr_arg {
+	const char *ddsqra_name;
+	zprop_source_t ddsqra_source;
+	uint64_t ddsqra_value;
+} dsl_dataset_set_qr_arg_t;
 
+
+/* ARGSUSED */
 static int
-dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
-    boolean_t *might_destroy)
+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
 {
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t zapobj;
-	uint64_t tmp;
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
 	int error;
+	uint64_t newval;
 
-	*might_destroy = B_FALSE;
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
+		return (SET_ERROR(ENOTSUP));
 
-	mutex_enter(&ds->ds_lock);
-	zapobj = ds->ds_phys->ds_userrefs_obj;
-	if (zapobj == 0) {
-		/* The tag can't possibly exist */
-		mutex_exit(&ds->ds_lock);
-		return (ESRCH);
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
 	}
 
-	/* Make sure the tag exists */
-	error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
-	if (error) {
-		mutex_exit(&ds->ds_lock);
-		if (error == ENOENT)
-			error = ESRCH;
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
-	if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
-	    DS_IS_DEFER_DESTROY(ds))
-		*might_destroy = B_TRUE;
+	if (newval == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
+	    newval < ds->ds_reserved) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOSPC));
+	}
 
-	mutex_exit(&ds->ds_lock);
+	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
-static int
-dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
+static void
+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
 {
-	struct dsl_ds_releasearg *ra = arg1;
-	dsl_dataset_t *ds = ra->ds;
-	boolean_t might_destroy;
-	int error;
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	uint64_t newval;
 
-	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
-		return (ENOTSUP);
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
-	error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
-	if (error)
-		return (error);
+	dsl_prop_set_sync_impl(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+	    &ddsqra->ddsqra_value, tx);
 
-	if (might_destroy) {
-		struct dsl_ds_destroyarg dsda = {0};
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
 
-		if (dmu_tx_is_syncing(tx)) {
-			/*
-			 * If we're not prepared to remove the snapshot,
-			 * we can't allow the release to happen right now.
-			 */
-			if (!ra->own)
-				return (EBUSY);
-			if (ds->ds_objset) {
-				dmu_objset_evict(ds->ds_objset);
-				ds->ds_objset = NULL;
-			}
-		}
-		dsda.ds = ds;
-		dsda.releasing = B_TRUE;
-		return (dsl_dataset_destroy_check(&dsda, tag, tx));
+	if (ds->ds_quota != newval) {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_quota = newval;
 	}
-
-	return (0);
+	dsl_dataset_rele(ds, FTAG);
 }
 
-static void
-dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+int
+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+    uint64_t refquota)
 {
-	struct dsl_ds_releasearg *ra = arg1;
-	dsl_dataset_t *ds = ra->ds;
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t zapobj;
-	uint64_t dsobj = ds->ds_object;
-	uint64_t refs;
-	int error;
+	dsl_dataset_set_qr_arg_t ddsqra;
 
-	mutex_enter(&ds->ds_lock);
-	ds->ds_userrefs--;
-	refs = ds->ds_userrefs;
-	mutex_exit(&ds->ds_lock);
-	error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
-	VERIFY(error == 0 || error == ENOENT);
-	zapobj = ds->ds_phys->ds_userrefs_obj;
-	VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
-	if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
-	    DS_IS_DEFER_DESTROY(ds)) {
-		struct dsl_ds_destroyarg dsda = {0};
-
-		ASSERT(ra->own);
-		dsda.ds = ds;
-		dsda.releasing = B_TRUE;
-		/* We already did the destroy_check */
-		dsl_dataset_destroy_sync(&dsda, tag, cr, tx);
-	}
-
-	spa_history_internal_log(LOG_DS_USER_RELEASE,
-	    dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu",
-	    ra->htag, (longlong_t)refs, dsobj);
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refquota;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
+	    dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static int
-dsl_dataset_user_release_one(const char *dsname, void *arg)
+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
 {
-	struct dsl_ds_holdarg *ha = arg;
-	struct dsl_ds_releasearg *ra;
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	int error;
-	void *dtag = ha->dstg;
-	char *name;
-	boolean_t own = B_FALSE;
-	boolean_t might_destroy;
-
-	/* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
-	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
-	error = dsl_dataset_hold(name, dtag, &ds);
-	strfree(name);
-	if (error == ENOENT && ha->recursive)
-		return (0);
-	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
-	if (error)
-		return (error);
+	uint64_t newval, unique;
 
-	ha->gotone = B_TRUE;
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
+		return (SET_ERROR(ENOTSUP));
 
-	ASSERT(dsl_dataset_is_snapshot(ds));
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
 
-	error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
-	if (error) {
-		dsl_dataset_rele(ds, dtag);
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
 		return (error);
 	}
 
-	if (might_destroy) {
-#ifdef _KERNEL
-		error = zfs_unmount_snap(name, NULL);
-		if (error) {
-			dsl_dataset_rele(ds, dtag);
-			return (error);
-		}
-#endif
-		if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
-			dsl_dataset_rele(ds, dtag);
-			return (EBUSY);
-		} else {
-			own = B_TRUE;
-			dsl_dataset_make_exclusive(ds, dtag);
-		}
+	/*
+	 * If we are doing the preliminary check in open context, the
+	 * space estimates may be inaccurate.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
 	}
 
-	ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
-	ra->ds = ds;
-	ra->htag = ha->htag;
-	ra->own = own;
-	dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
-	    dsl_dataset_user_release_sync, ra, dtag, 0);
+	mutex_enter(&ds->ds_lock);
+	if (!DS_UNIQUE_IS_ACCURATE(ds))
+		dsl_dataset_recalc_head_uniq(ds);
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+	mutex_exit(&ds->ds_lock);
+
+	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
+		uint64_t delta = MAX(unique, newval) -
+		    MAX(unique, ds->ds_reserved);
+
+		if (delta >
+		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
+		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(ENOSPC));
+		}
+	}
 
+	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
+void
+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
+{
+	uint64_t newval;
+	uint64_t unique;
+	int64_t delta;
+
+	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    source, sizeof (value), 1, &value, tx);
+
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_dir->dd_lock);
+	mutex_enter(&ds->ds_lock);
+	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+	delta = MAX(0, (int64_t)(newval - unique)) -
+	    MAX(0, (int64_t)(ds->ds_reserved - unique));
+	ds->ds_reserved = newval;
+	mutex_exit(&ds->ds_lock);
+
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+	mutex_exit(&ds->ds_dir->dd_lock);
+}
+
+static void
+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+	dsl_dataset_set_refreservation_sync_impl(ds,
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
 int
-dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
-    boolean_t recursive)
+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+    uint64_t refreservation)
 {
-	struct dsl_ds_holdarg *ha;
-	dsl_sync_task_t *dst;
-	spa_t *spa;
-	int error;
+	dsl_dataset_set_qr_arg_t ddsqra;
 
-top:
-	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refreservation;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
+	    dsl_dataset_set_refreservation_sync, &ddsqra,
+	    0, ZFS_SPACE_CHECK_NONE));
+}
 
-	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+/*
+ * Return (in *usedp) the amount of space written in new that is not
+ * present in oldsnap.  New may be a snapshot or the head.  Old must be
+ * a snapshot before new, in new's filesystem (or its origin).  If not then
+ * fail and return EINVAL.
+ *
+ * The written space is calculated by considering two components:  First, we
+ * ignore any freed space, and calculate the written as new's used space
+ * minus old's used space.  Next, we add in the amount of space that was freed
+ * between the two snapshots, thus reducing new's used space relative to old's.
+ * Specifically, this is the space that was born before old->ds_creation_txg,
+ * and freed before new (ie. on new's deadlist or a previous deadlist).
+ *
+ * space freed                         [---------------------]
+ * snapshots                       ---O-------O--------O-------O------
+ *                                         oldsnap            new
+ */
+int
+dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	int err = 0;
+	uint64_t snapobj;
+	dsl_pool_t *dp = new->ds_dir->dd_pool;
 
-	error = spa_open(dsname, &spa, FTAG);
-	if (error) {
-		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-		return (error);
-	}
+	ASSERT(dsl_pool_config_held(dp));
 
-	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
-	ha->htag = htag;
-	ha->snapname = snapname;
-	ha->recursive = recursive;
-	if (recursive) {
-		error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
-		    ha, DS_FIND_CHILDREN);
-	} else {
-		error = dsl_dataset_user_release_one(dsname, ha);
-	}
-	if (error == 0)
-		error = dsl_sync_task_group_wait(ha->dstg);
+	*usedp = 0;
+	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+	*usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
+
+	*compp = 0;
+	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+	*compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
+
+	*uncompp = 0;
+	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+	*uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
+
+	snapobj = new->ds_object;
+	while (snapobj != oldsnap->ds_object) {
+		dsl_dataset_t *snap;
+		uint64_t used, comp, uncomp;
 
-	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
-	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
-		struct dsl_ds_releasearg *ra = dst->dst_arg1;
-		dsl_dataset_t *ds = ra->ds;
+		if (snapobj == new->ds_object) {
+			snap = new;
+		} else {
+			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+			if (err != 0)
+				break;
+		}
 
-		if (dst->dst_err)
-			dsl_dataset_name(ds, ha->failed);
+		if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
+		    dsl_dataset_phys(oldsnap)->ds_creation_txg) {
+			/*
+			 * The blocks in the deadlist can not be born after
+			 * ds_prev_snap_txg, so get the whole deadlist space,
+			 * which is more efficient (especially for old-format
+			 * deadlists).  Unfortunately the deadlist code
+			 * doesn't have enough information to make this
+			 * optimization itself.
+			 */
+			dsl_deadlist_space(&snap->ds_deadlist,
+			    &used, &comp, &uncomp);
+		} else {
+			dsl_deadlist_space_range(&snap->ds_deadlist,
+			    0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
+			    &used, &comp, &uncomp);
+		}
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
 
-		if (ra->own)
-			dsl_dataset_disown(ds, ha->dstg);
-		else
-			dsl_dataset_rele(ds, ha->dstg);
+		/*
+		 * If we get to the beginning of the chain of snapshots
+		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
+		 * was not a snapshot of/before new.
+		 */
+		snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+		if (snap != new)
+			dsl_dataset_rele(snap, FTAG);
+		if (snapobj == 0) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
 
-		kmem_free(ra, sizeof (struct dsl_ds_releasearg));
 	}
+	return (err);
+}
 
-	if (error == 0 && recursive && !ha->gotone)
-		error = ENOENT;
-
-	if (error && error != EBUSY)
-		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
+/*
+ * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
+ * lastsnap, and all snapshots in between are deleted.
+ *
+ * blocks that would be freed            [---------------------------]
+ * snapshots                       ---O-------O--------O-------O--------O
+ *                                        firstsnap        lastsnap
+ *
+ * This is the set of blocks that were born after the snap before firstsnap,
+ * (birth > firstsnap->prev_snap_txg) and died before the snap after the
+ * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
+ * We calculate this by iterating over the relevant deadlists (from the snap
+ * after lastsnap, backward to the snap after firstsnap), summing up the
+ * space on the deadlist that was born after the snap before firstsnap.
+ */
+int
+dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
+    dsl_dataset_t *lastsnap,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	int err = 0;
+	uint64_t snapobj;
+	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
 
-	dsl_sync_task_group_destroy(ha->dstg);
-	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
-	spa_close(spa, FTAG);
+	ASSERT(firstsnap->ds_is_snapshot);
+	ASSERT(lastsnap->ds_is_snapshot);
 
 	/*
-	 * We can get EBUSY if we were racing with deferred destroy and
-	 * dsl_dataset_user_release_check() hadn't done the necessary
-	 * open context setup.  We can also get EBUSY if we're racing
-	 * with destroy and that thread is the ds_owner.  Either way
-	 * the busy condition should be transient, and we should retry
-	 * the release operation.
+	 * Check that the snapshots are in the same dsl_dir, and firstsnap
+	 * is before lastsnap.
 	 */
-	if (error == EBUSY)
-		goto top;
+	if (firstsnap->ds_dir != lastsnap->ds_dir ||
+	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
+	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
+		return (SET_ERROR(EINVAL));
 
-	return (error);
+	*usedp = *compp = *uncompp = 0;
+
+	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
+	while (snapobj != firstsnap->ds_object) {
+		dsl_dataset_t *ds;
+		uint64_t used, comp, uncomp;
+
+		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
+		if (err != 0)
+			break;
+
+		dsl_deadlist_space_range(&ds->ds_deadlist,
+		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
+		    &used, &comp, &uncomp);
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+
+		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		ASSERT3U(snapobj, !=, 0);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	return (err);
 }
 
 /*
- * Called at spa_load time to release a stale temporary user hold.
+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
+ * For example, they could both be snapshots of the same filesystem, and
+ * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
+ * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
+ * filesystem.  Or 'earlier' could be the origin's origin.
+ *
+ * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
  */
-int
-dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag)
+boolean_t
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+    uint64_t earlier_txg)
 {
-	dsl_dataset_t *ds;
-	char *snap;
-	char *name;
-	int namelen;
+	dsl_pool_t *dp = later->ds_dir->dd_pool;
 	int error;
+	boolean_t ret;
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
-	rw_exit(&dp->dp_config_rwlock);
-	if (error)
-		return (error);
-	namelen = dsl_dataset_namelen(ds)+1;
-	name = kmem_alloc(namelen, KM_SLEEP);
-	dsl_dataset_name(ds, name);
-	dsl_dataset_rele(ds, FTAG);
+	ASSERT(dsl_pool_config_held(dp));
+	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
 
-	snap = strchr(name, '@');
-	*snap = '\0';
-	++snap;
-	return (dsl_dataset_user_release(name, snap, htag, B_FALSE));
+	if (earlier_txg == 0)
+		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
+
+	if (later->ds_is_snapshot &&
+	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
+		return (B_FALSE);
+
+	if (later->ds_dir == earlier->ds_dir)
+		return (B_TRUE);
+	if (!dsl_dir_is_clone(later->ds_dir))
+		return (B_FALSE);
+
+	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
+		return (B_TRUE);
+	dsl_dataset_t *origin;
+	error = dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
+	if (error != 0)
+		return (B_FALSE);
+	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
+	dsl_dataset_rele(origin, FTAG);
+	return (ret);
 }
 
-int
-dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
+void
+dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds;
-	int err;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
+}
 
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+	dmu_object_info_t doi;
 
-	VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
-	if (ds->ds_phys->ds_userrefs_obj != 0) {
-		zap_attribute_t *za;
-		zap_cursor_t zc;
-
-		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
-		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
-		    ds->ds_phys->ds_userrefs_obj);
-		    zap_cursor_retrieve(&zc, za) == 0;
-		    zap_cursor_advance(&zc)) {
-			VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
-			    za->za_first_integer));
-		}
-		zap_cursor_fini(&zc);
-		kmem_free(za, sizeof (zap_attribute_t));
-	}
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
+	dmu_object_info_from_db(ds->ds_dbuf, &doi);
+	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
 }
 
-/*
- * Note, this fuction is used as the callback for dmu_objset_find().  We
- * always return 0 so that we will continue to find and process
- * inconsistent datasets, even if we encounter an error trying to
- * process one of them.
- */
-/* ARGSUSED */
-int
-dsl_destroy_inconsistent(const char *dsname, void *arg)
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
 {
-	dsl_dataset_t *ds;
-
-	if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
-		if (DS_IS_INCONSISTENT(ds))
-			(void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
-		else
-			dsl_dataset_disown(ds, FTAG);
-	}
-	return (0);
+	return (dsl_dataset_is_zapified(ds) &&
+	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c	27 Mar 2016 02:52:21 -0000
@@ -0,0 +1,539 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+/*
+ * Deadlist concurrency:
+ *
+ * Deadlists can only be modified from the syncing thread.
+ *
+ * Except for dsl_deadlist_insert(), it can only be modified with the
+ * dp_config_rwlock held with RW_WRITER.
+ *
+ * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
+ * be called concurrently, from open context, with the dl_config_rwlock held
+ * with RW_READER.
+ *
+ * Therefore, we only need to provide locking between dsl_deadlist_insert() and
+ * the accessors, protecting:
+ *     dl_phys->dl_used,comp,uncomp
+ *     and protecting the dl_tree from being loaded.
+ * The locking is provided by dl_lock.  Note that locking on the bpobj_t
+ * provides its own locking, and dl_oldfmt is immutable.
+ */
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+	const dsl_deadlist_entry_t *dle1 = arg1;
+	const dsl_deadlist_entry_t *dle2 = arg2;
+
+	if (dle1->dle_mintxg < dle2->dle_mintxg)
+		return (-1);
+	else if (dle1->dle_mintxg > dle2->dle_mintxg)
+		return (+1);
+	else
+		return (0);
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	ASSERT(!dl->dl_oldfmt);
+	if (dl->dl_havetree)
+		return;
+
+	avl_create(&dl->dl_tree, dsl_deadlist_compare,
+	    sizeof (dsl_deadlist_entry_t),
+	    offsetof(dsl_deadlist_entry_t, dle_node));
+	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+		dle->dle_mintxg = strtonum(za.za_name, NULL);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+		    za.za_first_integer));
+		avl_add(&dl->dl_tree, dle);
+	}
+	zap_cursor_fini(&zc);
+	dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+
+	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+	dl->dl_os = os;
+	dl->dl_object = object;
+	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+	dmu_object_info_from_db(dl->dl_dbuf, &doi);
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		dmu_buf_rele(dl->dl_dbuf, dl);
+		dl->dl_dbuf = NULL;
+		dl->dl_oldfmt = B_TRUE;
+		VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+		return;
+	}
+
+	dl->dl_oldfmt = B_FALSE;
+	dl->dl_phys = dl->dl_dbuf->db_data;
+	dl->dl_havetree = B_FALSE;
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+	void *cookie = NULL;
+	dsl_deadlist_entry_t *dle;
+
+	dl->dl_os = NULL;
+
+	if (dl->dl_oldfmt) {
+		dl->dl_oldfmt = B_FALSE;
+		bpobj_close(&dl->dl_bpobj);
+		return;
+	}
+
+	if (dl->dl_havetree) {
+		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+		    != NULL) {
+			bpobj_close(&dle->dle_bpobj);
+			kmem_free(dle, sizeof (*dle));
+		}
+		avl_destroy(&dl->dl_tree);
+	}
+	dmu_buf_rele(dl->dl_dbuf, dl);
+	mutex_destroy(&dl->dl_lock);
+	dl->dl_dbuf = NULL;
+	dl->dl_phys = NULL;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
+	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+	    sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+	dmu_object_info_t doi;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_free(os, dlobj, tx);
+		return;
+	}
+
+	for (zap_cursor_init(&zc, os, dlobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t obj = za.za_first_integer;
+		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+			bpobj_decr_empty(os, tx);
+		else
+			bpobj_free(os, obj, tx);
+	}
+	zap_cursor_fini(&zc);
+	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    const blkptr_t *bp, dmu_tx_t *tx)
+{
+	if (dle->dle_bpobj.bpo_object ==
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj, dmu_tx_t *tx)
+{
+	if (dle->dle_bpobj.bpo_object !=
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+	} else {
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+		return;
+	}
+
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	mutex_enter(&dl->dl_lock);
+	dl->dl_phys->dl_used +=
+	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+	mutex_exit(&dl->dl_lock);
+
+	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	else
+		dle = AVL_PREV(&dl->dl_tree, dle);
+	dle_enqueue(dl, dle, bp, tx);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dsl_deadlist_entry_t *dle;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	dsl_deadlist_load_tree(dl);
+
+	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+	dle->dle_mintxg = mintxg;
+	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+	avl_add(&dl->dl_tree, dle);
+
+	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+	    mintxg, obj, tx));
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle, *dle_prev;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+	dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
+
+	avl_remove(&dl->dl_tree, dle);
+	bpobj_close(&dle->dle_bpobj);
+	kmem_free(dle, sizeof (*dle));
+
+	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_t dl;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	dsl_deadlist_open(&dl, os, dlobj);
+	if (dl.dl_oldfmt) {
+		dsl_deadlist_close(&dl);
+		return;
+	}
+
+	while (mrs_obj != 0) {
+		dsl_dataset_t *ds;
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+		dsl_deadlist_add_key(&dl,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t *dle;
+	uint64_t newobj;
+
+	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+	if (dl->dl_oldfmt) {
+		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+		return (newobj);
+	}
+
+	dsl_deadlist_load_tree(dl);
+
+	for (dle = avl_first(&dl->dl_tree); dle;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		uint64_t obj;
+
+		if (dle->dle_mintxg >= maxtxg)
+			break;
+
+		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+		    dle->dle_mintxg, obj, tx));
+	}
+	return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	if (dl->dl_oldfmt) {
+		VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+		    usedp, compp, uncompp));
+		return;
+	}
+
+	mutex_enter(&dl->dl_lock);
+	*usedp = dl->dl_phys->dl_used;
+	*compp = dl->dl_phys->dl_comp;
+	*uncompp = dl->dl_phys->dl_uncomp;
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * larger than any bp in the deadlist (eg. UINT64_MAX)).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	dsl_deadlist_entry_t *dle;
+	dsl_deadlist_entry_t dle_tofind;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+		    mintxg, maxtxg, usedp, compp, uncompp));
+		return;
+	}
+
+	*usedp = *compp = *uncompp = 0;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	/*
+	 * If we don't find this mintxg, there shouldn't be anything
+	 * after it either.
+	 */
+	ASSERT(dle != NULL ||
+	    avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+
+	for (; dle && dle->dle_mintxg < maxtxg;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		uint64_t used, comp, uncomp;
+
+		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+		    &used, &comp, &uncomp));
+
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+	}
+	mutex_exit(&dl->dl_lock);
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+	uint64_t used, comp, uncomp;
+	bpobj_t bpo;
+
+	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+	bpobj_close(&bpo);
+
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	mutex_enter(&dl->dl_lock);
+	dl->dl_phys->dl_used += used;
+	dl->dl_phys->dl_comp += comp;
+	dl->dl_phys->dl_uncomp += uncomp;
+	mutex_exit(&dl->dl_lock);
+
+	dle_tofind.dle_mintxg = birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	dle_enqueue_subobj(dl, dle, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, tx);
+	return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	dmu_buf_t *bonus;
+	dsl_deadlist_phys_t *dlp;
+	dmu_object_info_t doi;
+
+	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_t bpo;
+		VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+		VERIFY3U(0, ==, bpobj_iterate(&bpo,
+		    dsl_deadlist_insert_cb, dl, tx));
+		bpobj_close(&bpo);
+		return;
+	}
+
+	for (zap_cursor_init(&zc, dl->dl_os, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t mintxg = strtonum(za.za_name, NULL);
+		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+	}
+	zap_cursor_fini(&zc);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+	dlp = bonus->db_data;
+	dmu_buf_will_dirty(bonus, tx);
+	bzero(dlp, sizeof (*dlp));
+	dmu_buf_rele(bonus, FTAG);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	ASSERT(!dl->dl_oldfmt);
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+	while (dle) {
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_entry_t *dle_next;
+
+		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+		    &used, &comp, &uncomp));
+		mutex_enter(&dl->dl_lock);
+		ASSERT3U(dl->dl_phys->dl_used, >=, used);
+		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+		dl->dl_phys->dl_used -= used;
+		dl->dl_phys->dl_comp -= comp;
+		dl->dl_phys->dl_uncomp -= uncomp;
+		mutex_exit(&dl->dl_lock);
+
+		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, tx));
+
+		dle_next = AVL_NEXT(&dl->dl_tree, dle);
+		avl_remove(&dl->dl_tree, dle);
+		bpobj_close(&dle->dle_bpobj);
+		kmem_free(dle, sizeof (*dle));
+		dle = dle_next;
+	}
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c,v
retrieving revision 1.3
diff -u -p -r1.3 dsl_deleg.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c	10 Oct 2016 11:09:56 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -107,7 +107,7 @@ dsl_deleg_can_allow(char *ddname, nvlist
 			const char *perm = nvpair_name(permpair);
 
 			if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
-				return (EPERM);
+				return (SET_ERROR(EPERM));
 
 			if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
 				return (error);
@@ -139,42 +139,49 @@ dsl_deleg_can_unallow(char *ddname, nvli
 
 		if (type != ZFS_DELEG_USER &&
 		    type != ZFS_DELEG_USER_SETS)
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 
 		if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 	}
 	return (0);
 }
 
+typedef struct dsl_deleg_arg {
+	const char *dda_name;
+	nvlist_t *dda_nvlist;
+} dsl_deleg_arg_t;
+
 static void
-dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	nvlist_t *nvp = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
 	nvpair_t *whopair = NULL;
-	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	uint64_t zapobj;
+
+	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
 
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	if (zapobj == 0) {
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
 		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 	}
 
-	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
 		const char *whokey = nvpair_name(whopair);
 		nvlist_t *perms;
 		nvpair_t *permpair = NULL;
 		uint64_t jumpobj;
 
-		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+		perms = fnvpair_value_nvlist(whopair);
 
 		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
-			jumpobj = zap_create(mos, DMU_OT_DSL_PERMS,
-			    DMU_OT_NONE, 0, tx);
-			VERIFY(zap_update(mos, zapobj,
-			    whokey, 8, 1, &jumpobj, tx) == 0);
+			jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
+			    zapobj, whokey, tx);
 		}
 
 		while (permpair = nvlist_next_nvpair(perms, permpair)) {
@@ -183,27 +190,31 @@ dsl_deleg_set_sync(void *arg1, void *arg
 
 			VERIFY(zap_update(mos, jumpobj,
 			    perm, 8, 1, &n, tx) == 0);
-			spa_history_internal_log(LOG_DS_PERM_UPDATE,
-			    dd->dd_pool->dp_spa, tx, cr,
-			    "%s %s dataset = %llu", whokey, perm,
-			    dd->dd_phys->dd_head_dataset_obj);
+			spa_history_log_internal_dd(dd, "permission update", tx,
+			    "%s %s", whokey, perm);
 		}
 	}
+	dsl_dir_rele(dd, FTAG);
 }
 
 static void
-dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	nvlist_t *nvp = arg2;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
 	nvpair_t *whopair = NULL;
-	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	uint64_t zapobj;
 
-	if (zapobj == 0)
+	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+	if (zapobj == 0) {
+		dsl_dir_rele(dd, FTAG);
 		return;
+	}
 
-	while (whopair = nvlist_next_nvpair(nvp, whopair)) {
+	while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) {
 		const char *whokey = nvpair_name(whopair);
 		nvlist_t *perms;
 		nvpair_t *permpair = NULL;
@@ -215,10 +226,8 @@ dsl_deleg_unset_sync(void *arg1, void *a
 				(void) zap_remove(mos, zapobj, whokey, tx);
 				VERIFY(0 == zap_destroy(mos, jumpobj, tx));
 			}
-			spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE,
-			    dd->dd_pool->dp_spa, tx, cr,
-			    "%s dataset = %llu", whokey,
-			    dd->dd_phys->dd_head_dataset_obj);
+			spa_history_log_internal_dd(dd, "permission who remove",
+			    tx, "%s", whokey);
 			continue;
 		}
 
@@ -236,41 +245,44 @@ dsl_deleg_unset_sync(void *arg1, void *a
 				VERIFY(0 == zap_destroy(mos,
 				    jumpobj, tx));
 			}
-			spa_history_internal_log(LOG_DS_PERM_REMOVE,
-			    dd->dd_pool->dp_spa, tx, cr,
-			    "%s %s dataset = %llu", whokey, perm,
-			    dd->dd_phys->dd_head_dataset_obj);
+			spa_history_log_internal_dd(dd, "permission remove", tx,
+			    "%s %s", whokey, perm);
 		}
 	}
+	dsl_dir_rele(dd, FTAG);
 }
 
-int
-dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+static int
+dsl_deleg_check(void *arg, dmu_tx_t *tx)
 {
+	dsl_deleg_arg_t *dda = arg;
 	dsl_dir_t *dd;
 	int error;
-	nvpair_t *whopair = NULL;
-	int blocks_modified = 0;
 
-	error = dsl_dir_open(ddname, FTAG, &dd, NULL);
-	if (error)
-		return (error);
-
-	if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) <
+	if (spa_version(dmu_tx_pool(tx)->dp_spa) <
 	    SPA_VERSION_DELEGATED_PERMS) {
-		dsl_dir_close(dd, FTAG);
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 	}
 
-	while (whopair = nvlist_next_nvpair(nvp, whopair))
-		blocks_modified++;
+	error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
+	if (error == 0)
+		dsl_dir_rele(dd, FTAG);
+	return (error);
+}
 
-	error = dsl_sync_task_do(dd->dd_pool, NULL,
-	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
-	    dd, nvp, blocks_modified);
-	dsl_dir_close(dd, FTAG);
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+	dsl_deleg_arg_t dda;
 
-	return (error);
+	/* nvp must already have been verified to be valid */
+
+	dda.dda_name = ddname;
+	dda.dda_nvlist = nvp;
+
+	return (dsl_sync_task(ddname, dsl_deleg_check,
+	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+	    &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
 }
 
 /*
@@ -298,34 +310,36 @@ dsl_deleg_get(const char *ddname, nvlist
 	int error;
 	objset_t *mos;
 
-	error = dsl_dir_open(ddname, FTAG, &startdd, NULL);
-	if (error)
+	error = dsl_pool_hold(ddname, FTAG, &dp);
+	if (error != 0)
 		return (error);
 
+	error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
 	dp = startdd->dd_pool;
 	mos = dp->dp_meta_objset;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
 		zap_cursor_t basezc;
 		zap_attribute_t baseza;
 		nvlist_t *sp_nvp;
 		uint64_t n;
-		char source[MAXNAMELEN];
+		char source[ZFS_MAX_DATASET_NAME_LEN];
 
-		if (dd->dd_phys->dd_deleg_zapobj &&
-		    (zap_count(mos, dd->dd_phys->dd_deleg_zapobj,
-		    &n) == 0) && n) {
-			VERIFY(nvlist_alloc(&sp_nvp,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		} else {
+		if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+		    zap_count(mos,
+		    dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
 			continue;
-		}
 
+		sp_nvp = fnvlist_alloc();
 		for (zap_cursor_init(&basezc, mos,
-		    dd->dd_phys->dd_deleg_zapobj);
+		    dsl_dir_phys(dd)->dd_deleg_zapobj);
 		    zap_cursor_retrieve(&basezc, &baseza) == 0;
 		    zap_cursor_advance(&basezc)) {
 			zap_cursor_t zc;
@@ -335,29 +349,26 @@ dsl_deleg_get(const char *ddname, nvlist
 			ASSERT(baseza.za_integer_length == 8);
 			ASSERT(baseza.za_num_integers == 1);
 
-			VERIFY(nvlist_alloc(&perms_nvp,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+			perms_nvp = fnvlist_alloc();
 			for (zap_cursor_init(&zc, mos, baseza.za_first_integer);
 			    zap_cursor_retrieve(&zc, &za) == 0;
 			    zap_cursor_advance(&zc)) {
-				VERIFY(nvlist_add_boolean(perms_nvp,
-				    za.za_name) == 0);
+				fnvlist_add_boolean(perms_nvp, za.za_name);
 			}
 			zap_cursor_fini(&zc);
-			VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name,
-			    perms_nvp) == 0);
-			nvlist_free(perms_nvp);
+			fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp);
+			fnvlist_free(perms_nvp);
 		}
 
 		zap_cursor_fini(&basezc);
 
 		dsl_dir_name(dd, source);
-		VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0);
+		fnvlist_add_nvlist(*nvp, source, sp_nvp);
 		nvlist_free(sp_nvp);
 	}
-	rw_exit(&dp->dp_config_rwlock);
 
-	dsl_dir_close(startdd, FTAG);
+	dsl_dir_rele(startdd, FTAG);
+	dsl_pool_rele(dp, FTAG);
 	return (0);
 }
 
@@ -406,7 +417,7 @@ dsl_check_access(objset_t *mos, uint64_t
 	if (error == 0) {
 		error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
 		if (error == ENOENT)
-			error = EPERM;
+			error = SET_ERROR(EPERM);
 	}
 	return (error);
 }
@@ -418,7 +429,7 @@ static int
 dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
     int checkflag, cred_t *cr)
 {
-	gid_t	*gids;
+	const	gid_t *gids;
 	int	ngids;
 	int	i;
 	uint64_t id;
@@ -451,7 +462,7 @@ dsl_check_user_access(objset_t *mos, uin
 			return (0);
 	}
 
-	return (EPERM);
+	return (SET_ERROR(EPERM));
 }
 
 /*
@@ -529,9 +540,8 @@ dsl_load_user_sets(objset_t *mos, uint64
  * Check if user has requested permission.
  */
 int
-dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
 {
-	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
 	dsl_pool_t *dp;
 	void *cookie;
@@ -541,25 +551,17 @@ dsl_deleg_access(const char *dsname, con
 	avl_tree_t permsets;
 	perm_set_t *setnode;
 
-	error = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (error)
-		return (error);
-
 	dp = ds->ds_dir->dd_pool;
 	mos = dp->dp_meta_objset;
 
-	if (dsl_delegation_on(mos) == B_FALSE) {
-		dsl_dataset_rele(ds, FTAG);
-		return (ECANCELED);
-	}
+	if (dsl_delegation_on(mos) == B_FALSE)
+		return (SET_ERROR(ECANCELED));
 
 	if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
-	    SPA_VERSION_DELEGATED_PERMS) {
-		dsl_dataset_rele(ds, FTAG);
-		return (EPERM);
-	}
+	    SPA_VERSION_DELEGATED_PERMS)
+		return (SET_ERROR(EPERM));
 
-	if (dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		/*
 		 * Snapshots are treated as descendents only,
 		 * local permissions do not apply.
@@ -572,7 +574,7 @@ dsl_deleg_access(const char *dsname, con
 	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
 	    offsetof(perm_set_t, p_node));
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	ASSERT(dsl_pool_config_held(dp));
 	for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
 	    checkflag = ZFS_DELEG_DESCENDENT) {
 		uint64_t zapobj;
@@ -582,7 +584,7 @@ dsl_deleg_access(const char *dsname, con
 		 * If not in global zone then make sure
 		 * the zoned property is set
 		 */
-		if (!INGLOBALZONE(curproc)) {
+		if (!INGLOBALZONE(curthread)) {
 			uint64_t zoned;
 
 			if (dsl_prop_get_dd(dd,
@@ -592,7 +594,7 @@ dsl_deleg_access(const char *dsname, con
 			if (!zoned)
 				break;
 		}
-		zapobj = dd->dd_phys->dd_deleg_zapobj;
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
 		if (zapobj == 0)
 			continue;
@@ -631,10 +633,8 @@ again:
 		if (error == 0)
 			goto success;
 	}
-	error = EPERM;
+	error = SET_ERROR(EPERM);
 success:
-	rw_exit(&dp->dp_config_rwlock);
-	dsl_dataset_rele(ds, FTAG);
 
 	cookie = NULL;
 	while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
@@ -643,6 +643,26 @@ success:
 	return (error);
 }
 
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_pool_hold(dsname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (error == 0) {
+		error = dsl_deleg_access_impl(ds, perm, cr);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_pool_rele(dp, FTAG);
+
+	return (error);
+}
+
 /*
  * Other routines.
  */
@@ -653,7 +673,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	uint64_t jumpobj, pjumpobj;
-	uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+	uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	char whokey[ZFS_MAX_DELEG_NAME];
@@ -666,7 +686,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_
 
 	if (zapobj == 0) {
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
-		zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
 		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
 	}
 
@@ -704,7 +724,7 @@ dsl_deleg_set_create_perms(dsl_dir_t *sd
 		return;
 
 	for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
-		uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+		uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
 
 		if (pzapobj == 0)
 			continue;
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c	10 Oct 2016 11:09:56 -0000
@@ -0,0 +1,992 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+
+typedef struct dmu_snapshots_destroy_arg {
+	nvlist_t *dsda_snaps;
+	nvlist_t *dsda_successful_snaps;
+	boolean_t dsda_defer;
+	nvlist_t *dsda_errlist;
+} dmu_snapshots_destroy_arg_t;
+
+int
+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
+{
+	if (!ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (dsl_dataset_long_held(ds))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Only allow deferred destroy on pools that support it.
+	 * NOTE: deferred destroy is only supported on snapshots.
+	 */
+	if (defer) {
+		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+		    SPA_VERSION_USERREFS)
+			return (SET_ERROR(ENOTSUP));
+		return (0);
+	}
+
+	/*
+	 * If this snapshot has an elevated user reference count,
+	 * we can't destroy it yet.
+	 */
+	if (ds->ds_userrefs > 0)
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Can't delete a branch point.
+	 */
+	if (dsl_dataset_phys(ds)->ds_num_children > 1)
+		return (SET_ERROR(EEXIST));
+
+	return (0);
+}
+
+static int
+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_snapshots_destroy_arg_t *dsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+	int error = 0;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
+		dsl_dataset_t *ds;
+
+		error = dsl_dataset_hold(dp, nvpair_name(pair),
+		    FTAG, &ds);
+
+		/*
+		 * If the snapshot does not exist, silently ignore it
+		 * (it's "already destroyed").
+		 */
+		if (error == ENOENT)
+			continue;
+
+		if (error == 0) {
+			error = dsl_destroy_snapshot_check_impl(ds,
+			    dsda->dsda_defer);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error == 0) {
+			fnvlist_add_boolean(dsda->dsda_successful_snaps,
+			    nvpair_name(pair));
+		} else {
+			fnvlist_add_int32(dsda->dsda_errlist,
+			    nvpair_name(pair), error);
+		}
+	}
+
+	pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
+	if (pair != NULL)
+		return (fnvpair_value_int32(pair));
+
+	return (0);
+}
+
+struct process_old_arg {
+	dsl_dataset_t *ds;
+	dsl_dataset_t *ds_prev;
+	boolean_t after_branch_point;
+	zio_t *pio;
+	uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct process_old_arg *poa = arg;
+	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+		if (poa->ds_prev && !poa->after_branch_point &&
+		    bp->blk_birth >
+		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
+			    bp_get_dsize_sync(dp->dp_spa, bp);
+		}
+	} else {
+		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+		poa->comp += BP_GET_PSIZE(bp);
+		poa->uncomp += BP_GET_UCSIZE(bp);
+		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+	}
+	return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+	struct process_old_arg poa = { 0 };
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t deadlist_obj;
+
+	ASSERT(ds->ds_deadlist.dl_oldfmt);
+	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+	poa.ds = ds;
+	poa.ds_prev = ds_prev;
+	poa.after_branch_point = after_branch_point;
+	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+	    process_old_cb, &poa, tx));
+	VERIFY0(zio_wait(poa.pio));
+	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
+
+	/* change snapused */
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+	    -poa.used, -poa.comp, -poa.uncomp, tx);
+
+	/* swap next's deadlist to our deadlist */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_close(&ds_next->ds_deadlist);
+	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
+	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
+}
+
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	/*
+	 * If it is the old version, dd_clones doesn't exist so we can't
+	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
+	 * doesn't matter.
+	 */
+	if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
+		return;
+
+	for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_dataset_t *clone;
+
+		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za.za_first_integer, FTAG, &clone));
+		if (clone->ds_dir->dd_origin_txg > mintxg) {
+			dsl_deadlist_remove_key(&clone->ds_deadlist,
+			    mintxg, tx);
+			dsl_dataset_remove_clones_key(clone, mintxg, tx);
+		}
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(&zc);
+}
+
+void
+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+{
+	int err;
+	int after_branch_point = FALSE;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds_prev = NULL;
+	uint64_t obj;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	ASSERT(refcount_is_zero(&ds->ds_longholds));
+
+	if (defer &&
+	    (ds->ds_userrefs > 0 ||
+	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
+		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
+		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
+		return;
+	}
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+
+	/* We need to log before removing it from the namespace. */
+	spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	obj = ds->ds_object;
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
+	}
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		ASSERT3P(ds->ds_prev, ==, NULL);
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
+		after_branch_point =
+		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
+
+		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+		if (after_branch_point &&
+		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
+			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+				VERIFY0(zap_add_int(mos,
+				    dsl_dataset_phys(ds_prev)->
+				    ds_next_clones_obj,
+				    dsl_dataset_phys(ds)->ds_next_snap_obj,
+				    tx));
+			}
+		}
+		if (!after_branch_point) {
+			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
+		}
+	}
+
+	dsl_dataset_t *ds_next;
+	uint64_t old_unique;
+	uint64_t used = 0, comp = 0, uncomp = 0;
+
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
+
+	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
+
+	if (ds_next->ds_deadlist.dl_oldfmt) {
+		process_old_deadlist(ds, ds_prev, ds_next,
+		    after_branch_point, tx);
+	} else {
+		/* Adjust prev's unique space. */
+		if (ds_prev && !after_branch_point) {
+			dsl_deadlist_space_range(&ds_next->ds_deadlist,
+			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+			    &used, &comp, &uncomp);
+			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
+		}
+
+		/* Adjust snapused. */
+		dsl_deadlist_space_range(&ds_next->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
+		    &used, &comp, &uncomp);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+		    -used, -comp, -uncomp, tx);
+
+		/* Move blocks to be freed to pool's free list. */
+		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    tx);
+		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+		    DD_USED_HEAD, used, comp, uncomp, tx);
+
+		/* Merge our deadlist into next's and free it. */
+		dsl_deadlist_merge(&ds_next->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	}
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+	/* Collapse range in clone heads */
+	dsl_dataset_remove_clones_key(ds,
+	    dsl_dataset_phys(ds)->ds_creation_txg, tx);
+
+	if (ds_next->ds_is_snapshot) {
+		dsl_dataset_t *ds_nextnext;
+
+		/*
+		 * Update next's unique to include blocks which
+		 * were previously shared by only this snapshot
+		 * and it.  Those blocks will be born after the
+		 * prev snap and before this snap, and will have
+		 * died after the next snap and before the one
+		 * after that (ie. be on the snap after next's
+		 * deadlist).
+		 */
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+		    FTAG, &ds_nextnext));
+		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    dsl_dataset_phys(ds)->ds_creation_txg,
+		    &used, &comp, &uncomp);
+		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
+		dsl_dataset_rele(ds_nextnext, FTAG);
+		ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+		/* Collapse range in this head. */
+		dsl_dataset_t *hds;
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
+		dsl_deadlist_remove_key(&hds->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
+		dsl_dataset_rele(hds, FTAG);
+
+	} else {
+		ASSERT3P(ds_next->ds_prev, ==, ds);
+		dsl_dataset_rele(ds_next->ds_prev, ds_next);
+		ds_next->ds_prev = NULL;
+		if (ds_prev) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+			    ds_next, &ds_next->ds_prev));
+		}
+
+		dsl_dataset_recalc_head_uniq(ds_next);
+
+		/*
+		 * Reduce the amount of our unconsumed refreservation
+		 * being charged to our parent by the amount of
+		 * new unique data we have gained.
+		 */
+		if (old_unique < ds_next->ds_reserved) {
+			int64_t mrsdelta;
+			uint64_t new_unique =
+			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+			ASSERT(old_unique <= new_unique);
+			mrsdelta = MIN(new_unique - old_unique,
+			    ds_next->ds_reserved - old_unique);
+			dsl_dir_diduse_space(ds->ds_dir,
+			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+		}
+	}
+	dsl_dataset_rele(ds_next, FTAG);
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* remove from snapshot namespace */
+	dsl_dataset_t *ds_head;
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
+	VERIFY0(dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+	{
+		uint64_t val;
+
+		err = dsl_dataset_snap_lookup(ds_head,
+		    ds->ds_snapname, &val);
+		ASSERT0(err);
+		ASSERT3U(val, ==, obj);
+	}
+#endif
+	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
+	dsl_dataset_rele(ds_head, FTAG);
+
+	if (ds_prev != NULL)
+		dsl_dataset_rele(ds_prev, FTAG);
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		uint64_t count;
+		ASSERT0(zap_count(mos,
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+		    count == 0);
+		VERIFY0(dmu_object_free(mos,
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
+	}
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+		    tx));
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    tx));
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	dmu_object_free_zapified(mos, obj, tx);
+}
+
+static void
+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_snapshots_destroy_arg_t *dsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+
+		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The semantics of this function are described in the comment above
+ * lzc_destroy_snaps().  To summarize:
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Snapshots that don't exist will be silently ignored (considered to be
+ * "already deleted").
+ *
+ * On success, all snaps will be destroyed and this will return 0.
+ * On failure, no snaps will be destroyed, the errlist will be filled in,
+ * and this will return an errno.
+ */
+int
+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
+    nvlist_t *errlist)
+{
+	dmu_snapshots_destroy_arg_t dsda;
+	int error;
+	nvpair_t *pair;
+
+	pair = nvlist_next_nvpair(snaps, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dsda.dsda_snaps = snaps;
+	dsda.dsda_successful_snaps = fnvlist_alloc();
+	dsda.dsda_defer = defer;
+	dsda.dsda_errlist = errlist;
+
+	error = dsl_sync_task(nvpair_name(pair),
+	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
+	    &dsda, 0, ZFS_SPACE_CHECK_NONE);
+	fnvlist_free(dsda.dsda_successful_snaps);
+
+	return (error);
+}
+
+int
+dsl_destroy_snapshot(const char *name, boolean_t defer)
+{
+	int error;
+	nvlist_t *nvl = fnvlist_alloc();
+	nvlist_t *errlist = fnvlist_alloc();
+
+	fnvlist_add_boolean(nvl, name);
+	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
+	fnvlist_free(errlist);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+struct killarg {
+	dsl_dataset_t *ds;
+	dmu_tx_t *tx;
+};
+
+/* ARGSUSED */
+static int
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	struct killarg *ka = arg;
+	dmu_tx_t *tx = ka->tx;
+
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return (0);
+
+	if (zb->zb_level == ZB_ZIL_LEVEL) {
+		ASSERT(zilog != NULL);
+		/*
+		 * It's a block in the intent log.  It has no
+		 * accounting, so just free it.
+		 */
+		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
+	} else {
+		ASSERT(zilog == NULL);
+		ASSERT3U(bp->blk_birth, >,
+		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
+		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
+	}
+
+	return (0);
+}
+
+static void
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	struct killarg ka;
+
+	/*
+	 * Free everything that we point to (that's born after
+	 * the previous snapshot, if we are a clone)
+	 *
+	 * NB: this should be very quick, because we already
+	 * freed all the objects in open context.
+	 */
+	ka.ds = ds;
+	ka.tx = tx;
+	VERIFY0(traverse_dataset(ds,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
+	    kill_blkptr, &ka));
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
+}
+
+typedef struct dsl_destroy_head_arg {
+	const char *ddha_name;
+} dsl_destroy_head_arg_t;
+
+int
+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
+{
+	int error;
+	uint64_t count;
+	objset_t *mos;
+
+	ASSERT(!ds->ds_is_snapshot);
+	if (ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (refcount_count(&ds->ds_longholds) != expected_holds)
+		return (SET_ERROR(EBUSY));
+
+	mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+	/*
+	 * Can't delete a head dataset if there are snapshots of it.
+	 * (Except if the only snapshots are from the branch we cloned
+	 * from.)
+	 */
+	if (ds->ds_prev != NULL &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Can't delete if there are children of this fs.
+	 */
+	error = zap_count(mos,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
+	if (error != 0)
+		return (error);
+	if (count != 0)
+		return (SET_ERROR(EEXIST));
+
+	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0) {
+		/* We need to remove the origin snapshot as well. */
+		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
+			return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+static int
+dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	error = dsl_destroy_head_check_impl(ds, 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+static void
+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	dd_used_t t;
+
+	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
+
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+	/*
+	 * Decrement the filesystem count for all parent filesystems.
+	 *
+	 * When we receive an incremental stream into a filesystem that already
+	 * exists, a temporary clone is created.  We never count this temporary
+	 * clone, whose name begins with a '%'.
+	 */
+	if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+
+	/*
+	 * Remove our reservation. The impl() routine avoids setting the
+	 * actual property, which would require the (already destroyed) ds.
+	 */
+	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
+
+	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
+	for (t = 0; t < DD_USED_NUM; t++)
+		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
+
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
+	VERIFY0(zap_remove(mos,
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+	    dd->dd_myname, tx));
+
+	dsl_dir_rele(dd, FTAG);
+	dmu_object_free_zapified(mos, ddobj, tx);
+}
+
+void
+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t obj, ddobj, prevobj = 0;
+	boolean_t rmorigin;
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+	ASSERT(ds->ds_prev == NULL ||
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	/* We need to log before removing it from the namespace. */
+	spa_history_log_internal_ds(ds, "destroy", tx, "");
+
+	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
+	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0);
+
+	/* Remove our reservation. */
+	if (ds->ds_reserved != 0) {
+		dsl_dataset_set_refreservation_sync_impl(ds,
+		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+		    0, tx);
+		ASSERT0(ds->ds_reserved);
+	}
+
+	obj = ds->ds_object;
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (ds->ds_feature_inuse[f]) {
+			dsl_dataset_deactivate_feature(obj, f, tx);
+			ds->ds_feature_inuse[f] = B_FALSE;
+		}
+	}
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		/* This is a clone */
+		ASSERT(ds->ds_prev != NULL);
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+		    obj);
+		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
+
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
+			    obj, tx);
+		}
+
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
+	}
+
+	/*
+	 * Destroy the deadlist.  Unless it's a clone, the
+	 * deadlist should be empty.  (If it's a clone, it's
+	 * safe to ignore the deadlist contents.)
+	 */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+	objset_t *os;
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		old_synchronous_dataset_destroy(ds, tx);
+	} else {
+		/*
+		 * Move the bptree into the pool's list of trees to
+		 * clean up and update space accounting information.
+		 */
+		uint64_t used, comp, uncomp;
+
+		zil_destroy_sync(dmu_objset_zil(os), tx);
+
+		if (!spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_ASYNC_DESTROY)) {
+			dsl_scan_t *scn = dp->dp_scan;
+			spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+			    tx);
+			dp->dp_bptree_obj = bptree_alloc(mos, tx);
+			VERIFY0(zap_add(mos,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+			    &dp->dp_bptree_obj, tx));
+			ASSERT(!scn->scn_async_destroying);
+			scn->scn_async_destroying = B_TRUE;
+		}
+
+		used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+		comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+		uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
+
+		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+		    dsl_dataset_phys(ds)->ds_unique_bytes == used);
+
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+		bptree_add(mos, dp->dp_bptree_obj,
+		    &dsl_dataset_phys(ds)->ds_bp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    used, comp, uncomp, tx);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+		    -used, -comp, -uncomp, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    used, comp, uncomp, tx);
+	}
+
+	if (ds->ds_prev != NULL) {
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			VERIFY0(zap_remove_int(mos,
+			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
+			    ds->ds_object, tx));
+		}
+		prevobj = ds->ds_prev->ds_object;
+		dsl_dataset_rele(ds->ds_prev, ds);
+		ds->ds_prev = NULL;
+	}
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* Erase the link in the dir */
+	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
+	ddobj = ds->ds_dir->dd_object;
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+	VERIFY0(zap_destroy(mos,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
+
+	if (ds->ds_bookmarks != 0) {
+		VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
+		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+	}
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	dmu_object_free_zapified(mos, obj, tx);
+
+	dsl_dir_destroy_sync(ddobj, tx);
+
+	if (rmorigin) {
+		dsl_dataset_t *prev;
+		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
+		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
+		dsl_dataset_rele(prev, FTAG);
+	}
+}
+
+static void
+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+	dsl_destroy_head_sync_impl(ds, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+static void
+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+
+	/* Mark it as inconsistent on-disk, in case we crash */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_destroy_head(const char *name)
+{
+	dsl_destroy_head_arg_t ddha;
+	int error;
+	spa_t *spa;
+	boolean_t isenabled;
+
+#ifdef _KERNEL
+	zfs_destroy_unmount_origin(name);
+#endif
+
+	error = spa_open(name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
+	spa_close(spa, FTAG);
+
+	ddha.ddha_name = name;
+
+	if (!isenabled) {
+		objset_t *os;
+
+		error = dsl_sync_task(name, dsl_destroy_head_check,
+		    dsl_destroy_head_begin_sync, &ddha,
+		    0, ZFS_SPACE_CHECK_NONE);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Head deletion is processed in one txg on old pools;
+		 * remove the objects from open context so that the txg sync
+		 * is not too long.
+		 */
+		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
+		if (error == 0) {
+			uint64_t prev_snap_txg =
+			    dsl_dataset_phys(dmu_objset_ds(os))->
+			    ds_prev_snap_txg;
+			for (uint64_t obj = 0; error == 0;
+			    error = dmu_object_next(os, &obj, FALSE,
+			    prev_snap_txg))
+				(void) dmu_free_long_object(os, obj);
+			/* sync out all frees */
+			txg_wait_synced(dmu_objset_pool(os), 0);
+			dmu_objset_disown(os, FTAG);
+		}
+	}
+
+	return (dsl_sync_task(name, dsl_destroy_head_check,
+	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+/*
+ * Note, this function is used as the callback for dmu_objset_find().  We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+	objset_t *os;
+
+	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
+		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+		/*
+		 * If the dataset is inconsistent because a resumable receive
+		 * has failed, then do not destroy it.
+		 */
+		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+			need_destroy = B_FALSE;
+
+		dmu_objset_rele(os, FTAG);
+		if (need_destroy)
+			(void) dsl_destroy_head(dsname);
+	}
+	return (0);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dsl_dir.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c	27 Feb 2010 22:30:56 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c	17 Apr 2017 23:36:24 -0000
@@ -19,8 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -31,27 +36,111 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
 #include <sys/spa.h>
 #include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/sunddi.h>
+#include <sys/zvol.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
 #include "zfs_namecheck.h"
+#include "zfs_prop.h"
 
-static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-static void dsl_dir_set_reservation_sync(void *arg1, void *arg2,
-    cred_t *cr, dmu_tx_t *tx);
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initizized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ *
+ * There is a special case when we receive a filesystem that already exists. In
+ * this case a temporary clone name of %X is created (see dmu_recv_begin). We
+ * never update the filesystem counts for temporary clones.
+ *
+ * Likewise, we do not update the snapshot counts for temporary snapshots,
+ * such as those created by zfs diff.
+ */
 
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
+
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 
-/* ARGSUSED */
 static void
-dsl_dir_evict(dmu_buf_t *db, void *arg)
+dsl_dir_evict_async(void *dbu)
 {
-	dsl_dir_t *dd = arg;
+	dsl_dir_t *dd = dbu;
 	dsl_pool_t *dp = dd->dd_pool;
 	int t;
 
+	dd->dd_dbuf = NULL;
+
 	for (t = 0; t < TXG_SIZE; t++) {
 		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 		ASSERT(dd->dd_tempreserved[t] == 0);
@@ -59,39 +148,34 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
 	}
 
 	if (dd->dd_parent)
-		dsl_dir_close(dd->dd_parent, dd);
+		dsl_dir_async_rele(dd->dd_parent, dd);
 
-	spa_close(dd->dd_pool->dp_spa, dd);
+	spa_async_close(dd->dd_pool->dp_spa, dd);
 
-	/*
-	 * The props callback list should be empty since they hold the
-	 * dir open.
-	 */
-	list_destroy(&dd->dd_prop_cbs);
+	dsl_prop_fini(dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 }
 
 int
-dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **ddp)
 {
 	dmu_buf_t *dbuf;
 	dsl_dir_t *dd;
 	int err;
 
-	ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
-	    dsl_pool_sync_context(dp));
+	ASSERT(dsl_pool_config_held(dp));
 
 	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
-	if (err)
+	if (err != 0)
 		return (err);
 	dd = dmu_buf_get_user(dbuf);
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(dbuf, &doi);
-		ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
 		ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
 	}
 #endif
@@ -102,45 +186,67 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_
 		dd->dd_object = ddobj;
 		dd->dd_dbuf = dbuf;
 		dd->dd_pool = dp;
-		dd->dd_phys = dbuf->db_data;
 		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
-
-		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
-		    offsetof(dsl_prop_cb_record_t, cbr_node));
+		dsl_prop_init(dd);
 
 		dsl_dir_snap_cmtime_update(dd);
 
-		if (dd->dd_phys->dd_parent_obj) {
-			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
-			    NULL, dd, &dd->dd_parent);
-			if (err)
+		if (dsl_dir_phys(dd)->dd_parent_obj) {
+			err = dsl_dir_hold_obj(dp,
+			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+			    &dd->dd_parent);
+			if (err != 0)
 				goto errout;
 			if (tail) {
 #ifdef ZFS_DEBUG
 				uint64_t foundobj;
 
 				err = zap_lookup(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
-				    tail, sizeof (foundobj), 1, &foundobj);
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj, tail,
+				    sizeof (foundobj), 1, &foundobj);
 				ASSERT(err || foundobj == ddobj);
 #endif
 				(void) strcpy(dd->dd_myname, tail);
 			} else {
 				err = zap_value_search(dp->dp_meta_objset,
-				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj,
 				    ddobj, 0, dd->dd_myname);
 			}
-			if (err)
+			if (err != 0)
 				goto errout;
 		} else {
 			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 		}
 
-		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
-		    dsl_dir_evict);
-		if (winner) {
+		if (dsl_dir_is_clone(dd)) {
+			dmu_buf_t *origin_bonus;
+			dsl_dataset_phys_t *origin_phys;
+
+			/*
+			 * We can't open the origin dataset, because
+			 * that would require opening this dsl_dir.
+			 * Just look at its phys directly instead.
+			 */
+			err = dmu_bonus_hold(dp->dp_meta_objset,
+			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+			    &origin_bonus);
+			if (err != 0)
+				goto errout;
+			origin_phys = origin_bonus->db_data;
+			dd->dd_origin_txg =
+			    origin_phys->ds_creation_txg;
+			dmu_buf_rele(origin_bonus, FTAG);
+		}
+
+		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
+		    &dd->dd_dbuf);
+		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
+		if (winner != NULL) {
 			if (dd->dd_parent)
-				dsl_dir_close(dd->dd_parent, dd);
+				dsl_dir_rele(dd->dd_parent, dd);
+			dsl_prop_fini(dd);
 			mutex_destroy(&dd->dd_lock);
 			kmem_free(dd, sizeof (dsl_dir_t));
 			dd = winner;
@@ -167,29 +273,45 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_
 
 errout:
 	if (dd->dd_parent)
-		dsl_dir_close(dd->dd_parent, dd);
+		dsl_dir_rele(dd->dd_parent, dd);
+	dsl_prop_fini(dd);
 	mutex_destroy(&dd->dd_lock);
 	kmem_free(dd, sizeof (dsl_dir_t));
 	dmu_buf_rele(dbuf, tag);
 	return (err);
-
 }
 
 void
-dsl_dir_close(dsl_dir_t *dd, void *tag)
+dsl_dir_rele(dsl_dir_t *dd, void *tag)
 {
 	dprintf_dd(dd, "%s\n", "");
 	spa_close(dd->dd_pool->dp_spa, tag);
 	dmu_buf_rele(dd->dd_dbuf, tag);
 }
 
-/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
+/*
+ * Remove a reference to the given dsl dir that is being asynchronously
+ * released.  Async releases occur from a taskq performing eviction of
+ * dsl datasets and dirs.  This process is identical to a normal release
+ * with the exception of using the async API for releasing the reference on
+ * the spa.
+ */
+void
+dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+{
+	dprintf_dd(dd, "%s\n", "");
+	spa_async_close(dd->dd_pool->dp_spa, tag);
+	dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
 void
 dsl_dir_name(dsl_dir_t *dd, char *buf)
 {
 	if (dd->dd_parent) {
 		dsl_dir_name(dd->dd_parent, buf);
-		(void) strcat(buf, "/");
+		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
+		    ZFS_MAX_DATASET_NAME_LEN);
 	} else {
 		buf[0] = '\0';
 	}
@@ -199,14 +321,16 @@ dsl_dir_name(dsl_dir_t *dd, char *buf)
 		 * dprintf_dd() with dd_lock held
 		 */
 		mutex_enter(&dd->dd_lock);
-		(void) strcat(buf, dd->dd_myname);
+		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+		    <, ZFS_MAX_DATASET_NAME_LEN);
 		mutex_exit(&dd->dd_lock);
 	} else {
-		(void) strcat(buf, dd->dd_myname);
+		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+		    <, ZFS_MAX_DATASET_NAME_LEN);
 	}
 }
 
-/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */
+/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 int
 dsl_dir_namelen(dsl_dir_t *dd)
 {
@@ -233,13 +357,14 @@ static int
 getcomponent(const char *path, char *component, const char **nextp)
 {
 	char *p;
+
 	if ((path == NULL) || (path[0] == '\0'))
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	/* This would be a good place to reserve some namespace... */
 	p = strpbrk(path, "/@");
 	if (p && (p[1] == '/' || p[1] == '@')) {
 		/* two separators in a row */
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 	if (p == NULL || p == path) {
 		/*
@@ -249,16 +374,16 @@ getcomponent(const char *path, char *com
 		 */
 		if (p != NULL &&
 		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
-			return (EINVAL);
-		if (strlen(path) >= MAXNAMELEN)
-			return (ENAMETOOLONG);
+			return (SET_ERROR(EINVAL));
+		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
+			return (SET_ERROR(ENAMETOOLONG));
 		(void) strcpy(component, path);
 		p = NULL;
 	} else if (p[0] == '/') {
-		if (p-path >= MAXNAMELEN)
-			return (ENAMETOOLONG);
+		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
-		component[p-path] = '\0';
+		component[p - path] = '\0';
 		p++;
 	} else if (p[0] == '@') {
 		/*
@@ -266,95 +391,81 @@ getcomponent(const char *path, char *com
 		 * any more slashes.
 		 */
 		if (strchr(path, '/'))
-			return (EINVAL);
-		if (p-path >= MAXNAMELEN)
-			return (ENAMETOOLONG);
+			return (SET_ERROR(EINVAL));
+		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+			return (SET_ERROR(ENAMETOOLONG));
 		(void) strncpy(component, path, p - path);
-		component[p-path] = '\0';
+		component[p - path] = '\0';
 	} else {
-		ASSERT(!"invalid p");
+		panic("invalid p=%p", (void *)p);
 	}
 	*nextp = p;
 	return (0);
 }
 
 /*
- * same as dsl_open_dir, ignore the first component of name and use the
- * spa instead
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail.  The name must be in the specified dsl_pool_t.  This
+ * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
+ * path is bogus, or if tail==NULL and we couldn't parse the whole name.
+ * (*tail)[0] == '@' means that the last component is a snapshot.
  */
 int
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
     dsl_dir_t **ddp, const char **tailp)
 {
-	char buf[MAXNAMELEN];
-	const char *next, *nextnext = NULL;
+	char buf[ZFS_MAX_DATASET_NAME_LEN];
+	const char *spaname, *next, *nextnext = NULL;
 	int err;
 	dsl_dir_t *dd;
-	dsl_pool_t *dp;
 	uint64_t ddobj;
-	int openedspa = FALSE;
-
-	dprintf("%s\n", name);
 
 	err = getcomponent(name, buf, &next);
-	if (err)
+	if (err != 0)
 		return (err);
-	if (spa == NULL) {
-		err = spa_open(buf, &spa, FTAG);
-		if (err) {
-			dprintf("spa_open(%s) failed\n", buf);
-			return (err);
-		}
-		openedspa = TRUE;
 
-		/* XXX this assertion belongs in spa_open */
-		ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
-	}
+	/* Make sure the name is in the specified pool. */
+	spaname = spa_name(dp->dp_spa);
+	if (strcmp(buf, spaname) != 0)
+		return (SET_ERROR(EXDEV));
 
-	dp = spa_get_dsl(spa);
+	ASSERT(dsl_pool_config_held(dp));
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
-	if (err) {
-		rw_exit(&dp->dp_config_rwlock);
-		if (openedspa)
-			spa_close(spa, FTAG);
+	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+	if (err != 0) {
 		return (err);
 	}
 
 	while (next != NULL) {
-		dsl_dir_t *child_ds;
+		dsl_dir_t *child_dd;
 		err = getcomponent(next, buf, &nextnext);
-		if (err)
+		if (err != 0)
 			break;
 		ASSERT(next[0] != '\0');
 		if (next[0] == '@')
 			break;
 		dprintf("looking up %s in obj%lld\n",
-		    buf, dd->dd_phys->dd_child_dir_zapobj);
+		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 
 		err = zap_lookup(dp->dp_meta_objset,
-		    dd->dd_phys->dd_child_dir_zapobj,
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
 		    buf, sizeof (ddobj), 1, &ddobj);
-		if (err) {
+		if (err != 0) {
 			if (err == ENOENT)
 				err = 0;
 			break;
 		}
 
-		err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
-		if (err)
+		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
+		if (err != 0)
 			break;
-		dsl_dir_close(dd, tag);
-		dd = child_ds;
+		dsl_dir_rele(dd, tag);
+		dd = child_dd;
 		next = nextnext;
 	}
-	rw_exit(&dp->dp_config_rwlock);
 
-	if (err) {
-		dsl_dir_close(dd, tag);
-		if (openedspa)
-			spa_close(spa, FTAG);
+	if (err != 0) {
+		dsl_dir_rele(dd, tag);
 		return (err);
 	}
 
@@ -365,28 +476,416 @@ dsl_dir_open_spa(spa_t *spa, const char 
 	if (next != NULL &&
 	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 		/* bad path name */
-		dsl_dir_close(dd, tag);
+		dsl_dir_rele(dd, tag);
 		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
-		err = ENOENT;
+		err = SET_ERROR(ENOENT);
 	}
-	if (tailp)
+	if (tailp != NULL)
 		*tailp = next;
-	if (openedspa)
-		spa_close(spa, FTAG);
 	*ddp = dd;
 	return (err);
 }
 
 /*
- * Return the dsl_dir_t, and possibly the last component which couldn't
- * be found in *tail.  Return NULL if the path is bogus, or if
- * tail==NULL and we couldn't parse the whole name.  (*tail)[0] == '@'
- * means that the last component is a snapshot.
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	uint64_t my_fs_cnt = 0;
+	uint64_t my_ss_cnt = 0;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *os = dp->dp_meta_objset;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+	dsl_dataset_t *ds;
+
+	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+	ASSERT(dsl_pool_config_held(dp));
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dsl_dir_zapify(dd, tx);
+
+	/*
+	 * If the filesystem count has already been initialized then we
+	 * don't need to recurse down any further.
+	 */
+	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+		return;
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/* Iterate my child dirs */
+	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+		dsl_dir_t *chld_dd;
+		uint64_t count;
+
+		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+		    &chld_dd));
+
+		/*
+		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
+		 * temporary datasets.
+		 */
+		if (chld_dd->dd_myname[0] == '$' ||
+		    chld_dd->dd_myname[0] == '%') {
+			dsl_dir_rele(chld_dd, FTAG);
+			continue;
+		}
+
+		my_fs_cnt++;	/* count this child */
+
+		dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+		VERIFY0(zap_lookup(os, chld_dd->dd_object,
+		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+		my_fs_cnt += count;
+		VERIFY0(zap_lookup(os, chld_dd->dd_object,
+		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+		my_ss_cnt += count;
+
+		dsl_dir_rele(chld_dd, FTAG);
+	}
+	zap_cursor_fini(zc);
+	/* Count my snapshots (we counted children's snapshots above) */
+	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
+
+	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		/* Don't count temporary snapshots */
+		if (za->za_name[0] != '%')
+			my_ss_cnt++;
+	}
+	zap_cursor_fini(zc);
+
+	dsl_dataset_rele(ds, FTAG);
+
+	kmem_free(zc, sizeof (zap_cursor_t));
+	kmem_free(za, sizeof (zap_attribute_t));
+
+	/* we're in a sync task, update counts */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+	char *ddname = (char *)arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	dsl_dir_t *dd;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	dd = ds->ds_dir;
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+	    dsl_dir_is_zapified(dd) &&
+	    zap_contains(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EALREADY));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+	char *ddname = (char *)arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	spa_t *spa;
+
+	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+	spa = dsl_dataset_get_spa(ds);
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		/*
+		 * Since the feature was not active and we're now setting a
+		 * limit, increment the feature-active counter so that the
+		 * feature becomes active for the first time.
+		 *
+		 * We are already in a sync task so we can update the MOS.
+		 */
+		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+	}
+
+	/*
+	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+	 * we need to ensure the counts are correct. Descend down the tree from
+	 * this point and update all of the counts to be accurate.
+	 */
+	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+	int error;
+
+	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
+	    ZFS_SPACE_CHECK_RESERVED);
+
+	if (error == EALREADY)
+		error = 0;
+
+	return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+	ENFORCE_ALWAYS,
+	ENFORCE_NEVER,
+	ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
+{
+	enforce_res_t enforce = ENFORCE_ALWAYS;
+	uint64_t obj;
+	dsl_dataset_t *ds;
+	uint64_t zoned;
+
+	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+#ifdef illumos
+	if (crgetzoneid(cr) != GLOBAL_ZONEID)
+#endif
+#ifdef __FreeBSD__
+	if (jailed(cr))
+#endif
+#ifdef __NetBSD__
+	if (0)
+#endif		
+		return (ENFORCE_ALWAYS);
+
+	if (secpolicy_zfs(cr) == 0)
+		return (ENFORCE_NEVER);
+#endif
+
+	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
+		return (ENFORCE_ALWAYS);
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+		return (ENFORCE_ALWAYS);
+
+	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
+		/* Only root can access zoned fs's from the GZ */
+		enforce = ENFORCE_ALWAYS;
+	} else {
+		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+			enforce = ENFORCE_ABOVE;
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (enforce);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
  */
 int
-dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+    dsl_dir_t *ancestor, cred_t *cr)
+{
+	objset_t *os = dd->dd_pool->dp_meta_objset;
+	uint64_t limit, count;
+	char *count_prop;
+	enforce_res_t enforce;
+	int err = 0;
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+	/*
+	 * If we're allowed to change the limit, don't enforce the limit
+	 * e.g. this can happen if a snapshot is taken by an administrative
+	 * user in the global zone (i.e. a recursive snapshot by root).
+	 * However, we must handle the case of delegated permissions where we
+	 * are allowed to change the limit on the current dataset, but there
+	 * is another limit in the tree above.
+	 */
+	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
+	if (enforce == ENFORCE_NEVER)
+		return (0);
+
+	/*
+	 * e.g. if renaming a dataset with no snapshots, count adjustment
+	 * is 0.
+	 */
+	if (delta == 0)
+		return (0);
+
+	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+		/*
+		 * We don't enforce the limit for temporary snapshots. This is
+		 * indicated by a NULL cred_t argument.
+		 */
+		if (cr == NULL)
+			return (0);
+
+		count_prop = DD_FIELD_SNAPSHOT_COUNT;
+	} else {
+		count_prop = DD_FIELD_FILESYSTEM_COUNT;
+	}
+
+	/*
+	 * If an ancestor has been provided, stop checking the limit once we
+	 * hit that dir. We need this during rename so that we don't overcount
+	 * the check once we recurse up to the common ancestor.
+	 */
+	if (ancestor == dd)
+		return (0);
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know there is no limit here (or above). The counts are
+	 * not valid on this node and we know we won't touch this node's counts.
+	 */
+	if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
+	    count_prop, sizeof (count), 1, &count) == ENOENT)
+		return (0);
+
+	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+	    B_FALSE);
+	if (err != 0)
+		return (err);
+
+	/* Is there a limit which we've hit? */
+	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+		return (SET_ERROR(EDQUOT));
+
+	if (dd->dd_parent != NULL)
+		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+		    ancestor, cr);
+
+	return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+    dmu_tx_t *tx)
 {
-	return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
+	int err;
+	objset_t *os = dd->dd_pool->dp_meta_objset;
+	uint64_t count;
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+	/*
+	 * When we receive an incremental stream into a filesystem that already
+	 * exists, a temporary clone is created.  We don't count this temporary
+	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
+	 * $MOS & $ORIGIN) objsets.
+	 */
+	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
+	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
+		return;
+
+	/*
+	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+	 */
+	if (delta == 0)
+		return;
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know the counts are not valid on this node and we
+	 * know we shouldn't touch this node's counts. An uninitialized count
+	 * on the node indicates that either the feature has not yet been
+	 * activated or there are no limits on this part of the tree.
+	 */
+	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+	    prop, sizeof (count), 1, &count)) == ENOENT)
+		return;
+	VERIFY0(err);
+
+	count += delta;
+	/* Use a signed verify to make sure we're not neg. */
+	VERIFY3S(count, >=, 0);
+
+	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
+	    tx));
+
+	/* Roll up this additional count into our ancestors */
+	if (dd->dd_parent != NULL)
+		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
 }
 
 uint64_t
@@ -395,13 +894,13 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
-	dsl_dir_phys_t *dsphys;
+	dsl_dir_phys_t *ddphys;
 	dmu_buf_t *dbuf;
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 	if (pds) {
-		VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+		VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
 		    name, sizeof (uint64_t), 1, &ddobj, tx));
 	} else {
 		/* it's the root dir */
@@ -410,97 +909,32 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_
 	}
 	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
+	ddphys = dbuf->db_data;
 
-	dsphys->dd_creation_time = gethrestime_sec();
-	if (pds)
-		dsphys->dd_parent_obj = pds->dd_object;
-	dsphys->dd_props_zapobj = zap_create(mos,
+	ddphys->dd_creation_time = gethrestime_sec();
+	if (pds) {
+		ddphys->dd_parent_obj = pds->dd_object;
+
+		/* update the filesystem counts */
+		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
+	}
+	ddphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	dsphys->dd_child_dir_zapobj = zap_create(mos,
+	ddphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
-		dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 	dmu_buf_rele(dbuf, FTAG);
 
 	return (ddobj);
 }
 
-/* ARGSUSED */
-int
-dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	int err;
-	uint64_t count;
-
-	/*
-	 * There should be exactly two holds, both from
-	 * dsl_dataset_destroy: one on the dd directory, and one on its
-	 * head ds.  Otherwise, someone is trying to lookup something
-	 * inside this dir while we want to destroy it.  The
-	 * config_rwlock ensures that nobody else opens it after we
-	 * check.
-	 */
-	if (dmu_buf_refcount(dd->dd_dbuf) > 2)
-		return (EBUSY);
-
-	err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
-	if (err)
-		return (err);
-	if (count != 0)
-		return (EEXIST);
-
-	return (0);
-}
-
-void
-dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dsl_dir_t *dd = ds->ds_dir;
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	dsl_prop_setarg_t psa;
-	uint64_t value = 0;
-	uint64_t obj;
-	dd_used_t t;
-
-	ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
-	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
-
-	/* Remove our reservation. */
-	dsl_prop_setarg_init_uint64(&psa, "reservation",
-	    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
-	    &value);
-	psa.psa_effective_value = 0;	/* predict default value */
-
-	dsl_dir_set_reservation_sync(ds, &psa, cr, tx);
-
-	ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
-	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
-	for (t = 0; t < DD_USED_NUM; t++)
-		ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0);
-
-	VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
-	VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
-	VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
-	VERIFY(0 == zap_remove(mos,
-	    dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
-
-	obj = dd->dd_object;
-	dsl_dir_close(dd, tag);
-	VERIFY(0 == dmu_object_free(mos, obj, tx));
-}
-
 boolean_t
 dsl_dir_is_clone(dsl_dir_t *dd)
 {
-	return (dd->dd_phys->dd_origin_obj &&
+	return (dsl_dir_phys(dd)->dd_origin_obj &&
 	    (dd->dd_pool->dp_origin_snap == NULL ||
-	    dd->dd_phys->dd_origin_obj !=
+	    dsl_dir_phys(dd)->dd_origin_obj !=
 	    dd->dd_pool->dp_origin_snap->ds_object));
 }
 
@@ -509,39 +943,56 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *n
 {
 	mutex_enter(&dd->dd_lock);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
-	    dd->dd_phys->dd_used_bytes);
-	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
+	    dsl_dir_phys(dd)->dd_used_bytes);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+	    dsl_dir_phys(dd)->dd_quota);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
-	    dd->dd_phys->dd_reserved);
+	    dsl_dir_phys(dd)->dd_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
-	    dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
-	    (dd->dd_phys->dd_uncompressed_bytes * 100 /
-	    dd->dd_phys->dd_compressed_bytes));
-	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	    dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+	    dsl_dir_phys(dd)->dd_compressed_bytes));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
+	    dsl_dir_phys(dd)->dd_uncompressed_bytes);
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
-		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
-		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+		    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 	}
 	mutex_exit(&dd->dd_lock);
 
-	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
+	if (dsl_dir_is_zapified(dd)) {
+		uint64_t count;
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+
+		if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+		    sizeof (count), 1, &count) == 0) {
+			dsl_prop_nvlist_add_uint64(nv,
+			    ZFS_PROP_FILESYSTEM_COUNT, count);
+		}
+		if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+		    sizeof (count), 1, &count) == 0) {
+			dsl_prop_nvlist_add_uint64(nv,
+			    ZFS_PROP_SNAPSHOT_COUNT, count);
+		}
+	}
+
 	if (dsl_dir_is_clone(dd)) {
 		dsl_dataset_t *ds;
-		char buf[MAXNAMELEN];
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
 
-		VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
-		    dd->dd_phys->dd_origin_obj, FTAG, &ds));
+		VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
 		dsl_dataset_name(ds, buf);
 		dsl_dataset_rele(ds, FTAG);
 		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 	}
-	rw_exit(&dd->dd_pool->dp_config_rwlock);
 }
 
 void
@@ -549,9 +1000,9 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *t
 {
 	dsl_pool_t *dp = dd->dd_pool;
 
-	ASSERT(dd->dd_phys);
+	ASSERT(dsl_dir_phys(dd));
 
-	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
+	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(dd->dd_dbuf, dd);
 	}
@@ -560,8 +1011,9 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *t
 static int64_t
 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 {
-	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
-	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	uint64_t new_accounted =
+	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
 	return (new_accounted - old_accounted);
 }
 
@@ -570,10 +1022,8 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-
 	mutex_enter(&dd->dd_lock);
-	ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0);
+	ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
 	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
 	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
@@ -622,9 +1072,9 @@ dsl_dir_space_available(dsl_dir_t *dd,
 	}
 
 	mutex_enter(&dd->dd_lock);
-	if (dd->dd_phys->dd_quota != 0)
-		quota = dd->dd_phys->dd_quota;
-	used = dd->dd_phys->dd_used_bytes;
+	if (dsl_dir_phys(dd)->dd_quota != 0)
+		quota = dsl_dir_phys(dd)->dd_quota;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
 	if (!ondiskonly)
 		used += dsl_dir_space_towrite(dd);
 
@@ -633,12 +1083,12 @@ dsl_dir_space_available(dsl_dir_t *dd,
 		quota = MIN(quota, poolsize);
 	}
 
-	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
 		/*
 		 * We have some space reserved, in addition to what our
 		 * parent gave us.
 		 */
-		parentspace += dd->dd_phys->dd_reserved - used;
+		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
 	}
 
 	if (dd == ancestor) {
@@ -652,15 +1102,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
 	if (used > quota) {
 		/* over quota */
 		myspace = 0;
-
-		/*
-		 * While it's OK to be a little over quota, if
-		 * we think we are using more space than there
-		 * is in the pool (which is already 1.6% more than
-		 * dsl_pool_adjustedsize()), something is very
-		 * wrong.
-		 */
-		ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa));
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
@@ -676,7 +1117,6 @@ dsl_dir_space_available(dsl_dir_t *dd,
 
 struct tempreserve {
 	list_node_t tr_node;
-	dsl_pool_t *tr_dp;
 	dsl_dir_t *tr_ds;
 	uint64_t tr_size;
 };
@@ -707,7 +1147,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, 
 	est_inflight = dsl_dir_space_towrite(dd);
 	for (i = 0; i < TXG_SIZE; i++)
 		est_inflight += dd->dd_tempreserved[i];
-	used_on_disk = dd->dd_phys->dd_used_bytes;
+	used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
 
 	/*
 	 * On the first iteration, fetch the dataset's used-on-disk and
@@ -730,10 +1170,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, 
 	 * If this transaction will result in a net free of space,
 	 * we want to let it through.
 	 */
-	if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
 		quota = UINT64_MAX;
 	else
-		quota = dd->dd_phys->dd_quota;
+		quota = dsl_dir_phys(dd)->dd_quota;
 
 	/*
 	 * Adjust the quota against the actual pool size at the root
@@ -770,7 +1210,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, 
 		    used_on_disk>>10, est_inflight>>10,
 		    quota>>10, asize>>10, retval);
 		mutex_exit(&dd->dd_lock);
-		return (retval);
+		return (SET_ERROR(retval));
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
@@ -787,7 +1227,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, 
 
 	/* see if it's OK with our parent */
 	if (dd->dd_parent && parent_rsrv) {
-		boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+		boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
 
 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
 		    parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
@@ -827,29 +1267,29 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd,
 		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 		tr->tr_size = lsize;
 		list_insert_tail(tr_list, tr);
-
-		err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 	} else {
 		if (err == EAGAIN) {
-			txg_delay(dd->dd_pool, tx->tx_txg, 1);
-			err = ERESTART;
+			/*
+			 * If arc_memory_throttle() detected that pageout
+			 * is running and we are low on memory, we delay new
+			 * non-pageout transactions to give pageout an
+			 * advantage.
+			 *
+			 * It is unfortunate to be delaying while the caller's
+			 * locks are held.
+			 */
+			txg_delay(dd->dd_pool, tx->tx_txg,
+			    MSEC2NSEC(10), MSEC2NSEC(10));
+			err = SET_ERROR(ERESTART);
 		}
-		dsl_pool_memory_pressure(dd->dd_pool);
 	}
 
 	if (err == 0) {
-		struct tempreserve *tr;
-
-		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
-		tr->tr_dp = dd->dd_pool;
-		tr->tr_size = asize;
-		list_insert_tail(tr_list, tr);
-
 		err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 		    FALSE, asize > usize, tr_list, tx, TRUE);
 	}
 
-	if (err)
+	if (err != 0)
 		dsl_dir_tempreserve_clear(tr_list, tx);
 	else
 		*tr_cookiep = tr_list;
@@ -873,10 +1313,8 @@ dsl_dir_tempreserve_clear(void *tr_cooki
 	if (tr_cookie == NULL)
 		return;
 
-	while (tr = list_head(tr_list)) {
-		if (tr->tr_dp) {
-			dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
-		} else if (tr->tr_ds) {
+	while ((tr = list_head(tr_list)) != NULL) {
+		if (tr->tr_ds) {
 			mutex_enter(&tr->tr_ds->dd_lock);
 			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 			    tr->tr_size);
@@ -892,8 +1330,14 @@ dsl_dir_tempreserve_clear(void *tr_cooki
 	kmem_free(tr_list, sizeof (list_t));
 }
 
-static void
-dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+/*
+ * This should be called from open context when we think we're going to write
+ * or free space, for example when dirtying data. Be conservative; it's okay
+ * to write less space or free more, but we don't want to write more or free
+ * less than the amount specified.
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 {
 	int64_t parent_space;
 	uint64_t est_used;
@@ -902,7 +1346,7 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd
 	if (space > 0)
 		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 
-	est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
+	est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
 	parent_space = parent_delta(dd, est_used, space);
 	mutex_exit(&dd->dd_lock);
 
@@ -911,19 +1355,7 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd
 
 	/* XXX this is potentially expensive and unnecessary... */
 	if (parent_space && dd->dd_parent)
-		dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
-}
-
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data.  Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
-{
-	dsl_pool_willuse_space(dd->dd_pool, space, tx);
-	dsl_dir_willuse_space_impl(dd, space, tx);
+		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
 }
 
 /* call from syncing context when we actually write/free space for this dd */
@@ -932,35 +1364,44 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_u
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 {
 	int64_t accounted_delta;
+
+	/*
+	 * dsl_dataset_set_refreservation_sync_impl() calls this with
+	 * dd_lock held, so that it can atomically update
+	 * ds->ds_reserved and the dsl_dir accounting, so that
+	 * dsl_dataset_check_quota() can see dataset and dir accounting
+	 * consistently.
+	 */
 	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(type < DD_USED_NUM);
 
-	dsl_dir_dirty(dd, tx);
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	if (needlock)
 		mutex_enter(&dd->dd_lock);
-	accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
-	ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
+	accounted_delta =
+	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
 	ASSERT(compressed >= 0 ||
-	    dd->dd_phys->dd_compressed_bytes >= -compressed);
+	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
 	ASSERT(uncompressed >= 0 ||
-	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
-	dd->dd_phys->dd_used_bytes += used;
-	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
-	dd->dd_phys->dd_compressed_bytes += compressed;
+	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+	dsl_dir_phys(dd)->dd_used_bytes += used;
+	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
 
-	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 		ASSERT(used > 0 ||
-		    dd->dd_phys->dd_used_breakdown[type] >= -used);
-		dd->dd_phys->dd_used_breakdown[type] += used;
+		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
 #ifdef DEBUG
 		dd_used_t t;
 		uint64_t u = 0;
 		for (t = 0; t < DD_USED_NUM; t++)
-			u += dd->dd_phys->dd_used_breakdown[t];
-		ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
+			u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+		ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
 #endif
 	}
 	if (needlock)
@@ -971,7 +1412,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_u
 		    accounted_delta, compressed, uncompressed, tx);
 		dsl_dir_transfer_space(dd->dd_parent,
 		    used - accounted_delta,
-		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+		    DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
 	}
 }
 
@@ -979,145 +1420,152 @@ void
 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 {
-	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
-
-	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(tx == NULL || dmu_tx_is_syncing(tx));
 	ASSERT(oldtype < DD_USED_NUM);
 	ASSERT(newtype < DD_USED_NUM);
 
-	if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
+	if (delta == 0 ||
+	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
 		return;
 
-	dsl_dir_dirty(dd, tx);
-	if (needlock)
-		mutex_enter(&dd->dd_lock);
+	if (tx != NULL)
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	mutex_enter(&dd->dd_lock);
 	ASSERT(delta > 0 ?
-	    dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
-	    dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
-	ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
-	dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
-	dd->dd_phys->dd_used_breakdown[newtype] += delta;
-	if (needlock)
-		mutex_exit(&dd->dd_lock);
+	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
+	mutex_exit(&dd->dd_lock);
 }
 
+typedef struct dsl_dir_set_qr_arg {
+	const char *ddsqra_name;
+	zprop_source_t ddsqra_source;
+	uint64_t ddsqra_value;
+} dsl_dir_set_qr_arg_t;
+
 static int
-dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_setarg_t *psa = arg2;
-	int err;
-	uint64_t towrite;
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t towrite, newval;
 
-	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-		return (err);
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	error = dsl_prop_predict(ds->ds_dir, "quota",
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
 
-	if (psa->psa_effective_value == 0)
+	if (newval == 0) {
+		dsl_dataset_rele(ds, FTAG);
 		return (0);
+	}
 
-	mutex_enter(&dd->dd_lock);
+	mutex_enter(&ds->ds_dir->dd_lock);
 	/*
 	 * If we are doing the preliminary check in open context, and
 	 * there are pending changes, then don't fail it, since the
 	 * pending changes could under-estimate the amount of space to be
 	 * freed up.
 	 */
-	towrite = dsl_dir_space_towrite(dd);
+	towrite = dsl_dir_space_towrite(ds->ds_dir);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-	    (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
-	    psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
-		err = ENOSPC;
+	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
+		error = SET_ERROR(ENOSPC);
 	}
-	mutex_exit(&dd->dd_lock);
-	return (err);
+	mutex_exit(&ds->ds_dir->dd_lock);
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
 }
 
-extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
-
-/* ARGSUSED */
 static void
-dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value = psa->psa_effective_value;
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	uint64_t newval;
 
-	dsl_prop_set_sync(ds, psa, cr, tx);
-	DSL_PROP_CHECK_PREDICTION(dd, psa);
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
+		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+		    &ddsqra->ddsqra_value, tx);
 
-	mutex_enter(&dd->dd_lock);
-	dd->dd_phys->dd_quota = effective_value;
-	mutex_exit(&dd->dd_lock);
+		VERIFY0(dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
+	} else {
+		newval = ddsqra->ddsqra_value;
+		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
+	}
 
-	spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
-	    tx, cr, "%lld dataset = %llu ",
-	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
+	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+	mutex_enter(&ds->ds_dir->dd_lock);
+	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
+	mutex_exit(&ds->ds_dir->dd_lock);
+	dsl_dataset_rele(ds, FTAG);
 }
 
 int
 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
-	dsl_prop_setarg_t psa;
-	int err;
-
-	dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
+	dsl_dir_set_qr_arg_t ddsqra;
 
-	err = dsl_dataset_hold(ddname, FTAG, &ds);
-	if (err)
-		return (err);
+	ddsqra.ddsqra_name = ddname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = quota;
 
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
-	if (err) {
-		dsl_dataset_rele(ds, FTAG);
-		return (err);
-	}
-
-	ASSERT(ds->ds_dir == dd);
-
-	/*
-	 * If someone removes a file, then tries to set the quota, we want to
-	 * make sure the file freeing takes effect.
-	 */
-	txg_wait_open(dd->dd_pool, 0);
-
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
-	    dsl_dir_set_quota_sync, ds, &psa, 0);
-
-	dsl_dir_close(dd, FTAG);
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
+	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
+	    dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 int
-dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value;
-	uint64_t used, avail;
-	int err;
-
-	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
-		return (err);
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	dsl_dir_t *dd;
+	uint64_t newval, used, avail;
+	int error;
 
-	effective_value = psa->psa_effective_value;
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+	dd = ds->ds_dir;
 
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
 	 */
-	if (!dmu_tx_is_syncing(tx))
+	if (!dmu_tx_is_syncing(tx)) {
+		dsl_dataset_rele(ds, FTAG);
 		return (0);
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
 
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_phys->dd_used_bytes;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent) {
@@ -1127,41 +1575,32 @@ dsl_dir_set_reservation_check(void *arg1
 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
 	}
 
-	if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
-		uint64_t delta = MAX(used, effective_value) -
-		    MAX(used, dd->dd_phys->dd_reserved);
-
-		if (delta > avail)
-			return (ENOSPC);
-		if (dd->dd_phys->dd_quota > 0 &&
-		    effective_value > dd->dd_phys->dd_quota)
-			return (ENOSPC);
+	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
+		uint64_t delta = MAX(used, newval) -
+		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
+
+		if (delta > avail ||
+		    (dsl_dir_phys(dd)->dd_quota > 0 &&
+		    newval > dsl_dir_phys(dd)->dd_quota))
+			error = SET_ERROR(ENOSPC);
 	}
 
-	return (0);
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
 }
 
-/* ARGSUSED */
-static void
-dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+void
+dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_setarg_t *psa = arg2;
-	uint64_t effective_value = psa->psa_effective_value;
 	uint64_t used;
 	int64_t delta;
 
-	dsl_prop_set_sync(ds, psa, cr, tx);
-	DSL_PROP_CHECK_PREDICTION(dd, psa);
-
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
-	used = dd->dd_phys->dd_used_bytes;
-	delta = MAX(used, effective_value) -
-	    MAX(used, dd->dd_phys->dd_reserved);
-	dd->dd_phys->dd_reserved = effective_value;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
+	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	dsl_dir_phys(dd)->dd_reserved = value;
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
@@ -1169,41 +1608,49 @@ dsl_dir_set_reservation_sync(void *arg1,
 		    delta, 0, 0, tx);
 	}
 	mutex_exit(&dd->dd_lock);
-
-	spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
-	    tx, cr, "%lld dataset = %llu",
-	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }
 
-int
-dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
-    uint64_t reservation)
+static void
+dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd;
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
-	dsl_prop_setarg_t psa;
-	int err;
+	uint64_t newval;
 
-	dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 
-	err = dsl_dataset_hold(ddname, FTAG, &ds);
-	if (err)
-		return (err);
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+		dsl_prop_set_sync_impl(ds,
+		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
+		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+		    &ddsqra->ddsqra_value, tx);
 
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
-	if (err) {
-		dsl_dataset_rele(ds, FTAG);
-		return (err);
+		VERIFY0(dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
+	} else {
+		newval = ddsqra->ddsqra_value;
+		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
+		    (longlong_t)newval);
 	}
 
-	ASSERT(ds->ds_dir == dd);
+	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation)
+{
+	dsl_dir_set_qr_arg_t ddsqra;
 
-	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, ds, &psa, 0);
+	ddsqra.ddsqra_name = ddname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = reservation;
 
-	dsl_dir_close(dd, FTAG);
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
+	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
+	    dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 static dsl_dir_t *
@@ -1230,84 +1677,219 @@ would_change(dsl_dir_t *dd, int64_t delt
 		return (delta);
 
 	mutex_enter(&dd->dd_lock);
-	delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
+	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
 	mutex_exit(&dd->dd_lock);
 	return (would_change(dd->dd_parent, delta, ancestor));
 }
 
-struct renamearg {
-	dsl_dir_t *newparent;
-	const char *mynewname;
-};
+typedef struct dsl_dir_rename_arg {
+	const char *ddra_oldname;
+	const char *ddra_newname;
+	cred_t *ddra_cred;
+} dsl_dir_rename_arg_t;
 
-/*ARGSUSED*/
+/* ARGSUSED */
 static int
-dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
-	dsl_dir_t *dd = arg1;
-	struct renamearg *ra = arg2;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	int err;
-	uint64_t val;
+	int *deltap = arg;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
-	/* There should be 2 references: the open and the dirty */
-	if (dmu_buf_refcount(dd->dd_dbuf) > 2)
-		return (EBUSY);
-
-	/* check for existing name */
-	err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
-	    ra->mynewname, 8, 1, &val);
-	if (err == 0)
-		return (EEXIST);
-	if (err != ENOENT)
-		return (err);
+	dsl_dataset_name(ds, namebuf);
+
+	if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	return (0);
+}
+
+static int
+dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dir_rename_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd, *newparent;
+	const char *mynewname;
+	int error;
+	int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
+
+	/* target dir should exist */
+	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
+	if (error != 0)
+		return (error);
+
+	/* new parent should exist */
+	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
+	    &newparent, &mynewname);
+	if (error != 0) {
+		dsl_dir_rele(dd, FTAG);
+		return (error);
+	}
+
+	/* can't rename to different pool */
+	if (dd->dd_pool != newparent->dd_pool) {
+		dsl_dir_rele(newparent, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/* new name should not already exist */
+	if (mynewname == NULL) {
+		dsl_dir_rele(newparent, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		return (SET_ERROR(EEXIST));
+	}
+
+	/* if the name length is growing, validate child name lengths */
+	if (delta > 0) {
+		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
+		    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+		if (error != 0) {
+			dsl_dir_rele(newparent, FTAG);
+			dsl_dir_rele(dd, FTAG);
+			return (error);
+		}
+	}
 
-	if (ra->newparent != dd->dd_parent) {
+	if (dmu_tx_is_syncing(tx)) {
+		if (spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_FS_SS_LIMIT)) {
+			/*
+			 * Although this is the check function and we don't
+			 * normally make on-disk changes in check functions,
+			 * we need to do that here.
+			 *
+			 * Ensure this portion of the tree's counts have been
+			 * initialized in case the new parent has limits set.
+			 */
+			dsl_dir_init_fs_ss_count(dd, tx);
+		}
+	}
+
+	if (newparent != dd->dd_parent) {
 		/* is there enough space? */
 		uint64_t myspace =
-		    MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
+		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_reserved);
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		uint64_t fs_cnt = 0;
+		uint64_t ss_cnt = 0;
+
+		if (dsl_dir_is_zapified(dd)) {
+			int err;
+
+			err = zap_lookup(os, dd->dd_object,
+			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+			    &fs_cnt);
+			if (err != ENOENT && err != 0) {
+				dsl_dir_rele(newparent, FTAG);
+				dsl_dir_rele(dd, FTAG);
+				return (err);
+			}
+
+			/*
+			 * have to add 1 for the filesystem itself that we're
+			 * moving
+			 */
+			fs_cnt++;
+
+			err = zap_lookup(os, dd->dd_object,
+			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+			    &ss_cnt);
+			if (err != ENOENT && err != 0) {
+				dsl_dir_rele(newparent, FTAG);
+				dsl_dir_rele(dd, FTAG);
+				return (err);
+			}
+		}
 
 		/* no rename into our descendant */
-		if (closest_common_ancestor(dd, ra->newparent) == dd)
-			return (EINVAL);
+		if (closest_common_ancestor(dd, newparent) == dd) {
+			dsl_dir_rele(newparent, FTAG);
+			dsl_dir_rele(dd, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
 
-		if (err = dsl_dir_transfer_possible(dd->dd_parent,
-		    ra->newparent, myspace))
-			return (err);
+		error = dsl_dir_transfer_possible(dd->dd_parent,
+		    newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
+		if (error != 0) {
+			dsl_dir_rele(newparent, FTAG);
+			dsl_dir_rele(dd, FTAG);
+			return (error);
+		}
 	}
 
+	dsl_dir_rele(newparent, FTAG);
+	dsl_dir_rele(dd, FTAG);
 	return (0);
 }
 
 static void
-dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	struct renamearg *ra = arg2;
-	dsl_pool_t *dp = dd->dd_pool;
+	dsl_dir_rename_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd, *newparent;
+	const char *mynewname;
+	int error;
 	objset_t *mos = dp->dp_meta_objset;
-	int err;
 
-	ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
+	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
+	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
+	    &mynewname));
+
+	/* Log this before we change the name. */
+	spa_history_log_internal_dd(dd, "rename", tx,
+	    "-> %s", ddra->ddra_newname);
+
+	if (newparent != dd->dd_parent) {
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		uint64_t fs_cnt = 0;
+		uint64_t ss_cnt = 0;
+
+		/*
+		 * We already made sure the dd counts were initialized in the
+		 * check function.
+		 */
+		if (spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_FS_SS_LIMIT)) {
+			VERIFY0(zap_lookup(os, dd->dd_object,
+			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+			    &fs_cnt));
+			/* add 1 for the filesystem itself that we're moving */
+			fs_cnt++;
+
+			VERIFY0(zap_lookup(os, dd->dd_object,
+			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+			    &ss_cnt));
+		}
+
+		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+		dsl_fs_ss_count_adjust(newparent, fs_cnt,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+
+		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+		dsl_fs_ss_count_adjust(newparent, ss_cnt,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
 
-	if (ra->newparent != dd->dd_parent) {
 		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
-		    -dd->dd_phys->dd_used_bytes,
-		    -dd->dd_phys->dd_compressed_bytes,
-		    -dd->dd_phys->dd_uncompressed_bytes, tx);
-		dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
-		    dd->dd_phys->dd_used_bytes,
-		    dd->dd_phys->dd_compressed_bytes,
-		    dd->dd_phys->dd_uncompressed_bytes, tx);
-
-		if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
-			uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
-			    dd->dd_phys->dd_used_bytes;
+		    -dsl_dir_phys(dd)->dd_used_bytes,
+		    -dsl_dir_phys(dd)->dd_compressed_bytes,
+		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
+		    dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_compressed_bytes,
+		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+
+		if (dsl_dir_phys(dd)->dd_reserved >
+		    dsl_dir_phys(dd)->dd_used_bytes) {
+			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+			    dsl_dir_phys(dd)->dd_used_bytes;
 
 			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
 			    -unused_rsrv, 0, 0, tx);
-			dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
+			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
 			    unused_rsrv, 0, 0, tx);
 		}
 	}
@@ -1315,68 +1897,71 @@ dsl_dir_rename_sync(void *arg1, void *ar
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	/* remove from old parent zapobj */
-	err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+	error = zap_remove(mos,
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
 	    dd->dd_myname, tx);
-	ASSERT3U(err, ==, 0);
+	ASSERT0(error);
 
-	(void) strcpy(dd->dd_myname, ra->mynewname);
-	dsl_dir_close(dd->dd_parent, dd);
-	dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
-	VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
-	    ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
+	(void) strcpy(dd->dd_myname, mynewname);
+	dsl_dir_rele(dd->dd_parent, dd);
+	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
+	VERIFY0(dsl_dir_hold_obj(dp,
+	    newparent->dd_object, NULL, dd, &dd->dd_parent));
 
 	/* add to new parent zapobj */
-	err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
-	    dd->dd_myname, 8, 1, &dd->dd_object, tx);
-	ASSERT3U(err, ==, 0);
+	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
+	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
+	zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
+#endif
+#endif
+
+	dsl_prop_notify_all(dd);
 
-	spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa,
-	    tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj);
+	dsl_dir_rele(newparent, FTAG);
+	dsl_dir_rele(dd, FTAG);
 }
 
 int
-dsl_dir_rename(dsl_dir_t *dd, const char *newname)
+dsl_dir_rename(const char *oldname, const char *newname)
 {
-	struct renamearg ra;
-	int err;
+	dsl_dir_rename_arg_t ddra;
 
-	/* new parent should exist */
-	err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
-	if (err)
-		return (err);
-
-	/* can't rename to different pool */
-	if (dd->dd_pool != ra.newparent->dd_pool) {
-		err = ENXIO;
-		goto out;
-	}
-
-	/* new name should not already exist */
-	if (ra.mynewname == NULL) {
-		err = EEXIST;
-		goto out;
-	}
-
-	err = dsl_sync_task_do(dd->dd_pool,
-	    dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
+	ddra.ddra_oldname = oldname;
+	ddra.ddra_newname = newname;
+	ddra.ddra_cred = CRED();
 
-out:
-	dsl_dir_close(ra.newparent, FTAG);
-	return (err);
+	return (dsl_sync_task(oldname,
+	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
+	    3, ZFS_SPACE_CHECK_RESERVED));
 }
 
 int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
 {
 	dsl_dir_t *ancestor;
 	int64_t adelta;
 	uint64_t avail;
+	int err;
 
 	ancestor = closest_common_ancestor(sdd, tdd);
 	adelta = would_change(sdd, -space, ancestor);
 	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
 	if (avail < space)
-		return (ENOSPC);
+		return (SET_ERROR(ENOSPC));
+
+	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
+	    ancestor, cr);
+	if (err != 0)
+		return (err);
+	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
+	    ancestor, cr);
+	if (err != 0)
+		return (err);
 
 	return (0);
 }
@@ -1403,3 +1988,19 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd
 	dd->dd_snap_cmtime = t;
 	mutex_exit(&dd->dd_lock);
 }
+
+void
+dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
+}
+
+boolean_t
+dsl_dir_is_zapified(dsl_dir_t *dd)
+{
+	dmu_object_info_t doi;
+
+	dmu_object_info_from_db(dd->dd_dbuf, &doi);
+	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c,v
retrieving revision 1.3
diff -u -p -r1.3 dsl_pool.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c	28 Apr 2017 15:42:59 -0000
@@ -19,14 +19,21 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
@@ -36,33 +43,207 @@
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/bptree.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_userhold.h>
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+/*
+ * ZFS Write Throttle
+ * ------------------
+ *
+ * ZFS must limit the rate of incoming writes to the rate at which it is able
+ * to sync data modifications to the backend storage. Throttling by too much
+ * creates an artificial limit; throttling by too little can only be sustained
+ * for short periods and would lead to highly lumpy performance. On a per-pool
+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change
+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount
+ * of dirty data decreases. When the amount of dirty data exceeds a
+ * predetermined threshold further modifications are blocked until the amount
+ * of dirty data decreases (as data is synced out).
+ *
+ * The limit on dirty data is tunable, and should be adjusted according to
+ * both the IO capacity and available memory of the system. The larger the
+ * window, the more ZFS is able to aggregate and amortize metadata (and data)
+ * changes. However, memory is a limited resource, and allowing for more dirty
+ * data comes at the cost of keeping other useful data in memory (for example
+ * ZFS data cached by the ARC).
+ *
+ * Implementation
+ *
+ * As buffers are modified dsl_pool_willuse_space() increments both the per-
+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
+ * dirty space used; dsl_pool_dirty_space() decrements those values as data
+ * is synced out from dsl_pool_sync(). While only the poolwide value is
+ * relevant, the per-txg value is useful for debugging. The tunable
+ * zfs_dirty_data_max determines the dirty space limit. Once that value is
+ * exceeded, new writes are halted until space frees up.
+ *
+ * The zfs_dirty_data_sync tunable dictates the threshold at which we
+ * ensure that there is a txg syncing (see the comment in txg.c for a full
+ * description of transaction group stages).
+ *
+ * The IO scheduler uses both the dirty space limit and current amount of
+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
+ * issues. See the comment in vdev_queue.c for details of the IO scheduler.
+ *
+ * The delay is also calculated based on the amount of dirty data.  See the
+ * comment above dmu_tx_delay() for details.
+ */
+
+/*
+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
+ * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
+ */
+uint64_t zfs_dirty_data_max;
+uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
+int zfs_dirty_data_max_percent = 10;
+
+/*
+ * If there is at least this much dirty data, push out a txg.
+ */
+uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
+
+/*
+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
+ * and delay each transaction.
+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+ */
+int zfs_delay_min_dirty_percent = 60;
+
+/*
+ * This controls how quickly the delay approaches infinity.
+ * Larger values cause it to delay more for a given amount of dirty data.
+ * Therefore larger values will cause there to be less dirty data for a
+ * given throughput.
+ *
+ * For the smoothest delay, this value should be about 1 billion divided
+ * by the maximum number of operations per second.  This will smoothly
+ * handle between 10x and 1/10th this number.
+ *
+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
+ * multiply in dmu_tx_delay().
+ */
+uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
+
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
 
-int zfs_no_write_throttle = 0;
-int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
-int zfs_txg_synctime_ms = 5000;		/* target millisecs to sync a txg */
-
-uint64_t zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
-uint64_t zfs_write_limit_max = 0;		/* max data payload per txg */
-uint64_t zfs_write_limit_inflated = 0;
-uint64_t zfs_write_limit_override = 0;
+extern int zfs_vdev_async_write_active_max_dirty_percent;
 
-kmutex_t zfs_write_limit_lock;
+SYSCTL_DECL(_vfs_zfs);
 
-static pgcnt_t old_physmem = 0;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
+    &zfs_dirty_data_max, 0,
+    "The maximum amount of dirty data in bytes after which new writes are "
+    "halted until space becomes available");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
+    &zfs_dirty_data_max_max, 0,
+    "The absolute cap on dirty_data_max when auto calculating");
+
+static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+    sysctl_zfs_dirty_data_max_percent, "I",
+    "The percent of physical memory used to auto calculate dirty_data_max");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
+    &zfs_dirty_data_sync, 0,
+    "Force a txg if the number of dirty buffer bytes exceed this value");
+
+static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+    sysctl_zfs_delay_min_dirty_percent, "I",
+    "The limit of outstanding dirty data before transations are delayed");
+
+static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_scale tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_zfs_delay_scale, "QU",
+    "Controls how quickly the delay approaches infinity");
 
 static int
+sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_dirty_data_max_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < 0 || val > 100)
+		return (EINVAL);
+
+	zfs_dirty_data_max_percent = val;
+
+	return (0);
+}
+
+static int
+sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_delay_min_dirty_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < zfs_vdev_async_write_active_max_dirty_percent)
+		return (EINVAL);
+
+	zfs_delay_min_dirty_percent = val;
+
+	return (0);
+}
+
+static int
+sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_delay_scale;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val > UINT64_MAX / zfs_dirty_data_max)
+		return (EINVAL);
+
+	zfs_delay_scale = val;
+
+	return (0);
+}
+#endif
+
+hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
+hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
+
+int
 dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
-	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
 	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
 
-	return (dsl_dir_open_obj(dp, obj, name, dp, ddp));
+	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
@@ -74,21 +255,20 @@ dsl_pool_open_impl(spa_t *spa, uint64_t 
 	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
 	dp->dp_spa = spa;
 	dp->dp_meta_rootbp = *bp;
-	rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL);
-	dp->dp_write_limit = zfs_write_limit_min;
+	rrw_init(&dp->dp_config_rwlock, B_TRUE);
 	txg_init(dp, txg);
 
 	txg_list_create(&dp->dp_dirty_datasets,
 	    offsetof(dsl_dataset_t, ds_dirty_link));
+	txg_list_create(&dp->dp_dirty_zilogs,
+	    offsetof(zilog_t, zl_dirty_link));
 	txg_list_create(&dp->dp_dirty_dirs,
 	    offsetof(dsl_dir_t, dd_dirty_link));
 	txg_list_create(&dp->dp_sync_tasks,
-	    offsetof(dsl_sync_task_group_t, dstg_node));
-	list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t),
-	    offsetof(dsl_dataset_t, ds_synced_link));
+	    offsetof(dsl_sync_task_t, dst_node));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
 	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
 	    1, 4, 0);
@@ -97,26 +277,37 @@ dsl_pool_open_impl(spa_t *spa, uint64_t 
 }
 
 int
-dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-	dsl_dir_t *dd;
-	dsl_dataset_t *ds;
 
-	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
 	    &dp->dp_meta_objset);
-	if (err)
-		goto out;
+	if (err != 0)
+		dsl_pool_close(dp);
+	else
+		*dpp = dp;
 
+	return (err);
+}
+
+int
+dsl_pool_open(dsl_pool_t *dp)
+{
+	int err;
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	uint64_t obj;
+
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
 	    &dp->dp_root_dir_obj);
 	if (err)
 		goto out;
 
-	err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir);
 	if (err)
 		goto out;
@@ -125,131 +316,122 @@ dsl_pool_open(spa_t *spa, uint64_t txg, 
 	if (err)
 		goto out;
 
-	if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
 		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
 		if (err)
 			goto out;
-		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
-		    FTAG, &ds);
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
 		if (err == 0) {
 			err = dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
 			    &dp->dp_origin_snap);
 			dsl_dataset_rele(ds, FTAG);
 		}
-		dsl_dir_close(dd, dp);
+		dsl_dir_rele(dd, dp);
 		if (err)
 			goto out;
 	}
 
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
-	    &dp->dp_tmp_userrefs_obj);
-	if (err == ENOENT)
-		err = 0;
-	if (err)
-		goto out;
-
-	/* get scrub status */
-	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
-	    &dp->dp_scrub_func);
-	if (err == 0) {
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_queue_obj);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_min_txg);
-		if (err)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_max_txg);
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+		    &dp->dp_free_dir);
 		if (err)
 			goto out;
+
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
-		    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
-		    &dp->dp_scrub_bookmark);
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
 		if (err)
 			goto out;
+		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
+	}
+
+	/*
+	 * Note: errors ignored, because the leak dir will not exist if we
+	 * have not encountered a leak yet.
+	 */
+	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+	    &dp->dp_leak_dir);
+
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
-		    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
-		    &dp->dp_scrub_ddt_bookmark);
-		if (err && err != ENOENT)
-			goto out;
-		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
-		    &dp->dp_scrub_ddt_class_max);
-		if (err && err != ENOENT)
+		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+		    &dp->dp_bptree_obj);
+		if (err != 0)
 			goto out;
+	}
+
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-		    &spa->spa_scrub_errors);
-		if (err)
+		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+		    &dp->dp_empty_bpobj);
+		if (err != 0)
 			goto out;
-		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
-			/*
-			 * A new-type scrub was in progress on an old
-			 * pool.  Restart from the beginning, since the
-			 * old software may have changed the pool in the
-			 * meantime.
-			 */
-			dsl_pool_scrub_restart(dp);
-		}
-	} else {
-		/*
-		 * It's OK if there is no scrub in progress (and if
-		 * there was an I/O error, ignore it).
-		 */
-		err = 0;
 	}
 
-out:
-	rw_exit(&dp->dp_config_rwlock);
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+	    &dp->dp_tmp_userrefs_obj);
+	if (err == ENOENT)
+		err = 0;
 	if (err)
-		dsl_pool_close(dp);
-	else
-		*dpp = dp;
+		goto out;
+
+	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
 
+out:
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
 	return (err);
 }
 
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
-	/* drop our references from dsl_pool_open() */
-
 	/*
+	 * Drop our references from dsl_pool_open().
+	 *
 	 * Since we held the origin_snap from "syncing" context (which
 	 * includes pool-opening context), it actually only got a "ref"
 	 * and not a hold, so just drop that here.
 	 */
 	if (dp->dp_origin_snap)
-		dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
+		dsl_dataset_rele(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
-		dsl_dir_close(dp->dp_mos_dir, dp);
+		dsl_dir_rele(dp->dp_mos_dir, dp);
+	if (dp->dp_free_dir)
+		dsl_dir_rele(dp->dp_free_dir, dp);
+	if (dp->dp_leak_dir)
+		dsl_dir_rele(dp->dp_leak_dir, dp);
 	if (dp->dp_root_dir)
-		dsl_dir_close(dp->dp_root_dir, dp);
+		dsl_dir_rele(dp->dp_root_dir, dp);
+
+	bpobj_close(&dp->dp_free_bpobj);
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset)
 		dmu_objset_evict(dp->dp_meta_objset);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
-	txg_list_destroy(&dp->dp_dirty_dirs);
+	txg_list_destroy(&dp->dp_dirty_zilogs);
 	txg_list_destroy(&dp->dp_sync_tasks);
-	list_destroy(&dp->dp_synced_datasets);
+	txg_list_destroy(&dp->dp_dirty_dirs);
+
+	/*
+	 * We can't set retry to TRUE since we're explicitly specifying
+	 * a spa to flush. This is good enough; any missed buffers for
+	 * this spa won't cause trouble, and they'll eventually fall
+	 * out of the ARC just like any other unused buffer.
+	 */
+	arc_flush(dp->dp_spa, FALSE);
 
-	arc_flush(dp->dp_spa);
 	txg_fini(dp);
-	rw_destroy(&dp->dp_config_rwlock);
+	dsl_scan_fini(dp);
+	dmu_buf_user_evict_wait();
+
+	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
-	mutex_destroy(&dp->dp_scrub_cancel_lock);
+	cv_destroy(&dp->dp_spaceavail_cv);
 	taskq_destroy(dp->dp_vnrele_taskq);
 	if (dp->dp_blkstats)
 		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
@@ -264,7 +446,9 @@ dsl_pool_create(spa_t *spa, nvlist_t *zp
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 	objset_t *os;
 	dsl_dataset_t *ds;
-	uint64_t dsobj;
+	uint64_t obj;
+
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = dmu_objset_create_impl(spa,
@@ -273,28 +457,48 @@ dsl_pool_create(spa_t *spa, nvlist_t *zp
 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
-	ASSERT3U(err, ==, 0);
+	ASSERT0(err);
+
+	/* Initialize scan structures */
+	VERIFY0(dsl_scan_init(dp, txg));
 
 	/* create and open the root dir */
 	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
 	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
-	VERIFY(0 == dsl_pool_open_special_dir(dp,
+	VERIFY0(dsl_pool_open_special_dir(dp,
 	    MOS_DIR_NAME, &dp->dp_mos_dir));
 
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		/* create and open the free dir */
+		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+		    FREE_DIR_NAME, tx);
+		VERIFY0(dsl_pool_open_special_dir(dp,
+		    FREE_DIR_NAME, &dp->dp_free_dir));
+
+		/* create and open the free_bplist */
+		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
+	}
+
 	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 		dsl_pool_create_origin(dp, tx);
 
 	/* create the root dataset */
-	dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 
 	/* create the root objset */
-	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 #ifdef _KERNEL
 	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
@@ -302,9 +506,54 @@ dsl_pool_create(spa_t *spa, nvlist_t *zp
 
 	dmu_tx_commit(tx);
 
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
+
 	return (dp);
 }
 
+/*
+ * Account for the meta-objset space in its placeholder dsl_dir.
+ */
+void
+dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp)
+{
+	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
+	mutex_enter(&dp->dp_lock);
+	dp->dp_mos_used_delta += used;
+	dp->dp_mos_compressed_delta += comp;
+	dp->dp_mos_uncompressed_delta += uncomp;
+	mutex_exit(&dp->dp_lock);
+}
+
+static void
+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
+	VERIFY0(zio_wait(zio));
+	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+}
+
+static void
+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
+{
+	ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+	if (delta < 0)
+		ASSERT3U(-delta, <=, dp->dp_dirty_total);
+
+	dp->dp_dirty_total += delta;
+
+	/*
+	 * Note: we signal even when increasing dp_dirty_total.
+	 * This ensures forward progress -- each thread wakes the next waiter.
+	 */
+	if (dp->dp_dirty_total <= zfs_dirty_data_max)
+		cv_signal(&dp->dp_spaceavail_cv);
+}
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@@ -312,157 +561,150 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t t
 	dmu_tx_t *tx;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
-	dsl_sync_task_group_t *dstg;
 	objset_t *mos = dp->dp_meta_objset;
-	hrtime_t start, write_time;
-	uint64_t data_written;
-	int err;
+	list_t synced_datasets;
 
-	tx = dmu_tx_create_assigned(dp, txg);
+	list_create(&synced_datasets, sizeof (dsl_dataset_t),
+	    offsetof(dsl_dataset_t, ds_synced_link));
 
-	dp->dp_read_overhead = 0;
-	start = gethrtime();
+	tx = dmu_tx_create_assigned(dp, txg);
 
+	/*
+	 * Write out all dirty blocks of dirty datasets.
+	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		/*
 		 * We must not sync any non-MOS datasets twice, because
 		 * we may have taken a snapshot of them.  However, we
 		 * may sync newly-created datasets on pass 2.
 		 */
 		ASSERT(!list_link_active(&ds->ds_synced_link));
-		list_insert_tail(&dp->dp_synced_datasets, ds);
+		list_insert_tail(&synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
-	DTRACE_PROBE(pool_sync__1setup);
-	err = zio_wait(zio);
+	VERIFY0(zio_wait(zio));
 
-	write_time = gethrtime() - start;
-	ASSERT(err == 0);
-	DTRACE_PROBE(pool_sync__2rootzio);
-
-	for (ds = list_head(&dp->dp_synced_datasets); ds;
-	    ds = list_next(&dp->dp_synced_datasets, ds))
-		dmu_objset_do_userquota_callbacks(ds->ds_objset, tx);
+	/*
+	 * We have written all of the accounted dirty data, so our
+	 * dp_space_towrite should now be zero.  However, some seldom-used
+	 * code paths do not adhere to this (e.g. dbuf_undirty(), also
+	 * rounding error in dbuf_write_physdone).
+	 * Shore up the accounting of any dirtied space now.
+	 */
+	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
+
+	/*
+	 * Update the long range free counter after
+	 * we're done syncing user data
+	 */
+	mutex_enter(&dp->dp_lock);
+	ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
+	    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
+	dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
+	mutex_exit(&dp->dp_lock);
+
+	/*
+	 * After the data blocks have been written (ensured by the zio_wait()
+	 * above), update the user/group space accounting.
+	 */
+	for (ds = list_head(&synced_datasets); ds != NULL;
+	    ds = list_next(&synced_datasets, ds)) {
+		dmu_objset_do_userquota_updates(ds->ds_objset, tx);
+	}
 
 	/*
 	 * Sync the datasets again to push out the changes due to
-	 * userquota updates.  This must be done before we process the
-	 * sync tasks, because that could cause a snapshot of a dataset
-	 * whose ds_bp will be rewritten when we do this 2nd sync.
+	 * userspace updates.  This must be done before we process the
+	 * sync tasks, so that any snapshots will have the correct
+	 * user accounting information (and we won't get confused
+	 * about which blocks are part of the snapshot).
 	 */
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
 		ASSERT(list_link_active(&ds->ds_synced_link));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
-	err = zio_wait(zio);
+	VERIFY0(zio_wait(zio));
 
 	/*
-	 * If anything was added to a deadlist during a zio done callback,
-	 * it had to be put on the deferred queue.  Enqueue it for real now.
+	 * Now that the datasets have been completely synced, we can
+	 * clean up our in-memory structures accumulated while syncing:
+	 *
+	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
+	 *  - release hold from dsl_dataset_dirty()
 	 */
-	for (ds = list_head(&dp->dp_synced_datasets); ds;
-	    ds = list_next(&dp->dp_synced_datasets, ds))
-		bplist_sync(&ds->ds_deadlist,
-		    bplist_enqueue_cb, &ds->ds_deadlist, tx);
-
-	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
-		/*
-		 * No more sync tasks should have been added while we
-		 * were syncing.
-		 */
-		ASSERT(spa_sync_pass(dp->dp_spa) == 1);
-		dsl_sync_task_group_sync(dstg, tx);
+	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
+		dsl_dataset_sync_done(ds, tx);
 	}
-	DTRACE_PROBE(pool_sync__3task);
-
-	start = gethrtime();
-	while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
 		dsl_dir_sync(dd, tx);
-	write_time += gethrtime() - start;
+	}
 
-	if (spa_sync_pass(dp->dp_spa) == 1) {
-		dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
-		    NULL, ZIO_FLAG_CANFAIL);
-		dsl_pool_scrub_sync(dp, tx);
-		(void) zio_wait(dp->dp_scrub_prefetch_zio_root);
+	/*
+	 * The MOS's space is accounted for in the pool/$MOS
+	 * (dp_mos_dir).  We can't modify the mos while we're syncing
+	 * it, so we remember the deltas and apply them here.
+	 */
+	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
+	    dp->dp_mos_uncompressed_delta != 0) {
+		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
+		    dp->dp_mos_used_delta,
+		    dp->dp_mos_compressed_delta,
+		    dp->dp_mos_uncompressed_delta, tx);
+		dp->dp_mos_used_delta = 0;
+		dp->dp_mos_compressed_delta = 0;
+		dp->dp_mos_uncompressed_delta = 0;
 	}
 
-	start = gethrtime();
 	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
-		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-		dmu_objset_sync(mos, zio, tx);
-		err = zio_wait(zio);
-		ASSERT(err == 0);
-		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
-		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
-	}
-	write_time += gethrtime() - start;
-	DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
-	    hrtime_t, dp->dp_read_overhead);
-	write_time -= dp->dp_read_overhead;
-
-	dmu_tx_commit(tx);
-
-	data_written = dp->dp_space_towrite[txg & TXG_MASK];
-	dp->dp_space_towrite[txg & TXG_MASK] = 0;
-	ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
-
-	/*
-	 * If the write limit max has not been explicitly set, set it
-	 * to a fraction of available physical memory (default 1/8th).
-	 * Note that we must inflate the limit because the spa
-	 * inflates write sizes to account for data replication.
-	 * Check this each sync phase to catch changing memory size.
-	 */
-	if (physmem != old_physmem && zfs_write_limit_shift) {
-		mutex_enter(&zfs_write_limit_lock);
-		old_physmem = physmem;
-		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
-		zfs_write_limit_inflated = MAX(zfs_write_limit_min,
-		    spa_get_asize(dp->dp_spa, zfs_write_limit_max));
-		mutex_exit(&zfs_write_limit_lock);
+		dsl_pool_sync_mos(dp, tx);
 	}
 
 	/*
-	 * Attempt to keep the sync time consistent by adjusting the
-	 * amount of write traffic allowed into each transaction group.
-	 * Weight the throughput calculation towards the current value:
-	 * 	thru = 3/4 old_thru + 1/4 new_thru
-	 *
-	 * Note: write_time is in nanosecs, so write_time/MICROSEC
-	 * yields millisecs
+	 * If we modify a dataset in the same txg that we want to destroy it,
+	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
+	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
+	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
+	 * and clearing the hold on it) before we process the sync_tasks.
+	 * The MOS data dirtied by the sync_tasks will be synced on the next
+	 * pass.
 	 */
-	ASSERT(zfs_write_limit_min > 0);
-	if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
-		uint64_t throughput = data_written / (write_time / MICROSEC);
-
-		if (dp->dp_throughput)
-			dp->dp_throughput = throughput / 4 +
-			    3 * dp->dp_throughput / 4;
-		else
-			dp->dp_throughput = throughput;
-		dp->dp_write_limit = MIN(zfs_write_limit_inflated,
-		    MAX(zfs_write_limit_min,
-		    dp->dp_throughput * zfs_txg_synctime_ms));
+	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
+		dsl_sync_task_t *dst;
+		/*
+		 * No more sync tasks should have been added while we
+		 * were syncing.
+		 */
+		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
+			dsl_sync_task_sync(dst, tx);
 	}
+
+	dmu_tx_commit(tx);
+
+	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
 }
 
 void
 dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
-	dsl_dataset_t *ds;
-	objset_t *os;
+	zilog_t *zilog;
 
-	while (ds = list_head(&dp->dp_synced_datasets)) {
-		list_remove(&dp->dp_synced_datasets, ds);
-		os = ds->ds_objset;
-		zil_clean(os->os_zil);
-		ASSERT(!dmu_objset_is_dirty(os, txg));
-		dmu_buf_rele(ds->ds_dbuf, ds);
+	while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
+		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+		/*
+		 * We don't remove the zilog from the dp_dirty_zilogs
+		 * list until after we've cleaned it. This ensures that
+		 * callers of zilog_is_dirty() receive an accurate
+		 * answer when they are racing with the spa sync thread.
+		 */
+		zil_clean(zilog, txg);
+		(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
+		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
+		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
@@ -475,7 +717,7 @@ int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
 	return (curthread == dp->dp_tx.tx_sync_thread ||
-	    spa_get_dsl(dp->dp_spa) == NULL);
+	    spa_is_initializing(dp->dp_spa));
 }
 
 uint64_t
@@ -484,121 +726,83 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, bo
 	uint64_t space, resv;
 
 	/*
-	 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
-	 * efficiency.
-	 * XXX The intent log is not accounted for, so it must fit
-	 * within this slop.
-	 *
 	 * If we're trying to assess whether it's OK to do a free,
 	 * cut the reservation in half to allow forward progress
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
 	space = spa_get_dspace(dp->dp_spa);
-	resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
+	resv = spa_get_slop_space(dp->dp_spa);
 	if (netfree)
 		resv >>= 1;
 
 	return (space - resv);
 }
 
-int
-dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
+boolean_t
+dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
-	uint64_t reserved = 0;
-	uint64_t write_limit = (zfs_write_limit_override ?
-	    zfs_write_limit_override : dp->dp_write_limit);
-
-	if (zfs_no_write_throttle) {
-		atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
-		    space);
-		return (0);
-	}
-
-	/*
-	 * Check to see if we have exceeded the maximum allowed IO for
-	 * this transaction group.  We can do this without locks since
-	 * a little slop here is ok.  Note that we do the reserved check
-	 * with only half the requested reserve: this is because the
-	 * reserve requests are worst-case, and we really don't want to
-	 * throttle based off of worst-case estimates.
-	 */
-	if (write_limit > 0) {
-		reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
-		    + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
-
-		if (reserved && reserved > write_limit)
-			return (ERESTART);
-	}
-
-	atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
-
-	/*
-	 * If this transaction group is over 7/8ths capacity, delay
-	 * the caller 1 clock tick.  This will slow down the "fill"
-	 * rate until the sync process can catch up with us.
-	 */
-	if (reserved && reserved > (write_limit - (write_limit >> 3)))
-		txg_delay(dp, tx->tx_txg, 1);
-
-	return (0);
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	boolean_t rv;
+
+	mutex_enter(&dp->dp_lock);
+	if (dp->dp_dirty_total > zfs_dirty_data_sync)
+		txg_kick(dp);
+	rv = (dp->dp_dirty_total > delay_min_bytes);
+	mutex_exit(&dp->dp_lock);
+	return (rv);
 }
 
 void
-dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
-	ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
-	atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
-}
-
-void
-dsl_pool_memory_pressure(dsl_pool_t *dp)
-{
-	uint64_t space_inuse = 0;
-	int i;
-
-	if (dp->dp_write_limit == zfs_write_limit_min)
-		return;
-
-	for (i = 0; i < TXG_SIZE; i++) {
-		space_inuse += dp->dp_space_towrite[i];
-		space_inuse += dp->dp_tempreserved[i];
+	if (space > 0) {
+		mutex_enter(&dp->dp_lock);
+		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
+		dsl_pool_dirty_delta(dp, space);
+		mutex_exit(&dp->dp_lock);
 	}
-	dp->dp_write_limit = MAX(zfs_write_limit_min,
-	    MIN(dp->dp_write_limit, space_inuse / 4));
 }
 
 void
-dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
 {
-	if (space > 0) {
-		mutex_enter(&dp->dp_lock);
-		dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
-		mutex_exit(&dp->dp_lock);
+	ASSERT3S(space, >=, 0);
+	if (space == 0)
+		return;
+	mutex_enter(&dp->dp_lock);
+	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
+		/* XXX writing something we didn't dirty? */
+		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
 	}
+	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
+	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
+	ASSERT3U(dp->dp_dirty_total, >=, space);
+	dsl_pool_dirty_delta(dp, -space);
+	mutex_exit(&dp->dp_lock);
 }
 
 /* ARGSUSED */
 static int
-upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
 	dmu_tx_t *tx = arg;
 	dsl_dataset_t *ds, *prev = NULL;
 	int err;
-	dsl_pool_t *dp = spa_get_dsl(spa);
 
-	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
 	if (err)
 		return (err);
 
-	while (ds->ds_phys->ds_prev_snap_obj != 0) {
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    FTAG, &prev);
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
 		if (err) {
 			dsl_dataset_rele(ds, FTAG);
 			return (err);
 		}
 
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
 			break;
 		dsl_dataset_rele(ds, FTAG);
 		ds = prev;
@@ -612,7 +816,9 @@ upgrade_clones_cb(spa_t *spa, uint64_t d
 		 * The $ORIGIN can't have any data, or the accounting
 		 * will be wrong.
 		 */
-		ASSERT(prev->ds_phys->ds_bp.blk_birth == 0);
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		/* The origin doesn't get attached to itself */
 		if (ds->ds_object == prev->ds_object) {
@@ -621,33 +827,35 @@ upgrade_clones_cb(spa_t *spa, uint64_t d
 		}
 
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
-		ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
-		ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+		dsl_dataset_phys(ds)->ds_prev_snap_txg =
+		    dsl_dataset_phys(prev)->ds_creation_txg;
 
 		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
-		ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
 
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		prev->ds_phys->ds_num_children++;
+		dsl_dataset_phys(prev)->ds_num_children++;
 
-		if (ds->ds_phys->ds_next_snap_obj == 0) {
+		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
 			ASSERT(ds->ds_prev == NULL);
-			VERIFY(0 == dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+			    ds, &ds->ds_prev));
 		}
 	}
 
-	ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
-	ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
+	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
 
-	if (prev->ds_phys->ds_next_clones_obj == 0) {
+	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
 		dmu_buf_will_dirty(prev->ds_dbuf, tx);
-		prev->ds_phys->ds_next_clones_obj =
+		dsl_dataset_phys(prev)->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
 	}
-	VERIFY(0 == zap_add_int(dp->dp_meta_objset,
-	    prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+	VERIFY0(zap_add_int(dp->dp_meta_objset,
+	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
 
 	dsl_dataset_rele(ds, FTAG);
 	if (prev != dp->dp_origin_snap)
@@ -661,8 +869,62 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);
 
-	VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
-	    tx, DS_FIND_CHILDREN));
+	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
+	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
+}
+
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
+		dsl_dataset_t *origin;
+
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
+
+		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+			dsl_dir_phys(origin->ds_dir)->dd_clones =
+			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+			    0, tx);
+		}
+
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(origin->ds_dir)->dd_clones,
+		    ds->ds_object, tx));
+
+		dsl_dataset_rele(origin, FTAG);
+	}
+	return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	uint64_t obj;
+
+	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+	VERIFY0(dsl_pool_open_special_dir(dp,
+	    FREE_DIR_NAME, &dp->dp_free_dir));
+
+	/*
+	 * We can't use bpobj_alloc(), because spa_version() still
+	 * returns the old version, and we need a new-version bpobj with
+	 * subobj support.  So call dmu_object_alloc() directly.
+	 */
+	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
+
+	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
 }
 
 void
@@ -673,17 +935,16 @@ dsl_pool_create_origin(dsl_pool_t *dp, d
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap == NULL);
+	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
 
 	/* create the origin dir, ds, & snap-ds */
-	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
 	    NULL, 0, kcred, tx);
-	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-	dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
-	VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
+	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
 	    dp, &dp->dp_origin_snap));
 	dsl_dataset_rele(ds, FTAG);
-	rw_exit(&dp->dp_config_rwlock);
 }
 
 taskq_t *
@@ -703,23 +964,34 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *
 	zap_cursor_t zc;
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+	nvlist_t *holds;
 
 	if (zapobj == 0)
 		return;
 	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
 
+	holds = fnvlist_alloc();
+
 	for (zap_cursor_init(&zc, mos, zapobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    zap_cursor_advance(&zc)) {
 		char *htag;
-		uint64_t dsobj;
+		nvlist_t *tags;
 
 		htag = strchr(za.za_name, '-');
 		*htag = '\0';
 		++htag;
-		dsobj = strtonum(za.za_name, NULL);
-		(void) dsl_dataset_user_release_tmp(dp, dsobj, htag);
+		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
+			tags = fnvlist_alloc();
+			fnvlist_add_boolean(tags, htag);
+			fnvlist_add_nvlist(holds, za.za_name, tags);
+			fnvlist_free(tags);
+		} else {
+			fnvlist_add_boolean(tags, htag);
+		}
 	}
+	dsl_dataset_user_release_tmp(dp, holds);
+	fnvlist_free(holds);
 	zap_cursor_fini(&zc);
 }
 
@@ -734,16 +1006,13 @@ dsl_pool_user_hold_create_obj(dsl_pool_t
 	ASSERT(dp->dp_tmp_userrefs_obj == 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
-	dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
-	    DMU_OT_NONE, 0, tx);
-
-	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
-	    sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
 }
 
 static int
 dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
-    const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
+    const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
@@ -762,13 +1031,13 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t 
 			dsl_pool_user_hold_create_obj(dp, tx);
 			zapobj = dp->dp_tmp_userrefs_obj;
 		} else {
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 		}
 	}
 
 	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
 	if (holding)
-		error = zap_add(mos, zapobj, name, 8, 1, now, tx);
+		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
 	else
 		error = zap_remove(mos, zapobj, name, tx);
 	strfree(name);
@@ -781,7 +1050,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t 
  */
 int
 dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
-    uint64_t *now, dmu_tx_t *tx)
+    uint64_t now, dmu_tx_t *tx)
 {
 	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
 }
@@ -793,6 +1062,122 @@ int
 dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
     dmu_tx_t *tx)
 {
-	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
+	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
 	    tx, B_FALSE));
 }
+
+/*
+ * DSL Pool Configuration Lock
+ *
+ * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
+ * creation / destruction / rename / property setting).  It must be held for
+ * read to hold a dataset or dsl_dir.  I.e. you must call
+ * dsl_pool_config_enter() or dsl_pool_hold() before calling
+ * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
+ * must be held continuously until all datasets and dsl_dirs are released.
+ *
+ * The only exception to this rule is that if a "long hold" is placed on
+ * a dataset, then the dp_config_rwlock may be dropped while the dataset
+ * is still held.  The long hold will prevent the dataset from being
+ * destroyed -- the destroy will fail with EBUSY.  A long hold can be
+ * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
+ * (by calling dsl_{dataset,objset}_{try}own{_obj}).
+ *
+ * Legitimate long-holders (including owners) should be long-running, cancelable
+ * tasks that should cause "zfs destroy" to fail.  This includes DMU
+ * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
+ * "zfs send", and "zfs diff".  There are several other long-holders whose
+ * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
+ *
+ * The usual formula for long-holding would be:
+ * dsl_pool_hold()
+ * dsl_dataset_hold()
+ * ... perform checks ...
+ * dsl_dataset_long_hold()
+ * dsl_pool_rele()
+ * ... perform long-running task ...
+ * dsl_dataset_long_rele()
+ * dsl_dataset_rele()
+ *
+ * Note that when the long hold is released, the dataset is still held but
+ * the pool is not held.  The dataset may change arbitrarily during this time
+ * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
+ * dataset except release it.
+ *
+ * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only
+ * or modifying operations.
+ *
+ * Modifying operations should generally use dsl_sync_task().  The synctask
+ * infrastructure enforces proper locking strategy with respect to the
+ * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
+ *
+ * Read-only operations will manually hold the pool, then the dataset, obtain
+ * information from the dataset, then release the pool and dataset.
+ * dmu_objset_{hold,rele}() are convenience routines that also do the pool
+ * hold/rele.
+ */
+
+int
+dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(name, &spa, tag);
+	if (error == 0) {
+		*dp = spa_get_dsl(spa);
+		dsl_pool_config_enter(*dp, tag);
+	}
+	return (error);
+}
+
+void
+dsl_pool_rele(dsl_pool_t *dp, void *tag)
+{
+	dsl_pool_config_exit(dp, tag);
+	spa_close(dp->dp_spa, tag);
+}
+
+void
+dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
+{
+	/*
+	 * We use a "reentrant" reader-writer lock, but not reentrantly.
+	 *
+	 * The rrwlock can (with the track_all flag) track all reading threads,
+	 * which is very useful for debugging which code path failed to release
+	 * the lock, and for verifying that the *current* thread does hold
+	 * the lock.
+	 *
+	 * (Unlike a rwlock, which knows that N threads hold it for
+	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
+	 * if any thread holds it for read, even if this thread doesn't).
+	 */
+	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
+}
+
+void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
+void
+dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
+{
+	rrw_exit(&dp->dp_config_rwlock, tag);
+}
+
+boolean_t
+dsl_pool_config_held(dsl_pool_t *dp)
+{
+	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
+}
+
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dsl_prop.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c	27 Feb 2010 22:30:58 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c	10 Oct 2016 11:09:56 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -41,27 +42,27 @@
 #define	ZPROP_RECVD_SUFFIX "$recvd"
 
 static int
-dodefault(const char *propname, int intsz, int numints, void *buf)
+dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
 {
-	zfs_prop_t prop;
-
 	/*
 	 * The setonce properties are read-only, BUT they still
 	 * have a default value that can be used as the initial
 	 * value.
 	 */
-	if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL ||
+	if (prop == ZPROP_INVAL ||
 	    (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 
 	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+		if (zfs_prop_default_string(prop) == NULL)
+			return (SET_ERROR(ENOENT));
 		if (intsz != 1)
-			return (EOVERFLOW);
+			return (SET_ERROR(EOVERFLOW));
 		(void) strncpy(buf, zfs_prop_default_string(prop),
 		    numints);
 	} else {
 		if (intsz != 8 || numints < 1)
-			return (EOVERFLOW);
+			return (SET_ERROR(EOVERFLOW));
 
 		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
 	}
@@ -82,7 +83,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha
 	char *inheritstr;
 	char *recvdstr;
 
-	ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
 
 	if (setpoint)
 		setpoint[0] = '\0';
@@ -97,8 +98,6 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha
 	 * after this loop.
 	 */
 	for (; dd != NULL; dd = dd->dd_parent) {
-		ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
-
 		if (dd != target || snapshot) {
 			if (!inheritable)
 				break;
@@ -106,8 +105,8 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha
 		}
 
 		/* Check for a local value. */
-		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-		    intsz, numints, buf);
+		err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname, intsz, numints, buf);
 		if (err != ENOENT) {
 			if (setpoint != NULL && err == 0)
 				dsl_dir_name(dd, setpoint);
@@ -118,14 +117,14 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha
 		 * Skip the check for a received value if there is an explicit
 		 * inheritance entry.
 		 */
-		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
 		    inheritstr);
 		if (err != 0 && err != ENOENT)
 			break;
 
 		if (err == ENOENT) {
 			/* Check for a received value. */
-			err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+			err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
 			    recvdstr, intsz, numints, buf);
 			if (err != ENOENT) {
 				if (setpoint != NULL && err == 0) {
@@ -146,11 +145,11 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha
 		 * at the end of the loop (instead of at the beginning) ensures
 		 * that err has a valid post-loop value.
 		 */
-		err = ENOENT;
+		err = SET_ERROR(ENOENT);
 	}
 
 	if (err == ENOENT)
-		err = dodefault(propname, intsz, numints, buf);
+		err = dodefault(prop, intsz, numints, buf);
 
 	strfree(inheritstr);
 	strfree(recvdstr);
@@ -164,19 +163,17 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const
 {
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	boolean_t inheritable;
-	boolean_t snapshot;
 	uint64_t zapobj;
 
-	ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
-	snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
-	zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
+	zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 
 	if (zapobj != 0) {
 		objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 		int err;
 
-		ASSERT(snapshot);
+		ASSERT(ds->ds_is_snapshot);
 
 		/* Check for a local value. */
 		err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
@@ -216,7 +213,59 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const
 	}
 
 	return (dsl_prop_get_dd(ds->ds_dir, propname,
-	    intsz, numints, buf, setpoint, snapshot));
+	    intsz, numints, buf, setpoint, ds->ds_is_snapshot));
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
+{
+	dsl_prop_record_t *pr = NULL;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	for (pr = list_head(&dd->dd_props);
+	    pr != NULL; pr = list_next(&dd->dd_props, pr)) {
+		if (strcmp(pr->pr_propname, propname) == 0)
+			break;
+	}
+
+	return (pr);
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
+{
+	dsl_prop_record_t *pr;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
+	pr->pr_propname = spa_strdup(propname);
+	list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
+	    offsetof(dsl_prop_cb_record_t, cbr_pr_node));
+	list_insert_head(&dd->dd_props, pr);
+
+	return (pr);
+}
+
+void
+dsl_prop_init(dsl_dir_t *dd)
+{
+	list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
+	    offsetof(dsl_prop_record_t, pr_node));
+}
+
+void
+dsl_prop_fini(dsl_dir_t *dd)
+{
+	dsl_prop_record_t *pr;
+
+	while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
+		list_destroy(&pr->pr_cbs);
+		strfree((char *)pr->pr_propname);
+		kmem_free(pr, sizeof (dsl_prop_record_t));
+	}
+	list_destroy(&dd->dd_props);
 }
 
 /*
@@ -233,38 +282,31 @@ dsl_prop_register(dsl_dataset_t *ds, con
 	dsl_dir_t *dd = ds->ds_dir;
 	dsl_pool_t *dp = dd->dd_pool;
 	uint64_t value;
+	dsl_prop_record_t *pr;
 	dsl_prop_cb_record_t *cbr;
 	int err;
-	int need_rwlock;
 
-	need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock);
-	if (need_rwlock)
-		rw_enter(&dp->dp_config_rwlock, RW_READER);
-
-	err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL);
-	if (err != 0) {
-		if (need_rwlock)
-			rw_exit(&dp->dp_config_rwlock);
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dsl_prop_get_int_ds(ds, propname, &value);
+	if (err != 0)
 		return (err);
-	}
 
 	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
 	cbr->cbr_ds = ds;
-	cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
-	(void) strcpy((char *)cbr->cbr_propname, propname);
 	cbr->cbr_func = callback;
 	cbr->cbr_arg = cbarg;
+
 	mutex_enter(&dd->dd_lock);
-	list_insert_head(&dd->dd_prop_cbs, cbr);
+	pr = dsl_prop_record_find(dd, propname);
+	if (pr == NULL)
+		pr = dsl_prop_record_create(dd, propname);
+	cbr->cbr_pr = pr;
+	list_insert_head(&pr->pr_cbs, cbr);
+	list_insert_head(&ds->ds_prop_cbs, cbr);
 	mutex_exit(&dd->dd_lock);
 
 	cbr->cbr_func(cbr->cbr_arg, value);
-
-	VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
-	    NULL, cbr, &dd));
-	if (need_rwlock)
-		rw_exit(&dp->dp_config_rwlock);
-	/* Leave dir open until this callback is unregistered */
 	return (0);
 }
 
@@ -272,19 +314,18 @@ int
 dsl_prop_get(const char *dsname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint)
 {
-	dsl_dataset_t *ds;
-	int err;
+	objset_t *os;
+	int error;
 
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
+	error = dmu_objset_hold(dsname, FTAG, &os);
+	if (error != 0)
+		return (error);
 
-	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-	err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint);
-	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+	error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
+	    intsz, numints, buf, setpoint);
 
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
+	dmu_objset_rele(os, FTAG);
+	return (error);
 }
 
 /*
@@ -302,17 +343,11 @@ dsl_prop_get_integer(const char *ddname,
 	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
 }
 
-void
-dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
-    zprop_source_t source, uint64_t *value)
+int
+dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname,
+    uint64_t *valuep)
 {
-	psa->psa_name = propname;
-	psa->psa_source = source;
-	psa->psa_intsz = 8;
-	psa->psa_numints = 1;
-	psa->psa_value = value;
-
-	psa->psa_effective_value = -1ULL;
+	return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
 }
 
 /*
@@ -326,11 +361,10 @@ dsl_prop_setarg_init_uint64(dsl_prop_set
  * a property not handled by this function.
  */
 int
-dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+    zprop_source_t source, uint64_t value, uint64_t *newvalp)
 {
-	const char *propname = psa->psa_name;
 	zfs_prop_t prop = zfs_name_to_prop(propname);
-	zprop_source_t source = psa->psa_source;
 	objset_t *mos;
 	uint64_t zapobj;
 	uint64_t version;
@@ -348,7 +382,7 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl
 	}
 
 	mos = dd->dd_pool->dp_meta_objset;
-	zapobj = dd->dd_phys->dd_props_zapobj;
+	zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
 	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
 	version = spa_version(dd->dd_pool->dp_spa);
@@ -362,36 +396,33 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl
 	switch (source) {
 	case ZPROP_SRC_NONE:
 		/* Revert to the received value, if any. */
-		err = zap_lookup(mos, zapobj, recvdstr, 8, 1,
-		    &psa->psa_effective_value);
+		err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
 		if (err == ENOENT)
-			psa->psa_effective_value = 0;
+			*newvalp = 0;
 		break;
 	case ZPROP_SRC_LOCAL:
-		psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+		*newvalp = value;
 		break;
 	case ZPROP_SRC_RECEIVED:
 		/*
 		 * If there's no local setting, then the new received value will
 		 * be the effective value.
 		 */
-		err = zap_lookup(mos, zapobj, propname, 8, 1,
-		    &psa->psa_effective_value);
+		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
 		if (err == ENOENT)
-			psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+			*newvalp = value;
 		break;
 	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
 		/*
 		 * We're clearing the received value, so the local setting (if
 		 * it exists) remains the effective value.
 		 */
-		err = zap_lookup(mos, zapobj, propname, 8, 1,
-		    &psa->psa_effective_value);
+		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
 		if (err == ENOENT)
-			psa->psa_effective_value = 0;
+			*newvalp = 0;
 		break;
 	default:
-		cmn_err(CE_PANIC, "unexpected property source: %d", source);
+		panic("unexpected property source: %d", source);
 	}
 
 	strfree(recvdstr);
@@ -402,92 +433,103 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl
 	return (err);
 }
 
-#ifdef	ZFS_DEBUG
+/*
+ * Unregister all callbacks that are registered with the
+ * given callback argument.
+ */
 void
-dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
 {
-	zfs_prop_t prop = zfs_name_to_prop(psa->psa_name);
-	uint64_t intval;
-	char setpoint[MAXNAMELEN];
-	uint64_t version = spa_version(dd->dd_pool->dp_spa);
-	int err;
+	dsl_prop_cb_record_t *cbr, *next_cbr;
 
-	if (version < SPA_VERSION_RECVD_PROPS) {
-		switch (prop) {
-		case ZFS_PROP_QUOTA:
-		case ZFS_PROP_RESERVATION:
-			return;
+	dsl_dir_t *dd = ds->ds_dir;
+
+	mutex_enter(&dd->dd_lock);
+	next_cbr = list_head(&ds->ds_prop_cbs);
+	while (next_cbr != NULL) {
+		cbr = next_cbr;
+		next_cbr = list_next(&ds->ds_prop_cbs, cbr);
+		if (cbr->cbr_arg == cbarg) {
+			list_remove(&ds->ds_prop_cbs, cbr);
+			list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+			kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
 		}
 	}
+	mutex_exit(&dd->dd_lock);
+}
 
-	err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval,
-	    setpoint, B_FALSE);
-	if (err == 0 && intval != psa->psa_effective_value) {
-		cmn_err(CE_PANIC, "%s property, source: %x, "
-		    "predicted effective value: %llu, "
-		    "actual effective value: %llu (setpoint: %s)",
-		    psa->psa_name, psa->psa_source,
-		    (unsigned long long)psa->psa_effective_value,
-		    (unsigned long long)intval, setpoint);
-	}
+boolean_t
+dsl_prop_hascb(dsl_dataset_t *ds)
+{
+	return (!list_is_empty(&ds->ds_prop_cbs));
 }
-#endif
 
-/*
- * Unregister this callback.  Return 0 on success, ENOENT if ddname is
- * invalid, ENOMSG if no matching callback registered.
- */
-int
-dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg)
+/* ARGSUSED */
+static int
+dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_record_t *pr;
 	dsl_prop_cb_record_t *cbr;
 
 	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs);
-	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (cbr->cbr_ds == ds &&
-		    cbr->cbr_func == callback &&
-		    cbr->cbr_arg == cbarg &&
-		    strcmp(cbr->cbr_propname, propname) == 0)
-			break;
-	}
+	for (pr = list_head(&dd->dd_props);
+	    pr; pr = list_next(&dd->dd_props, pr)) {
+		for (cbr = list_head(&pr->pr_cbs); cbr;
+		    cbr = list_next(&pr->pr_cbs, cbr)) {
+			uint64_t value;
 
-	if (cbr == NULL) {
-		mutex_exit(&dd->dd_lock);
-		return (ENOMSG);
-	}
+			/*
+			 * Callback entries do not have holds on their
+			 * datasets so that datasets with registered
+			 * callbacks are still eligible for eviction.
+			 * Unlike operations to update properties on a
+			 * single dataset, we are performing a recursive
+			 * descent of related head datasets.  The caller
+			 * of this function only has a dataset hold on
+			 * the passed in head dataset, not the snapshots
+			 * associated with this dataset.  Without a hold,
+			 * the dataset pointer within callback records
+			 * for snapshots can be invalidated by eviction
+			 * at any time.
+			 *
+			 * Use dsl_dataset_try_add_ref() to verify
+			 * that the dataset for a snapshot has not
+			 * begun eviction processing and to prevent
+			 * eviction from occurring for the duration of
+			 * the callback.  If the hold attempt fails,
+			 * this object is already being evicted and the
+			 * callback can be safely ignored.
+			 */
+			if (ds != cbr->cbr_ds &&
+			    !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+				continue;
+
+			if (dsl_prop_get_ds(cbr->cbr_ds,
+			    cbr->cbr_pr->pr_propname, sizeof (value), 1,
+			    &value, NULL) == 0)
+				cbr->cbr_func(cbr->cbr_arg, value);
 
-	list_remove(&dd->dd_prop_cbs, cbr);
+			if (ds != cbr->cbr_ds)
+				dsl_dataset_rele(cbr->cbr_ds, FTAG);
+		}
+	}
 	mutex_exit(&dd->dd_lock);
-	kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
-	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
 
-	/* Clean up from dsl_prop_register */
-	dsl_dir_close(dd, cbr);
 	return (0);
 }
 
 /*
- * Return the number of callbacks that are registered for this dataset.
+ * Update all property values for ddobj & its descendants.  This is used
+ * when renaming the dir.
  */
-int
-dsl_prop_numcb(dsl_dataset_t *ds)
+void
+dsl_prop_notify_all(dsl_dir_t *dd)
 {
-	dsl_dir_t *dd = ds->ds_dir;
-	dsl_prop_cb_record_t *cbr;
-	int num = 0;
-
-	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs);
-	    cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		if (cbr->cbr_ds == ds)
-			num++;
-	}
-	mutex_exit(&dd->dd_lock);
-
-	return (num);
+	dsl_pool_t *dp = dd->dd_pool;
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	(void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
+	    NULL, DS_FIND_CHILDREN);
 }
 
 static void
@@ -495,14 +537,15 @@ dsl_prop_changed_notify(dsl_pool_t *dp, 
     const char *propname, uint64_t value, int first)
 {
 	dsl_dir_t *dd;
+	dsl_prop_record_t *pr;
 	dsl_prop_cb_record_t *cbr;
 	objset_t *mos = dp->dp_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	int err;
 
-	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
-	err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
 	if (err)
 		return;
 
@@ -511,36 +554,49 @@ dsl_prop_changed_notify(dsl_pool_t *dp, 
 		 * If the prop is set here, then this change is not
 		 * being inherited here or below; stop the recursion.
 		 */
-		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname);
 		if (err == 0) {
-			dsl_dir_close(dd, FTAG);
+			dsl_dir_rele(dd, FTAG);
 			return;
 		}
 		ASSERT3U(err, ==, ENOENT);
 	}
 
 	mutex_enter(&dd->dd_lock);
-	for (cbr = list_head(&dd->dd_prop_cbs); cbr;
-	    cbr = list_next(&dd->dd_prop_cbs, cbr)) {
-		uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj;
+	pr = dsl_prop_record_find(dd, propname);
+	if (pr != NULL) {
+		for (cbr = list_head(&pr->pr_cbs); cbr;
+		    cbr = list_next(&pr->pr_cbs, cbr)) {
+			uint64_t propobj;
 
-		if (strcmp(cbr->cbr_propname, propname) != 0)
-			continue;
+			/*
+			 * cbr->cbr_ds may be invalidated due to eviction,
+			 * requiring the use of dsl_dataset_try_add_ref().
+			 * See comment block in dsl_prop_notify_all_cb()
+			 * for details.
+			 */
+			if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+				continue;
 
-		/*
-		 * If the property is set on this ds, then it is not
-		 * inherited here; don't call the callback.
-		 */
-		if (propobj && 0 == zap_contains(mos, propobj, propname))
-			continue;
+			propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
 
-		cbr->cbr_func(cbr->cbr_arg, value);
+			/*
+			 * If the property is not set on this ds, then it is
+			 * inherited here; call the callback.
+			 */
+			if (propobj == 0 ||
+			    zap_contains(mos, propobj, propname) != 0)
+				cbr->cbr_func(cbr->cbr_arg, value);
+
+			dsl_dataset_rele(cbr->cbr_ds, FTAG);
+		}
 	}
 	mutex_exit(&dd->dd_lock);
 
 	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	for (zap_cursor_init(&zc, mos,
-	    dd->dd_phys->dd_child_dir_zapobj);
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		dsl_prop_changed_notify(dp, za->za_first_integer,
@@ -548,47 +604,41 @@ dsl_prop_changed_notify(dsl_pool_t *dp, 
 	}
 	kmem_free(za, sizeof (zap_attribute_t));
 	zap_cursor_fini(&zc);
-	dsl_dir_close(dd, FTAG);
+	dsl_dir_rele(dd, FTAG);
 }
 
 void
-dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
+    zprop_source_t source, int intsz, int numints, const void *value,
+    dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_prop_setarg_t *psa = arg2;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
 	uint64_t zapobj, intval, dummy;
 	int isint;
 	char valbuf[32];
-	char *valstr = NULL;
+	const char *valstr = NULL;
 	char *inheritstr;
 	char *recvdstr;
 	char *tbuf = NULL;
 	int err;
 	uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-	const char *propname = psa->psa_name;
-	zprop_source_t source = psa->psa_source;
 
-	isint = (dodefault(propname, 8, 1, &intval) == 0);
+	isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
 
-	if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+	if (ds->ds_is_snapshot) {
 		ASSERT(version >= SPA_VERSION_SNAP_PROPS);
-		if (ds->ds_phys->ds_props_obj == 0) {
+		if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
 			dmu_buf_will_dirty(ds->ds_dbuf, tx);
-			ds->ds_phys->ds_props_obj =
+			dsl_dataset_phys(ds)->ds_props_obj =
 			    zap_create(mos,
 			    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 		}
-		zapobj = ds->ds_phys->ds_props_obj;
+		zapobj = dsl_dataset_phys(ds)->ds_props_obj;
 	} else {
-		zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
+		zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
 	}
 
 	if (version < SPA_VERSION_RECVD_PROPS) {
-		zfs_prop_t prop = zfs_name_to_prop(propname);
-		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION)
-			return;
-
 		if (source & ZPROP_SRC_NONE)
 			source = ZPROP_SRC_NONE;
 		else if (source & ZPROP_SRC_RECEIVED)
@@ -617,8 +667,8 @@ dsl_prop_set_sync(void *arg1, void *arg2
 		 */
 		err = zap_remove(mos, zapobj, inheritstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
-		VERIFY(0 == zap_update(mos, zapobj, propname,
-		    psa->psa_intsz, psa->psa_numints, psa->psa_value, tx));
+		VERIFY0(zap_update(mos, zapobj, propname,
+		    intsz, numints, value, tx));
 		break;
 	case ZPROP_SRC_INHERITED:
 		/*
@@ -629,11 +679,10 @@ dsl_prop_set_sync(void *arg1, void *arg2
 		err = zap_remove(mos, zapobj, propname, tx);
 		ASSERT(err == 0 || err == ENOENT);
 		if (version >= SPA_VERSION_RECVD_PROPS &&
-		    zap_contains(mos, zapobj, ZPROP_HAS_RECVD) == 0) {
+		    dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
 			dummy = 0;
-			err = zap_update(mos, zapobj, inheritstr,
-			    8, 1, &dummy, tx);
-			ASSERT(err == 0);
+			VERIFY0(zap_update(mos, zapobj, inheritstr,
+			    8, 1, &dummy, tx));
 		}
 		break;
 	case ZPROP_SRC_RECEIVED:
@@ -641,7 +690,7 @@ dsl_prop_set_sync(void *arg1, void *arg2
 		 * set propname$recvd -> value
 		 */
 		err = zap_update(mos, zapobj, recvdstr,
-		    psa->psa_intsz, psa->psa_numints, psa->psa_value, tx);
+		    intsz, numints, value, tx);
 		ASSERT(err == 0);
 		break;
 	case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
@@ -671,9 +720,9 @@ dsl_prop_set_sync(void *arg1, void *arg2
 	strfree(recvdstr);
 
 	if (isint) {
-		VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL));
+		VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
 
-		if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+		if (ds->ds_is_snapshot) {
 			dsl_prop_cb_record_t *cbr;
 			/*
 			 * It's a snapshot; nothing can inherit this
@@ -681,10 +730,10 @@ dsl_prop_set_sync(void *arg1, void *arg2
 			 * ds here.
 			 */
 			mutex_enter(&ds->ds_dir->dd_lock);
-			for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
-			    cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
-				if (cbr->cbr_ds == ds &&
-				    strcmp(cbr->cbr_propname, propname) == 0)
+			for (cbr = list_head(&ds->ds_prop_cbs); cbr;
+			    cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+				if (strcmp(cbr->cbr_pr->pr_propname,
+				    propname) == 0)
 					cbr->cbr_func(cbr->cbr_arg, intval);
 			}
 			mutex_exit(&ds->ds_dir->dd_lock);
@@ -698,7 +747,7 @@ dsl_prop_set_sync(void *arg1, void *arg2
 		valstr = valbuf;
 	} else {
 		if (source == ZPROP_SRC_LOCAL) {
-			valstr = (char *)psa->psa_value;
+			valstr = value;
 		} else {
 			tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
 			if (dsl_prop_get_ds(ds, propname, 1,
@@ -707,146 +756,81 @@ dsl_prop_set_sync(void *arg1, void *arg2
 		}
 	}
 
-	spa_history_internal_log((source == ZPROP_SRC_NONE ||
-	    source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT :
-	    LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
-	    "%s=%s dataset = %llu", propname,
-	    (valstr == NULL ? "" : valstr), ds->ds_object);
+	spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE ||
+	    source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx,
+	    "%s=%s", propname, (valstr == NULL ? "" : valstr));
 
 	if (tbuf != NULL)
 		kmem_free(tbuf, ZAP_MAXVALUELEN);
 }
 
-void
-dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+int
+dsl_prop_set_int(const char *dsname, const char *propname,
+    zprop_source_t source, uint64_t value)
 {
-	dsl_dataset_t *ds = arg1;
-	dsl_props_arg_t *pa = arg2;
-	nvlist_t *props = pa->pa_props;
-	dsl_prop_setarg_t psa;
-	nvpair_t *elem = NULL;
+	nvlist_t *nvl = fnvlist_alloc();
+	int error;
 
-	psa.psa_source = pa->pa_source;
-
-	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
-		nvpair_t *pair = elem;
-
-		psa.psa_name = nvpair_name(pair);
-
-		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
-			/*
-			 * dsl_prop_get_all_impl() returns properties in this
-			 * format.
-			 */
-			nvlist_t *attrs;
-			VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-			VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-			    &pair) == 0);
-		}
-
-		if (nvpair_type(pair) == DATA_TYPE_STRING) {
-			VERIFY(nvpair_value_string(pair,
-			    (char **)&psa.psa_value) == 0);
-			psa.psa_intsz = 1;
-			psa.psa_numints = strlen(psa.psa_value) + 1;
-		} else {
-			uint64_t intval;
-			VERIFY(nvpair_value_uint64(pair, &intval) == 0);
-			psa.psa_intsz = sizeof (intval);
-			psa.psa_numints = 1;
-			psa.psa_value = &intval;
-		}
-		dsl_prop_set_sync(ds, &psa, cr, tx);
-	}
+	fnvlist_add_uint64(nvl, propname, value);
+	error = dsl_props_set(dsname, source, nvl);
+	fnvlist_free(nvl);
+	return (error);
 }
 
-void
-dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
-    cred_t *cr, dmu_tx_t *tx)
+int
+dsl_prop_set_string(const char *dsname, const char *propname,
+    zprop_source_t source, const char *value)
 {
-	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t zapobj = dd->dd_phys->dd_props_zapobj;
-
-	ASSERT(dmu_tx_is_syncing(tx));
-
-	VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx));
+	nvlist_t *nvl = fnvlist_alloc();
+	int error;
 
-	dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE);
-
-	spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr,
-	    "%s=%llu dataset = %llu", name, (u_longlong_t)val,
-	    dd->dd_phys->dd_head_dataset_obj);
+	fnvlist_add_string(nvl, propname, value);
+	error = dsl_props_set(dsname, source, nvl);
+	fnvlist_free(nvl);
+	return (error);
 }
 
 int
-dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source,
-    int intsz, int numints, const void *buf)
+dsl_prop_inherit(const char *dsname, const char *propname,
+    zprop_source_t source)
 {
-	dsl_dataset_t *ds;
-	uint64_t version;
-	int err;
-	dsl_prop_setarg_t psa;
-
-	/*
-	 * We must do these checks before we get to the syncfunc, since
-	 * it can't fail.
-	 */
-	if (strlen(propname) >= ZAP_MAXNAMELEN)
-		return (ENAMETOOLONG);
-
-	err = dsl_dataset_hold(dsname, FTAG, &ds);
-	if (err)
-		return (err);
-
-	version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-	if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ?
-	    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (E2BIG);
-	}
-	if (dsl_dataset_is_snapshot(ds) &&
-	    version < SPA_VERSION_SNAP_PROPS) {
-		dsl_dataset_rele(ds, FTAG);
-		return (ENOTSUP);
-	}
-
-	psa.psa_name = propname;
-	psa.psa_source = source;
-	psa.psa_intsz = intsz;
-	psa.psa_numints = numints;
-	psa.psa_value = buf;
-	psa.psa_effective_value = -1ULL;
-
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    NULL, dsl_prop_set_sync, ds, &psa, 2);
+	nvlist_t *nvl = fnvlist_alloc();
+	int error;
 
-	dsl_dataset_rele(ds, FTAG);
-	return (err);
-}
+	fnvlist_add_boolean(nvl, propname);
+	error = dsl_props_set(dsname, source, nvl);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+typedef struct dsl_props_set_arg {
+	const char *dpsa_dsname;
+	zprop_source_t dpsa_source;
+	nvlist_t *dpsa_props;
+} dsl_props_set_arg_t;
 
-int
-dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
+static int
+dsl_props_set_check(void *arg, dmu_tx_t *tx)
 {
+	dsl_props_set_arg_t *dpsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dataset_t *ds;
 	uint64_t version;
 	nvpair_t *elem = NULL;
-	dsl_props_arg_t pa;
 	int err;
 
-	if (err = dsl_dataset_hold(dsname, FTAG, &ds))
+	err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
+	if (err != 0)
 		return (err);
-	/*
-	 * Do these checks before the syncfunc, since it can't fail.
-	 */
+
 	version = spa_version(ds->ds_dir->dd_pool->dp_spa);
-	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+	while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
 		if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
 			dsl_dataset_rele(ds, FTAG);
-			return (ENAMETOOLONG);
+			return (SET_ERROR(ENAMETOOLONG));
 		}
 		if (nvpair_type(elem) == DATA_TYPE_STRING) {
-			char *valstr;
-			VERIFY(nvpair_value_string(elem, &valstr) == 0);
+			char *valstr = fnvpair_value_string(elem);
 			if (strlen(valstr) >= (version <
 			    SPA_VERSION_STMF_PROP ?
 			    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
@@ -856,20 +840,83 @@ dsl_props_set(const char *dsname, zprop_
 		}
 	}
 
-	if (dsl_dataset_is_snapshot(ds) &&
-	    version < SPA_VERSION_SNAP_PROPS) {
+	if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
 		dsl_dataset_rele(ds, FTAG);
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 	}
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
+    nvlist_t *props, dmu_tx_t *tx)
+{
+	nvpair_t *elem = NULL;
+
+	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+		nvpair_t *pair = elem;
+
+		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+			/*
+			 * dsl_prop_get_all_impl() returns properties in this
+			 * format.
+			 */
+			nvlist_t *attrs = fnvpair_value_nvlist(pair);
+			pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
+		}
 
-	pa.pa_props = props;
-	pa.pa_source = source;
+		if (nvpair_type(pair) == DATA_TYPE_STRING) {
+			const char *value = fnvpair_value_string(pair);
+			dsl_prop_set_sync_impl(ds, nvpair_name(pair),
+			    source, 1, strlen(value) + 1, value, tx);
+		} else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
+			uint64_t intval = fnvpair_value_uint64(pair);
+			dsl_prop_set_sync_impl(ds, nvpair_name(pair),
+			    source, sizeof (intval), 1, &intval, tx);
+		} else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
+			dsl_prop_set_sync_impl(ds, nvpair_name(pair),
+			    source, 0, 0, NULL, tx);
+		} else {
+			panic("invalid nvpair type");
+		}
+	}
+}
 
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    NULL, dsl_props_set_sync, ds, &pa, 2);
+static void
+dsl_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_props_set_arg_t *dpsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
 
+	VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
+	dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
 	dsl_dataset_rele(ds, FTAG);
-	return (err);
+}
+
+/*
+ * All-or-nothing; if any prop can't be set, nothing will be modified.
+ */
+int
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
+{
+	dsl_props_set_arg_t dpsa;
+	int nblks = 0;
+
+	dpsa.dpsa_dsname = dsname;
+	dpsa.dpsa_source = source;
+	dpsa.dpsa_props = props;
+
+	/*
+	 * If the source includes NONE, then we will only be removing entries
+	 * from the ZAP object.  In that case don't check for ENOSPC.
+	 */
+	if ((source & ZPROP_SRC_NONE) == 0)
+		nblks = 2 * fnvlist_num_pairs(props);
+
+	return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
+	    &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
 }
 
 typedef enum dsl_prop_getflags {
@@ -1012,20 +1059,20 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, n
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	int err = 0;
-	char setpoint[MAXNAMELEN];
+	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
-	if (dsl_dataset_is_snapshot(ds))
+	if (ds->ds_is_snapshot)
 		flags |= DSL_PROP_GET_SNAPSHOT;
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	ASSERT(dsl_pool_config_held(dp));
 
-	if (ds->ds_phys->ds_props_obj != 0) {
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
 		ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
 		dsl_dataset_name(ds, setpoint);
-		err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
-		    setpoint, flags, *nvp);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
 		if (err)
 			goto out;
 	}
@@ -1038,64 +1085,57 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, n
 			flags |= DSL_PROP_GET_INHERITING;
 		}
 		dsl_dir_name(dd, setpoint);
-		err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
-		    setpoint, flags, *nvp);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
 		if (err)
 			break;
 	}
 out:
-	rw_exit(&dp->dp_config_rwlock);
 	return (err);
 }
 
 boolean_t
-dsl_prop_get_hasrecvd(objset_t *os)
+dsl_prop_get_hasrecvd(const char *dsname)
 {
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	int rc;
 	uint64_t dummy;
 
-	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-	rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL);
-	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
-	ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
-	return (rc == 0);
+	return (0 ==
+	    dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
 }
 
-static void
-dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source)
+static int
+dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
 {
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	uint64_t dummy = 0;
-	dsl_prop_setarg_t psa;
-
-	if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS)
-		return;
-
-	dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy);
+	uint64_t version;
+	spa_t *spa;
+	int error = 0;
 
-	(void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL,
-	    dsl_prop_set_sync, ds, &psa, 2);
+	VERIFY0(spa_open(dsname, &spa, FTAG));
+	version = spa_version(spa);
+	spa_close(spa, FTAG);
+
+	if (version >= SPA_VERSION_RECVD_PROPS)
+		error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
+	return (error);
 }
 
 /*
  * Call after successfully receiving properties to ensure that only the first
  * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
  */
-void
-dsl_prop_set_hasrecvd(objset_t *os)
+int
+dsl_prop_set_hasrecvd(const char *dsname)
 {
-	if (dsl_prop_get_hasrecvd(os)) {
-		ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
-		return;
-	}
-	dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL);
+	int error = 0;
+	if (!dsl_prop_get_hasrecvd(dsname))
+		error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
+	return (error);
 }
 
 void
-dsl_prop_unset_hasrecvd(objset_t *os)
+dsl_prop_unset_hasrecvd(const char *dsname)
 {
-	dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE);
+	VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
 }
 
 int
@@ -1105,16 +1145,25 @@ dsl_prop_get_all(objset_t *os, nvlist_t 
 }
 
 int
-dsl_prop_get_received(objset_t *os, nvlist_t **nvp)
+dsl_prop_get_received(const char *dsname, nvlist_t **nvp)
 {
+	objset_t *os;
+	int error;
+
 	/*
 	 * Received properties are not distinguishable from local properties
 	 * until the dataset has received properties on or after
 	 * SPA_VERSION_RECVD_PROPS.
 	 */
-	dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ?
+	dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
 	    DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
-	return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags));
+
+	error = dmu_objset_hold(dsname, FTAG, &os);
+	if (error != 0)
+		return (error);
+	error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
+	dmu_objset_rele(os, FTAG);
+	return (error);
 }
 
 void
@@ -1132,7 +1181,7 @@ dsl_prop_nvlist_add_uint64(nvlist_t *nv,
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
 	/* Indicate the default source if we can. */
-	if (dodefault(propname, 8, 1, &default_value) == 0 &&
+	if (dodefault(prop, 8, 1, &default_value) == 0 &&
 	    value == default_value) {
 		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
 	}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c	10 Oct 2016 11:09:56 -0000
@@ -0,0 +1,1922 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Gary Mills
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
+    const zbookmark_phys_t *);
+
+static scan_cb_t dsl_scan_scrub_cb;
+static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
+static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
+
+unsigned int zfs_top_maxinflight = 32;	/* maximum I/Os per top-level */
+unsigned int zfs_resilver_delay = 2;	/* number of ticks to delay resilver */
+unsigned int zfs_scrub_delay = 4;	/* number of ticks to delay scrub */
+unsigned int zfs_scan_idle = 50;	/* idle window in clock ticks */
+
+unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
+						 per txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
+    &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN,
+    &zfs_resilver_delay, 0, "Number of ticks to delay resilver");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN,
+    &zfs_scrub_delay, 0, "Number of ticks to delay scrub");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN,
+    &zfs_scan_idle, 0, "Idle scan window in clock ticks");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN,
+    &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN,
+    &zfs_free_min_time_ms, 0, "Min millisecs to free per txg");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN,
+    &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN,
+    &zfs_no_scrub_io, 0, "Disable scrub I/O");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN,
+    &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
+
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+/* max number of blocks to free in a single TXG */
+uint64_t zfs_free_max_blocks = UINT64_MAX;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
+    &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
+
+
+#define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+extern int zfs_txg_timeout;
+
+/*
+ * Enable/disable the processing of the free_bpobj object.
+ */
+boolean_t zfs_free_bpobj_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
+    &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+	NULL,
+	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
+	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
+};
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+	int err;
+	dsl_scan_t *scn;
+	spa_t *spa = dp->dp_spa;
+	uint64_t f;
+
+	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+	scn->scn_dp = dp;
+
+	/*
+	 * It's possible that we're resuming a scan after a reboot so
+	 * make sure that the scan_async_destroying flag is initialized
+	 * appropriately.
+	 */
+	ASSERT(!scn->scn_async_destroying);
+	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
+	    SPA_FEATURE_ASYNC_DESTROY);
+
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    "scrub_func", sizeof (uint64_t), 1, &f);
+	if (err == 0) {
+		/*
+		 * There was an old-style scrub in progress.  Restart a
+		 * new-style scrub from the beginning.
+		 */
+		scn->scn_restart_txg = txg;
+		zfs_dbgmsg("old-style scrub was in progress; "
+		    "restarting new-style scrub in txg %llu",
+		    scn->scn_restart_txg);
+
+		/*
+		 * Load the queue obj from the old location so that it
+		 * can be freed by dsl_scan_done().
+		 */
+		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    "scrub_queue", sizeof (uint64_t), 1,
+		    &scn->scn_phys.scn_queue_obj);
+	} else {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys);
+		if (err == ENOENT)
+			return (0);
+		else if (err)
+			return (err);
+
+		if (scn->scn_phys.scn_state == DSS_SCANNING &&
+		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+			/*
+			 * A new-type scrub was in progress on an old
+			 * pool, and the pool was accessed by old
+			 * software.  Restart from the beginning, since
+			 * the old software may have changed the pool in
+			 * the meantime.
+			 */
+			scn->scn_restart_txg = txg;
+			zfs_dbgmsg("new-style scrub was modified "
+			    "by old software; restarting in txg %llu",
+			    scn->scn_restart_txg);
+		}
+	}
+
+	spa_scan_stat_init(spa);
+	return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+	if (dp->dp_scan) {
+		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+		dp->dp_scan = NULL;
+	}
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	if (scn->scn_phys.scn_state == DSS_SCANNING)
+		return (SET_ERROR(EBUSY));
+
+	return (0);
+}
+
+static void
+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	pool_scan_func_t *funcp = arg;
+	dmu_object_type_t ot = 0;
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+	scn->scn_phys.scn_func = *funcp;
+	scn->scn_phys.scn_state = DSS_SCANNING;
+	scn->scn_phys.scn_min_txg = 0;
+	scn->scn_phys.scn_max_txg = tx->tx_txg;
+	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+	scn->scn_phys.scn_start_time = gethrestime_sec();
+	scn->scn_phys.scn_errors = 0;
+	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+	scn->scn_restart_txg = 0;
+	scn->scn_done_txg = 0;
+	spa_scan_stat_init(spa);
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+		/* rewrite all disk labels */
+		vdev_config_dirty(spa->spa_root_vdev);
+
+		if (vdev_resilver_needed(spa->spa_root_vdev,
+		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+			spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
+		} else {
+			spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START);
+		}
+
+		spa->spa_scrub_started = B_TRUE;
+		/*
+		 * If this is an incremental scrub, limit the DDT scrub phase
+		 * to just the auto-ditto class (for correctness); the rest
+		 * of the scrub should go faster using top-down pruning.
+		 */
+		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+	}
+
+	/* back to the generic stuff */
+
+	if (dp->dp_blkstats == NULL) {
+		dp->dp_blkstats =
+		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+	}
+	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+
+	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+		ot = DMU_OT_ZAP_OTHER;
+
+	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+	dsl_scan_sync_state(scn, tx);
+
+	spa_history_log_internal(spa, "scan setup", tx,
+	    "func=%u mintxg=%llu maxtxg=%llu",
+	    *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+	static const char *old_names[] = {
+		"scrub_bookmark",
+		"scrub_ddt_bookmark",
+		"scrub_ddt_class_max",
+		"scrub_queue",
+		"scrub_min_txg",
+		"scrub_max_txg",
+		"scrub_func",
+		"scrub_errors",
+		NULL
+	};
+
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+	int i;
+
+	/* Remove any remnants of an old-style scrub. */
+	for (i = 0; old_names[i]; i++) {
+		(void) zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+	}
+
+	if (scn->scn_phys.scn_queue_obj != 0) {
+		VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, tx));
+		scn->scn_phys.scn_queue_obj = 0;
+	}
+
+	/*
+	 * If we were "restarted" from a stopped state, don't bother
+	 * with anything else.
+	 */
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (complete)
+		scn->scn_phys.scn_state = DSS_FINISHED;
+	else
+		scn->scn_phys.scn_state = DSS_CANCELED;
+
+	if (dsl_scan_restarting(scn, tx))
+		spa_history_log_internal(spa, "scan aborted, restarting", tx,
+		    "errors=%llu", spa_get_errlog_size(spa));
+	else if (!complete)
+		spa_history_log_internal(spa, "scan cancelled", tx,
+		    "errors=%llu", spa_get_errlog_size(spa));
+	else
+		spa_history_log_internal(spa, "scan done", tx,
+		    "errors=%llu", spa_get_errlog_size(spa));
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > 0) {
+			cv_wait(&spa->spa_scrub_io_cv,
+			    &spa->spa_scrub_lock);
+		}
+		mutex_exit(&spa->spa_scrub_lock);
+		spa->spa_scrub_started = B_FALSE;
+		spa->spa_scrub_active = B_FALSE;
+
+		/*
+		 * If the scrub/resilver completed, update all DTLs to
+		 * reflect this.  Whether it succeeded or not, vacate
+		 * all temporary scrub DTLs.
+		 */
+		vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+		    complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
+		if (complete) {
+			spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ?
+			    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+		}
+		spa_errlog_rotate(spa);
+
+		/*
+		 * We may have finished replacing a device.
+		 * Let the async thread assess this and handle the detach.
+		 */
+		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+	}
+
+	scn->scn_phys.scn_end_time = gethrestime_sec();
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return (SET_ERROR(ENOENT));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	dsl_scan_done(scn, B_FALSE, tx);
+	dsl_scan_sync_state(scn, tx);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
+	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx);
+static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+	zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+	ASSERT(dsl_pool_sync_context(dp));
+	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp),
+	    pio->io_flags));
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+	if (ds->ds_is_snapshot)
+		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+	return (smt);
+}
+
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+	    &scn->scn_phys, tx));
+}
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+static boolean_t
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+	/* we never skip user/group accounting objects */
+	if (zb && (int64_t)zb->zb_object < 0)
+		return (B_FALSE);
+
+	if (scn->scn_pausing)
+		return (B_TRUE); /* we're already pausing */
+
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
+		return (B_FALSE); /* we're resuming */
+
+	/* We only know how to resume from level-0 blocks. */
+	if (zb && zb->zb_level != 0)
+		return (B_FALSE);
+
+	/*
+	 * We pause if:
+	 *  - we have scanned for the maximum time: an entire txg
+	 *    timeout (default 5 sec)
+	 *  or
+	 *  - we have scanned for at least the minimum time (default 1 sec
+	 *    for scrub, 3 sec for resilver), and either we have sufficient
+	 *    dirty data that we are starting to write more quickly
+	 *    (default 30%), or someone is explicitly waiting for this txg
+	 *    to complete.
+	 *  or
+	 *  - the spa is shutting down because this pool is being exported
+	 *    or the machine is rebooting.
+	 */
+	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	    zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
+	uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+	if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
+	    (NSEC2MSEC(elapsed_nanosecs) > mintime &&
+	    (txg_sync_waiting(scn->scn_dp) ||
+	    dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa)) {
+		if (zb) {
+			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			scn->scn_phys.scn_bookmark = *zb;
+		}
+		dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+		scn->scn_pausing = B_TRUE;
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+	dsl_pool_t	*zsa_dp;
+	zil_header_t	*zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	zil_scan_arg_t *zsa = arg;
+	dsl_pool_t *dp = zsa->zsa_dp;
+	dsl_scan_t *scn = dp->dp_scan;
+	zil_header_t *zh = zsa->zsa_zh;
+	zbookmark_phys_t zb;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		return (0);
+
+	/*
+	 * One block ("stubby") can be allocated a long time ago; we
+	 * want to visit that one because it has been allocated
+	 * (on-disk) even if it hasn't been claimed (even though for
+	 * scrub there's nothing to do to it).
+	 */
+	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+		return (0);
+
+	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+	if (lrc->lrc_txtype == TX_WRITE) {
+		zil_scan_arg_t *zsa = arg;
+		dsl_pool_t *dp = zsa->zsa_dp;
+		dsl_scan_t *scn = dp->dp_scan;
+		zil_header_t *zh = zsa->zsa_zh;
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		zbookmark_phys_t zb;
+
+		if (BP_IS_HOLE(bp) ||
+		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+			return (0);
+
+		/*
+		 * birth can be < claim_txg if this record's txg is
+		 * already txg sync'ed (but this log block contains
+		 * other records that are not synced)
+		 */
+		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+			return (0);
+
+		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	}
+	return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+	zil_scan_arg_t zsa = { dp, zh };
+	zilog_t *zilog;
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 */
+	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+		return;
+
+	zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+	    claim_txg);
+
+	zil_free(zilog);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
+    uint64_t objset, uint64_t object, uint64_t blkid)
+{
+	zbookmark_phys_t czb;
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+
+	if (zfs_no_scrub_prefetch)
+		return;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+		return;
+
+	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
+	    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+    const zbookmark_phys_t *zb)
+{
+	/*
+	 * We never skip over user/group accounting objects (obj<0)
+	 */
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
+	    (int64_t)zb->zb_object >= 0) {
+		/*
+		 * If we already visited this bp & everything below (in
+		 * a prior txg sync), don't bother doing it again.
+		 */
+		if (zbookmark_subtree_completed(dnp, zb,
+		    &scn->scn_phys.scn_bookmark))
+			return (B_TRUE);
+
+		/*
+		 * If we found the block we're trying to resume from, or
+		 * we went past it to a different object, zero it out to
+		 * indicate that it's OK to start checking for pausing
+		 * again.
+		 */
+		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+			dprintf("resuming at %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+		}
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+	int err;
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
+			    zb->zb_object, zb->zb_blkid * epb + i);
+		}
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			zbookmark_phys_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			dsl_scan_visitbp(cbp, &czb, dnp,
+			    ds, scn, ostype, tx);
+		}
+		arc_buf_destroy(buf, &buf);
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		dnode_phys_t *cdnp;
+		int i, j;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			for (j = 0; j < cdnp->dn_nblkptr; j++) {
+				blkptr_t *cbp = &cdnp->dn_blkptr[j];
+				dsl_scan_prefetch(scn, buf, cbp,
+				    zb->zb_objset, zb->zb_blkid * epb + i, j);
+			}
+		}
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			dsl_scan_visitdnode(scn, ds, ostype,
+			    cdnp, zb->zb_blkid * epb + i, tx);
+		}
+
+		arc_buf_destroy(buf, &buf);
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+
+		osp = buf->b_data;
+
+		dsl_scan_visitdnode(scn, ds, osp->os_type,
+		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
+
+		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			/*
+			 * We also always visit user/group accounting
+			 * objects, and never skip them, even if we are
+			 * pausing.  This is necessary so that the space
+			 * deltas from this txg get integrated.
+			 */
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_groupused_dnode,
+			    DMU_GROUPUSED_OBJECT, tx);
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_userused_dnode,
+			    DMU_USERUSED_OBJECT, tx);
+		}
+		arc_buf_destroy(buf, &buf);
+	}
+
+	return (0);
+}
+
+static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype, dnode_phys_t *dnp,
+    uint64_t object, dmu_tx_t *tx)
+{
+	int j;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		zbookmark_phys_t czb;
+
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    dnp->dn_nlevels - 1, j);
+		dsl_scan_visitbp(&dnp->dn_blkptr[j],
+		    &czb, dnp, ds, scn, ostype, tx);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		zbookmark_phys_t czb;
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    0, DMU_SPILL_BLKID);
+		dsl_scan_visitbp(&dnp->dn_spill,
+		    &czb, dnp, ds, scn, ostype, tx);
+	}
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	arc_buf_t *buf = NULL;
+	blkptr_t bp_toread = *bp;
+
+	/* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+
+	if (dsl_scan_check_pause(scn, zb))
+		return;
+
+	if (dsl_scan_check_resume(scn, dnp, zb))
+		return;
+
+	if (BP_IS_HOLE(bp))
+		return;
+
+	scn->scn_visited_this_txg++;
+
+	dprintf_bp(bp,
+	    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
+	    ds, ds ? ds->ds_object : 0,
+	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+	    bp);
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		return;
+
+	if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
+		return;
+
+	/*
+	 * If dsl_scan_ddt() has aready visited this block, it will have
+	 * already done any translations or scrubbing, so don't call the
+	 * callback again.
+	 */
+	if (ddt_class_contains(dp->dp_spa,
+	    scn->scn_phys.scn_ddt_class_max, bp)) {
+		ASSERT(buf == NULL);
+		return;
+	}
+
+	/*
+	 * If this block is from the future (after cur_max_txg), then we
+	 * are doing this on behalf of a deleted snapshot, and we will
+	 * revisit the future block on the next pass of this dataset.
+	 * Don't scan it now unless we need to because something
+	 * under it was modified.
+	 */
+	if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
+		scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+	}
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_tx_t *tx)
+{
+	zbookmark_phys_t zb;
+
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	dsl_scan_visitbp(bp, &zb, NULL,
+	    ds, scn, DMU_OST_NONE, tx);
+
+	dprintf_ds(ds, "finished scan%s", "");
+}
+
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+		if (ds->ds_is_snapshot) {
+			/*
+			 * Note:
+			 *  - scn_cur_{min,max}_txg stays the same.
+			 *  - Setting the flag is not really necessary if
+			 *    scn_cur_max_txg == scn_max_txg, because there
+			 *    is nothing after this snapshot that we care
+			 *    about.  However, we set it anyway and then
+			 *    ignore it when we retraverse it in
+			 *    dsl_scan_visitds().
+			 */
+			scn->scn_phys.scn_bookmark.zb_objset =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset zb_objset to %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
+			scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+		} else {
+			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+			    ZB_DESTROYED_OBJSET, 0, 0, 0);
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset bookmark to -1,0,0,0",
+			    (u_longlong_t)ds->ds_object);
+		}
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		if (ds->ds_is_snapshot) {
+			/*
+			 * We keep the same mintxg; it could be >
+			 * ds_creation_txg if the previous snapshot was
+			 * deleted too.
+			 */
+			VERIFY(zap_add_int_key(dp->dp_meta_objset,
+			    scn->scn_phys.scn_queue_obj,
+			    dsl_dataset_phys(ds)->ds_next_snap_obj,
+			    mintxg, tx) == 0);
+			zfs_dbgmsg("destroying ds %llu; in queue; "
+			    "replacing with %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
+		} else {
+			zfs_dbgmsg("destroying ds %llu; in queue; removing",
+			    (u_longlong_t)ds->ds_object);
+		}
+	}
+
+	/*
+	 * dsl_scan_sync() should be called after this, and should sync
+	 * out our changed state, but just to be safe, do it here.
+	 */
+	dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset =
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
+		zfs_dbgmsg("snapshotting ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+	}
+	dsl_scan_sync_state(scn, tx);
+}
+
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
+		scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds1->ds_object, &mintxg) == 0) {
+		int err;
+
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+		err = zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx);
+		VERIFY(err == 0 || err == EEXIST);
+		if (err == EEXIST) {
+			/* Both were there to begin with */
+			VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+			    scn->scn_phys.scn_queue_obj,
+			    ds1->ds_object, mintxg, tx));
+		}
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+		VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx));
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+
+	dsl_scan_sync_state(scn, tx);
+}
+
+struct enqueue_clones_arg {
+	dmu_tx_t *tx;
+	uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+	struct enqueue_clones_arg *eca = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
+		return (0);
+
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+
+		dsl_dataset_rele(ds, FTAG);
+		if (err)
+			return (err);
+		ds = prev;
+	}
+	VERIFY(zap_add_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds->ds_object,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	dsl_dataset_t *ds;
+	objset_t *os;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	if (scn->scn_phys.scn_cur_min_txg >=
+	    scn->scn_phys.scn_max_txg) {
+		/*
+		 * This can happen if this snapshot was created after the
+		 * scan started, and we already completed a previous snapshot
+		 * that was created after the scan started.  This snapshot
+		 * only references blocks with:
+		 *
+		 *	birth < our ds_creation_txg
+		 *	cur_min_txg is no less than ds_creation_txg.
+		 *	We have already visited these blocks.
+		 * or
+		 *	birth > scn_max_txg
+		 *	The scan requested not to visit these blocks.
+		 *
+		 * Subsequent snapshots (and clones) can reference our
+		 * blocks, or blocks with even higher birth times.
+		 * Therefore we do not need to visit them either,
+		 * so we do not add them to the work queue.
+		 *
+		 * Note that checking for cur_min_txg >= cur_max_txg
+		 * is not sufficient, because in that case we may need to
+		 * visit subsequent snapshots.  This happens when min_txg > 0,
+		 * which raises cur_min_txg.  In this case we will visit
+		 * this dataset but skip all of its blocks, because the
+		 * rootbp's birth time is < cur_min_txg.  Then we will
+		 * add the next snapshots/clones to the work queue.
+		 */
+		char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+		dsl_dataset_name(ds, dsname);
+		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
+		    "cur_min_txg (%llu) >= max_txg (%llu)",
+		    dsobj, dsname,
+		    scn->scn_phys.scn_cur_min_txg,
+		    scn->scn_phys.scn_max_txg);
+		kmem_free(dsname, MAXNAMELEN);
+
+		goto out;
+	}
+
+	if (dmu_objset_from_ds(ds, &os))
+		goto out;
+
+	/*
+	 * Only the ZIL in the head (non-snapshot) is valid.  Even though
+	 * snapshots can have ZIL block pointers (which may be the same
+	 * BP as in the head), they must be ignored.  So we traverse the
+	 * ZIL here, rather than in scan_recurse(), because the regular
+	 * snapshot block-sharing rules don't apply to it.
+	 */
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
+		dsl_scan_zil(dp, &os->os_zil_header);
+
+	/*
+	 * Iterate over the bps in this ds.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	dsl_dataset_name(ds, dsname);
+	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+	    "pausing=%u",
+	    (longlong_t)dsobj, dsname,
+	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
+	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
+	    (int)scn->scn_pausing);
+	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+	if (scn->scn_pausing)
+		goto out;
+
+	/*
+	 * We've finished this pass over this dataset.
+	 */
+
+	/*
+	 * If we did not completely visit this dataset, do another pass.
+	 */
+	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+		zfs_dbgmsg("incomplete pass; visiting again");
+		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object,
+		    scn->scn_phys.scn_cur_max_txg, tx) == 0);
+		goto out;
+	}
+
+	/*
+	 * Add descendent datasets to work queue.
+	 */
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj,
+		    dsl_dataset_phys(ds)->ds_next_snap_obj,
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
+	}
+	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
+		boolean_t usenext = B_FALSE;
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+			uint64_t count;
+			/*
+			 * A bug in a previous version of the code could
+			 * cause upgrade_clones_cb() to not set
+			 * ds_next_snap_obj when it should, leading to a
+			 * missing entry.  Therefore we can only use the
+			 * next_clones_obj when its count is correct.
+			 */
+			int err = zap_count(dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
+			if (err == 0 &&
+			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
+				usenext = B_TRUE;
+		}
+
+		if (usenext) {
+			VERIFY0(zap_join_key(dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj,
+			    scn->scn_phys.scn_queue_obj,
+			    dsl_dataset_phys(ds)->ds_creation_txg, tx));
+		} else {
+			struct enqueue_clones_arg eca;
+			eca.tx = tx;
+			eca.originobj = ds->ds_object;
+
+			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+			    enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
+		}
+	}
+
+out:
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		/*
+		 * If this is a clone, we don't need to worry about it for now.
+		 */
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_dataset_rele(prev, FTAG);
+			return (0);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+	}
+
+	VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+	ddt_entry_t dde = { 0 };
+	int error;
+	uint64_t n = 0;
+
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+		ddt_t *ddt;
+
+		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+			break;
+		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+		    (longlong_t)ddb->ddb_class,
+		    (longlong_t)ddb->ddb_type,
+		    (longlong_t)ddb->ddb_checksum,
+		    (longlong_t)ddb->ddb_cursor);
+
+		/* There should be no pending changes to the dedup table */
+		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		n++;
+
+		if (dsl_scan_check_pause(scn, NULL))
+			break;
+	}
+
+	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u",
+	    (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max,
+	    (int)scn->scn_pausing);
+
+	ASSERT(error == 0 || error == ENOENT);
+	ASSERT(error != ENOENT ||
+	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	const ddt_key_t *ddk = &dde->dde_key;
+	ddt_phys_t *ddp = dde->dde_phys;
+	blkptr_t bp;
+	zbookmark_phys_t zb = { 0 };
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+			continue;
+		ddt_bp_create(checksum, ddk, ddp, &bp);
+
+		scn->scn_visited_this_txg++;
+		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+	}
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+	    scn->scn_phys.scn_ddt_class_max) {
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_ddt(scn, tx);
+		if (scn->scn_pausing)
+			return;
+	}
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+		/* First do the MOS & ORIGIN */
+
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_visit_rootbp(scn, NULL,
+		    &dp->dp_meta_rootbp, tx);
+		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+		if (scn->scn_pausing)
+			return;
+
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+			    enqueue_cb, tx, DS_FIND_CHILDREN));
+		} else {
+			dsl_scan_visitds(scn,
+			    dp->dp_origin_snap->ds_object, tx);
+		}
+		ASSERT(!scn->scn_pausing);
+	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
+	    ZB_DESTROYED_OBJSET) {
+		/*
+		 * If we were paused, continue from here.  Note if the
+		 * ds we were paused on was deleted, the zb_objset may
+		 * be -1, so we will skip this and find a new objset
+		 * below.
+		 */
+		dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+		if (scn->scn_pausing)
+			return;
+	}
+
+	/*
+	 * In case we were paused right at the end of the ds, zero the
+	 * bookmark so we don't think that we're still trying to resume.
+	 */
+	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
+
+	/* keep pulling things out of the zap-object-as-queue */
+	while (zap_cursor_init(&zc, dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj),
+	    zap_cursor_retrieve(&zc, &za) == 0) {
+		dsl_dataset_t *ds;
+		uint64_t dsobj;
+
+		dsobj = strtonum(za.za_name, NULL);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, dsobj, tx));
+
+		/* Set up min/max txg */
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		if (za.za_first_integer != 0) {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg,
+			    za.za_first_integer);
+		} else {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+		}
+		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+		dsl_dataset_rele(ds, FTAG);
+
+		dsl_scan_visitds(scn, dsobj, tx);
+		zap_cursor_fini(&zc);
+		if (scn->scn_pausing)
+			return;
+	}
+	zap_cursor_fini(&zc);
+}
+
+static boolean_t
+dsl_scan_free_should_pause(dsl_scan_t *scn)
+{
+	uint64_t elapsed_nanosecs;
+
+	if (zfs_recover)
+		return (B_FALSE);
+
+	if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
+		return (B_TRUE);
+
+	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+	    (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
+	    txg_sync_waiting(scn->scn_dp)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg;
+
+	if (!scn->scn_is_bptree ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+		if (dsl_scan_free_should_pause(scn))
+			return (SET_ERROR(ERESTART));
+	}
+
+	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+	    dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0));
+	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+	scn->scn_visited_this_txg++;
+	return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	uint64_t used = 0, comp, uncomp;
+
+	if (spa->spa_load_state != SPA_LOAD_NONE)
+		return (B_FALSE);
+	if (spa_shutting_down(spa))
+		return (B_FALSE);
+	if (scn->scn_phys.scn_state == DSS_SCANNING ||
+	    (scn->scn_async_destroying && !scn->scn_async_stalled))
+		return (B_TRUE);
+
+	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+		    &used, &comp, &uncomp);
+	}
+	return (used != 0);
+}
+
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+	int err = 0;
+
+	/*
+	 * Check for scn_restart_txg before checking spa_load_state, so
+	 * that we can restart an old-style scan while the pool is being
+	 * imported (see dsl_scan_init).
+	 */
+	if (dsl_scan_restarting(scn, tx)) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_done(scn, B_FALSE, tx);
+		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+			func = POOL_SCAN_RESILVER;
+		zfs_dbgmsg("restarting scan func=%u txg=%llu",
+		    func, tx->tx_txg);
+		dsl_scan_setup_sync(&func, tx);
+	}
+
+	/*
+	 * Only process scans in sync pass 1.
+	 */
+	if (spa_sync_pass(dp->dp_spa) > 1)
+		return;
+
+	/*
+	 * If the spa is shutting down, then stop scanning. This will
+	 * ensure that the scan does not dirty any new data during the
+	 * shutdown phase.
+	 */
+	if (spa_shutting_down(spa))
+		return;
+
+	/*
+	 * If the scan is inactive due to a stalled async destroy, try again.
+	 */
+	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+		return;
+
+	scn->scn_visited_this_txg = 0;
+	scn->scn_pausing = B_FALSE;
+	scn->scn_sync_start_time = gethrtime();
+	spa->spa_scrub_active = B_TRUE;
+
+	/*
+	 * First process the async destroys.  If we pause, don't do
+	 * any scrubbing or resilvering.  This ensures that there are no
+	 * async destroys while we are scanning, so the scan code doesn't
+	 * have to worry about traversing it.  It is also faster to free the
+	 * blocks than to scrub them.
+	 */
+	if (zfs_free_bpobj_enabled &&
+	    spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		scn->scn_is_bptree = B_FALSE;
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bpobj_iterate(&dp->dp_free_bpobj,
+		    dsl_scan_free_block_cb, scn, tx);
+		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+
+		if (err != 0 && err != ERESTART)
+			zfs_panic_recover("error %u from bpobj_iterate()", err);
+	}
+
+	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		ASSERT(scn->scn_async_destroying);
+		scn->scn_is_bptree = B_TRUE;
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bptree_iterate(dp->dp_meta_objset,
+		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+		VERIFY0(zio_wait(scn->scn_zio_root));
+
+		if (err == EIO || err == ECKSUM) {
+			err = 0;
+		} else if (err != 0 && err != ERESTART) {
+			zfs_panic_recover("error %u from "
+			    "traverse_dataset_destroyed()", err);
+		}
+
+		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+			/* finished; deactivate async destroy feature */
+			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+			ASSERT(!spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY));
+			VERIFY0(zap_remove(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, tx));
+			VERIFY0(bptree_free(dp->dp_meta_objset,
+			    dp->dp_bptree_obj, tx));
+			dp->dp_bptree_obj = 0;
+			scn->scn_async_destroying = B_FALSE;
+			scn->scn_async_stalled = B_FALSE;
+		} else {
+			/*
+			 * If we didn't make progress, mark the async
+			 * destroy as stalled, so that we will not initiate
+			 * a spa_sync() on its behalf.  Note that we only
+			 * check this if we are not finished, because if the
+			 * bptree had no blocks for us to visit, we can
+			 * finish without "making progress".
+			 */
+			scn->scn_async_stalled =
+			    (scn->scn_visited_this_txg == 0);
+		}
+	}
+	if (scn->scn_visited_this_txg) {
+		zfs_dbgmsg("freed %llu blocks in %llums from "
+		    "free_bpobj/bptree txg %llu; err=%d",
+		    (longlong_t)scn->scn_visited_this_txg,
+		    (longlong_t)
+		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+		    (longlong_t)tx->tx_txg, err);
+		scn->scn_visited_this_txg = 0;
+
+		/*
+		 * Write out changes to the DDT that may be required as a
+		 * result of the blocks freed.  This ensures that the DDT
+		 * is clean when a scrub/resilver runs.
+		 */
+		ddt_sync(spa, tx->tx_txg);
+	}
+	if (err != 0)
+		return;
+	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+	    zfs_free_leak_on_eio &&
+	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
+		/*
+		 * We have finished background destroying, but there is still
+		 * some space left in the dp_free_dir. Transfer this leaked
+		 * space to the dp_leak_dir.
+		 */
+		if (dp->dp_leak_dir == NULL) {
+			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+			    LEAK_DIR_NAME, tx);
+			VERIFY0(dsl_pool_open_special_dir(dp,
+			    LEAK_DIR_NAME, &dp->dp_leak_dir));
+			rrw_exit(&dp->dp_config_rwlock, FTAG);
+		}
+		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+	}
+	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
+		/* finished; verify that space accounting went to zero */
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+	}
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+	if (scn->scn_done_txg == tx->tx_txg) {
+		ASSERT(!scn->scn_pausing);
+		/* finished with scan. */
+		zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
+		dsl_scan_done(scn, B_TRUE, tx);
+		ASSERT3U(spa->spa_scrub_inflight, ==, 0);
+		dsl_scan_sync_state(scn, tx);
+		return;
+	}
+
+	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+	    scn->scn_phys.scn_ddt_class_max) {
+		zfs_dbgmsg("doing scan sync txg %llu; "
+		    "ddt bm=%llu/%llu/%llu/%llx",
+		    (longlong_t)tx->tx_txg,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
+		    (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
+		ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+	} else {
+		zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
+		    (longlong_t)tx->tx_txg,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
+		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+	}
+
+	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+	    NULL, ZIO_FLAG_CANFAIL);
+	dsl_pool_config_enter(dp, FTAG);
+	dsl_scan_visit(scn, tx);
+	dsl_pool_config_exit(dp, FTAG);
+	(void) zio_wait(scn->scn_zio_root);
+	scn->scn_zio_root = NULL;
+
+	zfs_dbgmsg("visited %llu blocks in %llums",
+	    (longlong_t)scn->scn_visited_this_txg,
+	    (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
+
+	if (!scn->scn_pausing) {
+		scn->scn_done_txg = tx->tx_txg + 1;
+		zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
+		    tx->tx_txg, scn->scn_done_txg);
+	}
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > 0) {
+			cv_wait(&spa->spa_scrub_io_cv,
+			    &spa->spa_scrub_lock);
+		}
+		mutex_exit(&spa->spa_scrub_lock);
+	}
+
+	dsl_scan_sync_state(scn, tx);
+}
+
+/*
+ * This will start a new scan, or restart an existing one.
+ */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+	if (txg == 0) {
+		dmu_tx_t *tx;
+		tx = dmu_tx_create_dd(dp->dp_mos_dir);
+		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+		txg = dmu_tx_get_txg(tx);
+		dp->dp_scan->scn_restart_txg = txg;
+		dmu_tx_commit(tx);
+	} else {
+		dp->dp_scan->scn_restart_txg = txg;
+	}
+	zfs_dbgmsg("restarting resilver txg=%llu", txg);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+	return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
+	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+	int i;
+
+	/*
+	 * If we resume after a reboot, zab will be NULL; don't record
+	 * incomplete stats in that case.
+	 */
+	if (zab == NULL)
+		return;
+
+	for (i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+		if (t & DMU_OT_NEWTYPE)
+			t = DMU_OT_OTHER;
+		zfs_blkstat_t *zb = &zab->zab_type[l][t];
+		int equal;
+
+		zb->zb_count++;
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_gangs += BP_COUNT_GANG(bp);
+
+		switch (BP_GET_NDVAS(bp)) {
+		case 2:
+			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1]))
+				zb->zb_ditto_2_of_2_samevdev++;
+			break;
+		case 3:
+			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2]));
+			if (equal == 1)
+				zb->zb_ditto_2_of_3_samevdev++;
+			else if (equal == 3)
+				zb->zb_ditto_3_of_3_samevdev++;
+			break;
+		}
+	}
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (zio->io_error && (zio->io_error != ECKSUM ||
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+		spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+	}
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	size_t size = BP_GET_PSIZE(bp);
+	spa_t *spa = dp->dp_spa;
+	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	boolean_t needs_io;
+	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+	unsigned int scan_delay = 0;
+
+	if (phys_birth <= scn->scn_phys.scn_min_txg ||
+	    phys_birth >= scn->scn_phys.scn_max_txg)
+		return (0);
+
+	count_block(dp->dp_blkstats, bp);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (0);
+
+	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+		zio_flags |= ZIO_FLAG_SCRUB;
+		needs_io = B_TRUE;
+		scan_delay = zfs_scrub_delay;
+	} else {
+		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
+		zio_flags |= ZIO_FLAG_RESILVER;
+		needs_io = B_FALSE;
+		scan_delay = zfs_resilver_delay;
+	}
+
+	/* If it's an intent log block, failure is expected. */
+	if (zb->zb_level == ZB_ZIL_LEVEL)
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+		vdev_t *vd = vdev_lookup_top(spa,
+		    DVA_GET_VDEV(&bp->blk_dva[d]));
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(1M) status can make useful progress reports.
+		 */
+		scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]);
+		spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]);
+
+		/* if it's a resilver, this may not be in the target range */
+		if (!needs_io) {
+			if (DVA_GET_GANG(&bp->blk_dva[d])) {
+				/*
+				 * Gang members may be spread across multiple
+				 * vdevs, so the best estimate we have is the
+				 * scrub range, which has already been checked.
+				 * XXX -- it would be better to change our
+				 * allocation policy to ensure that all
+				 * gang members reside on the same vdev.
+				 */
+				needs_io = B_TRUE;
+			} else {
+				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+				    phys_birth, 1);
+			}
+		}
+	}
+
+	if (needs_io && !zfs_no_scrub_io) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		uint64_t maxinflight = rvd->vdev_children *
+		    MAX(zfs_top_maxinflight, 1);
+		void *data = zio_data_buf_alloc(size);
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight >= maxinflight)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		/*
+		 * If we're seeing recent (zfs_scan_idle) "important" I/Os
+		 * then throttle our workload to limit the impact of a scan.
+		 */
+		if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
+			delay(MAX((int)scan_delay, 0));
+
+		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		    dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
+		    zio_flags, zb));
+	}
+
+	/* do not relocate this block */
+	return (0);
+}
+
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+	spa_t *spa = dp->dp_spa;
+
+	/*
+	 * Purge all vdev caches and probe all devices.  We do this here
+	 * rather than in sync context because this requires a writer lock
+	 * on the spa_config lock, which we can't do from sync context.  The
+	 * spa_scrub_reopen flag indicates that vdev_open() should not
+	 * attempt to start another scrub.
+	 */
+	spa_vdev_state_enter(spa, SCL_NONE);
+	spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	return (scn->scn_restart_txg != 0 &&
+	    scn->scn_restart_txg <= tx->tx_txg);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c	27 Feb 2010 22:30:59 -0000	1.1.1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,1200 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/dsl_pool.h>
-#include <sys/dsl_dataset.h>
-#include <sys/dsl_prop.h>
-#include <sys/dsl_dir.h>
-#include <sys/dsl_synctask.h>
-#include <sys/dnode.h>
-#include <sys/dmu_tx.h>
-#include <sys/dmu_objset.h>
-#include <sys/arc.h>
-#include <sys/zap.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
-#include <sys/fs/zfs.h>
-#include <sys/zfs_znode.h>
-#include <sys/spa_impl.h>
-#include <sys/vdev_impl.h>
-#include <sys/zil_impl.h>
-#include <sys/zio_checksum.h>
-#include <sys/ddt.h>
-
-typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
-
-static scrub_cb_t dsl_pool_scrub_clean_cb;
-static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
-static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
-    uint64_t objset, uint64_t object);
-
-int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
-int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
-boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
-enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
-
-extern int zfs_txg_timeout;
-
-static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
-	NULL,
-	dsl_pool_scrub_clean_cb
-};
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = arg1;
-	enum scrub_func *funcp = arg2;
-	dmu_object_type_t ot = 0;
-	boolean_t complete = B_FALSE;
-
-	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
-
-	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
-	ASSERT(*funcp > SCRUB_FUNC_NONE);
-	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
-
-	dp->dp_scrub_min_txg = 0;
-	dp->dp_scrub_max_txg = tx->tx_txg;
-	dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
-
-	if (*funcp == SCRUB_FUNC_CLEAN) {
-		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
-
-		/* rewrite all disk labels */
-		vdev_config_dirty(rvd);
-
-		if (vdev_resilver_needed(rvd,
-		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
-			spa_event_notify(dp->dp_spa, NULL,
-			    ESC_ZFS_RESILVER_START);
-			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
-			    tx->tx_txg);
-		} else {
-			spa_event_notify(dp->dp_spa, NULL,
-			    ESC_ZFS_SCRUB_START);
-		}
-
-		/* zero out the scrub stats in all vdev_stat_t's */
-		vdev_scrub_stat_update(rvd,
-		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
-		    POOL_SCRUB_EVERYTHING, B_FALSE);
-
-		/*
-		 * If this is an incremental scrub, limit the DDT scrub phase
-		 * to just the auto-ditto class (for correctness); the rest
-		 * of the scrub should go faster using top-down pruning.
-		 */
-		if (dp->dp_scrub_min_txg > TXG_INITIAL)
-			dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
-
-		dp->dp_spa->spa_scrub_started = B_TRUE;
-	}
-
-	/* back to the generic stuff */
-
-	if (dp->dp_blkstats == NULL) {
-		dp->dp_blkstats =
-		    kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
-	}
-	bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
-
-	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
-		ot = DMU_OT_ZAP_OTHER;
-
-	dp->dp_scrub_func = *funcp;
-	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
-	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
-	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
-	dp->dp_scrub_restart = B_FALSE;
-	dp->dp_spa->spa_scrub_errors = 0;
-
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
-	    &dp->dp_scrub_func, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_queue_obj, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_min_txg, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_max_txg, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_ddt_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_ddt_class_max, tx));
-	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-	    &dp->dp_spa->spa_scrub_errors, tx));
-
-	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
-	    "func=%u mintxg=%llu maxtxg=%llu",
-	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
-}
-
-int
-dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
-{
-	return (dsl_sync_task_do(dp, NULL,
-	    dsl_pool_scrub_setup_sync, dp, &func, 0));
-}
-
-/* ARGSUSED */
-static void
-dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = arg1;
-	boolean_t *completep = arg2;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	mutex_enter(&dp->dp_scrub_cancel_lock);
-
-	if (dp->dp_scrub_restart) {
-		dp->dp_scrub_restart = B_FALSE;
-		*completep = B_FALSE;
-	}
-
-	/* XXX this is scrub-clean specific */
-	mutex_enter(&dp->dp_spa->spa_scrub_lock);
-	while (dp->dp_spa->spa_scrub_inflight > 0) {
-		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-		    &dp->dp_spa->spa_scrub_lock);
-	}
-	mutex_exit(&dp->dp_spa->spa_scrub_lock);
-	dp->dp_spa->spa_scrub_started = B_FALSE;
-	dp->dp_spa->spa_scrub_active = B_FALSE;
-
-	dp->dp_scrub_func = SCRUB_FUNC_NONE;
-	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
-	    dp->dp_scrub_queue_obj, tx));
-	dp->dp_scrub_queue_obj = 0;
-	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
-
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_QUEUE, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MIN_TXG, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_MAX_TXG, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_FUNC, tx));
-	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_ERRORS, tx));
-
-	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
-	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
-
-	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
-	    "complete=%u", *completep);
-
-	/* below is scrub-clean specific */
-	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
-	    *completep);
-	/*
-	 * If the scrub/resilver completed, update all DTLs to reflect this.
-	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
-	 */
-	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
-	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
-	if (*completep)
-		spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
-		    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
-	spa_errlog_rotate(dp->dp_spa);
-
-	/*
-	 * We may have finished replacing a device.
-	 * Let the async thread assess this and handle the detach.
-	 */
-	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
-
-	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
-	mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-int
-dsl_pool_scrub_cancel(dsl_pool_t *dp)
-{
-	boolean_t complete = B_FALSE;
-
-	return (dsl_sync_task_do(dp, NULL,
-	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
-}
-
-void
-dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
-{
-	/*
-	 * This function will be used by bp-rewrite wad to intercept frees.
-	 */
-	zio_free(dp->dp_spa, txg, bpp);
-}
-
-static boolean_t
-bookmark_is_zero(const zbookmark_t *zb)
-{
-	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
-	    zb->zb_level == 0 && zb->zb_blkid == 0);
-}
-
-/* dnp is the dnode for zb1->zb_object */
-static boolean_t
-bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
-    const zbookmark_t *zb2)
-{
-	uint64_t zb1nextL0, zb2thisobj;
-
-	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
-	ASSERT(zb2->zb_level == 0);
-
-	/*
-	 * A bookmark in the deadlist is considered to be after
-	 * everything else.
-	 */
-	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-		return (B_TRUE);
-
-	/* The objset_phys_t isn't before anything. */
-	if (dnp == NULL)
-		return (B_FALSE);
-
-	zb1nextL0 = (zb1->zb_blkid + 1) <<
-	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
-
-	zb2thisobj = zb2->zb_object ? zb2->zb_object :
-	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
-	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
-		uint64_t nextobj = zb1nextL0 *
-		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
-		return (nextobj <= zb2thisobj);
-	}
-
-	if (zb1->zb_object < zb2thisobj)
-		return (B_TRUE);
-	if (zb1->zb_object > zb2thisobj)
-		return (B_FALSE);
-	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
-		return (B_FALSE);
-	return (zb1nextL0 <= zb2->zb_blkid);
-}
-
-static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
-{
-	uint64_t elapsed_nanosecs;
-	int mintime;
-
-	if (dp->dp_scrub_pausing)
-		return (B_TRUE); /* we're already pausing */
-
-	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
-		return (B_FALSE); /* we're resuming */
-
-	/* We only know how to resume from level-0 blocks. */
-	if (zb != NULL && zb->zb_level != 0)
-		return (B_FALSE);
-
-	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms :
-	    zfs_scrub_min_time_ms;
-	elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time;
-	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-	    (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) {
-		if (zb) {
-			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
-			    (longlong_t)zb->zb_objset,
-			    (longlong_t)zb->zb_object,
-			    (longlong_t)zb->zb_level,
-			    (longlong_t)zb->zb_blkid);
-			dp->dp_scrub_bookmark = *zb;
-		}
-		if (ddb) {
-			dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
-			    (longlong_t)ddb->ddb_class,
-			    (longlong_t)ddb->ddb_type,
-			    (longlong_t)ddb->ddb_checksum,
-			    (longlong_t)ddb->ddb_cursor);
-			ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
-		}
-		dp->dp_scrub_pausing = B_TRUE;
-		return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-typedef struct zil_traverse_arg {
-	dsl_pool_t	*zta_dp;
-	zil_header_t	*zta_zh;
-} zil_traverse_arg_t;
-
-/* ARGSUSED */
-static int
-traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
-{
-	zil_traverse_arg_t *zta = arg;
-	dsl_pool_t *dp = zta->zta_dp;
-	zil_header_t *zh = zta->zta_zh;
-	zbookmark_t zb;
-
-	if (bp->blk_birth <= dp->dp_scrub_min_txg)
-		return (0);
-
-	/*
-	 * One block ("stubby") can be allocated a long time ago; we
-	 * want to visit that one because it has been allocated
-	 * (on-disk) even if it hasn't been claimed (even though for
-	 * plain scrub there's nothing to do to it).
-	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
-		return (0);
-
-	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
-
-	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
-{
-	if (lrc->lrc_txtype == TX_WRITE) {
-		zil_traverse_arg_t *zta = arg;
-		dsl_pool_t *dp = zta->zta_dp;
-		zil_header_t *zh = zta->zta_zh;
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-		zbookmark_t zb;
-
-		if (bp->blk_birth <= dp->dp_scrub_min_txg)
-			return (0);
-
-		/*
-		 * birth can be < claim_txg if this record's txg is
-		 * already txg sync'ed (but this log block contains
-		 * other records that are not synced)
-		 */
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return (0);
-
-		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
-		    lr->lr_foid, ZB_ZIL_LEVEL,
-		    lr->lr_offset / BP_GET_LSIZE(bp));
-
-		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
-	}
-	return (0);
-}
-
-static void
-traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
-{
-	uint64_t claim_txg = zh->zh_claim_txg;
-	zil_traverse_arg_t zta = { dp, zh };
-	zilog_t *zilog;
-
-	/*
-	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed (or, in read-only mode, blocks that *would* be claimed).
-	 */
-	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
-		return;
-
-	zilog = zil_alloc(dp->dp_meta_objset, zh);
-
-	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta,
-	    claim_txg);
-
-	zil_free(zilog);
-}
-
-static void
-scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset,
-    uint64_t object, uint64_t blkid)
-{
-	zbookmark_t czb;
-	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
-
-	if (zfs_no_scrub_prefetch)
-		return;
-
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg ||
-	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
-		return;
-
-	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
-
-	(void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp,
-	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
-	    &flags, &czb);
-}
-
-static void
-scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
-    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
-{
-	int err;
-	arc_buf_t *buf = NULL;
-
-	if (bp->blk_birth <= dp->dp_scrub_min_txg)
-		return;
-
-	if (scrub_pause(dp, zb, NULL))
-		return;
-
-	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
-		/*
-		 * If we already visited this bp & everything below (in
-		 * a prior txg), don't bother doing it again.
-		 */
-		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
-			return;
-
-		/*
-		 * If we found the block we're trying to resume from, or
-		 * we went past it to a different object, zero it out to
-		 * indicate that it's OK to start checking for pausing
-		 * again.
-		 */
-		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
-		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
-			dprintf("resuming at %llx/%llx/%llx/%llx\n",
-			    (longlong_t)zb->zb_objset,
-			    (longlong_t)zb->zb_object,
-			    (longlong_t)zb->zb_level,
-			    (longlong_t)zb->zb_blkid);
-			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
-		}
-	}
-
-	/*
-	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
-	 * don't scrub it again.
-	 */
-	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
-		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
-
-	if (BP_GET_LEVEL(bp) > 0) {
-		uint32_t flags = ARC_WAIT;
-		int i;
-		blkptr_t *cbp;
-		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
-
-		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
-		    arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err) {
-			mutex_enter(&dp->dp_spa->spa_scrub_lock);
-			dp->dp_spa->spa_scrub_errors++;
-			mutex_exit(&dp->dp_spa->spa_scrub_lock);
-			return;
-		}
-		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-			scrub_prefetch(dp, buf, cbp, zb->zb_objset,
-			    zb->zb_object, zb->zb_blkid * epb + i);
-		}
-		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-			zbookmark_t czb;
-
-			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
-			    zb->zb_level - 1,
-			    zb->zb_blkid * epb + i);
-			scrub_visitbp(dp, dnp, buf, cbp, &czb);
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
-		uint32_t flags = ARC_WAIT;
-		dnode_phys_t *cdnp;
-		int i, j;
-		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-
-		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
-		    arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err) {
-			mutex_enter(&dp->dp_spa->spa_scrub_lock);
-			dp->dp_spa->spa_scrub_errors++;
-			mutex_exit(&dp->dp_spa->spa_scrub_lock);
-			return;
-		}
-		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
-			for (j = 0; j < cdnp->dn_nblkptr; j++) {
-				blkptr_t *cbp = &cdnp->dn_blkptr[j];
-				scrub_prefetch(dp, buf, cbp, zb->zb_objset,
-				    zb->zb_blkid * epb + i, j);
-			}
-		}
-		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
-			scrub_visitdnode(dp, cdnp, buf, zb->zb_objset,
-			    zb->zb_blkid * epb + i);
-		}
-	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
-		uint32_t flags = ARC_WAIT;
-		objset_phys_t *osp;
-
-		err = arc_read_nolock(NULL, dp->dp_spa, bp,
-		    arc_getbuf_func, &buf,
-		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
-		if (err) {
-			mutex_enter(&dp->dp_spa->spa_scrub_lock);
-			dp->dp_spa->spa_scrub_errors++;
-			mutex_exit(&dp->dp_spa->spa_scrub_lock);
-			return;
-		}
-
-		osp = buf->b_data;
-
-		traverse_zil(dp, &osp->os_zil_header);
-
-		scrub_visitdnode(dp, &osp->os_meta_dnode,
-		    buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
-		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-			scrub_visitdnode(dp, &osp->os_userused_dnode,
-			    buf, zb->zb_objset, DMU_USERUSED_OBJECT);
-			scrub_visitdnode(dp, &osp->os_groupused_dnode,
-			    buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
-		}
-	}
-
-	if (buf)
-		(void) arc_buf_remove_ref(buf, &buf);
-}
-
-static void
-scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
-    uint64_t objset, uint64_t object)
-{
-	int j;
-
-	for (j = 0; j < dnp->dn_nblkptr; j++) {
-		zbookmark_t czb;
-
-		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
-		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
-	}
-}
-
-static void
-scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
-{
-	zbookmark_t zb;
-
-	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
-	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-	scrub_visitbp(dp, NULL, NULL, bp, &zb);
-}
-
-void
-dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-		SET_BOOKMARK(&dp->dp_scrub_bookmark,
-		    ZB_DESTROYED_OBJSET, 0, 0, 0);
-	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds->ds_object, tx) != 0) {
-		return;
-	}
-
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
-	}
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
-}
-
-void
-dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds->ds_dir->dd_pool;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
-
-	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-		dp->dp_scrub_bookmark.zb_objset =
-		    ds->ds_phys->ds_prev_snap_obj;
-	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds->ds_object, tx) == 0) {
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
-	}
-}
-
-void
-dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
-{
-	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) {
-		dp->dp_scrub_bookmark.zb_objset = ds2->ds_object;
-	} else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) {
-		dp->dp_scrub_bookmark.zb_objset = ds1->ds_object;
-	}
-
-	if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds1->ds_object, tx) == 0) {
-		int err = zap_add_int(dp->dp_meta_objset,
-		    dp->dp_scrub_queue_obj, ds2->ds_object, tx);
-		VERIFY(err == 0 || err == EEXIST);
-		if (err == EEXIST) {
-			/* Both were there to begin with */
-			VERIFY(0 == zap_add_int(dp->dp_meta_objset,
-			    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
-		}
-	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds2->ds_object, tx) == 0) {
-		VERIFY(0 == zap_add_int(dp->dp_meta_objset,
-		    dp->dp_scrub_queue_obj, ds1->ds_object, tx));
-	}
-}
-
-struct enqueue_clones_arg {
-	dmu_tx_t *tx;
-	uint64_t originobj;
-};
-
-/* ARGSUSED */
-static int
-enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-	struct enqueue_clones_arg *eca = arg;
-	dsl_dataset_t *ds;
-	int err;
-	dsl_pool_t *dp;
-
-	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
-	if (err)
-		return (err);
-	dp = ds->ds_dir->dd_pool;
-
-	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
-		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
-			dsl_dataset_t *prev;
-			err = dsl_dataset_hold_obj(dp,
-			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
-
-			dsl_dataset_rele(ds, FTAG);
-			if (err)
-				return (err);
-			ds = prev;
-		}
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_object, eca->tx) == 0);
-	}
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-static void
-scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds;
-	uint64_t min_txg_save;
-
-	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-
-	/*
-	 * Iterate over the bps in this ds.
-	 */
-	min_txg_save = dp->dp_scrub_min_txg;
-	dp->dp_scrub_min_txg =
-	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
-	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
-	dp->dp_scrub_min_txg = min_txg_save;
-
-	if (dp->dp_scrub_pausing)
-		goto out;
-
-	/*
-	 * Add descendent datasets to work queue.
-	 */
-	if (ds->ds_phys->ds_next_snap_obj != 0) {
-		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
-	}
-	if (ds->ds_phys->ds_num_children > 1) {
-		boolean_t usenext = B_FALSE;
-		if (ds->ds_phys->ds_next_clones_obj != 0) {
-			uint64_t count;
-			/*
-			 * A bug in a previous version of the code could
-			 * cause upgrade_clones_cb() to not set
-			 * ds_next_snap_obj when it should, leading to a
-			 * missing entry.  Therefore we can only use the
-			 * next_clones_obj when its count is correct.
-			 */
-			int err = zap_count(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj, &count);
-			if (err == 0 &&
-			    count == ds->ds_phys->ds_num_children - 1)
-				usenext = B_TRUE;
-		}
-
-		if (usenext) {
-			VERIFY(zap_join(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj,
-			    dp->dp_scrub_queue_obj, tx) == 0);
-		} else {
-			struct enqueue_clones_arg eca;
-			eca.tx = tx;
-			eca.originobj = ds->ds_object;
-
-			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
-			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
-		}
-	}
-
-out:
-	dsl_dataset_rele(ds, FTAG);
-}
-
-/* ARGSUSED */
-static int
-enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
-{
-	dmu_tx_t *tx = arg;
-	dsl_dataset_t *ds;
-	int err;
-	dsl_pool_t *dp;
-
-	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
-	if (err)
-		return (err);
-
-	dp = ds->ds_dir->dd_pool;
-
-	while (ds->ds_phys->ds_prev_snap_obj != 0) {
-		dsl_dataset_t *prev;
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    FTAG, &prev);
-		if (err) {
-			dsl_dataset_rele(ds, FTAG);
-			return (err);
-		}
-
-		/*
-		 * If this is a clone, we don't need to worry about it for now.
-		 */
-		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
-			dsl_dataset_rele(ds, FTAG);
-			dsl_dataset_rele(prev, FTAG);
-			return (0);
-		}
-		dsl_dataset_rele(ds, FTAG);
-		ds = prev;
-	}
-
-	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
-	    ds->ds_object, tx) == 0);
-	dsl_dataset_rele(ds, FTAG);
-	return (0);
-}
-
-/*
- * Scrub/dedup interaction.
- *
- * If there are N references to a deduped block, we don't want to scrub it
- * N times -- ideally, we should scrub it exactly once.
- *
- * We leverage the fact that the dde's replication class (enum ddt_class)
- * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
- * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
- *
- * To prevent excess scrubbing, the scrub begins by walking the DDT
- * to find all blocks with refcnt > 1, and scrubs each of these once.
- * Since there are two replication classes which contain blocks with
- * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
- * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
- *
- * There would be nothing more to say if a block's refcnt couldn't change
- * during a scrub, but of course it can so we must account for changes
- * in a block's replication class.
- *
- * Here's an example of what can occur:
- *
- * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
- * when visited during the top-down scrub phase, it will be scrubbed twice.
- * This negates our scrub optimization, but is otherwise harmless.
- *
- * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
- * on each visit during the top-down scrub phase, it will never be scrubbed.
- * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
- * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
- * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
- * while a scrub is in progress, it scrubs the block right then.
- */
-static void
-dsl_pool_scrub_ddt(dsl_pool_t *dp)
-{
-	ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
-	ddt_entry_t dde;
-	int error;
-
-	while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
-		if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
-			return;
-		dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
-		if (scrub_pause(dp, NULL, ddb))
-			return;
-	}
-	ASSERT(error == ENOENT);
-	ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
-}
-
-void
-dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
-    const ddt_entry_t *dde)
-{
-	const ddt_key_t *ddk = &dde->dde_key;
-	const ddt_phys_t *ddp = dde->dde_phys;
-	blkptr_t blk;
-	zbookmark_t zb = { 0 };
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-		if (ddp->ddp_phys_birth == 0)
-			continue;
-		ddt_bp_create(checksum, ddk, ddp, &blk);
-		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
-	}
-}
-
-void
-dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
-{
-	spa_t *spa = dp->dp_spa;
-	zap_cursor_t zc;
-	zap_attribute_t za;
-	boolean_t complete = B_TRUE;
-
-	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
-		return;
-
-	/*
-	 * If the pool is not loaded, or is trying to unload, leave it alone.
-	 */
-	if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa))
-		return;
-
-	if (dp->dp_scrub_restart) {
-		enum scrub_func func = dp->dp_scrub_func;
-		dp->dp_scrub_restart = B_FALSE;
-		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
-	}
-
-	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
-		/*
-		 * We must have resumed after rebooting; reset the vdev
-		 * stats to know that we're doing a scrub (although it
-		 * will think we're just starting now).
-		 */
-		vdev_scrub_stat_update(spa->spa_root_vdev,
-		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
-		    POOL_SCRUB_EVERYTHING, B_FALSE);
-	}
-
-	dp->dp_scrub_pausing = B_FALSE;
-	dp->dp_scrub_start_time = gethrtime();
-	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-	spa->spa_scrub_active = B_TRUE;
-
-	if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
-		dsl_pool_scrub_ddt(dp);
-		if (dp->dp_scrub_pausing)
-			goto out;
-	}
-
-	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
-		/* First do the MOS & ORIGIN */
-		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
-		if (dp->dp_scrub_pausing)
-			goto out;
-
-		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
-			VERIFY(0 == dmu_objset_find_spa(spa,
-			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
-		} else {
-			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
-		}
-		ASSERT(!dp->dp_scrub_pausing);
-	} else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
-		/*
-		 * If we were paused, continue from here.  Note if the ds
-		 * we were paused on was destroyed, the zb_objset will be
-		 * ZB_DESTROYED_OBJSET, so we will skip this and find a new
-		 * objset below.
-		 */
-		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
-		if (dp->dp_scrub_pausing)
-			goto out;
-	}
-
-	/*
-	 * In case we were paused right at the end of the ds, zero the
-	 * bookmark so we don't think that we're still trying to resume.
-	 */
-	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
-
-	/* keep pulling things out of the zap-object-as-queue */
-	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
-	    zap_cursor_retrieve(&zc, &za) == 0) {
-		VERIFY(0 == zap_remove(dp->dp_meta_objset,
-		    dp->dp_scrub_queue_obj, za.za_name, tx));
-		scrub_visitds(dp, za.za_first_integer, tx);
-		if (dp->dp_scrub_pausing)
-			break;
-		zap_cursor_fini(&zc);
-	}
-	zap_cursor_fini(&zc);
-	if (dp->dp_scrub_pausing)
-		goto out;
-
-	/* done. */
-
-	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
-	return;
-out:
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
-	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
-	    &dp->dp_scrub_ddt_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
-	    &dp->dp_scrub_ddt_class_max, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-	    &spa->spa_scrub_errors, tx));
-}
-
-void
-dsl_pool_scrub_restart(dsl_pool_t *dp)
-{
-	mutex_enter(&dp->dp_scrub_cancel_lock);
-	dp->dp_scrub_restart = B_TRUE;
-	mutex_exit(&dp->dp_scrub_cancel_lock);
-}
-
-/*
- * scrub consumers
- */
-
-static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
-{
-	int i;
-
-	/*
-	 * If we resume after a reboot, zab will be NULL; don't record
-	 * incomplete stats in that case.
-	 */
-	if (zab == NULL)
-		return;
-
-	for (i = 0; i < 4; i++) {
-		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
-		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
-		zfs_blkstat_t *zb = &zab->zab_type[l][t];
-		int equal;
-
-		zb->zb_count++;
-		zb->zb_asize += BP_GET_ASIZE(bp);
-		zb->zb_lsize += BP_GET_LSIZE(bp);
-		zb->zb_psize += BP_GET_PSIZE(bp);
-		zb->zb_gangs += BP_COUNT_GANG(bp);
-
-		switch (BP_GET_NDVAS(bp)) {
-		case 2:
-			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[1]))
-				zb->zb_ditto_2_of_2_samevdev++;
-			break;
-		case 3:
-			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[1])) +
-			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[2])) +
-			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
-			    DVA_GET_VDEV(&bp->blk_dva[2]));
-			if (equal == 1)
-				zb->zb_ditto_2_of_3_samevdev++;
-			else if (equal == 3)
-				zb->zb_ditto_3_of_3_samevdev++;
-			break;
-		}
-	}
-}
-
-static void
-dsl_pool_scrub_clean_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-
-	zio_data_buf_free(zio->io_data, zio->io_size);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_inflight--;
-	cv_broadcast(&spa->spa_scrub_io_cv);
-
-	if (zio->io_error && (zio->io_error != ECKSUM ||
-	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)))
-		spa->spa_scrub_errors++;
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-static int
-dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
-    const blkptr_t *bp, const zbookmark_t *zb)
-{
-	size_t size = BP_GET_PSIZE(bp);
-	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
-	boolean_t needs_io;
-	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-	int zio_priority;
-
-	if (phys_birth <= dp->dp_scrub_min_txg ||
-	    phys_birth >= dp->dp_scrub_max_txg)
-		return (0);
-
-	count_block(dp->dp_blkstats, bp);
-
-	if (dp->dp_scrub_isresilver == 0) {
-		/* It's a scrub */
-		zio_flags |= ZIO_FLAG_SCRUB;
-		zio_priority = ZIO_PRIORITY_SCRUB;
-		needs_io = B_TRUE;
-	} else {
-		/* It's a resilver */
-		zio_flags |= ZIO_FLAG_RESILVER;
-		zio_priority = ZIO_PRIORITY_RESILVER;
-		needs_io = B_FALSE;
-	}
-
-	/* If it's an intent log block, failure is expected. */
-	if (zb->zb_level == ZB_ZIL_LEVEL)
-		zio_flags |= ZIO_FLAG_SPECULATIVE;
-
-	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
-		vdev_t *vd = vdev_lookup_top(spa,
-		    DVA_GET_VDEV(&bp->blk_dva[d]));
-
-		/*
-		 * Keep track of how much data we've examined so that
-		 * zpool(1M) status can make useful progress reports.
-		 */
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_examined +=
-		    DVA_GET_ASIZE(&bp->blk_dva[d]);
-		mutex_exit(&vd->vdev_stat_lock);
-
-		/* if it's a resilver, this may not be in the target range */
-		if (!needs_io) {
-			if (DVA_GET_GANG(&bp->blk_dva[d])) {
-				/*
-				 * Gang members may be spread across multiple
-				 * vdevs, so the best estimate we have is the
-				 * scrub range, which has already been checked.
-				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that all
-				 * gang members reside on the same vdev.
-				 */
-				needs_io = B_TRUE;
-			} else {
-				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
-				    phys_birth, 1);
-			}
-		}
-	}
-
-	if (needs_io && !zfs_no_scrub_io) {
-		void *data = zio_data_buf_alloc(size);
-
-		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
-			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		spa->spa_scrub_inflight++;
-		mutex_exit(&spa->spa_scrub_lock);
-
-		zio_nowait(zio_read(NULL, spa, bp, data, size,
-		    dsl_pool_scrub_clean_done, NULL, zio_priority,
-		    zio_flags, zb));
-	}
-
-	/* do not relocate this block */
-	return (0);
-}
-
-int
-dsl_pool_scrub_clean(dsl_pool_t *dp)
-{
-	spa_t *spa = dp->dp_spa;
-
-	/*
-	 * Purge all vdev caches and probe all devices.  We do this here
-	 * rather than in sync context because this requires a writer lock
-	 * on the spa_config lock, which we can't do from sync context.  The
-	 * spa_scrub_reopen flag indicates that vdev_open() should not
-	 * attempt to start another scrub.
-	 */
-	spa_vdev_state_enter(spa, SCL_NONE);
-	spa->spa_scrub_reopen = B_TRUE;
-	vdev_reopen(spa->spa_root_vdev);
-	spa->spa_scrub_reopen = B_FALSE;
-	(void) spa_vdev_state_exit(spa, NULL, 0);
-
-	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
-}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dsl_synctask.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c	27 Feb 2010 22:30:59 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c	4 Feb 2015 07:24:17 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu.h>
@@ -28,198 +28,159 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_synctask.h>
-#include <sys/cred.h>
+#include <sys/metaslab.h>
 
 #define	DST_AVG_BLKSHIFT 14
 
 /* ARGSUSED */
 static int
-dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx)
+dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
 {
 	return (0);
 }
 
-dsl_sync_task_group_t *
-dsl_sync_task_group_create(dsl_pool_t *dp)
-{
-	dsl_sync_task_group_t *dstg;
-
-	dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP);
-	list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t),
-	    offsetof(dsl_sync_task_t, dst_node));
-	dstg->dstg_pool = dp;
-	dstg->dstg_cr = CRED();
-
-	return (dstg);
-}
-
-void
-dsl_sync_task_create(dsl_sync_task_group_t *dstg,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified)
-{
-	dsl_sync_task_t *dst;
-
-	if (checkfunc == NULL)
-		checkfunc = dsl_null_checkfunc;
-	dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP);
-	dst->dst_checkfunc = checkfunc;
-	dst->dst_syncfunc = syncfunc;
-	dst->dst_arg1 = arg1;
-	dst->dst_arg2 = arg2;
-	list_insert_tail(&dstg->dstg_tasks, dst);
-
-	dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT;
-}
-
+/*
+ * Called from open context to perform a callback in syncing context.  Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail.  If it succeeds, it will be called again from
+ * syncing context.  The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called.  It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument.  Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
 int
-dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
 {
+	spa_t *spa;
 	dmu_tx_t *tx;
-	uint64_t txg;
-	dsl_sync_task_t *dst;
+	int err;
+	dsl_sync_task_t dst = { 0 };
+	dsl_pool_t *dp;
 
-top:
-	tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir);
-	VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+	err = spa_open(pool, &spa, FTAG);
+	if (err != 0)
+		return (err);
+	dp = spa_get_dsl(spa);
 
-	txg = dmu_tx_get_txg(tx);
+top:
+	tx = dmu_tx_create_dd(dp->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 
-	/* Do a preliminary error check. */
-	dstg->dstg_err = 0;
-	rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER);
-	for (dst = list_head(&dstg->dstg_tasks); dst;
-	    dst = list_next(&dstg->dstg_tasks, dst)) {
-#ifdef ZFS_DEBUG
-		/*
-		 * Only check half the time, otherwise, the sync-context
-		 * check will almost never fail.
-		 */
-		if (spa_get_random(2) == 0)
-			continue;
-#endif
-		dst->dst_err =
-		    dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
-		if (dst->dst_err)
-			dstg->dstg_err = dst->dst_err;
-	}
-	rw_exit(&dstg->dstg_pool->dp_config_rwlock);
+	dst.dst_pool = dp;
+	dst.dst_txg = dmu_tx_get_txg(tx);
+	dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+	dst.dst_space_check = space_check;
+	dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
+	dst.dst_syncfunc = syncfunc;
+	dst.dst_arg = arg;
+	dst.dst_error = 0;
+	dst.dst_nowaiter = B_FALSE;
+
+	dsl_pool_config_enter(dp, FTAG);
+	err = dst.dst_checkfunc(arg, tx);
+	dsl_pool_config_exit(dp, FTAG);
 
-	if (dstg->dstg_err) {
+	if (err != 0) {
 		dmu_tx_commit(tx);
-		return (dstg->dstg_err);
+		spa_close(spa, FTAG);
+		return (err);
 	}
 
-	VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
+	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg));
 
 	dmu_tx_commit(tx);
 
-	txg_wait_synced(dstg->dstg_pool, txg);
+	txg_wait_synced(dp, dst.dst_txg);
 
-	if (dstg->dstg_err == EAGAIN) {
-		txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
+	if (dst.dst_error == EAGAIN) {
+		txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
 		goto top;
 	}
 
-	return (dstg->dstg_err);
+	spa_close(spa, FTAG);
+	return (dst.dst_error);
 }
 
 void
-dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
 {
-	uint64_t txg;
+	dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
 
-	dstg->dstg_nowaiter = B_TRUE;
-	txg = dmu_tx_get_txg(tx);
-	VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg));
-}
-
-void
-dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg)
-{
-	dsl_sync_task_t *dst;
+	dst->dst_pool = dp;
+	dst->dst_txg = dmu_tx_get_txg(tx);
+	dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+	dst->dst_space_check = space_check;
+	dst->dst_checkfunc = dsl_null_checkfunc;
+	dst->dst_syncfunc = syncfunc;
+	dst->dst_arg = arg;
+	dst->dst_error = 0;
+	dst->dst_nowaiter = B_TRUE;
 
-	while (dst = list_head(&dstg->dstg_tasks)) {
-		list_remove(&dstg->dstg_tasks, dst);
-		kmem_free(dst, sizeof (dsl_sync_task_t));
-	}
-	kmem_free(dstg, sizeof (dsl_sync_task_group_t));
+	VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg));
 }
 
+/*
+ * Called in syncing context to execute the synctask.
+ */
 void
-dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx)
+dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
 {
-	dsl_sync_task_t *dst;
-	void *tr_cookie;
+	dsl_pool_t *dp = dst->dst_pool;
 
-	ASSERT3U(dstg->dstg_err, ==, 0);
+	ASSERT0(dst->dst_error);
 
 	/*
 	 * Check for sufficient space.
+	 *
+	 * When the sync task was created, the caller specified the
+	 * type of space checking required.  See the comment in
+	 * zfs_space_check_t for details on the semantics of each
+	 * type of space checking.
+	 *
+	 * We just check against what's on-disk; we don't want any
+	 * in-flight accounting to get in our way, because open context
+	 * may have already used up various in-core limits
+	 * (arc_tempreserve, dsl_pool_tempreserve).
 	 */
-	dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir,
-	    dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx);
-	/* don't bother trying again */
-	if (dstg->dstg_err == ERESTART)
-		dstg->dstg_err = EAGAIN;
-	if (dstg->dstg_err)
-		return;
-
-	/*
-	 * Check for errors by calling checkfuncs.
-	 */
-	rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER);
-	for (dst = list_head(&dstg->dstg_tasks); dst;
-	    dst = list_next(&dstg->dstg_tasks, dst)) {
-		dst->dst_err =
-		    dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx);
-		if (dst->dst_err)
-			dstg->dstg_err = dst->dst_err;
-	}
-
-	if (dstg->dstg_err == 0) {
-		/*
-		 * Execute sync tasks.
-		 */
-		for (dst = list_head(&dstg->dstg_tasks); dst;
-		    dst = list_next(&dstg->dstg_tasks, dst)) {
-			dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2,
-			    dstg->dstg_cr, tx);
+	if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
+		uint64_t quota = dsl_pool_adjustedsize(dp,
+		    dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
+		    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+		/* MOS space is triple-dittoed, so we multiply by 3. */
+		if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+			dst->dst_error = SET_ERROR(ENOSPC);
+			if (dst->dst_nowaiter)
+				kmem_free(dst, sizeof (*dst));
+			return;
 		}
 	}
-	rw_exit(&dstg->dstg_pool->dp_config_rwlock);
-
-	dsl_dir_tempreserve_clear(tr_cookie, tx);
 
-	if (dstg->dstg_nowaiter)
-		dsl_sync_task_group_destroy(dstg);
-}
-
-int
-dsl_sync_task_do(dsl_pool_t *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified)
-{
-	dsl_sync_task_group_t *dstg;
-	int err;
-
-	dstg = dsl_sync_task_group_create(dp);
-	dsl_sync_task_create(dstg, checkfunc, syncfunc,
-	    arg1, arg2, blocks_modified);
-	err = dsl_sync_task_group_wait(dstg);
-	dsl_sync_task_group_destroy(dstg);
-	return (err);
-}
-
-void
-dsl_sync_task_do_nowait(dsl_pool_t *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx)
-{
-	dsl_sync_task_group_t *dstg;
-
-	dstg = dsl_sync_task_group_create(dp);
-	dsl_sync_task_create(dstg, checkfunc, syncfunc,
-	    arg1, arg2, blocks_modified);
-	dsl_sync_task_group_nowait(dstg, tx);
+	/*
+	 * Check for errors by calling checkfunc.
+	 */
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+	dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
+	if (dst->dst_error == 0)
+		dst->dst_syncfunc(dst->dst_arg, tx);
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
+	if (dst->dst_nowaiter)
+		kmem_free(dst, sizeof (*dst));
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c	10 Oct 2016 11:09:56 -0000
@@ -0,0 +1,666 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+typedef struct dsl_dataset_user_hold_arg {
+	nvlist_t *dduha_holds;
+	nvlist_t *dduha_chkholds;
+	nvlist_t *dduha_errlist;
+	minor_t dduha_minor;
+} dsl_dataset_user_hold_arg_t;
+
+/*
+ * If you add new checks here, you may need to add additional checks to the
+ * "temporary" case in snapshot_check() in dmu_objset.c.
+ */
+int
+dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
+    boolean_t temphold, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	int error = 0;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	if (strlen(htag) > MAXNAMELEN)
+		return (SET_ERROR(E2BIG));
+	/* Tempholds have a more restricted length */
+	if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+		return (SET_ERROR(E2BIG));
+
+	/* tags must be unique (if ds already exists) */
+	if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+		uint64_t value;
+
+		error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    htag, 8, 1, &value);
+		if (error == 0)
+			error = SET_ERROR(EEXIST);
+		else if (error == ENOENT)
+			error = 0;
+	}
+
+	return (error);
+}
+
+static int
+dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_hold_arg_t *dduha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
+		return (SET_ERROR(ENOTSUP));
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+		dsl_dataset_t *ds;
+		int error = 0;
+		char *htag, *name;
+
+		/* must be a snapshot */
+		name = nvpair_name(pair);
+		if (strchr(name, '@') == NULL)
+			error = SET_ERROR(EINVAL);
+
+		if (error == 0)
+			error = nvpair_value_string(pair, &htag);
+
+		if (error == 0)
+			error = dsl_dataset_hold(dp, name, FTAG, &ds);
+
+		if (error == 0) {
+			error = dsl_dataset_user_hold_check_one(ds, htag,
+			    dduha->dduha_minor != 0, tx);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error == 0) {
+			fnvlist_add_string(dduha->dduha_chkholds, name, htag);
+		} else {
+			/*
+			 * We register ENOENT errors so they can be correctly
+			 * reported if needed, such as when all holds fail.
+			 */
+			fnvlist_add_int32(dduha->dduha_errlist, name, error);
+			if (error != ENOENT)
+				return (error);
+		}
+	}
+
+	return (0);
+}
+
+
+static void
+dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds,
+    const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
+		/*
+		 * This is the first user hold for this dataset.  Create
+		 * the userrefs zap object.
+		 */
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
+		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+	} else {
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+	}
+	ds->ds_userrefs++;
+
+	VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+	if (minor != 0) {
+		char name[MAXNAMELEN];
+		nvlist_t *tags;
+
+		VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
+		    htag, now, tx));
+		(void) snprintf(name, sizeof (name), "%llx",
+		    (u_longlong_t)ds->ds_object);
+
+		if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) {
+			tags = fnvlist_alloc();
+			fnvlist_add_boolean(tags, htag);
+			fnvlist_add_nvlist(tmpholds, name, tags);
+			fnvlist_free(tags);
+		} else {
+			fnvlist_add_boolean(tags, htag);
+		}
+	}
+
+	spa_history_log_internal_ds(ds, "hold", tx,
+	    "tag=%s temp=%d refs=%llu",
+	    htag, minor != 0, ds->ds_userrefs);
+}
+
+typedef struct zfs_hold_cleanup_arg {
+	char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t zhca_spa_load_guid;
+	nvlist_t *zhca_holds;
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+	zfs_hold_cleanup_arg_t *ca = arg;
+	spa_t *spa;
+	int error;
+
+	error = spa_open(ca->zhca_spaname, &spa, FTAG);
+	if (error != 0) {
+		zfs_dbgmsg("couldn't release holds on pool=%s "
+		    "because pool is no longer loaded",
+		    ca->zhca_spaname);
+		return;
+	}
+	if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
+		zfs_dbgmsg("couldn't release holds on pool=%s "
+		    "because pool is no longer loaded (guid doesn't match)",
+		    ca->zhca_spaname);
+		spa_close(spa, FTAG);
+		return;
+	}
+
+	(void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds);
+	fnvlist_free(ca->zhca_holds);
+	kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+	spa_close(spa, FTAG);
+}
+
+static void
+dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor)
+{
+	zfs_hold_cleanup_arg_t *ca;
+
+	if (minor == 0 || nvlist_empty(holds)) {
+		fnvlist_free(holds);
+		return;
+	}
+
+	ASSERT(spa != NULL);
+	ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
+
+	(void) strlcpy(ca->zhca_spaname, spa_name(spa),
+	    sizeof (ca->zhca_spaname));
+	ca->zhca_spa_load_guid = spa_load_guid(spa);
+	ca->zhca_holds = holds;
+	VERIFY0(zfs_onexit_add_cb(minor,
+	    dsl_dataset_user_release_onexit, ca, NULL));
+}
+
+void
+dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag,
+    minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+	nvlist_t *tmpholds;
+
+	if (minor != 0)
+		tmpholds = fnvlist_alloc();
+	else
+		tmpholds = NULL;
+	dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx);
+	dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_hold_arg_t *dduha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvlist_t *tmpholds;
+	uint64_t now = gethrestime_sec();
+
+	if (dduha->dduha_minor != 0)
+		tmpholds = fnvlist_alloc();
+	else
+		tmpholds = NULL;
+	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+		dsl_dataset_user_hold_sync_one_impl(tmpholds, ds,
+		    fnvpair_value_string(pair), dduha->dduha_minor, now, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor);
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_hold().
+ *
+ * To summarize:
+ * holds is nvl of snapname -> holdname
+ * errlist will be filled in with snapname -> error
+ *
+ * The snaphosts must all be in the same pool.
+ *
+ * Holds for snapshots that don't exist will be skipped.
+ *
+ * If none of the snapshots for requested holds exist then ENOENT will be
+ * returned.
+ *
+ * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned
+ * up when the process exits.
+ *
+ * On success all the holds, for snapshots that existed, will be created and 0
+ * will be returned.
+ *
+ * On failure no holds will be created, the errlist will be filled in,
+ * and an errno will returned.
+ *
+ * In all cases the errlist will contain entries for holds where the snapshot
+ * didn't exist.
+ */
+int
+dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
+{
+	dsl_dataset_user_hold_arg_t dduha;
+	nvpair_t *pair;
+	int ret;
+
+	pair = nvlist_next_nvpair(holds, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dduha.dduha_holds = holds;
+	dduha.dduha_chkholds = fnvlist_alloc();
+	dduha.dduha_errlist = errlist;
+	dduha.dduha_minor = cleanup_minor;
+
+	ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
+	    dsl_dataset_user_hold_sync, &dduha,
+	    fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
+	fnvlist_free(dduha.dduha_chkholds);
+
+	return (ret);
+}
+
+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
+    dsl_dataset_t **dsp);
+
+typedef struct dsl_dataset_user_release_arg {
+	dsl_holdfunc_t *ddura_holdfunc;
+	nvlist_t *ddura_holds;
+	nvlist_t *ddura_todelete;
+	nvlist_t *ddura_errlist;
+	nvlist_t *ddura_chkholds;
+} dsl_dataset_user_release_arg_t;
+
+/* Place a dataset hold on the snapshot identified by passed dsobj string */
+static int
+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
+    dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_hold_obj(dp, strtonum(dsobj, NULL), tag, dsp));
+}
+
+static int
+dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
+    dsl_dataset_t *ds, nvlist_t *holds, const char *snapname)
+{
+	uint64_t zapobj;
+	nvlist_t *holds_found;
+	objset_t *mos;
+	int numholds;
+
+	if (!ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (nvlist_empty(holds))
+		return (0);
+
+	numholds = 0;
+	mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+	holds_found = fnvlist_alloc();
+
+	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		uint64_t tmp;
+		int error;
+		const char *holdname = nvpair_name(pair);
+
+		if (zapobj != 0)
+			error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp);
+		else
+			error = SET_ERROR(ENOENT);
+
+		/*
+		 * Non-existent holds are put on the errlist, but don't
+		 * cause an overall failure.
+		 */
+		if (error == ENOENT) {
+			if (ddura->ddura_errlist != NULL) {
+				char *errtag = kmem_asprintf("%s#%s",
+				    snapname, holdname);
+				fnvlist_add_int32(ddura->ddura_errlist, errtag,
+				    ENOENT);
+				strfree(errtag);
+			}
+			continue;
+		}
+
+		if (error != 0) {
+			fnvlist_free(holds_found);
+			return (error);
+		}
+
+		fnvlist_add_boolean(holds_found, holdname);
+		numholds++;
+	}
+
+	if (DS_IS_DEFER_DESTROY(ds) &&
+	    dsl_dataset_phys(ds)->ds_num_children == 1 &&
+	    ds->ds_userrefs == numholds) {
+		/* we need to destroy the snapshot as well */
+		if (dsl_dataset_long_held(ds)) {
+			fnvlist_free(holds_found);
+			return (SET_ERROR(EBUSY));
+		}
+		fnvlist_add_boolean(ddura->ddura_todelete, snapname);
+	}
+
+	if (numholds != 0) {
+		fnvlist_add_nvlist(ddura->ddura_chkholds, snapname,
+		    holds_found);
+	}
+	fnvlist_free(holds_found);
+
+	return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_release_arg_t *ddura;
+	dsl_holdfunc_t *holdfunc;
+	dsl_pool_t *dp;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	dp = dmu_tx_pool(tx);
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	ddura = arg;
+	holdfunc = ddura->ddura_holdfunc;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
+		int error;
+		dsl_dataset_t *ds;
+		nvlist_t *holds;
+		const char *snapname = nvpair_name(pair);
+
+		error = nvpair_value_nvlist(pair, &holds);
+		if (error != 0)
+			error = (SET_ERROR(EINVAL));
+		else
+			error = holdfunc(dp, snapname, FTAG, &ds);
+		if (error == 0) {
+			error = dsl_dataset_user_release_check_one(ddura, ds,
+			    holds, snapname);
+			dsl_dataset_rele(ds, FTAG);
+		}
+		if (error != 0) {
+			if (ddura->ddura_errlist != NULL) {
+				fnvlist_add_int32(ddura->ddura_errlist,
+				    snapname, error);
+			}
+			/*
+			 * Non-existent snapshots are put on the errlist,
+			 * but don't cause an overall failure.
+			 */
+			if (error != ENOENT)
+				return (error);
+		}
+	}
+
+	return (0);
+}
+
+static void
+dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		int error;
+		const char *holdname = nvpair_name(pair);
+
+		/* Remove temporary hold if one exists. */
+		error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
+		VERIFY(error == 0 || error == ENOENT);
+
+		VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    holdname, tx));
+		ds->ds_userrefs--;
+
+		spa_history_log_internal_ds(ds, "release", tx,
+		    "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs);
+	}
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_release_arg_t *ddura = arg;
+	dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds,
+	    pair)) {
+		dsl_dataset_t *ds;
+		const char *name = nvpair_name(pair);
+
+		VERIFY0(holdfunc(dp, name, FTAG, &ds));
+
+		dsl_dataset_user_release_sync_one(ds,
+		    fnvpair_value_nvlist(pair), tx);
+		if (nvlist_exists(ddura->ddura_todelete, name)) {
+			ASSERT(ds->ds_userrefs == 0 &&
+			    dsl_dataset_phys(ds)->ds_num_children == 1 &&
+			    DS_IS_DEFER_DESTROY(ds));
+			dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
+		}
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_release().
+ *
+ * To summarize:
+ * Releases holds specified in the nvl holds.
+ *
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ *
+ * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots,
+ * otherwise they should be the names of shapshots.
+ *
+ * As a release may cause snapshots to be destroyed this trys to ensure they
+ * aren't mounted.
+ *
+ * The release of non-existent holds are skipped.
+ *
+ * At least one hold must have been released for the this function to succeed
+ * and return 0.
+ */
+static int
+dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
+    dsl_pool_t *tmpdp)
+{
+	dsl_dataset_user_release_arg_t ddura;
+	nvpair_t *pair;
+	char *pool;
+	int error;
+
+	pair = nvlist_next_nvpair(holds, NULL);
+	if (pair == NULL)
+		return (0);
+
+	/*
+	 * The release may cause snapshots to be destroyed; make sure they
+	 * are not mounted.
+	 */
+	if (tmpdp != NULL) {
+		/* Temporary holds are specified by dsobj string. */
+		ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
+		pool = spa_name(tmpdp->dp_spa);
+#ifdef _KERNEL
+		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(holds, pair)) {
+			dsl_dataset_t *ds;
+
+			dsl_pool_config_enter(tmpdp, FTAG);
+			error = dsl_dataset_hold_obj_string(tmpdp,
+			    nvpair_name(pair), FTAG, &ds);
+			if (error == 0) {
+				char name[ZFS_MAX_DATASET_NAME_LEN];
+				dsl_dataset_name(ds, name);
+				dsl_pool_config_exit(tmpdp, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				(void) zfs_unmount_snap(name);
+			} else {
+				dsl_pool_config_exit(tmpdp, FTAG);
+			}
+		}
+#endif
+	} else {
+		/* Non-temporary holds are specified by name. */
+		ddura.ddura_holdfunc = dsl_dataset_hold;
+		pool = nvpair_name(pair);
+#ifdef _KERNEL
+		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(holds, pair)) {
+			(void) zfs_unmount_snap(nvpair_name(pair));
+		}
+#endif
+	}
+
+	ddura.ddura_holds = holds;
+	ddura.ddura_errlist = errlist;
+	ddura.ddura_todelete = fnvlist_alloc();
+	ddura.ddura_chkholds = fnvlist_alloc();
+
+	error = dsl_sync_task(pool, dsl_dataset_user_release_check,
+	    dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
+	fnvlist_free(ddura.ddura_todelete);
+	fnvlist_free(ddura.ddura_chkholds);
+
+	return (error);
+}
+
+/*
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ */
+int
+dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist)
+{
+	return (dsl_dataset_user_release_impl(holds, errlist, NULL));
+}
+
+/*
+ * holds is nvl of snapdsobj -> { holdname, ... }
+ */
+void
+dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds)
+{
+	ASSERT(dp != NULL);
+	(void) dsl_dataset_user_release_impl(holds, NULL, dp);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_pool_hold(dsname, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+		zap_attribute_t *za;
+		zap_cursor_t zc;
+
+		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+		    dsl_dataset_phys(ds)->ds_userrefs_obj);
+		    zap_cursor_retrieve(&zc, za) == 0;
+		    zap_cursor_advance(&zc)) {
+			fnvlist_add_uint64(nvl, za->za_name,
+			    za->za_first_integer);
+		}
+		zap_cursor_fini(&zc);
+		kmem_free(za, sizeof (zap_attribute_t));
+	}
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (0);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c	22 Nov 2015 17:22:31 -0000
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/edonr.h>
+
+#define	EDONR_MODE		512
+#define	EDONR_BLOCK_SIZE	EdonR512_BLOCK_SIZE
+
+/*
+ * Native zio_checksum interface for the Edon-R hash function.
+ */
+/*ARGSUSED*/
+void
+zio_checksum_edonr_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	uint8_t		digest[EDONR_MODE / 8];
+	EdonRState	ctx;
+
+	ASSERT(ctx_template != NULL);
+	bcopy(ctx_template, &ctx, sizeof (ctx));
+	EdonRUpdate(&ctx, buf, size * 8);
+	EdonRFinal(&ctx, digest);
+	bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+}
+
+/*
+ * Byteswapped zio_checksum interface for the Edon-R hash function.
+ */
+void
+zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	zio_checksum_edonr_native(buf, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
+}
+
+void *
+zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+{
+	EdonRState	*ctx;
+	uint8_t		salt_block[EDONR_BLOCK_SIZE];
+
+	/*
+	 * Edon-R needs all but the last hash invocation to be on full-size
+	 * blocks, but the salt is too small. Rather than simply padding it
+	 * with zeros, we expand the salt into a new salt block of proper
+	 * size by double-hashing it (the new salt block will be composed of
+	 * H(salt) || H(H(salt))).
+	 */
+	CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
+	EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
+	    salt_block);
+	EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
+	    EDONR_MODE / 8);
+
+	/*
+	 * Feed the new salt block into the hash function - this will serve
+	 * as our MAC key.
+	 */
+	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+	EdonRInit(ctx, EDONR_MODE);
+	EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
+	return (ctx);
+}
+
+void
+zio_checksum_edonr_tmpl_free(void *ctx_template)
+{
+	EdonRState	*ctx = ctx_template;
+
+	bzero(ctx, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c	25 Apr 2017 16:40:13 -0000
@@ -0,0 +1,1039 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Header File
+ * Copyright (C) 2011-2013, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#include <sys/zfs_context.h>
+
+static int real_LZ4_compress(const char *source, char *dest, int isize,
+    int osize);
+static int LZ4_compressBound(int isize);
+static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+    int isize, int maxOutputSize);
+static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
+    int isize, int osize);
+static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
+    int isize, int osize);
+
+#ifdef __FreeBSD__
+static kmem_cache_t *lz4_ctx_cache;
+#endif
+
+/*ARGSUSED*/
+size_t
+lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uint32_t bufsiz;
+	char *dest = d_start;
+
+	ASSERT(d_len >= sizeof (bufsiz));
+
+	bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
+	    d_len - sizeof (bufsiz));
+
+	/* Signal an error if the compression routine returned zero. */
+	if (bufsiz == 0)
+		return (s_len);
+
+	/*
+	 * Encode the compresed buffer size at the start. We'll need this in
+	 * decompression to counter the effects of padding which might be
+	 * added to the compressed buffer and which, if unhandled, would
+	 * confuse the hell out of our decompression function.
+	 */
+	*(uint32_t *)dest = BE_32(bufsiz);
+
+	return (bufsiz + sizeof (bufsiz));
+}
+
+/*ARGSUSED*/
+int
+lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	const char *src = s_start;
+	uint32_t bufsiz = BE_IN32(src);
+
+	/* invalid compressed buffer size encoded at start */
+	if (bufsiz + sizeof (bufsiz) > s_len)
+		return (1);
+
+	/*
+	 * Returns 0 on success (decompression function returned non-negative)
+	 * and non-zero on failure (decompression function returned negative.
+	 */
+	return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
+	    d_start, bufsiz, d_len) < 0);
+}
+
+/*
+ * LZ4 API Description:
+ *
+ * Simple Functions:
+ * real_LZ4_compress() :
+ * 	isize  : is the input size. Max supported value is ~1.9GB
+ * 	return : the number of bytes written in buffer dest
+ *		 or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
+ * 	note : destination buffer must be already allocated.
+ * 		destination buffer must be sized to handle worst cases
+ * 		situations (input data not compressible) worst case size
+ * 		evaluation is provided by function LZ4_compressBound().
+ *
+ * Advanced Functions
+ *
+ * LZ4_compressBound() :
+ * 	Provides the maximum size that LZ4 may output in a "worst case"
+ * 	scenario (input data not compressible) primarily useful for memory
+ * 	allocation of output buffer.
+ *
+ * 	isize  : is the input size. Max supported value is ~1.9GB
+ * 	return : maximum output size in a "worst case" scenario
+ * 	note : this function is limited by "int" range (2^31-1)
+ *
+ * LZ4_uncompress_unknownOutputSize() :
+ * 	isize  : is the input size, therefore the compressed size
+ * 	maxOutputSize : is the size of the destination buffer (which must be
+ * 		already allocated)
+ * 	return : the number of bytes decoded in the destination buffer
+ * 		(necessarily <= maxOutputSize). If the source stream is
+ * 		malformed, the function will stop decoding and return a
+ * 		negative result, indicating the byte position of the faulty
+ * 		instruction. This function never writes beyond dest +
+ * 		maxOutputSize, and is therefore protected against malicious
+ * 		data packets.
+ * 	note   : Destination buffer must be already allocated.
+ *
+ * LZ4_compressCtx() :
+ * 	This function explicitly handles the CTX memory structure.
+ *
+ * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * 	by the caller (either on the stack or using kmem_zalloc). Passing NULL
+ * 	isn't valid.
+ *
+ * LZ4_compress64kCtx() :
+ * 	Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
+ * 	isize *Must* be <64KB, otherwise the output will be corrupted.
+ *
+ * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * 	by the caller (either on the stack or using kmem_zalloc). Passing NULL
+ * 	isn't valid.
+ */
+
+/*
+ * Tuning parameters
+ */
+
+/*
+ * COMPRESSIONLEVEL: Increasing this value improves compression ratio
+ *	 Lowering this value reduces memory usage. Reduced memory usage
+ *	typically improves speed, due to cache effect (ex: L1 32KB for Intel,
+ *	L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes
+ *	(examples : 12 -> 16KB ; 17 -> 512KB)
+ */
+#define	COMPRESSIONLEVEL 12
+
+/*
+ * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the
+ *	algorithm skip faster data segments considered "incompressible".
+ *	This may decrease compression ratio dramatically, but will be
+ *	faster on incompressible data. Increasing this value will make
+ *	the algorithm search more before declaring a segment "incompressible".
+ *	This could improve compression a bit, but will be slower on
+ *	incompressible data. The default value (6) is recommended.
+ */
+#define	NOTCOMPRESSIBLE_CONFIRMATION 6
+
+/*
+ * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
+ * performance for big endian cpu, but the resulting compressed stream
+ * will be incompatible with little-endian CPU. You can set this option
+ * to 1 in situations where data will stay within closed environment.
+ * This option is useless on Little_Endian CPU (such as x86).
+ */
+/* #define	BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
+
+/*
+ * CPU Feature Detection
+ */
+
+/* 32 or 64 bits ? */
+#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || \
+    defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || \
+    defined(__LP64__) || defined(_LP64))
+#define	LZ4_ARCH64 1
+#else
+#define	LZ4_ARCH64 0
+#endif
+
+/*
+ * Limits the amount of stack space that the algorithm may consume to hold
+ * the compression lookup table. The value `9' here means we'll never use
+ * more than 2k of stack (see above for a description of COMPRESSIONLEVEL).
+ * If more memory is needed, it is allocated from the heap.
+ */
+/* FreeBSD: Use heap for all platforms for now */
+#define	STACKLIMIT 0
+
+/*
+ * Little Endian or Big Endian?
+ * Note: overwrite the below #define if you know your architecture endianess.
+ */
+#if BYTE_ORDER == BIG_ENDIAN
+#define	LZ4_BIG_ENDIAN 1
+#else
+/*
+ * Little Endian assumed. PDP Endian and other very rare endian format
+ * are unsupported.
+ */
+#endif
+
+/*
+ * Unaligned memory access is automatically enabled for "common" CPU,
+ * such as x86. For others CPU, the compiler will be more cautious, and
+ * insert extra code to ensure aligned access is respected. If you know
+ * your target CPU supports unaligned memory access, you may want to
+ * force this option manually to improve performance
+ */
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define	LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+/*
+ * FreeBSD: can't use GCC's __builtin_ctz when using sparc64 because
+ * gcc currently rely on libcompiler_rt.
+ *
+ * TODO: revisit this when situation changes.
+ */
+#if defined(__sparc64__)
+#define	LZ4_FORCE_SW_BITCOUNT
+#endif
+
+/*
+ * Compiler Options
+ */
+#if __STDC_VERSION__ >= 199901L	/* C99 */
+/* "restrict" is a known keyword */
+#else
+/* Disable restrict */
+#define	restrict
+#endif
+
+#define	lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
+	(((x) & 0xffu) << 8)))
+
+#define	expect(expr, value)    (__builtin_expect((expr), (value)))
+
+#if defined(likely)
+#undef likely
+#endif
+#if defined(unlikely)
+#undef unlikely
+#endif
+
+#define	likely(expr)	expect((expr) != 0, 1)
+#define	unlikely(expr)	expect((expr) != 0, 0)
+
+/* Basic types */
+#define	BYTE	uint8_t
+#define	U16	uint16_t
+#define	U32	uint32_t
+#define	S32	int32_t
+#define	U64	uint64_t
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack(1)
+#endif
+
+typedef struct _U16_S {
+	U16 v;
+} U16_S;
+typedef struct _U32_S {
+	U32 v;
+} U32_S;
+typedef struct _U64_S {
+	U64 v;
+} U64_S;
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack()
+#endif
+
+#define	A64(x) (((U64_S *)(x))->v)
+#define	A32(x) (((U32_S *)(x))->v)
+#define	A16(x) (((U16_S *)(x))->v)
+
+/*
+ * Constants
+ */
+#define	MINMATCH 4
+
+#define	HASH_LOG COMPRESSIONLEVEL
+#define	HASHTABLESIZE (1 << HASH_LOG)
+#define	HASH_MASK (HASHTABLESIZE - 1)
+
+#define	SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
+	NOTCOMPRESSIBLE_CONFIRMATION : 2)
+
+/*
+ * Defines if memory is allocated into the stack (local variable),
+ * or into the heap (kmem_alloc()).
+ */
+#define	HEAPMODE (HASH_LOG > STACKLIMIT)
+#define	COPYLENGTH 8
+#define	LASTLITERALS 5
+#define	MFLIMIT (COPYLENGTH + MINMATCH)
+#define	MINLENGTH (MFLIMIT + 1)
+
+#define	MAXD_LOG 16
+#define	MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define	ML_BITS 4
+#define	ML_MASK ((1U<<ML_BITS)-1)
+#define	RUN_BITS (8-ML_BITS)
+#define	RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*
+ * Architecture-specific macros
+ */
+#if LZ4_ARCH64
+#define	STEPSIZE 8
+#define	UARCH U64
+#define	AARCH A64
+#define	LZ4_COPYSTEP(s, d)	A64(d) = A64(s); d += 8; s += 8;
+#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
+#define	LZ4_SECURECOPY(s, d, e)	if (d < e) LZ4_WILDCOPY(s, d, e)
+#define	HTYPE U32
+#define	INITBASE(base)		const BYTE* const base = ip
+#else /* !LZ4_ARCH64 */
+#define	STEPSIZE 4
+#define	UARCH U32
+#define	AARCH A32
+#define	LZ4_COPYSTEP(s, d)	A32(d) = A32(s); d += 4; s += 4;
+#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
+#define	LZ4_SECURECOPY		LZ4_WILDCOPY
+#define	HTYPE const BYTE *
+#define	INITBASE(base)		const int base = 0
+#endif /* !LZ4_ARCH64 */
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+	{ U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#define	LZ4_WRITE_LITTLEENDIAN_16(p, i) \
+	{ U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#else
+#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
+#define	LZ4_WRITE_LITTLEENDIAN_16(p, v)  { A16(p) = v; p += 2; }
+#endif
+
+
+/* Local structures */
+struct refTables {
+	HTYPE hashTable[HASHTABLESIZE];
+};
+
+
+/* Macros */
+#define	LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
+	HASH_LOG))
+#define	LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
+#define	LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
+#define	LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
+	d = e; }
+
+
+/* Private functions */
+#if LZ4_ARCH64
+
+static inline int
+LZ4_NbCommonBytes(register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_clzll(val) >> 3);
+#else
+	int r;
+	if (!(val >> 32)) {
+		r = 4;
+	} else {
+		r = 0;
+		val >>= 32;
+	}
+	if (!(val >> 16)) {
+		r += 2;
+		val >>= 8;
+	} else {
+		val >>= 24;
+	}
+	r += (!val);
+	return (r);
+#endif
+#else
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_ctzll(val) >> 3);
+#else
+	static const int DeBruijnBytePos[64] =
+	    { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
+		3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
+		5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
+		4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+	};
+	return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
+	    58];
+#endif
+#endif
+}
+
+#else
+
+static inline int
+LZ4_NbCommonBytes(register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_clz(val) >> 3);
+#else
+	int r;
+	if (!(val >> 16)) {
+		r = 2;
+		val >>= 8;
+	} else {
+		r = 0;
+		val >>= 24;
+	}
+	r += (!val);
+	return (r);
+#endif
+#else
+#if !defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_ctz(val) >> 3);
+#else
+	static const int DeBruijnBytePos[32] = {
+		0, 0, 3, 0, 3, 1, 3, 0,
+		3, 2, 2, 1, 3, 2, 0, 1,
+		3, 3, 1, 2, 2, 2, 2, 0,
+		3, 1, 2, 0, 1, 0, 1, 1
+	};
+	return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
+	    27];
+#endif
+#endif
+}
+
+#endif
+
+/* Public functions */
+
+static int
+LZ4_compressBound(int isize)
+{
+	return (isize + (isize / 255) + 16);
+}
+
+/* Compression functions */
+
+/*ARGSUSED*/
+static int
+LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
+    int osize)
+{
+#if HEAPMODE
+	struct refTables *srt = (struct refTables *)ctx;
+	HTYPE *HashTable = (HTYPE *) (srt->hashTable);
+#else
+	HTYPE HashTable[HASHTABLESIZE] = { 0 };
+#endif
+
+	const BYTE *ip = (BYTE *) source;
+	INITBASE(base);
+	const BYTE *anchor = ip;
+	const BYTE *const iend = ip + isize;
+	const BYTE *const oend = (BYTE *) dest + osize;
+	const BYTE *const mflimit = iend - MFLIMIT;
+#define	matchlimit (iend - LASTLITERALS)
+
+	BYTE *op = (BYTE *) dest;
+
+	int len, length;
+	const int skipStrength = SKIPSTRENGTH;
+	U32 forwardH;
+
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	/* First Byte */
+	HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+	ip++;
+	forwardH = LZ4_HASH_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findMatchAttempts = (1U << skipStrength) + 3;
+		const BYTE *forwardIp = ip;
+		const BYTE *ref;
+		BYTE *token;
+
+		/* Find a match */
+		do {
+			U32 h = forwardH;
+			int step = findMatchAttempts++ >> skipStrength;
+			ip = forwardIp;
+			forwardIp = ip + step;
+
+			if unlikely(forwardIp > mflimit) {
+				goto _last_literals;
+			}
+
+			forwardH = LZ4_HASH_VALUE(forwardIp);
+			ref = base + HashTable[h];
+			HashTable[h] = ip - base;
+
+		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (BYTE *) source) &&
+		    unlikely(ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = ip - anchor;
+		token = op++;
+
+		/* Check output limit */
+		if unlikely(op + length + (2 + 1 + LASTLITERALS) +
+		    (length >> 8) > oend)
+			return (0);
+
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254; len -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+		_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+		/* Start Counting */
+		ip += MINMATCH;
+		ref += MINMATCH;	/* MinMatch verified */
+		anchor = ip;
+		while likely(ip < matchlimit - (STEPSIZE - 1)) {
+			UARCH diff = AARCH(ref) ^ AARCH(ip);
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NbCommonBytes(diff);
+			goto _endCount;
+		}
+#if LZ4_ARCH64
+		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+#endif
+		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < matchlimit) && (*ref == *ip))
+			ip++;
+		_endCount:
+
+		/* Encode MatchLength */
+		len = (ip - anchor);
+		/* Check output limit */
+		if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)
+			return (0);
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (BYTE)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+		/* Fill table */
+		HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + HashTable[LZ4_HASH_VALUE(ip)];
+		HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardH = LZ4_HASH_VALUE(ip);
+	}
+
+	_last_literals:
+	/* Encode Last Literals */
+	{
+		int lastRun = iend - anchor;
+		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+		    oend)
+			return (0);
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK << ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254; lastRun -= 255) {
+				*op++ = 255;
+			}
+			*op++ = (BYTE)lastRun;
+		} else
+			*op++ = (lastRun << ML_BITS);
+		(void) memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+
+
+/* Note : this function is valid only if isize < LZ4_64KLIMIT */
+#define	LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
+#define	HASHLOG64K (HASH_LOG + 1)
+#define	HASH64KTABLESIZE (1U << HASHLOG64K)
+#define	LZ4_HASH64K_FUNCTION(i)	(((i) * 2654435761U) >> ((MINMATCH*8) - \
+	HASHLOG64K))
+#define	LZ4_HASH64K_VALUE(p)	LZ4_HASH64K_FUNCTION(A32(p))
+
+/*ARGSUSED*/
+static int
+LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
+    int osize)
+{
+#if HEAPMODE
+	struct refTables *srt = (struct refTables *)ctx;
+	U16 *HashTable = (U16 *) (srt->hashTable);
+#else
+	U16 HashTable[HASH64KTABLESIZE] = { 0 };
+#endif
+
+	const BYTE *ip = (BYTE *) source;
+	const BYTE *anchor = ip;
+	const BYTE *const base = ip;
+	const BYTE *const iend = ip + isize;
+	const BYTE *const oend = (BYTE *) dest + osize;
+	const BYTE *const mflimit = iend - MFLIMIT;
+#define	matchlimit (iend - LASTLITERALS)
+
+	BYTE *op = (BYTE *) dest;
+
+	int len, length;
+	const int skipStrength = SKIPSTRENGTH;
+	U32 forwardH;
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	/* First Byte */
+	ip++;
+	forwardH = LZ4_HASH64K_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findMatchAttempts = (1U << skipStrength) + 3;
+		const BYTE *forwardIp = ip;
+		const BYTE *ref;
+		BYTE *token;
+
+		/* Find a match */
+		do {
+			U32 h = forwardH;
+			int step = findMatchAttempts++ >> skipStrength;
+			ip = forwardIp;
+			forwardIp = ip + step;
+
+			if (forwardIp > mflimit) {
+				goto _last_literals;
+			}
+
+			forwardH = LZ4_HASH64K_VALUE(forwardIp);
+			ref = base + HashTable[h];
+			HashTable[h] = ip - base;
+
+		} while (A32(ref) != A32(ip));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (BYTE *) source) &&
+		    (ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = ip - anchor;
+		token = op++;
+
+		/* Check output limit */
+		if unlikely(op + length + (2 + 1 + LASTLITERALS) +
+		    (length >> 8) > oend)
+			return (0);
+
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254; len -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+		_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+		/* Start Counting */
+		ip += MINMATCH;
+		ref += MINMATCH;	/* MinMatch verified */
+		anchor = ip;
+		while (ip < matchlimit - (STEPSIZE - 1)) {
+			UARCH diff = AARCH(ref) ^ AARCH(ip);
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NbCommonBytes(diff);
+			goto _endCount;
+		}
+#if LZ4_ARCH64
+		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+#endif
+		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < matchlimit) && (*ref == *ip))
+			ip++;
+		_endCount:
+
+		/* Encode MatchLength */
+		len = (ip - anchor);
+		/* Check output limit */
+		if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)
+			return (0);
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (BYTE)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+		/* Fill table */
+		HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
+		HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
+		if (A32(ref) == A32(ip)) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardH = LZ4_HASH64K_VALUE(ip);
+	}
+
+	_last_literals:
+	/* Encode Last Literals */
+	{
+		int lastRun = iend - anchor;
+		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+		    oend)
+			return (0);
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK << ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254; lastRun -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)lastRun;
+		} else
+			*op++ = (lastRun << ML_BITS);
+		(void) memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+static int
+real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+{
+#if HEAPMODE
+#ifdef __FreeBSD__
+	void *ctx = kmem_cache_alloc(lz4_ctx_cache, KM_NOSLEEP);
+#else
+	void *ctx = kmem_zalloc(sizeof (struct refTables), KM_NOSLEEP);
+#endif
+	int result;
+
+	/*
+	 * out of kernel memory, gently fall through - this will disable
+	 * compression in zio_compress_data
+	 */
+	if (ctx == NULL)
+		return (0);
+
+	bzero(ctx, sizeof(struct refTables));
+	if (isize < LZ4_64KLIMIT)
+		result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
+	else
+		result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+
+#ifdef __FreeBSD__
+	kmem_cache_free(lz4_ctx_cache, ctx);
+#else
+	kmem_free(ctx, sizeof (struct refTables));
+#endif
+	return (result);
+#else
+	if (isize < (int)LZ4_64KLIMIT)
+		return (LZ4_compress64kCtx(NULL, source, dest, isize, osize));
+	return (LZ4_compressCtx(NULL, source, dest, isize, osize));
+#endif
+}
+
+/* Decompression functions */
+
+/*
+ * Note: The decoding function LZ4_uncompress_unknownOutputSize() is safe
+ *	against "buffer overflow" attack type. They will never write nor
+ *	read outside of the provided output buffers.
+ *	LZ4_uncompress_unknownOutputSize() also insures that it will never
+ *	read outside of the input buffer.  A corrupted input will produce
+ *	an error result, a negative int, indicating the position of the
+ *	error within input stream.
+ */
+
+static int
+LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize,
+    int maxOutputSize)
+{
+	/* Local Variables */
+	const BYTE *restrict ip = (const BYTE *) source;
+	const BYTE *const iend = ip + isize;
+	const BYTE *ref;
+
+	BYTE *op = (BYTE *) dest;
+	BYTE *const oend = op + maxOutputSize;
+	BYTE *cpy;
+
+	size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+	size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+#endif
+
+	/* Main Loop */
+	while (ip < iend) {
+		unsigned token;
+		size_t length;
+
+		/* get runlength */
+		token = *ip++;
+		if ((length = (token >> ML_BITS)) == RUN_MASK) {
+			int s = 255;
+			while ((ip < iend) && (s == 255)) {
+				s = *ip++;
+				length += s;
+			}
+		}
+		/* copy literals */
+		cpy = op + length;
+		/* CORNER-CASE: cpy might overflow. */
+		if (cpy < op)
+			goto _output_error;	/* cpy was overflowed, bail! */
+		if ((cpy > oend - COPYLENGTH) ||
+		    (ip + length > iend - COPYLENGTH)) {
+			if (cpy > oend)
+				/* Error: writes beyond output buffer */
+				goto _output_error;
+			if (ip + length != iend)
+				/*
+				 * Error: LZ4 format requires to consume all
+				 * input at this stage
+				 */
+				goto _output_error;
+			(void) memcpy(op, ip, length);
+			op += length;
+			/* Necessarily EOF, due to parsing restrictions */
+			break;
+		}
+		LZ4_WILDCOPY(ip, op, cpy);
+		ip -= (op - cpy);
+		op = cpy;
+
+		/* get offset */
+		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+		ip += 2;
+		if (ref < (BYTE * const) dest)
+			/*
+			 * Error: offset creates reference outside of
+			 * destination buffer
+			 */
+			goto _output_error;
+
+		/* get matchlength */
+		if ((length = (token & ML_MASK)) == ML_MASK) {
+			while (ip < iend) {
+				int s = *ip++;
+				length += s;
+				if (s == 255)
+					continue;
+				break;
+			}
+		}
+		/* copy repeated sequence */
+		if unlikely(op - ref < STEPSIZE) {
+#if LZ4_ARCH64
+			size_t dec64 = dec64table[op-ref];
+#else
+			const int dec64 = 0;
+#endif
+			op[0] = ref[0];
+			op[1] = ref[1];
+			op[2] = ref[2];
+			op[3] = ref[3];
+			op += 4;
+			ref += 4;
+			ref -= dec32table[op-ref];
+			A32(op) = A32(ref);
+			op += STEPSIZE - 4;
+			ref -= dec64;
+		} else {
+			LZ4_COPYSTEP(ref, op);
+		}
+		cpy = op + length - (STEPSIZE - 4);
+		if (cpy > oend - COPYLENGTH) {
+			if (cpy > oend)
+				/*
+				 * Error: request to write outside of
+				 * destination buffer
+				 */
+				goto _output_error;
+			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			if (op == oend)
+				/*
+				 * Check EOF (should never happen, since
+				 * last 5 bytes are supposed to be literals)
+				 */
+				goto _output_error;
+			continue;
+		}
+		LZ4_SECURECOPY(ref, op, cpy);
+		op = cpy;	/* correction */
+	}
+
+	/* end of decoding */
+	return (int)(((char *)op) - dest);
+
+	/* write overflow error detected */
+	_output_error:
+	return (int)(-(((char *)ip) - source));
+}
+
+#ifdef __FreeBSD__
+
+extern void
+lz4_init(void)
+{
+
+#if HEAPMODE
+	lz4_ctx_cache = kmem_cache_create("lz4_ctx", sizeof(struct refTables),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+#endif
+}
+
+extern void
+lz4_fini(void)
+{
+
+#if HEAPMODE
+	kmem_cache_destroy(lz4_ctx_cache);
+#endif
+}
+
+#endif /* __FreeBSD__ */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 lzjb.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c	27 Feb 2010 22:30:59 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c	2 Nov 2013 01:24:10 -0000
@@ -20,12 +20,11 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
- * We keep our own copy of this algorithm for 2 main reasons:
+ * We keep our own copy of this algorithm for 3 main reasons:
  *	1. If we didn't, anyone modifying common/os/compress.c would
  *         directly break our on disk format
  *	2. Our version of lzjb does not have a number of checks that the
@@ -33,11 +32,13 @@
  *	3. We initialize the lempel to ensure deterministic results,
  *	   so that identical blocks can always be deduplicated.
  * In particular, we are adding the "feature" that compress() can
- * take a destination buffer size and return -1 if the data will not
- * compress to d_len or less.
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
  */
 
+#include <sys/zfs_context.h>
 #include <sys/types.h>
+#include <sys/param.h>
 
 #define	MATCH_BITS	6
 #define	MATCH_MIN	3
@@ -51,7 +52,8 @@ lzjb_compress(void *s_start, void *d_sta
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
-	uchar_t *cpy, *copymap;
+	uchar_t *cpy;
+	uchar_t *copymap = NULL;
 	int copymask = 1 << (NBBY - 1);
 	int mlen, offset, hash;
 	uint16_t *hp;
@@ -100,7 +102,8 @@ lzjb_decompress(void *s_start, void *d_s
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
 	uchar_t *d_end = (uchar_t *)d_start + d_len;
-	uchar_t *cpy, copymap;
+	uchar_t *cpy;
+	uchar_t copymap = 0;
 	int copymask = 1 << (NBBY - 1);
 
 	while (dst < d_end) {
@@ -114,7 +117,9 @@ lzjb_decompress(void *s_start, void *d_s
 			src += 2;
 			if ((cpy = dst - offset) < (uchar_t *)d_start)
 				return (-1);
-			while (--mlen >= 0 && dst < d_end)
+			if (mlen > (d_end - dst))
+				mlen = d_end - dst;
+			while (--mlen >= 0)
 				*dst++ = *cpy++;
 		} else {
 			*dst++ = *src++;
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 metaslab.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c	27 Feb 2010 22:31:01 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c	27 Mar 2017 06:19:45 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
@@ -30,14 +32,107 @@
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zfeature.h>
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
+
+#define	GANG_ALLOCATION(flags) \
+	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
+    &metaslab_gang_bang, 0,
+    "Force gang block allocation for blocks larger than or equal to this value");
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+int zfs_condense_pct = 200;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+    &zfs_condense_pct, 0,
+    "Condense on-disk spacemap when it is more than this many percents"
+    " of in-memory counterpart");
+
+/*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
+    &zfs_mg_noalloc_threshold, 0,
+    "Percentage of metaslab group size that should be free"
+    " to make it eligible for allocation");
+
+/*
+ * Metaslab groups are considered eligible for allocations if their
+ * fragmenation metric (measured as a percentage) is less than or equal to
+ * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
+ * then it will be skipped unless all metaslab groups within the metaslab
+ * class have also crossed this threshold.
+ */
+int zfs_mg_fragmentation_threshold = 85;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
+    &zfs_mg_fragmentation_threshold, 0,
+    "Percentage of metaslab group size that should be considered "
+    "eligible for allocations unless all metaslab groups within the metaslab class "
+    "have also crossed this threshold");
+
+/*
+ * Allow metaslabs to keep their active state as long as their fragmentation
+ * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
+ * active metaslab that exceeds this threshold will no longer keep its active
+ * status allowing better metaslabs to be selected.
+ */
+int zfs_metaslab_fragmentation_threshold = 70;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
+    &zfs_metaslab_fragmentation_threshold, 0,
+    "Maximum percentage of metaslab fragmentation level to keep their active state");
+
+/*
+ * When set will load all metaslabs when pool is first opened.
+ */
+int metaslab_debug_load = 0;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
+    &metaslab_debug_load, 0,
+    "Load all metaslabs when pool is first opened");
 
 /*
- * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ * When set will prevent metaslabs from being unloaded.
  */
-static int metaslab_debug = 0;
+int metaslab_debug_unload = 0;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
+    &metaslab_debug_unload, 0,
+    "Prevent metaslabs from being unloaded");
 
 /*
  * Minimum size which forces the dynamic allocator to change
@@ -45,31 +140,123 @@ static int metaslab_debug = 0;
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
-uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+    &metaslab_df_alloc_threshold, 0,
+    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
- * Once the space_map's free space drops below this level we dynamically
+ * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 int metaslab_df_free_pct = 4;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+    &metaslab_df_free_pct, 0,
+    "The minimum free space, in percent, which must be available in a "
+    "space map to continue allocations in a first-fit fashion");
 
 /*
  * A metaslab is considered "free" if it contains a contiguous
  * segment which is greater than metaslab_min_alloc_size.
  */
 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
+    &metaslab_min_alloc_size, 0,
+    "A metaslab is considered \"free\" if it contains a contiguous "
+    "segment which is greater than vfs.zfs.metaslab.min_alloc_size");
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+int metaslab_load_pct = 50;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+    &metaslab_load_pct, 0,
+    "Percentage of cpus that can be used by the metaslab taskq");
+
+/*
+ * Determines how many txgs a metaslab may remain loaded without having any
+ * allocations from it. As long as a metaslab continues to be used we will
+ * keep it loaded.
+ */
+int metaslab_unload_delay = TXG_SIZE * 2;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
+    &metaslab_unload_delay, 0,
+    "Number of TXGs that an unused metaslab can be kept in memory");
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+int metaslab_preload_limit = SPA_DVAS_PER_BP;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+    &metaslab_preload_limit, 0,
+    "Max number of metaslabs per group to preload");
+
+/*
+ * Enable/disable preloading of metaslab.
+ */
+boolean_t metaslab_preload_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
+    &metaslab_preload_enabled, 0,
+    "Max number of metaslabs per group to preload");
+
+/*
+ * Enable/disable fragmentation weighting on metaslabs.
+ */
+boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
+    &metaslab_fragmentation_factor_enabled, 0,
+    "Enable fragmentation weighting on metaslabs");
+
+/*
+ * Enable/disable lba weighting (i.e. outer tracks are given preference).
+ */
+boolean_t metaslab_lba_weighting_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
+    &metaslab_lba_weighting_enabled, 0,
+    "Enable LBA weighting (i.e. outer tracks are given preference)");
+
+/*
+ * Enable/disable metaslab group biasing.
+ */
+boolean_t metaslab_bias_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
+    &metaslab_bias_enabled, 0,
+    "Enable metaslab group biasing");
+
+/*
+ * Enable/disable segment-based metaslab selection.
+ */
+boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
+
+/*
+ * When using segment-based metaslab selection, we will continue
+ * allocating from the active metaslab until we have exhausted
+ * zfs_metaslab_switch_threshold of its buckets.
+ */
+int zfs_metaslab_switch_threshold = 2;
 
 /*
- * Max number of space_maps to prefetch.
+ * Internal switch to enable/disable the metaslab allocation tracing
+ * facility.
  */
-int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+boolean_t metaslab_trace_enabled = B_TRUE;
 
 /*
- * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ * Maximum entries that the metaslab allocation tracing facility will keep
+ * in a given list when running in non-debug mode. We limit the number
+ * of entries in non-debug mode to prevent us from using up too much memory.
+ * The limit should be sufficiently large that we don't expect any allocation
+ * to every exceed this value. In debug mode, the system will panic if this
+ * limit is ever reached allowing for further investigation.
  */
-int metaslab_smo_bonus_pct = 150;
+uint64_t metaslab_trace_max_entries = 5000;
+
+static uint64_t metaslab_weight(metaslab_t *);
+static void metaslab_set_fragmentation(metaslab_t *);
+
+kmem_cache_t *metaslab_alloc_trace_cache;
 
 /*
  * ==========================================================================
@@ -77,7 +264,7 @@ int metaslab_smo_bonus_pct = 150;
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
@@ -86,6 +273,8 @@ metaslab_class_create(spa_t *spa, space_
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
+	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+	refcount_create_tracked(&mc->mc_alloc_slots);
 
 	return (mc);
 }
@@ -99,6 +288,8 @@ metaslab_class_destroy(metaslab_class_t 
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
+	refcount_destroy(&mc->mc_alloc_slots);
+	mutex_destroy(&mc->mc_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
@@ -138,6 +329,27 @@ metaslab_class_space_update(metaslab_cla
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
+void
+metaslab_class_minblocksize_update(metaslab_class_t *mc)
+{
+	metaslab_group_t *mg;
+	vdev_t *vd;
+	uint64_t minashift = UINT64_MAX;
+
+	if ((mg = mc->mc_rotor) == NULL) {
+		mc->mc_minblocksize = SPA_MINBLOCKSIZE;
+		return;
+	}
+
+	do {
+		vd = mg->mg_vd;
+		if (vd->vdev_ashift < minashift)
+			minashift = vd->vdev_ashift;
+	} while ((mg = mg->mg_next) != mc->mc_rotor);
+
+	mc->mc_minblocksize = 1ULL << minashift;
+}
+
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
@@ -162,11 +374,133 @@ metaslab_class_get_dspace(metaslab_class
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
+uint64_t
+metaslab_class_get_minblocksize(metaslab_class_t *mc)
+{
+	return (mc->mc_minblocksize);
+}
+
+void
+metaslab_class_histogram_verify(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t *mc_hist;
+	int i;
+
+	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+		return;
+
+	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	    KM_SLEEP);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		/*
+		 * Skip any holes, uninitialized top-levels, or
+		 * vdevs that are not in this metalab class.
+		 */
+		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+			mc_hist[i] += mg->mg_histogram[i];
+	}
+
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+
+	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
 /*
- * ==========================================================================
- * Metaslab groups
- * ==========================================================================
+ * Calculate the metaslab class's fragmentation metric. The metric
+ * is weighted based on the space contribution of each metaslab group.
+ * The return value will be a number between 0 and 100 (inclusive), or
+ * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
+ * zfs_frag_table for more information about the metric.
+ */
+uint64_t
+metaslab_class_fragmentation(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t fragmentation = 0;
+
+	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		/*
+		 * Skip any holes, uninitialized top-levels, or
+		 * vdevs that are not in this metalab class.
+		 */
+		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		/*
+		 * If a metaslab group does not contain a fragmentation
+		 * metric then just bail out.
+		 */
+		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+			return (ZFS_FRAG_INVALID);
+		}
+
+		/*
+		 * Determine how much this metaslab_group is contributing
+		 * to the overall pool fragmentation metric.
+		 */
+		fragmentation += mg->mg_fragmentation *
+		    metaslab_group_get_space(mg);
+	}
+	fragmentation /= metaslab_class_get_space(mc);
+
+	ASSERT3U(fragmentation, <=, 100);
+	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+	return (fragmentation);
+}
+
+/*
+ * Calculate the amount of expandable space that is available in
+ * this metaslab class. If a device is expanded then its expandable
+ * space will be the amount of allocatable space that is currently not
+ * part of this metaslab class.
  */
+uint64_t
+metaslab_class_expandable_space(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t space = 0;
+
+	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		/*
+		 * Calculate if we have enough space to add additional
+		 * metaslabs. We report the expandable space in terms
+		 * of the metaslab size since that's the unit of expansion.
+		 */
+		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift);
+	}
+	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+	return (space);
+}
+
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
@@ -181,9 +515,9 @@ metaslab_compare(const void *x1, const v
 	/*
 	 * If the weights are identical, use the offset to force uniqueness.
 	 */
-	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
+	if (m1->ms_start < m2->ms_start)
 		return (-1);
-	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
+	if (m1->ms_start > m2->ms_start)
 		return (1);
 
 	ASSERT3P(m1, ==, m2);
@@ -191,6 +525,138 @@ metaslab_compare(const void *x1, const v
 	return (0);
 }
 
+/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	uint64_t allocated = 0;
+	uint64_t freed = 0;
+	uint64_t sm_free_space, msp_free_space;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	/*
+	 * We can only verify the metaslab space when we're called
+	 * from syncing context with a loaded metaslab that has an allocated
+	 * space map. Calling this in non-syncing context does not
+	 * provide a consistent view of the metaslab since we're performing
+	 * allocations in the future.
+	 */
+	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+	    !msp->ms_loaded)
+		return;
+
+	sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) -
+	    space_map_alloc_delta(msp->ms_sm);
+
+	/*
+	 * Account for future allocations since we would have already
+	 * deducted that space from the ms_freetree.
+	 */
+	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+		allocated +=
+		    range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]);
+	}
+	freed = range_tree_space(msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]);
+
+	msp_free_space = range_tree_space(msp->ms_tree) + allocated +
+	    msp->ms_deferspace + freed;
+
+	VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	metaslab_class_t *mc = mg->mg_class;
+	vdev_stat_t *vs = &vd->vdev_stat;
+	boolean_t was_allocatable;
+	boolean_t was_initialized;
+
+	ASSERT(vd == vd->vdev_top);
+
+	mutex_enter(&mg->mg_lock);
+	was_allocatable = mg->mg_allocatable;
+	was_initialized = mg->mg_initialized;
+
+	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+	    (vs->vs_space + 1);
+
+	mutex_enter(&mc->mc_lock);
+
+	/*
+	 * If the metaslab group was just added then it won't
+	 * have any space until we finish syncing out this txg.
+	 * At that point we will consider it initialized and available
+	 * for allocations.  We also don't consider non-activated
+	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
+	 * to be initialized, because they can't be used for allocation.
+	 */
+	mg->mg_initialized = metaslab_group_initialized(mg);
+	if (!was_initialized && mg->mg_initialized) {
+		mc->mc_groups++;
+	} else if (was_initialized && !mg->mg_initialized) {
+		ASSERT3U(mc->mc_groups, >, 0);
+		mc->mc_groups--;
+	}
+	if (mg->mg_initialized)
+		mg->mg_no_free_space = B_FALSE;
+
+	/*
+	 * A metaslab group is considered allocatable if it has plenty
+	 * of free space or is not heavily fragmented. We only take
+	 * fragmentation into account if the metaslab group has a valid
+	 * fragmentation metric (i.e. a value between 0 and 100).
+	 */
+	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
+	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
+
+	/*
+	 * The mc_alloc_groups maintains a count of the number of
+	 * groups in this metaslab class that are still above the
+	 * zfs_mg_noalloc_threshold. This is used by the allocating
+	 * threads to determine if they should avoid allocations to
+	 * a given group. The allocator will avoid allocations to a group
+	 * if that group has reached or is below the zfs_mg_noalloc_threshold
+	 * and there are still other groups that are above the threshold.
+	 * When a group transitions from allocatable to non-allocatable or
+	 * vice versa we update the metaslab class to reflect that change.
+	 * When the mc_alloc_groups value drops to 0 that means that all
+	 * groups have reached the zfs_mg_noalloc_threshold making all groups
+	 * eligible for allocations. This effectively means that all devices
+	 * are balanced again.
+	 */
+	if (was_allocatable && !mg->mg_allocatable)
+		mc->mc_alloc_groups--;
+	else if (!was_allocatable && mg->mg_allocatable)
+		mc->mc_alloc_groups++;
+	mutex_exit(&mc->mc_lock);
+
+	mutex_exit(&mg->mg_lock);
+}
+
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 {
@@ -203,6 +669,12 @@ metaslab_group_create(metaslab_class_t *
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
+	mg->mg_initialized = B_FALSE;
+	mg->mg_no_free_space = B_TRUE;
+	refcount_create_tracked(&mg->mg_alloc_queue_depth);
+
+	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
+	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 
 	return (mg);
 }
@@ -219,8 +691,10 @@ metaslab_group_destroy(metaslab_group_t 
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
+	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
+	refcount_destroy(&mg->mg_alloc_queue_depth);
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
@@ -241,6 +715,7 @@ metaslab_group_activate(metaslab_group_t
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_rotor) == NULL) {
 		mg->mg_prev = mg;
@@ -253,6 +728,7 @@ metaslab_group_activate(metaslab_group_t
 		mgnext->mg_prev = mg;
 	}
 	mc->mc_rotor = mg;
+	metaslab_class_minblocksize_update(mc);
 }
 
 void
@@ -271,6 +747,9 @@ metaslab_group_passivate(metaslab_group_
 		return;
 	}
 
+	taskq_wait(mg->mg_taskq);
+	metaslab_group_alloc_update(mg);
+
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
@@ -284,22 +763,125 @@ metaslab_group_passivate(metaslab_group_
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
+	metaslab_class_minblocksize_update(mc);
+}
+
+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
+uint64_t
+metaslab_group_get_space(metaslab_group_t *mg)
+{
+	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
+}
+
+void
+metaslab_group_histogram_verify(metaslab_group_t *mg)
+{
+	uint64_t *mg_hist;
+	vdev_t *vd = mg->mg_vd;
+	uint64_t ashift = vd->vdev_ashift;
+	int i;
+
+	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+		return;
+
+	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	    KM_SLEEP);
+
+	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
+	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
+
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
+
+		if (msp->ms_sm == NULL)
+			continue;
+
+		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+			mg_hist[i + ashift] +=
+			    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
+
+	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 static void
-metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 {
+	metaslab_class_t *mc = mg->mg_class;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_sm == NULL)
+		return;
+
+	mutex_enter(&mg->mg_lock);
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		mg->mg_histogram[i + ashift] +=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+		mc->mc_histogram[i + ashift] +=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+	mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_sm == NULL)
+		return;
+
 	mutex_enter(&mg->mg_lock);
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		ASSERT3U(mg->mg_histogram[i + ashift], >=,
+		    msp->ms_sm->sm_phys->smp_histogram[i]);
+		ASSERT3U(mc->mc_histogram[i + ashift], >=,
+		    msp->ms_sm->sm_phys->smp_histogram[i]);
+
+		mg->mg_histogram[i + ashift] -=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+		mc->mc_histogram[i + ashift] -=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+	mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+{
 	ASSERT(msp->ms_group == NULL);
+	mutex_enter(&mg->mg_lock);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
+
+	mutex_enter(&msp->ms_lock);
+	metaslab_group_histogram_add(mg, msp);
+	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
+	mutex_enter(&msp->ms_lock);
+	metaslab_group_histogram_remove(mg, msp);
+	mutex_exit(&msp->ms_lock);
+
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
@@ -312,9 +894,9 @@ metaslab_group_sort(metaslab_group_t *mg
 {
 	/*
 	 * Although in principle the weight can be any value, in
-	 * practice we do not use values in the range [1, 510].
+	 * practice we do not use values in the range [1, 511].
 	 */
-	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
+	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
@@ -326,131 +908,314 @@ metaslab_group_sort(metaslab_group_t *mg
 }
 
 /*
- * ==========================================================================
- * Common allocator routines
- * ==========================================================================
+ * Calculate the fragmentation for a given metaslab group. We can use
+ * a simple average here since all metaslabs within the group must have
+ * the same size. The return value will be a value between 0 and 100
+ * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
+ * group have a fragmentation metric.
  */
-static int
-metaslab_segsize_compare(const void *x1, const void *x2)
+uint64_t
+metaslab_group_fragmentation(metaslab_group_t *mg)
 {
-	const space_seg_t *s1 = x1;
-	const space_seg_t *s2 = x2;
-	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
-	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+	vdev_t *vd = mg->mg_vd;
+	uint64_t fragmentation = 0;
+	uint64_t valid_ms = 0;
 
-	if (ss_size1 < ss_size2)
-		return (-1);
-	if (ss_size1 > ss_size2)
-		return (1);
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
 
-	if (s1->ss_start < s2->ss_start)
-		return (-1);
-	if (s1->ss_start > s2->ss_start)
-		return (1);
+		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
+			continue;
 
-	return (0);
+		valid_ms++;
+		fragmentation += msp->ms_fragmentation;
+	}
+
+	if (valid_ms <= vd->vdev_ms_count / 2)
+		return (ZFS_FRAG_INVALID);
+
+	fragmentation /= valid_ms;
+	ASSERT3U(fragmentation, <=, 100);
+	return (fragmentation);
 }
 
 /*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its free capacity is less than the
+ * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
+ * zfs_mg_fragmentation_threshold and there is at least one metaslab group
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
  */
-static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
-    uint64_t align)
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+    uint64_t psize)
 {
-	space_seg_t *ss, ssearch;
-	avl_index_t where;
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_class_t *mc = mg->mg_class;
 
-	ssearch.ss_start = *cursor;
-	ssearch.ss_end = *cursor + size;
+	/*
+	 * We can only consider skipping this metaslab group if it's
+	 * in the normal metaslab class and there are other metaslab
+	 * groups to select from. Otherwise, we always consider it eligible
+	 * for allocations.
+	 */
+	if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+		return (B_TRUE);
+
+	/*
+	 * If the metaslab group's mg_allocatable flag is set (see comments
+	 * in metaslab_group_alloc_update() for more information) and
+	 * the allocation throttle is disabled then allow allocations to this
+	 * device. However, if the allocation throttle is enabled then
+	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
+	 * to determine if we should allow allocations to this metaslab group.
+	 * If all metaslab groups are no longer considered allocatable
+	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+	 * gang block size then we allow allocations on this metaslab group
+	 * regardless of the mg_allocatable or throttle settings.
+	 */
+	if (mg->mg_allocatable) {
+		metaslab_group_t *mgp;
+		int64_t qdepth;
+		uint64_t qmax = mg->mg_max_alloc_queue_depth;
 
-	ss = avl_find(t, &ssearch, &where);
-	if (ss == NULL)
-		ss = avl_nearest(t, where, AVL_AFTER);
+		if (!mc->mc_alloc_throttle_enabled)
+			return (B_TRUE);
 
-	while (ss != NULL) {
-		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
+		/*
+		 * If this metaslab group does not have any free space, then
+		 * there is no point in looking further.
+		 */
+		if (mg->mg_no_free_space)
+			return (B_FALSE);
 
-		if (offset + size <= ss->ss_end) {
-			*cursor = offset + size;
-			return (offset);
-		}
-		ss = AVL_NEXT(t, ss);
+		qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+
+		/*
+		 * If this metaslab group is below its qmax or it's
+		 * the only allocatable metasable group, then attempt
+		 * to allocate from it.
+		 */
+		if (qdepth < qmax || mc->mc_alloc_groups == 1)
+			return (B_TRUE);
+		ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+		/*
+		 * Since this metaslab group is at or over its qmax, we
+		 * need to determine if there are metaslab groups after this
+		 * one that might be able to handle this allocation. This is
+		 * racy since we can't hold the locks for all metaslab
+		 * groups at the same time when we make this check.
+		 */
+		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
+			qmax = mgp->mg_max_alloc_queue_depth;
+
+			qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+
+			/*
+			 * If there is another metaslab group that
+			 * might be able to handle the allocation, then
+			 * we return false so that we skip this group.
+			 */
+			if (qdepth < qmax && !mgp->mg_no_free_space)
+				return (B_FALSE);
+		}
+
+		/*
+		 * We didn't find another group to handle the allocation
+		 * so we can't skip this metaslab group even though
+		 * we are at or over our qmax.
+		 */
+		return (B_TRUE);
+
+	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+		return (B_TRUE);
 	}
+	return (B_FALSE);
+}
 
-	/*
-	 * If we know we've searched the whole map (*cursor == 0), give up.
-	 * Otherwise, reset the cursor to the beginning and try again.
-	 */
-	if (*cursor == 0)
-		return (-1ULL);
+/*
+ * ==========================================================================
+ * Range tree callbacks
+ * ==========================================================================
+ */
 
-	*cursor = 0;
-	return (metaslab_block_picker(t, cursor, size, align));
+/*
+ * Comparison function for the private size-ordered tree. Tree is sorted
+ * by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize_compare(const void *x1, const void *x2)
+{
+	const range_seg_t *r1 = x1;
+	const range_seg_t *r2 = x2;
+	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+	if (rs_size1 < rs_size2)
+		return (-1);
+	if (rs_size1 > rs_size2)
+		return (1);
+
+	if (r1->rs_start < r2->rs_start)
+		return (-1);
+
+	if (r1->rs_start > r2->rs_start)
+		return (1);
+
+	return (0);
 }
 
+/*
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
+ */
 static void
-metaslab_pp_load(space_map_t *sm)
+metaslab_rt_create(range_tree_t *rt, void *arg)
 {
-	space_seg_t *ss;
-
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+	metaslab_t *msp = arg;
 
-	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
-	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT(msp->ms_tree == NULL);
 
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		avl_add(sm->sm_pp_root, ss);
+	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+/*
+ * Destroy the block allocator specific components.
+ */
 static void
-metaslab_pp_unload(space_map_t *sm)
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
-	void *cookie = NULL;
+	metaslab_t *msp = arg;
 
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	ASSERT0(avl_numnodes(&msp->ms_size_tree));
 
-	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
-		/* tear down the tree */
-	}
+	avl_destroy(&msp->ms_size_tree);
+}
+
+static void
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	metaslab_t *msp = arg;
 
-	avl_destroy(sm->sm_pp_root);
-	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
-	sm->sm_pp_root = NULL;
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	VERIFY(!msp->ms_condensing);
+	avl_add(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-	/* No need to update cursor */
+	metaslab_t *msp = arg;
+
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+	VERIFY(!msp->ms_condensing);
+	avl_remove(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
-	/* No need to update cursor */
+	metaslab_t *msp = arg;
+
+	ASSERT3P(rt->rt_arg, ==, msp);
+	ASSERT3P(msp->ms_tree, ==, rt);
+
+	/*
+	 * Normally one would walk the tree freeing nodes along the way.
+	 * Since the nodes are shared with the range trees we can avoid
+	 * walking all nodes and just reinitialize the avl tree. The nodes
+	 * will be freed by the range tree, so we don't want to free them here.
+	 */
+	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+static range_tree_ops_t metaslab_rt_ops = {
+	metaslab_rt_create,
+	metaslab_rt_destroy,
+	metaslab_rt_add,
+	metaslab_rt_remove,
+	metaslab_rt_vacate
+};
+
+/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
-metaslab_pp_maxsize(space_map_t *sm)
+metaslab_block_maxsize(metaslab_t *msp)
 {
-	avl_tree_t *t = sm->sm_pp_root;
-	space_seg_t *ss;
+	avl_tree_t *t = &msp->ms_size_tree;
+	range_seg_t *rs;
 
-	if (t == NULL || (ss = avl_last(t)) == NULL)
+	if (t == NULL || (rs = avl_last(t)) == NULL)
 		return (0ULL);
 
-	return (ss->ss_end - ss->ss_start);
+	return (rs->rs_end - rs->rs_start);
+}
+
+static range_seg_t *
+metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
+{
+	range_seg_t *rs, rsearch;
+	avl_index_t where;
+
+	rsearch.rs_start = start;
+	rsearch.rs_end = start + size;
+
+	rs = avl_find(t, &rsearch, &where);
+	if (rs == NULL) {
+		rs = avl_nearest(t, where, AVL_AFTER);
+	}
+
+	return (rs);
+}
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+static uint64_t
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+    uint64_t align)
+{
+	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
+
+	while (rs != NULL) {
+		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+
+		if (offset + size <= rs->rs_end) {
+			*cursor = offset + size;
+			return (offset);
+		}
+		rs = AVL_NEXT(t, rs);
+	}
+
+	/*
+	 * If we know we've searched the whole map (*cursor == 0), give up.
+	 * Otherwise, reset the cursor to the beginning and try again.
+	 */
+	if (*cursor == 0)
+		return (-1ULL);
+
+	*cursor = 0;
+	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /*
@@ -459,30 +1224,24 @@ metaslab_pp_maxsize(space_map_t *sm)
  * ==========================================================================
  */
 static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	/*
+	 * Find the largest power of 2 block size that evenly divides the
+	 * requested size. This is used to try to allocate blocks with similar
+	 * alignment from the same area of the metaslab (i.e. same cursor
+	 * bucket) but it does not guarantee that other allocations sizes
+	 * may exist in the same region.
+	 */
 	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+	avl_tree_t *t = &msp->ms_tree->rt_root;
 
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
-/* ARGSUSED */
-boolean_t
-metaslab_ff_fragmented(space_map_t *sm)
-{
-	return (B_TRUE);
-}
-
-static space_map_ops_t metaslab_ff_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_ff_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_ff_fragmented
+static metaslab_ops_t metaslab_ff_ops = {
+	metaslab_ff_alloc
 };
 
 /*
@@ -494,16 +1253,24 @@ static space_map_ops_t metaslab_ff_ops =
  * ==========================================================================
  */
 static uint64_t
-metaslab_df_alloc(space_map_t *sm, uint64_t size)
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	/*
+	 * Find the largest power of 2 block size that evenly divides the
+	 * requested size. This is used to try to allocate blocks with similar
+	 * alignment from the same area of the metaslab (i.e. same cursor
+	 * bucket) but it does not guarantee that other allocations sizes
+	 * may exist in the same region.
+	 */
 	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	int free_pct = sm->sm_space * 100 / sm->sm_size;
+	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &rt->rt_root;
+	uint64_t max_size = metaslab_block_maxsize(msp);
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
@@ -514,216 +1281,265 @@ metaslab_df_alloc(space_map_t *sm, uint6
 	 */
 	if (max_size < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
-		t = sm->sm_pp_root;
+		t = &msp->ms_size_tree;
 		*cursor = 0;
 	}
 
 	return (metaslab_block_picker(t, cursor, size, 1ULL));
 }
 
-static boolean_t
-metaslab_df_fragmented(space_map_t *sm)
-{
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	int free_pct = sm->sm_space * 100 / sm->sm_size;
-
-	if (max_size >= metaslab_df_alloc_threshold &&
-	    free_pct >= metaslab_df_free_pct)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-static space_map_ops_t metaslab_df_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_df_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_df_fragmented
+static metaslab_ops_t metaslab_df_ops = {
+	metaslab_df_alloc
 };
 
 /*
  * ==========================================================================
- * Other experimental allocators
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
-metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
-	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-	uint64_t rsize = size;
+	range_tree_t *rt = msp->ms_tree;
+	avl_tree_t *t = &msp->ms_size_tree;
+	uint64_t *cursor = &msp->ms_lbas[0];
+	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
-
-	if (max_size < size)
-		return (-1ULL);
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 
-	ASSERT3U(*extent_end, >=, *cursor);
+	ASSERT3U(*cursor_end, >=, *cursor);
 
-	/*
-	 * If we're running low on space switch to using the size
-	 * sorted AVL tree (best-fit).
-	 */
-	if ((*cursor + size) > *extent_end) {
+	if ((*cursor + size) > *cursor_end) {
+		range_seg_t *rs;
 
-		t = sm->sm_pp_root;
-		*cursor = *extent_end = 0;
+		rs = avl_last(&msp->ms_size_tree);
+		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+			return (-1ULL);
 
-		if (max_size > 2 * SPA_MAXBLOCKSIZE)
-			rsize = MIN(metaslab_min_alloc_size, max_size);
-		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
-		if (offset != -1)
-			*cursor = offset + size;
-	} else {
-		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+		*cursor = rs->rs_start;
+		*cursor_end = rs->rs_end;
 	}
-	ASSERT3U(*cursor, <=, *extent_end);
+
+	offset = *cursor;
+	*cursor += size;
+
 	return (offset);
 }
 
-static boolean_t
-metaslab_cdf_fragmented(space_map_t *sm)
-{
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-
-	if (max_size > (metaslab_min_alloc_size * 10))
-		return (B_FALSE);
-	return (B_TRUE);
-}
-
-static space_map_ops_t metaslab_cdf_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_cdf_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_cdf_fragmented
+static metaslab_ops_t metaslab_cf_ops = {
+	metaslab_cf_alloc
 };
 
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
+uint64_t metaslab_ndf_clump_shift = 4;
+
 static uint64_t
-metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-	avl_tree_t *t = &sm->sm_root;
+	avl_tree_t *t = &msp->ms_tree->rt_root;
 	avl_index_t where;
-	space_seg_t *ss, ssearch;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
-	uint64_t max_size = metaslab_pp_maxsize(sm);
+	range_seg_t *rs, rsearch;
+	uint64_t hbit = highbit64(size);
+	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+	uint64_t max_size = metaslab_block_maxsize(msp);
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
 	if (max_size < size)
 		return (-1ULL);
 
-	ssearch.ss_start = *cursor;
-	ssearch.ss_end = *cursor + size;
+	rsearch.rs_start = *cursor;
+	rsearch.rs_end = *cursor + size;
 
-	ss = avl_find(t, &ssearch, &where);
-	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
-		t = sm->sm_pp_root;
-
-		if (max_size > 2 * SPA_MAXBLOCKSIZE)
-			size = MIN(metaslab_min_alloc_size, max_size);
-
-		ssearch.ss_start = 0;
-		ssearch.ss_end = size;
-		ss = avl_find(t, &ssearch, &where);
-		if (ss == NULL)
-			ss = avl_nearest(t, where, AVL_AFTER);
-		ASSERT(ss != NULL);
-	}
-
-	if (ss != NULL) {
-		if (ss->ss_start + size <= ss->ss_end) {
-			*cursor = ss->ss_start + size;
-			return (ss->ss_start);
-		}
+	rs = avl_find(t, &rsearch, &where);
+	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+		t = &msp->ms_size_tree;
+
+		rsearch.rs_start = 0;
+		rsearch.rs_end = MIN(max_size,
+		    1ULL << (hbit + metaslab_ndf_clump_shift));
+		rs = avl_find(t, &rsearch, &where);
+		if (rs == NULL)
+			rs = avl_nearest(t, where, AVL_AFTER);
+		ASSERT(rs != NULL);
+	}
+
+	if ((rs->rs_end - rs->rs_start) >= size) {
+		*cursor = rs->rs_start + size;
+		return (rs->rs_start);
 	}
 	return (-1ULL);
 }
 
-static boolean_t
-metaslab_ndf_fragmented(space_map_t *sm)
-{
-	uint64_t max_size = metaslab_pp_maxsize(sm);
-
-	if (max_size > (metaslab_min_alloc_size * 10))
-		return (B_FALSE);
-	return (B_TRUE);
-}
-
-
-static space_map_ops_t metaslab_ndf_ops = {
-	metaslab_pp_load,
-	metaslab_pp_unload,
-	metaslab_ndf_alloc,
-	metaslab_pp_claim,
-	metaslab_pp_free,
-	metaslab_pp_maxsize,
-	metaslab_ndf_fragmented
+static metaslab_ops_t metaslab_ndf_ops = {
+	metaslab_ndf_alloc
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
-metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-	uint64_t start, uint64_t size, uint64_t txg)
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_loading) {
+		ASSERT(!msp->ms_loaded);
+		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+	}
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+	int error = 0;
+	boolean_t success = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!msp->ms_loaded);
+	ASSERT(!msp->ms_loading);
+
+	msp->ms_loading = B_TRUE;
+
+	/*
+	 * If the space map has not been allocated yet, then treat
+	 * all the space in the metaslab as free and add it to the
+	 * ms_tree.
+	 */
+	if (msp->ms_sm != NULL)
+		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
+	else
+		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+
+	success = (error == 0);
+	msp->ms_loading = B_FALSE;
+
+	if (success) {
+		ASSERT3P(msp->ms_group, !=, NULL);
+		msp->ms_loaded = B_TRUE;
+
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_walk(msp->ms_defertree[t],
+			    range_tree_remove, msp->ms_tree);
+		}
+		msp->ms_max_size = metaslab_block_maxsize(msp);
+	}
+	cv_broadcast(&msp->ms_load_cv);
+	return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	range_tree_vacate(msp->ms_tree, NULL, NULL);
+	msp->ms_loaded = B_FALSE;
+	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+	msp->ms_max_size = 0;
+}
+
+int
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
+    metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
-	metaslab_t *msp;
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+	metaslab_t *ms;
+	int error;
+
+	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+	ms->ms_id = id;
+	ms->ms_start = id << vd->vdev_ms_shift;
+	ms->ms_size = 1ULL << vd->vdev_ms_shift;
+
+	/*
+	 * We only open space map objects that already exist. All others
+	 * will be opened when we finally allocate an object for it.
+	 */
+	if (object != 0) {
+		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
+		    ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
 
-	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
-	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+		if (error != 0) {
+			kmem_free(ms, sizeof (metaslab_t));
+			return (error);
+		}
 
-	msp->ms_smo_syncing = *smo;
+		ASSERT(ms->ms_sm != NULL);
+	}
 
 	/*
-	 * We create the main space map here, but we don't create the
-	 * allocmaps and freemaps until metaslab_sync_done().  This serves
+	 * We create the main range tree here, but we don't create the
+	 * alloctree and freetree until metaslab_sync_done().  This serves
 	 * two purposes: it allows metaslab_sync_done() to detect the
 	 * addition of new space; and for debugging, it ensures that we'd
 	 * data fault on any attempt to use this metaslab before it's ready.
 	 */
-	space_map_create(&msp->ms_map, start, size,
-	    vd->vdev_ashift, &msp->ms_lock);
+	ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
+	metaslab_group_add(mg, ms);
 
-	metaslab_group_add(mg, msp);
-
-	if (metaslab_debug && smo->smo_object != 0) {
-		mutex_enter(&msp->ms_lock);
-		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
-		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
-		mutex_exit(&msp->ms_lock);
-	}
+	metaslab_set_fragmentation(ms);
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
 	 * If we're adding space to an existing pool, the new space
 	 * does not become available until after this txg has synced.
+	 * The metaslab's weight will also be initialized when we sync
+	 * out this txg. This ensures that we don't attempt to allocate
+	 * from it before we have initialized it completely.
 	 */
 	if (txg <= TXG_INITIAL)
-		metaslab_sync_done(msp, 0);
+		metaslab_sync_done(ms, 0);
+
+	/*
+	 * If metaslab_debug_load is set and we're initializing a metaslab
+	 * that has an allocated space map object then load the its space
+	 * map so that can verify frees.
+	 */
+	if (metaslab_debug_load && ms->ms_sm != NULL) {
+		mutex_enter(&ms->ms_lock);
+		VERIFY0(metaslab_load(ms));
+		mutex_exit(&ms->ms_lock);
+	}
 
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
-		vdev_dirty(vd, VDD_METASLAB, msp, txg);
+		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
-	return (msp);
+	*msp = ms;
+
+	return (0);
 }
 
 void
@@ -731,52 +1547,187 @@ metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 
-	vdev_space_update(mg->mg_vd,
-	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
-
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
+	VERIFY(msp->ms_group == NULL);
+	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
+	    0, -msp->ms_size);
+	space_map_close(msp->ms_sm);
 
-	space_map_unload(&msp->ms_map);
-	space_map_destroy(&msp->ms_map);
+	metaslab_unload(msp);
+	range_tree_destroy(msp->ms_tree);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
-		space_map_destroy(&msp->ms_allocmap[t]);
-		space_map_destroy(&msp->ms_freemap[t]);
+		range_tree_destroy(msp->ms_alloctree[t]);
+		range_tree_destroy(msp->ms_freetree[t]);
 	}
 
-	for (int t = 0; t < TXG_DEFER_SIZE; t++)
-		space_map_destroy(&msp->ms_defermap[t]);
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		range_tree_destroy(msp->ms_defertree[t]);
+	}
 
-	ASSERT3S(msp->ms_deferspace, ==, 0);
+	ASSERT0(msp->ms_deferspace);
 
 	mutex_exit(&msp->ms_lock);
+	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
-#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
-#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
-#define	METASLAB_ACTIVE_MASK		\
-	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define	FRAGMENTATION_TABLE_SIZE	17
+
+/*
+ * This table defines a segment size based fragmentation metric that will
+ * allow each metaslab to derive its own fragmentation value. This is done
+ * by calculating the space in each bucket of the spacemap histogram and
+ * multiplying that by the fragmetation metric in this table. Doing
+ * this for all buckets and dividing it by the total amount of free
+ * space in this metaslab (i.e. the total free space in all buckets) gives
+ * us the fragmentation metric. This means that a high fragmentation metric
+ * equates to most of the free space being comprised of small segments.
+ * Conversely, if the metric is low, then most of the free space is in
+ * large segments. A 10% change in fragmentation equates to approximately
+ * double the number of segments.
+ *
+ * This table defines 0% fragmented space using 16MB segments. Testing has
+ * shown that segments that are greater than or equal to 16MB do not suffer
+ * from drastic performance problems. Using this value, we derive the rest
+ * of the table. Since the fragmentation value is never stored on disk, it
+ * is possible to change these calculations in the future.
+ */
+int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+	100,	/* 512B	*/
+	100,	/* 1K	*/
+	98,	/* 2K	*/
+	95,	/* 4K	*/
+	90,	/* 8K	*/
+	80,	/* 16K	*/
+	70,	/* 32K	*/
+	60,	/* 64K	*/
+	50,	/* 128K	*/
+	40,	/* 256K	*/
+	30,	/* 512K	*/
+	20,	/* 1M	*/
+	15,	/* 2M	*/
+	10,	/* 4M	*/
+	5,	/* 8M	*/
+	0	/* 16M	*/
+};
+
+/*
+ * Calclate the metaslab's fragmentation metric. A return value
+ * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
+ * not support this metric. Otherwise, the return value should be in the
+ * range [0, 100].
+ */
+static void
+metaslab_set_fragmentation(metaslab_t *msp)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	uint64_t fragmentation = 0;
+	uint64_t total = 0;
+	boolean_t feature_enabled = spa_feature_is_enabled(spa,
+	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
+
+	if (!feature_enabled) {
+		msp->ms_fragmentation = ZFS_FRAG_INVALID;
+		return;
+	}
+
+	/*
+	 * A null space map means that the entire metaslab is free
+	 * and thus is not fragmented.
+	 */
+	if (msp->ms_sm == NULL) {
+		msp->ms_fragmentation = 0;
+		return;
+	}
+
+	/*
+	 * If this metaslab's space map has not been upgraded, flag it
+	 * so that we upgrade next time we encounter it.
+	 */
+	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
+		uint64_t txg = spa_syncing_txg(spa);
+		vdev_t *vd = msp->ms_group->mg_vd;
+
+		if (spa_writeable(spa)) {
+			msp->ms_condense_wanted = B_TRUE;
+			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+			spa_dbgmsg(spa, "txg %llu, requesting force condense: "
+			    "msp %p, vd %p", txg, msp, vd);
+		}
+		msp->ms_fragmentation = ZFS_FRAG_INVALID;
+		return;
+	}
+
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		uint64_t space = 0;
+		uint8_t shift = msp->ms_sm->sm_shift;
+
+		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
+		    FRAGMENTATION_TABLE_SIZE - 1);
+
+		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
+			continue;
+
+		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
+		total += space;
 
+		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
+		fragmentation += space * zfs_frag_table[idx];
+	}
+
+	if (total > 0)
+		fragmentation /= total;
+	ASSERT3U(fragmentation, <=, 100);
+
+	msp->ms_fragmentation = fragmentation;
+}
+
+/*
+ * Compute a weight -- a selection preference value -- for the given metaslab.
+ * This is based on the amount of free space, the level of fragmentation,
+ * the LBA range, and whether the metaslab is loaded.
+ */
 static uint64_t
-metaslab_weight(metaslab_t *msp)
+metaslab_space_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = &msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!vd->vdev_removing);
 
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
-	space = sm->sm_size - smo->smo_alloc;
+	space = msp->ms_size - space_map_allocated(msp->ms_sm);
+
+	if (metaslab_fragmentation_factor_enabled &&
+	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
+		/*
+		 * Use the fragmentation information to inversely scale
+		 * down the baseline weight. We need to ensure that we
+		 * don't exclude this metaslab completely when it's 100%
+		 * fragmented. To avoid this we reduce the fragmented value
+		 * by 1.
+		 */
+		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
+
+		/*
+		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
+		 * this metaslab again. The fragmentation metric may have
+		 * decreased the space to something smaller than
+		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
+		 * so that we can consume any remaining space.
+		 */
+		if (space > 0 && space < SPA_MINBLOCKSIZE)
+			space = SPA_MINBLOCKSIZE;
+	}
 	weight = space;
 
 	/*
@@ -788,219 +1739,669 @@ metaslab_weight(metaslab_t *msp)
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
-	weight = 2 * weight -
-	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
-	ASSERT(weight >= space && weight <= 2 * space);
+	if (metaslab_lba_weighting_enabled) {
+		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
+		ASSERT(weight >= space && weight <= 2 * space);
+	}
 
 	/*
-	 * For locality, assign higher weight to metaslabs which have
-	 * a lower offset than what we've already activated.
+	 * If this metaslab is one we're actively using, adjust its
+	 * weight to make it preferable to any inactive metaslab so
+	 * we'll polish it off. If the fragmentation on this metaslab
+	 * has exceed our threshold, then don't mark it active.
 	 */
-	if (sm->sm_start <= mg->mg_bonus_area)
-		weight *= (metaslab_smo_bonus_pct / 100);
-	ASSERT(weight >= space &&
-	    weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
+	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
+	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
+		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+	}
+
+	WEIGHT_SET_SPACEBASED(weight);
+	return (weight);
+}
+
+/*
+ * Return the weight of the specified metaslab, according to the segment-based
+ * weighting algorithm. The metaslab must be loaded. This function can
+ * be called within a sync pass since it relies only on the metaslab's
+ * range tree which is always accurate when the metaslab is loaded.
+ */
+static uint64_t
+metaslab_weight_from_range_tree(metaslab_t *msp)
+{
+	uint64_t weight = 0;
+	uint32_t segments = 0;
+
+	ASSERT(msp->ms_loaded);
+
+	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
+	    i--) {
+		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
+		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+		segments <<= 1;
+		segments += msp->ms_tree->rt_histogram[i];
 
-	if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
 		/*
-		 * If this metaslab is one we're actively using, adjust its
-		 * weight to make it preferable to any inactive metaslab so
-		 * we'll polish it off.
+		 * The range tree provides more precision than the space map
+		 * and must be downgraded so that all values fit within the
+		 * space map's histogram. This allows us to compare loaded
+		 * vs. unloaded metaslabs to determine which metaslab is
+		 * considered "best".
 		 */
-		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+		if (i > max_idx)
+			continue;
+
+		if (segments != 0) {
+			WEIGHT_SET_COUNT(weight, segments);
+			WEIGHT_SET_INDEX(weight, i);
+			WEIGHT_SET_ACTIVE(weight, 0);
+			break;
+		}
 	}
 	return (weight);
 }
 
-static void
-metaslab_prefetch(metaslab_group_t *mg)
+/*
+ * Calculate the weight based on the on-disk histogram. This should only
+ * be called after a sync pass has completely finished since the on-disk
+ * information is updated in metaslab_sync().
+ */
+static uint64_t
+metaslab_weight_from_spacemap(metaslab_t *msp)
 {
-	spa_t *spa = mg->mg_vd->vdev_spa;
-	metaslab_t *msp;
-	avl_tree_t *t = &mg->mg_metaslab_tree;
-	int m;
+	uint64_t weight = 0;
 
-	mutex_enter(&mg->mg_lock);
+	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
+		if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) {
+			WEIGHT_SET_COUNT(weight,
+			    msp->ms_sm->sm_phys->smp_histogram[i]);
+			WEIGHT_SET_INDEX(weight, i +
+			    msp->ms_sm->sm_shift);
+			WEIGHT_SET_ACTIVE(weight, 0);
+			break;
+		}
+	}
+	return (weight);
+}
+
+/*
+ * Compute a segment-based weight for the specified metaslab. The weight
+ * is determined by highest bucket in the histogram. The information
+ * for the highest bucket is encoded into the weight value.
+ */
+static uint64_t
+metaslab_segment_weight(metaslab_t *msp)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	uint64_t weight = 0;
+	uint8_t shift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
-	 * Prefetch the next potential metaslabs
+	 * The metaslab is completely free.
 	 */
-	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
-		space_map_t *sm = &msp->ms_map;
-		space_map_obj_t *smo = &msp->ms_smo;
+	if (space_map_allocated(msp->ms_sm) == 0) {
+		int idx = highbit64(msp->ms_size) - 1;
+		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+		if (idx < max_idx) {
+			WEIGHT_SET_COUNT(weight, 1ULL);
+			WEIGHT_SET_INDEX(weight, idx);
+		} else {
+			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
+			WEIGHT_SET_INDEX(weight, max_idx);
+		}
+		WEIGHT_SET_ACTIVE(weight, 0);
+		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
 
-		/* If we have reached our prefetch limit then we're done */
-		if (m >= metaslab_prefetch_limit)
-			break;
+		return (weight);
+	}
 
-		if (!sm->sm_loaded && smo->smo_object != 0) {
-			mutex_exit(&mg->mg_lock);
-			dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
-			    0ULL, smo->smo_objsize);
-			mutex_enter(&mg->mg_lock);
-		}
+	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+	/*
+	 * If the metaslab is fully allocated then just make the weight 0.
+	 */
+	if (space_map_allocated(msp->ms_sm) == msp->ms_size)
+		return (0);
+	/*
+	 * If the metaslab is already loaded, then use the range tree to
+	 * determine the weight. Otherwise, we rely on the space map information
+	 * to generate the weight.
+	 */
+	if (msp->ms_loaded) {
+		weight = metaslab_weight_from_range_tree(msp);
+	} else {
+		weight = metaslab_weight_from_spacemap(msp);
 	}
-	mutex_exit(&mg->mg_lock);
+
+	/*
+	 * If the metaslab was active the last time we calculated its weight
+	 * then keep it active. We want to consume the entire region that
+	 * is associated with this weight.
+	 */
+	if (msp->ms_activation_weight != 0 && weight != 0)
+		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
+	return (weight);
 }
 
-static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+/*
+ * Determine if we should attempt to allocate from this metaslab. If the
+ * metaslab has a maximum size then we can quickly determine if the desired
+ * allocation size can be satisfied. Otherwise, if we're using segment-based
+ * weighting then we can determine the maximum allocation that this metaslab
+ * can accommodate based on the index encoded in the weight. If we're using
+ * space-based weights then rely on the entire weight (excluding the weight
+ * type bit).
+ */
+boolean_t
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
 {
-	metaslab_group_t *mg = msp->ms_group;
-	space_map_t *sm = &msp->ms_map;
-	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
+	boolean_t should_allocate;
+
+	if (msp->ms_max_size != 0)
+		return (msp->ms_max_size >= asize);
+
+	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+		/*
+		 * The metaslab segment weight indicates segments in the
+		 * range [2^i, 2^(i+1)), where i is the index in the weight.
+		 * Since the asize might be in the middle of the range, we
+		 * should attempt the allocation if asize < 2^(i+1).
+		 */
+		should_allocate = (asize <
+		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
+	} else {
+		should_allocate = (asize <=
+		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
+	}
+	return (should_allocate);
+}
+
+static uint64_t
+metaslab_weight(metaslab_t *msp)
+{
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	uint64_t weight;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * This vdev is in the process of being removed so there is nothing
+	 * for us to do here.
+	 */
+	if (vd->vdev_removing) {
+		ASSERT0(space_map_allocated(msp->ms_sm));
+		ASSERT0(vd->vdev_ms_shift);
+		return (0);
+	}
+
+	metaslab_set_fragmentation(msp);
+
+	/*
+	 * Update the maximum size if the metaslab is loaded. This will
+	 * ensure that we get an accurate maximum size if newly freed space
+	 * has been added back into the free tree.
+	 */
+	if (msp->ms_loaded)
+		msp->ms_max_size = metaslab_block_maxsize(msp);
+
+	/*
+	 * Segment-based weighting requires space map histogram support.
+	 */
+	if (zfs_metaslab_segment_weight_enabled &&
+	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
+	    sizeof (space_map_phys_t))) {
+		weight = metaslab_segment_weight(msp);
+	} else {
+		weight = metaslab_space_weight(msp);
+	}
+	return (weight);
+}
 
+static int
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+{
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		space_map_load_wait(sm);
-		if (!sm->sm_loaded) {
-			int error = space_map_load(sm, sm_ops, SM_FREE,
-			    &msp->ms_smo,
-			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
-			if (error)  {
+		metaslab_load_wait(msp);
+		if (!msp->ms_loaded) {
+			int error = metaslab_load(msp);
+			if (error) {
 				metaslab_group_sort(msp->ms_group, msp, 0);
 				return (error);
 			}
-			for (int t = 0; t < TXG_DEFER_SIZE; t++)
-				space_map_walk(&msp->ms_defermap[t],
-				    space_map_claim, sm);
-
 		}
 
-		/*
-		 * Track the bonus area as we activate new metaslabs.
-		 */
-		if (sm->sm_start > mg->mg_bonus_area) {
-			mutex_enter(&mg->mg_lock);
-			mg->mg_bonus_area = sm->sm_start;
-			mutex_exit(&mg->mg_lock);
-		}
-
-		/*
-		 * If we were able to load the map then make sure
-		 * that this map is still able to satisfy our request.
-		 */
-		if (msp->ms_weight < size)
-			return (ENOSPC);
-
+		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
-	ASSERT(sm->sm_loaded);
+	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
 
 static void
-metaslab_passivate(metaslab_t *msp, uint64_t size)
+metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
+	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
+
 	/*
 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
-	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
-	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+	ASSERT(size >= SPA_MINBLOCKSIZE ||
+	    range_tree_space(msp->ms_tree) == 0);
+	ASSERT0(weight & METASLAB_ACTIVE_MASK);
+
+	msp->ms_activation_weight = 0;
+	metaslab_group_sort(msp->ms_group, msp, weight);
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
 /*
+ * Segment-based metaslabs are activated once and remain active until
+ * we either fail an allocation attempt (similar to space-based metaslabs)
+ * or have exhausted the free space in zfs_metaslab_switch_threshold
+ * buckets since the metaslab was activated. This function checks to see
+ * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
+ * metaslab and passivates it proactively. This will allow us to select a
+ * metaslabs with larger contiguous region if any remaining within this
+ * metaslab group. If we're in sync pass > 1, then we continue using this
+ * metaslab so that we don't dirty more block and cause more sync passes.
+ */
+void
+metaslab_segment_may_passivate(metaslab_t *msp)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
+		return;
+
+	/*
+	 * Since we are in the middle of a sync pass, the most accurate
+	 * information that is accessible to us is the in-core range tree
+	 * histogram; calculate the new weight based on that information.
+	 */
+	uint64_t weight = metaslab_weight_from_range_tree(msp);
+	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
+	int current_idx = WEIGHT_GET_INDEX(weight);
+
+	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
+		metaslab_passivate(msp, weight);
+}
+
+static void
+metaslab_preload(void *arg)
+{
+	metaslab_t *msp = arg;
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
+
+	mutex_enter(&msp->ms_lock);
+	metaslab_load_wait(msp);
+	if (!msp->ms_loaded)
+		(void) metaslab_load(msp);
+	msp->ms_selected_txg = spa_syncing_txg(spa);
+	mutex_exit(&msp->ms_lock);
+}
+
+static void
+metaslab_group_preload(metaslab_group_t *mg)
+{
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_t *msp;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	int m = 0;
+
+	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
+		taskq_wait(mg->mg_taskq);
+		return;
+	}
+
+	mutex_enter(&mg->mg_lock);
+	/*
+	 * Load the next potential metaslabs
+	 */
+	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
+		/*
+		 * We preload only the maximum number of metaslabs specified
+		 * by metaslab_preload_limit. If a metaslab is being forced
+		 * to condense then we preload it too. This will ensure
+		 * that force condensing happens in the next txg.
+		 */
+		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
+			continue;
+		}
+
+		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
+		    msp, TQ_SLEEP) != 0);
+	}
+	mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * Determine if the space map's on-disk footprint is past our tolerance
+ * for inefficiency. We would like to use the following criteria to make
+ * our decision:
+ *
+ * 1. The size of the space map object should not dramatically increase as a
+ * result of writing out the free space range tree.
+ *
+ * 2. The minimal on-disk space map representation is zfs_condense_pct/100
+ * times the size than the free space range tree representation
+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
+ *
+ * 3. The on-disk size of the space map should actually decrease.
+ *
+ * Checking the first condition is tricky since we don't want to walk
+ * the entire AVL tree calculating the estimated on-disk size. Instead we
+ * use the size-ordered range tree in the metaslab and calculate the
+ * size required to write out the largest segment in our free tree. If the
+ * size required to represent that segment on disk is larger than the space
+ * map object then we avoid condensing this map.
+ *
+ * To determine the second criterion we use a best-case estimate and assume
+ * each segment can be represented on-disk as a single 64-bit entry. We refer
+ * to this best-case estimate as the space map's minimal form.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
+ */
+static boolean_t
+metaslab_should_condense(metaslab_t *msp)
+{
+	space_map_t *sm = msp->ms_sm;
+	range_seg_t *rs;
+	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
+	dmu_object_info_t doi;
+	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_loaded);
+
+	/*
+	 * Use the ms_size_tree range tree, which is ordered by size, to
+	 * obtain the largest segment in the free tree. We always condense
+	 * metaslabs that are empty and metaslabs for which a condense
+	 * request has been made.
+	 */
+	rs = avl_last(&msp->ms_size_tree);
+	if (rs == NULL || msp->ms_condense_wanted)
+		return (B_TRUE);
+
+	/*
+	 * Calculate the number of 64-bit entries this segment would
+	 * require when written to disk. If this single segment would be
+	 * larger on-disk than the entire current on-disk structure, then
+	 * clearly condensing will increase the on-disk structure size.
+	 */
+	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+	entries = size / (MIN(size, SM_RUN_MAX));
+	segsz = entries * sizeof (uint64_t);
+
+	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+	object_size = space_map_length(msp->ms_sm);
+
+	dmu_object_info_from_db(sm->sm_dbuf, &doi);
+	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+	return (segsz <= object_size &&
+	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	    object_size > zfs_metaslab_condense_block_threshold * record_size);
+}
+
+/*
+ * Condense the on-disk space map representation to its minimized form.
+ * The minimized form consists of a small number of allocations followed by
+ * the entries of the free range tree.
+ */
+static void
+metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
+	range_tree_t *condense_tree;
+	space_map_t *sm = msp->ms_sm;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(msp->ms_loaded);
+
+
+	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
+	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
+	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
+	    msp->ms_group->mg_vd->vdev_spa->spa_name,
+	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
+
+	msp->ms_condense_wanted = B_FALSE;
+
+	/*
+	 * Create an range tree that is 100% allocated. We remove segments
+	 * that have been freed in this txg, any deferred frees that exist,
+	 * and any allocation in the future. Removing segments should be
+	 * a relatively inexpensive operation since we expect these trees to
+	 * have a small number of nodes.
+	 */
+	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
+	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
+
+	/*
+	 * Remove what's been freed in this txg from the condense_tree.
+	 * Since we're in sync_pass 1, we know that all the frees from
+	 * this txg are in the freetree.
+	 */
+	range_tree_walk(freetree, range_tree_remove, condense_tree);
+
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		range_tree_walk(msp->ms_defertree[t],
+		    range_tree_remove, condense_tree);
+	}
+
+	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+		    range_tree_remove, condense_tree);
+	}
+
+	/*
+	 * We're about to drop the metaslab's lock thus allowing
+	 * other consumers to change it's content. Set the
+	 * metaslab's ms_condensing flag to ensure that
+	 * allocations on this metaslab do not occur while we're
+	 * in the middle of committing it to disk. This is only critical
+	 * for the ms_tree as all other range trees use per txg
+	 * views of their content.
+	 */
+	msp->ms_condensing = B_TRUE;
+
+	mutex_exit(&msp->ms_lock);
+	space_map_truncate(sm, tx);
+	mutex_enter(&msp->ms_lock);
+
+	/*
+	 * While we would ideally like to create a space map representation
+	 * that consists only of allocation records, doing so can be
+	 * prohibitively expensive because the in-core free tree can be
+	 * large, and therefore computationally expensive to subtract
+	 * from the condense_tree. Instead we sync out two trees, a cheap
+	 * allocation only tree followed by the in-core free tree. While not
+	 * optimal, this is typically close to optimal, and much cheaper to
+	 * compute.
+	 */
+	space_map_write(sm, condense_tree, SM_ALLOC, tx);
+	range_tree_vacate(condense_tree, NULL, NULL);
+	range_tree_destroy(condense_tree);
+
+	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+	msp->ms_condensing = B_FALSE;
+}
+
+/*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
-	vdev_t *vd = msp->ms_group->mg_vd;
+	metaslab_group_t *mg = msp->ms_group;
+	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
-	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
-	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *sm = &msp->ms_map;
-	space_map_obj_t *smo = &msp->ms_smo_syncing;
-	dmu_buf_t *db;
+	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
+	range_tree_t **freed_tree =
+	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
 	dmu_tx_t *tx;
+	uint64_t object = space_map_object(msp->ms_sm);
 
 	ASSERT(!vd->vdev_ishole);
 
-	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+	/*
+	 * This metaslab has just been added so there's no work to do now.
+	 */
+	if (*freetree == NULL) {
+		ASSERT3P(alloctree, ==, NULL);
+		return;
+	}
+
+	ASSERT3P(alloctree, !=, NULL);
+	ASSERT3P(*freetree, !=, NULL);
+	ASSERT3P(*freed_tree, !=, NULL);
+
+	/*
+	 * Normally, we don't want to process a metaslab if there
+	 * are no allocations or frees to perform. However, if the metaslab
+	 * is being forced to condense we need to let it through.
+	 */
+	if (range_tree_space(alloctree) == 0 &&
+	    range_tree_space(*freetree) == 0 &&
+	    !msp->ms_condense_wanted)
 		return;
 
 	/*
 	 * The only state that can actually be changing concurrently with
-	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
-	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
-	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
-	 * We drop it whenever we call into the DMU, because the DMU
-	 * can call down to us (e.g. via zio_free()) at any time.
+	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
+	 * be modifying this txg's alloctree, freetree, freed_tree, or
+	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
+	 * space map ASSERTs. We drop it whenever we call into the DMU,
+	 * because the DMU can call down to us (e.g. via zio_free()) at
+	 * any time.
 	 */
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
-	if (smo->smo_object == 0) {
-		ASSERT(smo->smo_objsize == 0);
-		ASSERT(smo->smo_alloc == 0);
-		smo->smo_object = dmu_object_alloc(mos,
-		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
-		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
-		ASSERT(smo->smo_object != 0);
-		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
-		    (sm->sm_start >> vd->vdev_ms_shift),
-		    sizeof (uint64_t), &smo->smo_object, tx);
+	if (msp->ms_sm == NULL) {
+		uint64_t new_object;
+
+		new_object = space_map_alloc(mos, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
+		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
+		    &msp->ms_lock));
+		ASSERT(msp->ms_sm != NULL);
 	}
 
 	mutex_enter(&msp->ms_lock);
 
-	space_map_walk(freemap, space_map_add, freed_map);
+	/*
+	 * Note: metaslab_condense() clears the space map's histogram.
+	 * Therefore we must verify and remove this histogram before
+	 * condensing.
+	 */
+	metaslab_group_histogram_verify(mg);
+	metaslab_class_histogram_verify(mg->mg_class);
+	metaslab_group_histogram_remove(mg, msp);
+
+	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
+	    metaslab_should_condense(msp)) {
+		metaslab_condense(msp, txg, tx);
+	} else {
+		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
+		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
+	}
+
+	if (msp->ms_loaded) {
+		/*
+		 * When the space map is loaded, we have an accruate
+		 * histogram in the range tree. This gives us an opportunity
+		 * to bring the space map's histogram up-to-date so we clear
+		 * it first before updating it.
+		 */
+		space_map_histogram_clear(msp->ms_sm);
+		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
 
-	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
-	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
 		/*
-		 * The in-core space map representation is twice as compact
-		 * as the on-disk one, so it's time to condense the latter
-		 * by generating a pure allocmap from first principles.
-		 *
-		 * This metaslab is 100% allocated,
-		 * minus the content of the in-core map (sm),
-		 * minus what's been freed this txg (freed_map),
-		 * minus deferred frees (ms_defermap[]),
-		 * minus allocations from txgs in the future
-		 * (because they haven't been committed yet).
-		 */
-		space_map_vacate(allocmap, NULL, NULL);
-		space_map_vacate(freemap, NULL, NULL);
-
-		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
-
-		space_map_walk(sm, space_map_remove, allocmap);
-		space_map_walk(freed_map, space_map_remove, allocmap);
-
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
-			space_map_walk(&msp->ms_defermap[t],
-			    space_map_remove, allocmap);
-
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
-			    space_map_remove, allocmap);
+		 * Since we've cleared the histogram we need to add back
+		 * any free space that has already been processed, plus
+		 * any deferred space. This allows the on-disk histogram
+		 * to accurately reflect all free space even if some space
+		 * is not yet available for allocation (i.e. deferred).
+		 */
+		space_map_histogram_add(msp->ms_sm, *freed_tree, tx);
 
-		mutex_exit(&msp->ms_lock);
-		space_map_truncate(smo, mos, tx);
-		mutex_enter(&msp->ms_lock);
+		/*
+		 * Add back any deferred free space that has not been
+		 * added back into the in-core free tree yet. This will
+		 * ensure that we don't end up with a space map histogram
+		 * that is completely empty unless the metaslab is fully
+		 * allocated.
+		 */
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			space_map_histogram_add(msp->ms_sm,
+			    msp->ms_defertree[t], tx);
+		}
+	}
+
+	/*
+	 * Always add the free space from this sync pass to the space
+	 * map histogram. We want to make sure that the on-disk histogram
+	 * accounts for all free space. If the space map is not loaded,
+	 * then we will lose some accuracy but will correct it the next
+	 * time we load the space map.
+	 */
+	space_map_histogram_add(msp->ms_sm, *freetree, tx);
+
+	metaslab_group_histogram_add(mg, msp);
+	metaslab_group_histogram_verify(mg);
+	metaslab_class_histogram_verify(mg->mg_class);
+
+	/*
+	 * For sync pass 1, we avoid traversing this txg's free range tree
+	 * and instead will just swap the pointers for freetree and
+	 * freed_tree. We can safely do this since the freed_tree is
+	 * guaranteed to be empty on the initial pass.
+	 */
+	if (spa_sync_pass(spa) == 1) {
+		range_tree_swap(freetree, freed_tree);
+	} else {
+		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
 	}
+	range_tree_vacate(alloctree, NULL, NULL);
 
-	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
-	space_map_sync(freemap, SM_FREE, smo, mos, tx);
+	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
 
 	mutex_exit(&msp->ms_lock);
 
-	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, >=, sizeof (*smo));
-	bcopy(smo, db->db_data, sizeof (*smo));
-	dmu_buf_rele(db, FTAG);
-
+	if (object != space_map_object(msp->ms_sm)) {
+		object = space_map_object(msp->ms_sm);
+		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &object, tx);
+	}
 	dmu_tx_commit(tx);
 }
 
@@ -1011,14 +2412,13 @@ metaslab_sync(metaslab_t *msp, uint64_t 
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
-	space_map_obj_t *smo = &msp->ms_smo;
-	space_map_obj_t *smosync = &msp->ms_smo_syncing;
-	space_map_t *sm = &msp->ms_map;
-	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
-	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	range_tree_t **freed_tree;
+	range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
+	boolean_t defer_allowed = B_TRUE;
 
 	ASSERT(!vd->vdev_ishole);
 
@@ -1026,46 +2426,79 @@ metaslab_sync_done(metaslab_t *msp, uint
 
 	/*
 	 * If this metaslab is just becoming available, initialize its
-	 * allocmaps and freemaps and add its capacity to the vdev.
+	 * alloctrees, freetrees, and defertree and add its capacity to
+	 * the vdev.
 	 */
-	if (freed_map->sm_size == 0) {
+	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
 		for (int t = 0; t < TXG_SIZE; t++) {
-			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
-			space_map_create(&msp->ms_freemap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+			ASSERT(msp->ms_alloctree[t] == NULL);
+			ASSERT(msp->ms_freetree[t] == NULL);
+
+			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
+			    &msp->ms_lock);
+			msp->ms_freetree[t] = range_tree_create(NULL, msp,
+			    &msp->ms_lock);
+		}
+
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			ASSERT(msp->ms_defertree[t] == NULL);
+
+			msp->ms_defertree[t] = range_tree_create(NULL, msp,
+			    &msp->ms_lock);
 		}
 
-		for (int t = 0; t < TXG_DEFER_SIZE; t++)
-			space_map_create(&msp->ms_defermap[t], sm->sm_start,
-			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+		vdev_space_update(vd, 0, 0, msp->ms_size);
+	}
+
+	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
+	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
 
-		vdev_space_update(vd, 0, 0, sm->sm_size);
+	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
+	    metaslab_class_get_alloc(spa_normal_class(spa));
+	if (free_space <= spa_get_slop_space(spa)) {
+		defer_allowed = B_FALSE;
 	}
 
-	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
-	defer_delta = freed_map->sm_space - defer_map->sm_space;
+	defer_delta = 0;
+	alloc_delta = space_map_alloc_delta(msp->ms_sm);
+	if (defer_allowed) {
+		defer_delta = range_tree_space(*freed_tree) -
+		    range_tree_space(*defer_tree);
+	} else {
+		defer_delta -= range_tree_space(*defer_tree);
+	}
 
 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
-	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
-	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
+	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
 
 	/*
-	 * If there's a space_map_load() in progress, wait for it to complete
+	 * If there's a metaslab_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
-	 * Then, add defer_map (oldest deferred frees) to this map and
-	 * transfer freed_map (this txg's frees) to defer_map.
 	 */
-	space_map_load_wait(sm);
-	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
-	space_map_vacate(freed_map, space_map_add, defer_map);
+	metaslab_load_wait(msp);
+
+	/*
+	 * Move the frees from the defer_tree back to the free
+	 * range tree (if it's loaded). Swap the freed_tree and the
+	 * defer_tree -- this is safe to do because we've just emptied out
+	 * the defer_tree.
+	 */
+	range_tree_vacate(*defer_tree,
+	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+	if (defer_allowed) {
+		range_tree_swap(freed_tree, defer_tree);
+	} else {
+		range_tree_vacate(*freed_tree,
+		    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+	}
 
-	*smo = *smosync;
+	space_map_update(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
-	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
@@ -1075,22 +2508,25 @@ metaslab_sync_done(metaslab_t *msp, uint
 	}
 
 	/*
-	 * If the map is loaded but no longer active, evict it as soon as all
-	 * future allocations have synced.  (If we unloaded it now and then
-	 * loaded a moment later, the map wouldn't reflect those allocations.)
-	 */
-	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int evictable = 1;
-
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
-			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
-				evictable = 0;
+	 * Calculate the new weights before unloading any metaslabs.
+	 * This will give us the most accurate weighting.
+	 */
+	metaslab_group_sort(mg, msp, metaslab_weight(msp));
 
-		if (evictable && !metaslab_debug)
-			space_map_unload(sm);
-	}
+	/*
+	 * If the metaslab is loaded and we've not tried to load or allocate
+	 * from it in 'metaslab_unload_delay' txgs, then unload it.
+	 */
+	if (msp->ms_loaded &&
+	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+			VERIFY0(range_tree_space(
+			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
+		}
 
-	metaslab_group_sort(mg, msp, metaslab_weight(msp));
+		if (!metaslab_debug_unload)
+			metaslab_unload(msp);
+	}
 
 	mutex_exit(&msp->ms_lock);
 }
@@ -1098,27 +2534,13 @@ metaslab_sync_done(metaslab_t *msp, uint
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
-	vdev_t *vd = mg->mg_vd;
-
-	/*
-	 * Re-evaluate all metaslabs which have lower offsets than the
-	 * bonus area.
-	 */
-	for (int m = 0; m < vd->vdev_ms_count; m++) {
-		metaslab_t *msp = vd->vdev_ms[m];
-
-		if (msp->ms_map.sm_start > mg->mg_bonus_area)
-			break;
-
-		mutex_enter(&msp->ms_lock);
-		metaslab_group_sort(mg, msp, metaslab_weight(msp));
-		mutex_exit(&msp->ms_lock);
-	}
+	metaslab_group_alloc_update(mg);
+	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 	/*
-	 * Prefetch the next potential metaslabs
+	 * Preload the next potential metaslabs
 	 */
-	metaslab_prefetch(mg);
+	metaslab_group_preload(mg);
 }
 
 static uint64_t
@@ -1126,7 +2548,7 @@ metaslab_distance(metaslab_t *msp, dva_t
 {
 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
-	uint64_t start = msp->ms_map.sm_start >> ms_shift;
+	uint64_t start = msp->ms_id;
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (1ULL << 63);
@@ -1138,13 +2560,205 @@ metaslab_distance(metaslab_t *msp, dva_t
 	return (0);
 }
 
+/*
+ * ==========================================================================
+ * Metaslab allocation tracing facility
+ * ==========================================================================
+ */
+kstat_t *metaslab_trace_ksp;
+kstat_named_t metaslab_trace_over_limit;
+
+void
+metaslab_alloc_trace_init(void)
+{
+	ASSERT(metaslab_alloc_trace_cache == NULL);
+	metaslab_alloc_trace_cache = kmem_cache_create(
+	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
+	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
+	if (metaslab_trace_ksp != NULL) {
+		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
+		kstat_named_init(&metaslab_trace_over_limit,
+		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
+		kstat_install(metaslab_trace_ksp);
+	}
+}
+
+void
+metaslab_alloc_trace_fini(void)
+{
+	if (metaslab_trace_ksp != NULL) {
+		kstat_delete(metaslab_trace_ksp);
+		metaslab_trace_ksp = NULL;
+	}
+	kmem_cache_destroy(metaslab_alloc_trace_cache);
+	metaslab_alloc_trace_cache = NULL;
+}
+
+/*
+ * Add an allocation trace element to the allocation tracing list.
+ */
+static void
+metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
+    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
+{
+	if (!metaslab_trace_enabled)
+		return;
+
+	/*
+	 * When the tracing list reaches its maximum we remove
+	 * the second element in the list before adding a new one.
+	 * By removing the second element we preserve the original
+	 * entry as a clue to what allocations steps have already been
+	 * performed.
+	 */
+	if (zal->zal_size == metaslab_trace_max_entries) {
+		metaslab_alloc_trace_t *mat_next;
+#ifdef DEBUG
+		panic("too many entries in allocation list");
+#endif
+		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
+		zal->zal_size--;
+		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
+		list_remove(&zal->zal_list, mat_next);
+		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
+	}
+
+	metaslab_alloc_trace_t *mat =
+	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
+	list_link_init(&mat->mat_list_node);
+	mat->mat_mg = mg;
+	mat->mat_msp = msp;
+	mat->mat_size = psize;
+	mat->mat_dva_id = dva_id;
+	mat->mat_offset = offset;
+	mat->mat_weight = 0;
+
+	if (msp != NULL)
+		mat->mat_weight = msp->ms_weight;
+
+	/*
+	 * The list is part of the zio so locking is not required. Only
+	 * a single thread will perform allocations for a given zio.
+	 */
+	list_insert_tail(&zal->zal_list, mat);
+	zal->zal_size++;
+
+	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
+}
+
+void
+metaslab_trace_init(zio_alloc_list_t *zal)
+{
+	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
+	    offsetof(metaslab_alloc_trace_t, mat_list_node));
+	zal->zal_size = 0;
+}
+
+void
+metaslab_trace_fini(zio_alloc_list_t *zal)
+{
+	metaslab_alloc_trace_t *mat;
+
+	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
+		kmem_cache_free(metaslab_alloc_trace_cache, mat);
+	list_destroy(&zal->zal_list);
+	zal->zal_size = 0;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    flags & METASLAB_DONT_THROTTLE)
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	(void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    flags & METASLAB_DONT_THROTTLE)
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	(void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+{
+#ifdef ZFS_DEBUG
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+
+	for (int d = 0; d < ndvas; d++) {
+		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+		VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+	}
+#endif
+}
+
+static uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+	uint64_t start;
+	range_tree_t *rt = msp->ms_tree;
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+
+	VERIFY(!msp->ms_condensing);
+
+	start = mc->mc_ops->msop_alloc(msp, size);
+	if (start != -1ULL) {
+		metaslab_group_t *mg = msp->ms_group;
+		vdev_t *vd = mg->mg_vd;
+
+		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+		range_tree_remove(rt, start, size);
+
+		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
+			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size);
+
+		/* Track the last successful allocation */
+		msp->ms_alloc_txg = txg;
+		metaslab_verify_space(msp, txg);
+	}
+
+	/*
+	 * Now that we've attempted the allocation we need to update the
+	 * metaslab's maximum block size since it may have changed.
+	 */
+	msp->ms_max_size = metaslab_block_maxsize(msp);
+	return (start);
+}
+
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
-	avl_tree_t *t = &mg->mg_metaslab_tree;
 	uint64_t activation_weight;
 	uint64_t target_distance;
 	int i;
@@ -1157,33 +2771,70 @@ metaslab_group_alloc(metaslab_group_t *m
 		}
 	}
 
+	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
+	search->ms_weight = UINT64_MAX;
+	search->ms_start = 0;
 	for (;;) {
 		boolean_t was_active;
+		avl_tree_t *t = &mg->mg_metaslab_tree;
+		avl_index_t idx;
 
 		mutex_enter(&mg->mg_lock);
-		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
-				mutex_exit(&mg->mg_lock);
-				return (-1ULL);
+
+		/*
+		 * Find the metaslab with the highest weight that is less
+		 * than what we've already tried.  In the common case, this
+		 * means that we will examine each metaslab at most once.
+		 * Note that concurrent callers could reorder metaslabs
+		 * by activation/passivation once we have dropped the mg_lock.
+		 * If a metaslab is activated by another thread, and we fail
+		 * to allocate from the metaslab we have selected, we may
+		 * not try the newly-activated metaslab, and instead activate
+		 * another metaslab.  This is not optimal, but generally
+		 * does not cause any problems (a possible exception being
+		 * if every metaslab is completely full except for the
+		 * the newly-activated metaslab which we fail to examine).
+		 */
+		msp = avl_find(t, search, &idx);
+		if (msp == NULL)
+			msp = avl_nearest(t, idx, AVL_AFTER);
+		for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+
+			if (!metaslab_should_allocate(msp, asize)) {
+				metaslab_trace_add(zal, mg, msp, asize, d,
+				    TRACE_TOO_SMALL);
+				continue;
 			}
 
+			/*
+			 * If the selected metaslab is condensing, skip it.
+			 */
+			if (msp->ms_condensing)
+				continue;
+
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
 
 			target_distance = min_distance +
-			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+			    min_distance >> 1);
 
-			for (i = 0; i < d; i++)
+			for (i = 0; i < d; i++) {
 				if (metaslab_distance(msp, &dva[i]) <
 				    target_distance)
 					break;
+			}
 			if (i == d)
 				break;
 		}
 		mutex_exit(&mg->mg_lock);
-		if (msp == NULL)
+		if (msp == NULL) {
+			kmem_free(search, sizeof (*search));
 			return (-1ULL);
+		}
+		search->ms_weight = msp->ms_weight;
+		search->ms_start = msp->ms_start + 1;
 
 		mutex_enter(&msp->ms_lock);
 
@@ -1191,11 +2842,11 @@ metaslab_group_alloc(metaslab_group_t *m
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
-		 * were blocked on the metaslab lock.
+		 * were blocked on the metaslab lock. We check the
+		 * active status first to see if we need to reselect
+		 * a new metaslab.
 		 */
-		if (msp->ms_weight < size || (was_active &&
-		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
-		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
+		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -1208,53 +2859,157 @@ metaslab_group_alloc(metaslab_group_t *m
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight, size) != 0) {
+		if (metaslab_activate(msp, activation_weight) != 0) {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+		msp->ms_selected_txg = txg;
+
+		/*
+		 * Now that we have the lock, recheck to see if we should
+		 * continue to use this metaslab for this allocation. The
+		 * the metaslab is now loaded so metaslab_should_allocate() can
+		 * accurately determine if the allocation attempt should
+		 * proceed.
+		 */
+		if (!metaslab_should_allocate(msp, asize)) {
+			/* Passivate this metaslab and select a new one. */
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_TOO_SMALL);
+			goto next;
+		}
+
+		/*
+		 * If this metaslab is currently condensing then pick again as
+		 * we can't manipulate this metaslab until it's committed
+		 * to disk.
+		 */
+		if (msp->ms_condensing) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_CONDENSING);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+		offset = metaslab_block_alloc(msp, asize, txg);
+		metaslab_trace_add(zal, mg, msp, asize, d, offset);
+
+		if (offset != -1ULL) {
+			/* Proactively passivate the metaslab, if needed */
+			metaslab_segment_may_passivate(msp);
 			break;
+		}
+next:
+		ASSERT(msp->ms_loaded);
 
-		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
+		/*
+		 * We were unable to allocate from this metaslab so determine
+		 * a new weight for this metaslab. Now that we have loaded
+		 * the metaslab we can provide a better hint to the metaslab
+		 * selector.
+		 *
+		 * For space-based metaslabs, we use the maximum block size.
+		 * This information is only available when the metaslab
+		 * is loaded and is more accurate than the generic free
+		 * space weight that was calculated by metaslab_weight().
+		 * This information allows us to quickly compare the maximum
+		 * available allocation in the metaslab to the allocation
+		 * size being requested.
+		 *
+		 * For segment-based metaslabs, determine the new weight
+		 * based on the highest bucket in the range tree. We
+		 * explicitly use the loaded segment weight (i.e. the range
+		 * tree histogram) since it contains the space that is
+		 * currently available for allocation and is accurate
+		 * even within a sync pass.
+		 */
+		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+			uint64_t weight = metaslab_block_maxsize(msp);
+			WEIGHT_SET_SPACEBASED(weight);
+			metaslab_passivate(msp, weight);
+		} else {
+			metaslab_passivate(msp,
+			    metaslab_weight_from_range_tree(msp));
+		}
 
+		/*
+		 * We have just failed an allocation attempt, check
+		 * that metaslab_should_allocate() agrees. Otherwise,
+		 * we may end up in an infinite loop retrying the same
+		 * metaslab.
+		 */
+		ASSERT(!metaslab_should_allocate(msp, asize));
 		mutex_exit(&msp->ms_lock);
 	}
+	mutex_exit(&msp->ms_lock);
+	kmem_free(search, sizeof (*search));
+	return (offset);
+}
 
-	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
-		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
-
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+{
+	uint64_t offset;
+	ASSERT(mg->mg_initialized);
 
-	mutex_exit(&msp->ms_lock);
+	offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
+	    min_distance, dva, d);
 
+	mutex_enter(&mg->mg_lock);
+	if (offset == -1ULL) {
+		mg->mg_failed_allocations++;
+		metaslab_trace_add(zal, mg, NULL, asize, d,
+		    TRACE_GROUP_FAILURE);
+		if (asize == SPA_GANGBLOCKSIZE) {
+			/*
+			 * This metaslab group was unable to allocate
+			 * the minimum gang block size so it must be out of
+			 * space. We must notify the allocation throttle
+			 * to start skipping allocation attempts to this
+			 * metaslab group until more space becomes available.
+			 * Note: this failure cannot be caused by the
+			 * allocation throttle since the allocation throttle
+			 * is only responsible for skipping devices and
+			 * not failing block allocations.
+			 */
+			mg->mg_no_free_space = B_TRUE;
+		}
+	}
+	mg->mg_allocations++;
+	mutex_exit(&mg->mg_lock);
 	return (offset);
 }
 
 /*
+ * If we have to write a ditto block (i.e. more than one DVA for a given BP)
+ * on the same vdev as an existing DVA of this BP, then try to allocate it
+ * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the
+ * existing DVAs.
+ */
+int ditto_same_vdev_distance_shift = 3;
+
+/*
  * Allocate a block for the specified i/o.
  */
 static int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
-    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
+    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
+    zio_alloc_list_t *zal)
 {
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
-	int dshift = 3;
-	int all_zero;
-	int zio_lock = B_FALSE;
-	boolean_t allocatable;
-	uint64_t offset = -1ULL;
-	uint64_t asize;
-	uint64_t distance;
+	boolean_t try_hard = B_FALSE;
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 */
-	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
-		return (ENOSPC);
+	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) {
+		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
+		return (SET_ERROR(ENOSPC));
+	}
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
@@ -1311,47 +3066,78 @@ metaslab_alloc_dva(spa_t *spa, metaslab_
 
 	rotor = mg;
 top:
-	all_zero = B_TRUE;
 	do {
-		ASSERT(mg->mg_activation_count == 1);
+		boolean_t allocatable;
 
+		ASSERT(mg->mg_activation_count == 1);
 		vd = mg->mg_vd;
 
 		/*
 		 * Don't allocate from faulted devices.
 		 */
-		if (zio_lock) {
+		if (try_hard) {
 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 			allocatable = vdev_allocatable(vd);
 			spa_config_exit(spa, SCL_ZIO, FTAG);
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
-		if (!allocatable)
+
+		/*
+		 * Determine if the selected metaslab group is eligible
+		 * for allocations. If we're ganging then don't allow
+		 * this metaslab group to skip allocations since that would
+		 * inadvertently return ENOSPC and suspend the pool
+		 * even though space is still available.
+		 */
+		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
+			allocatable = metaslab_group_allocatable(mg, rotor,
+			    psize);
+		}
+
+		if (!allocatable) {
+			metaslab_trace_add(zal, mg, NULL, psize, d,
+			    TRACE_NOT_ALLOCATABLE);
 			goto next;
+		}
+
+		ASSERT(mg->mg_initialized);
 
 		/*
-		 * Avoid writing single-copy data to a failing vdev
+		 * Avoid writing single-copy data to a failing,
+		 * non-redundant vdev, unless we've already tried all
+		 * other vdevs.
 		 */
 		if ((vd->vdev_stat.vs_write_errors > 0 ||
 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
-		    d == 0 && dshift == 3) {
-			all_zero = B_FALSE;
+		    d == 0 && !try_hard && vd->vdev_children == 0) {
+			metaslab_trace_add(zal, mg, NULL, psize, d,
+			    TRACE_VDEV_ERROR);
 			goto next;
 		}
 
 		ASSERT(mg->mg_class == mc);
 
-		distance = vd->vdev_asize >> dshift;
-		if (distance <= (1ULL << vd->vdev_ms_shift))
-			distance = 0;
-		else
-			all_zero = B_FALSE;
+		/*
+		 * If we don't need to try hard, then require that the
+		 * block be 1/8th of the device away from any other DVAs
+		 * in this BP.  If we are trying hard, allow any offset
+		 * to be used (distance=0).
+		 */
+		uint64_t distance = 0;
+		if (!try_hard) {
+			distance = vd->vdev_asize >>
+			    ditto_same_vdev_distance_shift;
+			if (distance <= (1ULL << vd->vdev_ms_shift))
+				distance = 0;
+		}
 
-		asize = vdev_psize_to_asize(vd, psize);
+		uint64_t asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
+		    distance, dva, d);
+
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@@ -1359,22 +3145,30 @@ top:
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 */
-			if (mc->mc_aliquot == 0) {
+			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
-				 * Bias by at most +/- 25% of the aliquot.
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 * For example, if a device is 80% full
+				 * and the pool is 20% full then we should
+				 * reduce allocations by 60% on this device.
+				 *
+				 * mg_bias = (20 - 80) * 512K / 100 = -307K
+				 *
+				 * This reduces allocations by 307K for this
+				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+				    (int64_t)mg->mg_aliquot) / 100;
+			} else if (!metaslab_bias_enabled) {
+				mg->mg_bias = 0;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1395,21 +3189,18 @@ next:
 		mc->mc_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
-	if (!all_zero) {
-		dshift++;
-		ASSERT(dshift < 64);
-		goto top;
-	}
-
-	if (!allocatable && !zio_lock) {
-		dshift = 3;
-		zio_lock = B_TRUE;
+	/*
+	 * If we haven't tried hard, do so now.
+	 */
+	if (!try_hard) {
+		try_hard = B_TRUE;
 		goto top;
 	}
 
 	bzero(&dva[d], sizeof (dva_t));
 
-	return (ENOSPC);
+	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
+	return (SET_ERROR(ENOSPC));
 }
 
 /*
@@ -1446,13 +3237,23 @@ metaslab_free_dva(spa_t *spa, const dva_
 	mutex_enter(&msp->ms_lock);
 
 	if (now) {
-		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
+		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
 		    offset, size);
-		space_map_free(&msp->ms_map, offset, size);
+
+		VERIFY(!msp->ms_condensing);
+		VERIFY3U(offset, >=, msp->ms_start);
+		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
+		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+		    msp->ms_size);
+		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+		range_tree_add(msp->ms_tree, offset, size);
+		msp->ms_max_size = metaslab_block_maxsize(msp);
 	} else {
-		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
+		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
+		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
+		    offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -1478,7 +3279,7 @@ metaslab_claim_dva(spa_t *spa, const dva
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
@@ -1487,23 +3288,27 @@ metaslab_claim_dva(spa_t *spa, const dva
 
 	mutex_enter(&msp->ms_lock);
 
-	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
-	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
-		error = ENOENT;
+	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
+		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
-	space_map_claim(&msp->ms_map, offset, size);
+	VERIFY(!msp->ms_condensing);
+	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
+	range_tree_remove(msp->ms_tree, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
-		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
+		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
-		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
@@ -1511,9 +3316,58 @@ metaslab_claim_dva(spa_t *spa, const dva
 	return (0);
 }
 
+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
+    int flags)
+{
+	uint64_t available_slots = 0;
+	boolean_t slot_reserved = B_FALSE;
+
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+
+	uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
+	if (reserved_slots < mc->mc_alloc_max_slots)
+		available_slots = mc->mc_alloc_max_slots - reserved_slots;
+
+	if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+		/*
+		 * We reserve the slots individually so that we can unreserve
+		 * them individually when an I/O completes.
+		 */
+		for (int d = 0; d < slots; d++) {
+			reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+		}
+		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+		slot_reserved = B_TRUE;
+	}
+
+	mutex_exit(&mc->mc_lock);
+	return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+{
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+	for (int d = 0; d < slots; d++) {
+		(void) refcount_remove(&mc->mc_alloc_slots, zio);
+	}
+	mutex_exit(&mc->mc_lock);
+}
+
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
-    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
+    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
+    zio_alloc_list_t *zal, zio_t *zio)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
@@ -1526,24 +3380,35 @@ metaslab_alloc(spa_t *spa, metaslab_clas
 
 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
-		return (ENOSPC);
+		return (SET_ERROR(ENOSPC));
 	}
 
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+	ASSERT3P(zal, !=, NULL);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
-		    txg, flags);
-		if (error) {
+		    txg, flags, zal);
+		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+				metaslab_group_alloc_decrement(spa,
+				    DVA_GET_VDEV(&dva[d]), zio, flags);
 				bzero(&dva[d], sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
+		} else {
+			/*
+			 * Update the metaslab group's queue depth
+			 * based on the newly allocated dva.
+			 */
+			metaslab_group_alloc_increment(spa,
+			    DVA_GET_VDEV(&dva[d]), zio, flags);
 		}
+
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
@@ -1602,3 +3467,28 @@ metaslab_claim(spa_t *spa, const blkptr_
 
 	return (error);
 }
+
+void
+metaslab_check_free(spa_t *spa, const blkptr_t *bp)
+{
+	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+		return;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+		vdev_t *vd = vdev_lookup_top(spa, vdev);
+		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
+		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+		if (msp->ms_loaded)
+			range_tree_verify(msp->ms_tree, offset, size);
+
+		for (int j = 0; j < TXG_SIZE; j++)
+			range_tree_verify(msp->ms_freetree[j], offset, size);
+		for (int j = 0; j < TXG_DEFER_SIZE; j++)
+			range_tree_verify(msp->ms_defertree[j], offset, size);
+	}
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c	30 Aug 2015 02:17:54 -0000
@@ -0,0 +1,366 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ *  - 'size' denotes the size of the structure containing the
+ *     multilist_node_t.
+ *  - 'offset' denotes the byte offset of the mutlilist_node_t within
+ *     the structure that contains it.
+ *  - 'num' specifies the number of internal sublists to create.
+ *  - 'index_func' is used to determine which sublist to insert into
+ *     when the multilist_insert() function is called; as well as which
+ *     sublist to remove from when multilist_remove() is called. The
+ *     requirements this function must meet, are the following:
+ *
+ *      - It must always return the same value when called on the same
+ *        object (to ensure the object is removed from the list it was
+ *        inserted into).
+ *
+ *      - It must return a value in the range [0, number of sublists).
+ *        The multilist_get_num_sublists() function may be used to
+ *        determine the number of sublists in the multilist.
+ *
+ *     Also, in order to reduce internal contention between the sublists
+ *     during insertion and removal, this function should choose evenly
+ *     between all available sublists when inserting. This isn't a hard
+ *     requirement, but a general rule of thumb in order to garner the
+ *     best multi-threaded performance out of the data structure.
+ */
+void
+multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num,
+    multilist_sublist_index_func_t *index_func)
+{
+	ASSERT3P(ml, !=, NULL);
+	ASSERT3U(size, >, 0);
+	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+	ASSERT3U(num, >, 0);
+	ASSERT3P(index_func, !=, NULL);
+
+	ml->ml_offset = offset;
+	ml->ml_num_sublists = num;
+	ml->ml_index_func = index_func;
+
+	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+	    ml->ml_num_sublists, KM_SLEEP);
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL);
+		list_create(&mls->mls_list, size, offset);
+	}
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+	ASSERT(multilist_is_empty(ml));
+
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+		ASSERT(list_is_empty(&mls->mls_list));
+
+		list_destroy(&mls->mls_list);
+		mutex_destroy(&mls->mls_lock);
+	}
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+	kmem_free(ml->ml_sublists,
+	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+	ml->ml_num_sublists = 0;
+	ml->ml_offset = 0;
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+
+	/*
+	 * Note: Callers may already hold the sublist lock by calling
+	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
+	 * returning TRUE if and only if the current thread holds the
+	 * lock.  While it's a little ugly to make the lock recursive in
+	 * this way, it works and allows the calling code to be much
+	 * simpler -- otherwise it would have to pass around a flag
+	 * indicating that it already has the lock.
+	 */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_insert_head(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+	/* See comment in multilist_insert(). */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_remove(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		/* See comment in multilist_insert(). */
+		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+		if (need_lock)
+			mutex_enter(&mls->mls_lock);
+
+		if (!list_is_empty(&mls->mls_list)) {
+			if (need_lock)
+				mutex_exit(&mls->mls_lock);
+
+			return (FALSE);
+		}
+
+		if (need_lock)
+			mutex_exit(&mls->mls_lock);
+	}
+
+	return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+	return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+	return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+	multilist_sublist_t *mls;
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+	mls = &ml->ml_sublists[sublist_idx];
+	mutex_enter(&mls->mls_lock);
+
+	return (mls);
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+	mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ *       than the object given as the parameter. This is relied upon in
+ *       arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+	void *prev = list_prev(&mls->mls_list, obj);
+
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	ASSERT(!list_is_empty(&mls->mls_list));
+
+	/* 'obj' must be at the head of the list, nothing to do */
+	if (prev == NULL)
+		return;
+
+	list_remove(&mls->mls_list, obj);
+	list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_remove(&mls->mls_list, obj);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+	list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+	return (list_link_active(link));
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c	1 Nov 2014 10:04:53 -0000
@@ -0,0 +1,411 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/range_tree.h>
+
+kmem_cache_t *range_seg_cache;
+
+void
+range_tree_init(void)
+{
+	ASSERT(range_seg_cache == NULL);
+	range_seg_cache = kmem_cache_create("range_seg_cache",
+	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+range_tree_fini(void)
+{
+	kmem_cache_destroy(range_seg_cache);
+	range_seg_cache = NULL;
+}
+
+void
+range_tree_stat_verify(range_tree_t *rt)
+{
+	range_seg_t *rs;
+	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
+	int i;
+
+	for (rs = avl_first(&rt->rt_root); rs != NULL;
+	    rs = AVL_NEXT(&rt->rt_root, rs)) {
+		uint64_t size = rs->rs_end - rs->rs_start;
+		int idx	= highbit64(size) - 1;
+
+		hist[idx]++;
+		ASSERT3U(hist[idx], !=, 0);
+	}
+
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+		if (hist[i] != rt->rt_histogram[i]) {
+			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
+			    i, hist, hist[i], rt->rt_histogram[i]);
+		}
+		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
+	}
+}
+
+static void
+range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
+{
+	uint64_t size = rs->rs_end - rs->rs_start;
+	int idx = highbit64(size) - 1;
+
+	ASSERT(size != 0);
+	ASSERT3U(idx, <,
+	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	rt->rt_histogram[idx]++;
+	ASSERT3U(rt->rt_histogram[idx], !=, 0);
+}
+
+static void
+range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
+{
+	uint64_t size = rs->rs_end - rs->rs_start;
+	int idx = highbit64(size) - 1;
+
+	ASSERT(size != 0);
+	ASSERT3U(idx, <,
+	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	ASSERT3U(rt->rt_histogram[idx], !=, 0);
+	rt->rt_histogram[idx]--;
+}
+
+/*
+ * NOTE: caller is responsible for all locking.
+ */
+static int
+range_tree_seg_compare(const void *x1, const void *x2)
+{
+	const range_seg_t *r1 = x1;
+	const range_seg_t *r2 = x2;
+
+	if (r1->rs_start < r2->rs_start) {
+		if (r1->rs_end > r2->rs_start)
+			return (0);
+		return (-1);
+	}
+	if (r1->rs_start > r2->rs_start) {
+		if (r1->rs_start < r2->rs_end)
+			return (0);
+		return (1);
+	}
+	return (0);
+}
+
+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+{
+	range_tree_t *rt;
+
+	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
+
+	avl_create(&rt->rt_root, range_tree_seg_compare,
+	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
+
+	rt->rt_lock = lp;
+	rt->rt_ops = ops;
+	rt->rt_arg = arg;
+
+	if (rt->rt_ops != NULL)
+		rt->rt_ops->rtop_create(rt, rt->rt_arg);
+
+	return (rt);
+}
+
+void
+range_tree_destroy(range_tree_t *rt)
+{
+	VERIFY0(rt->rt_space);
+
+	if (rt->rt_ops != NULL)
+		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
+
+	avl_destroy(&rt->rt_root);
+	kmem_free(rt, sizeof (*rt));
+}
+
+void
+range_tree_add(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_t *rt = arg;
+	avl_index_t where;
+	range_seg_t rsearch, *rs_before, *rs_after, *rs;
+	uint64_t end = start + size;
+	boolean_t merge_before, merge_after;
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	VERIFY(size != 0);
+
+	rsearch.rs_start = start;
+	rsearch.rs_end = end;
+	rs = avl_find(&rt->rt_root, &rsearch, &where);
+
+	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
+		zfs_panic_recover("zfs: allocating allocated segment"
+		    "(offset=%llu size=%llu)\n",
+		    (longlong_t)start, (longlong_t)size);
+		return;
+	}
+
+	/* Make sure we don't overlap with either of our neighbors */
+	VERIFY(rs == NULL);
+
+	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
+	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
+
+	merge_before = (rs_before != NULL && rs_before->rs_end == start);
+	merge_after = (rs_after != NULL && rs_after->rs_start == end);
+
+	if (merge_before && merge_after) {
+		avl_remove(&rt->rt_root, rs_before);
+		if (rt->rt_ops != NULL) {
+			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+		}
+
+		range_tree_stat_decr(rt, rs_before);
+		range_tree_stat_decr(rt, rs_after);
+
+		rs_after->rs_start = rs_before->rs_start;
+		kmem_cache_free(range_seg_cache, rs_before);
+		rs = rs_after;
+	} else if (merge_before) {
+		if (rt->rt_ops != NULL)
+			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+
+		range_tree_stat_decr(rt, rs_before);
+
+		rs_before->rs_end = end;
+		rs = rs_before;
+	} else if (merge_after) {
+		if (rt->rt_ops != NULL)
+			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+
+		range_tree_stat_decr(rt, rs_after);
+
+		rs_after->rs_start = start;
+		rs = rs_after;
+	} else {
+		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+		rs->rs_start = start;
+		rs->rs_end = end;
+		avl_insert(&rt->rt_root, rs, where);
+	}
+
+	if (rt->rt_ops != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+	range_tree_stat_incr(rt, rs);
+	rt->rt_space += size;
+}
+
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_t *rt = arg;
+	avl_index_t where;
+	range_seg_t rsearch, *rs, *newseg;
+	uint64_t end = start + size;
+	boolean_t left_over, right_over;
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	VERIFY3U(size, !=, 0);
+	VERIFY3U(size, <=, rt->rt_space);
+
+	rsearch.rs_start = start;
+	rsearch.rs_end = end;
+	rs = avl_find(&rt->rt_root, &rsearch, &where);
+
+	/* Make sure we completely overlap with someone */
+	if (rs == NULL) {
+		zfs_panic_recover("zfs: freeing free segment "
+		    "(offset=%llu size=%llu)",
+		    (longlong_t)start, (longlong_t)size);
+		return;
+	}
+	VERIFY3U(rs->rs_start, <=, start);
+	VERIFY3U(rs->rs_end, >=, end);
+
+	left_over = (rs->rs_start != start);
+	right_over = (rs->rs_end != end);
+
+	range_tree_stat_decr(rt, rs);
+
+	if (rt->rt_ops != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+	if (left_over && right_over) {
+		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+		newseg->rs_start = end;
+		newseg->rs_end = rs->rs_end;
+		range_tree_stat_incr(rt, newseg);
+
+		rs->rs_end = start;
+
+		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
+		if (rt->rt_ops != NULL)
+			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
+	} else if (left_over) {
+		rs->rs_end = start;
+	} else if (right_over) {
+		rs->rs_start = end;
+	} else {
+		avl_remove(&rt->rt_root, rs);
+		kmem_cache_free(range_seg_cache, rs);
+		rs = NULL;
+	}
+
+	if (rs != NULL) {
+		range_tree_stat_incr(rt, rs);
+
+		if (rt->rt_ops != NULL)
+			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+	}
+
+	rt->rt_space -= size;
+}
+
+static range_seg_t *
+range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	avl_index_t where;
+	range_seg_t rsearch;
+	uint64_t end = start + size;
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	VERIFY(size != 0);
+
+	rsearch.rs_start = start;
+	rsearch.rs_end = end;
+	return (avl_find(&rt->rt_root, &rsearch, &where));
+}
+
+static range_seg_t *
+range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	range_seg_t *rs = range_tree_find_impl(rt, start, size);
+	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
+		return (rs);
+	return (NULL);
+}
+
+void
+range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
+{
+	range_seg_t *rs;
+
+	mutex_enter(rt->rt_lock);
+	rs = range_tree_find(rt, off, size);
+	if (rs != NULL)
+		panic("freeing free block; rs=%p", (void *)rs);
+	mutex_exit(rt->rt_lock);
+}
+
+boolean_t
+range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	return (range_tree_find(rt, start, size) != NULL);
+}
+
+/*
+ * Ensure that this range is not in the tree, regardless of whether
+ * it is currently in the tree.
+ */
+void
+range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	range_seg_t *rs;
+
+	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
+		uint64_t free_start = MAX(rs->rs_start, start);
+		uint64_t free_end = MIN(rs->rs_end, start + size);
+		range_tree_remove(rt, free_start, free_end - free_start);
+	}
+}
+
+void
+range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
+{
+	range_tree_t *rt;
+
+	ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
+	ASSERT0(range_tree_space(*rtdst));
+	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
+
+	rt = *rtsrc;
+	*rtsrc = *rtdst;
+	*rtdst = rt;
+}
+
+void
+range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+	range_seg_t *rs;
+	void *cookie = NULL;
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	if (rt->rt_ops != NULL)
+		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
+
+	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
+		if (func != NULL)
+			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
+		kmem_cache_free(range_seg_cache, rs);
+	}
+
+	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
+	rt->rt_space = 0;
+}
+
+void
+range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+	range_seg_t *rs;
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
+		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
+}
+
+uint64_t
+range_tree_space(range_tree_t *rt)
+{
+	return (rt->rt_space);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 refcount.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c	7 Aug 2009 18:33:07 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c	10 Oct 2016 11:09:56 -0000
@@ -19,29 +19,31 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef	ZFS_DEBUG
 
 #ifdef _KERNEL
 int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+    &reference_tracking_enable, 0,
+    "Track reference holders to refcount_t objects, used mostly by ZFS");
 #else
 int reference_tracking_enable = TRUE;
 #endif
-int reference_history = 4; /* tunable */
+int reference_history = 3; /* tunable */
 
 static kmem_cache_t *reference_cache;
 static kmem_cache_t *reference_history_cache;
 
 void
-refcount_init(void)
+refcount_sysinit(void)
 {
 	reference_cache = kmem_cache_create("reference_cache",
 	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
@@ -67,6 +69,21 @@ refcount_create(refcount_t *rc)
 	    offsetof(reference_t, ref_link));
 	rc->rc_count = 0;
 	rc->rc_removed_count = 0;
+	rc->rc_tracked = reference_tracking_enable;
+}
+
+void
+refcount_create_tracked(refcount_t *rc)
+{
+	refcount_create(rc);
+	rc->rc_tracked = B_TRUE;
+}
+
+void
+refcount_create_untracked(refcount_t *rc)
+{
+	refcount_create(rc);
+	rc->rc_tracked = B_FALSE;
 }
 
 void
@@ -99,31 +116,29 @@ refcount_destroy(refcount_t *rc)
 int
 refcount_is_zero(refcount_t *rc)
 {
-	ASSERT(rc->rc_count >= 0);
 	return (rc->rc_count == 0);
 }
 
 int64_t
 refcount_count(refcount_t *rc)
 {
-	ASSERT(rc->rc_count >= 0);
 	return (rc->rc_count);
 }
 
 int64_t
 refcount_add_many(refcount_t *rc, uint64_t number, void *holder)
 {
-	reference_t *ref;
+	reference_t *ref = NULL;
 	int64_t count;
 
-	if (reference_tracking_enable) {
+	if (rc->rc_tracked) {
 		ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
 		ref->ref_holder = holder;
 		ref->ref_number = number;
 	}
 	mutex_enter(&rc->rc_mtx);
 	ASSERT(rc->rc_count >= 0);
-	if (reference_tracking_enable)
+	if (rc->rc_tracked)
 		list_insert_head(&rc->rc_list, ref);
 	rc->rc_count += number;
 	count = rc->rc_count;
@@ -147,7 +162,7 @@ refcount_remove_many(refcount_t *rc, uin
 	mutex_enter(&rc->rc_mtx);
 	ASSERT(rc->rc_count >= number);
 
-	if (!reference_tracking_enable) {
+	if (!rc->rc_tracked) {
 		rc->rc_count -= number;
 		count = rc->rc_count;
 		mutex_exit(&rc->rc_mtx);
@@ -164,7 +179,7 @@ refcount_remove_many(refcount_t *rc, uin
 				    KM_SLEEP);
 				list_insert_head(&rc->rc_removed, ref);
 				rc->rc_removed_count++;
-				if (rc->rc_removed_count >= reference_history) {
+				if (rc->rc_removed_count > reference_history) {
 					ref = list_tail(&rc->rc_removed);
 					list_remove(&rc->rc_removed, ref);
 					kmem_cache_free(reference_history_cache,
@@ -192,4 +207,115 @@ refcount_remove(refcount_t *rc, void *ho
 	return (refcount_remove_many(rc, 1, holder));
 }
 
-#endif
+void
+refcount_transfer(refcount_t *dst, refcount_t *src)
+{
+	int64_t count, removed_count;
+	list_t list, removed;
+
+	list_create(&list, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	list_create(&removed, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+
+	mutex_enter(&src->rc_mtx);
+	count = src->rc_count;
+	removed_count = src->rc_removed_count;
+	src->rc_count = 0;
+	src->rc_removed_count = 0;
+	list_move_tail(&list, &src->rc_list);
+	list_move_tail(&removed, &src->rc_removed);
+	mutex_exit(&src->rc_mtx);
+
+	mutex_enter(&dst->rc_mtx);
+	dst->rc_count += count;
+	dst->rc_removed_count += removed_count;
+	list_move_tail(&dst->rc_list, &list);
+	list_move_tail(&dst->rc_removed, &removed);
+	mutex_exit(&dst->rc_mtx);
+
+	list_destroy(&list);
+	list_destroy(&removed);
+}
+
+void
+refcount_transfer_ownership(refcount_t *rc, void *current_holder,
+    void *new_holder)
+{
+	reference_t *ref;
+	boolean_t found = B_FALSE;
+
+	mutex_enter(&rc->rc_mtx);
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return;
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == current_holder) {
+			ref->ref_holder = new_holder;
+			found = B_TRUE;
+			break;
+		}
+	}
+	ASSERT(found);
+	mutex_exit(&rc->rc_mtx);
+}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+refcount_held(refcount_t *rc, void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (rc->rc_count > 0);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_TRUE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+refcount_not_held(refcount_t *rc, void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (B_TRUE);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_FALSE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_TRUE);
+}
+#endif	/* ZFS_DEBUG */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c,v
retrieving revision 1.3
diff -u -p -r1.3 rrwlock.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c	2 May 2017 18:05:37 -0000
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 #include <sys/refcount.h>
 #include <sys/rrwlock.h>
@@ -72,8 +75,9 @@
 uint_t rrw_tsd_key;
 
 typedef struct rrw_node {
-	struct rrw_node	*rn_next;
-	rrwlock_t	*rn_rrl;
+	struct rrw_node *rn_next;
+	rrwlock_t *rn_rrl;
+	void *rn_tag;
 } rrw_node_t;
 
 static rrw_node_t *
@@ -95,13 +99,14 @@ rrn_find(rrwlock_t *rrl)
  * Add a node to the head of the singly linked list.
  */
 static void
-rrn_add(rrwlock_t *rrl)
+rrn_add(rrwlock_t *rrl, void *tag)
 {
 	rrw_node_t *rn;
 
 	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
 	rn->rn_rrl = rrl;
 	rn->rn_next = tsd_get(rrw_tsd_key);
+	rn->rn_tag = tag;
 	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
 }
 
@@ -110,7 +115,7 @@ rrn_add(rrwlock_t *rrl)
  * thread's list and return TRUE; otherwise return FALSE.
  */
 static boolean_t
-rrn_find_and_remove(rrwlock_t *rrl)
+rrn_find_and_remove(rrwlock_t *rrl, void *tag)
 {
 	rrw_node_t *rn;
 	rrw_node_t *prev = NULL;
@@ -119,7 +124,7 @@ rrn_find_and_remove(rrwlock_t *rrl)
 		return (B_FALSE);
 
 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
-		if (rn->rn_rrl == rrl) {
+		if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
 			if (prev)
 				prev->rn_next = rn->rn_next;
 			else
@@ -133,7 +138,7 @@ rrn_find_and_remove(rrwlock_t *rrl)
 }
 
 void
-rrw_init(rrwlock_t *rrl)
+rrw_init(rrwlock_t *rrl, boolean_t track_all)
 {
 	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
@@ -141,6 +146,7 @@ rrw_init(rrwlock_t *rrl)
 	refcount_create(&rrl->rr_anon_rcount);
 	refcount_create(&rrl->rr_linked_rcount);
 	rrl->rr_writer_wanted = B_FALSE;
+	rrl->rr_track_all = track_all;
 }
 
 void
@@ -154,11 +160,12 @@ rrw_destroy(rrwlock_t *rrl)
 }
 
 static void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
 #if !defined(DEBUG) && defined(_KERNEL)
-	if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
+	if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
+	    !rrl->rr_track_all) {
 		rrl->rr_anon_rcount.rc_count++;
 		mutex_exit(&rrl->rr_lock);
 		return;
@@ -168,14 +175,14 @@ rrw_enter_read(rrwlock_t *rrl, void *tag
 	ASSERT(rrl->rr_writer != curthread);
 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 
-	while (rrl->rr_writer || (rrl->rr_writer_wanted &&
-	    refcount_is_zero(&rrl->rr_anon_rcount) &&
+	while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
+	    refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
 	    rrn_find(rrl) == NULL))
 		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
 
-	if (rrl->rr_writer_wanted) {
+	if (rrl->rr_writer_wanted || rrl->rr_track_all) {
 		/* may or may not be a re-entrant enter */
-		rrn_add(rrl);
+		rrn_add(rrl, tag);
 		(void) refcount_add(&rrl->rr_linked_rcount, tag);
 	} else {
 		(void) refcount_add(&rrl->rr_anon_rcount, tag);
@@ -184,7 +191,26 @@ rrw_enter_read(rrwlock_t *rrl, void *tag
 	mutex_exit(&rrl->rr_lock);
 }
 
-static void
+void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+	rrw_enter_read_impl(rrl, B_FALSE, tag);
+}
+
+/*
+ * take a read lock even if there are pending write lock requests. if we want
+ * to take a lock reentrantly, but from different threads (that have a
+ * relationship to each other), the normal detection mechanism to overrule
+ * the pending writer does not work, so we have to give an explicit hint here.
+ */
+void
+rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+{
+	rrw_enter_read_impl(rrl, B_TRUE, tag);
+}
+
+
+void
 rrw_enter_write(rrwlock_t *rrl)
 {
 	mutex_enter(&rrl->rr_lock);
@@ -230,10 +256,12 @@ rrw_exit(rrwlock_t *rrl, void *tag)
 
 	if (rrl->rr_writer == NULL) {
 		int64_t count;
-		if (rrn_find_and_remove(rrl))
+		if (rrn_find_and_remove(rrl, tag)) {
 			count = refcount_remove(&rrl->rr_linked_rcount, tag);
-		else
+		} else {
+			ASSERT(!rrl->rr_track_all);
 			count = refcount_remove(&rrl->rr_anon_rcount, tag);
+		}
 		if (count == 0)
 			cv_broadcast(&rrl->rr_cv);
 	} else {
@@ -246,6 +274,11 @@ rrw_exit(rrwlock_t *rrl, void *tag)
 	mutex_exit(&rrl->rr_lock);
 }
 
+/*
+ * If the lock was created with track_all, rrw_held(RW_READER) will return
+ * B_TRUE iff the current thread has the lock for reader.  Otherwise it may
+ * return B_TRUE if any thread has the lock for reader.
+ */
 boolean_t
 rrw_held(rrwlock_t *rrl, krw_t rw)
 {
@@ -256,9 +289,107 @@ rrw_held(rrwlock_t *rrl, krw_t rw)
 		held = (rrl->rr_writer == curthread);
 	} else {
 		held = (!refcount_is_zero(&rrl->rr_anon_rcount) ||
-		    !refcount_is_zero(&rrl->rr_linked_rcount));
+		    rrn_find(rrl) != NULL);
 	}
 	mutex_exit(&rrl->rr_lock);
 
 	return (held);
 }
+
+void
+rrw_tsd_destroy(void *arg)
+{
+	rrw_node_t *rn = arg;
+	if (rn != NULL) {
+		panic("thread %p terminating with rrw lock %p held",
+		    (void *)curthread, (void *)rn->rn_rrl);
+	}
+}
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function.  That proportionally reduces lock congestion.
+ * Writer same time has to sequentially aquire write on all the locks.
+ * That makes write aquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rrm_enter_read(rrl, tag);
+	else
+		rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock.  Note that the lock
+ * must be released by the same thread that acquired it.  We do this
+ * mapping by taking the thread pointer mod a prime number.  We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define	RRM_TD_LOCK()	(((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+	rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+	int i;
+
+	if (rrl->locks[0].rr_writer == curthread) {
+		for (i = 0; i < RRM_NUM_LOCKS; i++)
+			rrw_exit(&rrl->locks[i], tag);
+	} else {
+		rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+	}
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+	if (rw == RW_WRITER) {
+		return (rrw_held(&rrl->locks[0], rw));
+	} else {
+		return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+	}
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c	11 May 2017 16:36:37 -0000
@@ -0,0 +1,2028 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 iXsystems, Inc
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/dnode.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode.  The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations.  The spill block will be sized to fit the data
+ * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer.  Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function.  The names are only used
+ * during registration.  All  attributes are known by their unique attribute
+ * id value.  If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually.  Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout.  The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is whats
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill).  If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used.  This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attribues.  The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes.  Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once.  sa_replace_all_by_template() is
+ * used to set an array of attributes.  This is used by the ZPL when
+ * creating a brand new file.  The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ *
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing.  The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping.  Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available.  Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+    uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+    void *data);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+    int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+    uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t *sa_bswap_table[] = {
+	byteswap_uint64_array,
+	byteswap_uint32_array,
+	byteswap_uint16_array,
+	byteswap_uint8_array,
+	zfs_acl_byteswap,
+};
+
+#define	SA_COPY_DATA(f, s, t, l) \
+	{ \
+		if (f == NULL) { \
+			if (l == 8) { \
+				*(uint64_t *)t = *(uint64_t *)s; \
+			} else if (l == 16) { \
+				*(uint64_t *)t = *(uint64_t *)s; \
+				*(uint64_t *)((uintptr_t)t + 8) = \
+				    *(uint64_t *)((uintptr_t)s + 8); \
+			} else { \
+				bcopy(s, t, l); \
+			} \
+		} else \
+			sa_copy_data(f, s, t, l); \
+	}
+
+/*
+ * This table is fixed and cannot be changed.  Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes.  These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = 16;
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+	sa_handle_t *hdl = buf;
+
+#ifdef __NetBSD__
+	hdl = unused;
+#endif
+	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+	sa_handle_t *hdl = buf;
+
+#ifdef __NetBSD__
+	hdl = unused;
+#endif
+	mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+	sa_cache = kmem_cache_create("sa_cache",
+	    sizeof (sa_handle_t), 0, sa_cache_constructor,
+	    sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+	if (sa_cache)
+		kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+	const sa_lot_t *node1 = arg1;
+	const sa_lot_t *node2 = arg2;
+
+	if (node1->lot_num > node2->lot_num)
+		return (1);
+	else if (node1->lot_num < node2->lot_num)
+		return (-1);
+	return (0);
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+	const sa_lot_t *node1 = arg1;
+	const sa_lot_t *node2 = arg2;
+
+	if (node1->lot_hash > node2->lot_hash)
+		return (1);
+	if (node1->lot_hash < node2->lot_hash)
+		return (-1);
+	if (node1->lot_instance > node2->lot_instance)
+		return (1);
+	if (node1->lot_instance < node2->lot_instance)
+		return (-1);
+	return (0);
+}
+
+boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+	int i;
+
+	if (count != tbf->lot_attr_count)
+		return (1);
+
+	for (i = 0; i != count; i++) {
+		if (attrs[i] != tbf->lot_attrs[i])
+			return (1);
+	}
+	return (0);
+}
+
+#define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+	int i;
+	uint64_t crc = -1ULL;
+
+	for (i = 0; i != attr_count; i++)
+		crc ^= SA_ATTR_HASH(attrs[i]);
+
+	return (crc);
+}
+
+static int
+sa_get_spill(sa_handle_t *hdl)
+{
+	int rc;
+	if (hdl->sa_spill == NULL) {
+		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+		    &hdl->sa_spill)) == 0)
+			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+	} else {
+		rc = 0;
+	}
+
+	return (rc);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+    sa_data_op_t data_op, dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	int i;
+	int error = 0;
+	sa_buf_type_t buftypes;
+
+	buftypes = 0;
+
+	ASSERT(count > 0);
+	for (i = 0; i != count; i++) {
+		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+		bulk[i].sa_addr = NULL;
+		/* First check the bonus buffer */
+
+		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+			    SA_GET_HDR(hdl, SA_BONUS),
+			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+			if (tx && !(buftypes & SA_BONUS)) {
+				dmu_buf_will_dirty(hdl->sa_bonus, tx);
+				buftypes |= SA_BONUS;
+			}
+		}
+		if (bulk[i].sa_addr == NULL &&
+		    ((error = sa_get_spill(hdl)) == 0)) {
+			if (TOC_ATTR_PRESENT(
+			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+				    SA_GET_HDR(hdl, SA_SPILL),
+				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+				if (tx && !(buftypes & SA_SPILL) &&
+				    bulk[i].sa_size == bulk[i].sa_length) {
+					dmu_buf_will_dirty(hdl->sa_spill, tx);
+					buftypes |= SA_SPILL;
+				}
+			}
+		}
+		if (error && error != ENOENT) {
+			return ((error == ECKSUM) ? EIO : error);
+		}
+
+		switch (data_op) {
+		case SA_LOOKUP:
+			if (bulk[i].sa_addr == NULL)
+				return (SET_ERROR(ENOENT));
+			if (bulk[i].sa_data) {
+				SA_COPY_DATA(bulk[i].sa_data_func,
+				    bulk[i].sa_addr, bulk[i].sa_data,
+				    bulk[i].sa_size);
+			}
+			continue;
+
+		case SA_UPDATE:
+			/* existing rewrite of attr */
+			if (bulk[i].sa_addr &&
+			    bulk[i].sa_size == bulk[i].sa_length) {
+				SA_COPY_DATA(bulk[i].sa_data_func,
+				    bulk[i].sa_data, bulk[i].sa_addr,
+				    bulk[i].sa_length);
+				continue;
+			} else if (bulk[i].sa_addr) { /* attr size change */
+				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+				    SA_REPLACE, bulk[i].sa_data_func,
+				    bulk[i].sa_data, bulk[i].sa_length, tx);
+			} else { /* adding new attribute */
+				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+				    SA_ADD, bulk[i].sa_data_func,
+				    bulk[i].sa_data, bulk[i].sa_length, tx);
+			}
+			if (error)
+				return (error);
+			break;
+		}
+	}
+	return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+    uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+	sa_os_t *sa = os->os_sa;
+	sa_lot_t *tb, *findtb;
+	int i;
+	avl_index_t loc;
+
+	ASSERT(MUTEX_HELD(&sa->sa_lock));
+	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+	tb->lot_attr_count = attr_count;
+#ifdef __NetBSD__
+	if (attr_count != 0)
+#endif
+	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+	    KM_SLEEP);
+	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+	tb->lot_num = lot_num;
+	tb->lot_hash = hash;
+	tb->lot_instance = 0;
+
+	if (zapadd) {
+		char attr_name[8];
+
+		if (sa->sa_layout_attr_obj == 0) {
+			sa->sa_layout_attr_obj = zap_create_link(os,
+			    DMU_OT_SA_ATTR_LAYOUTS,
+			    sa->sa_master_obj, SA_LAYOUTS, tx);
+		}
+
+		(void) snprintf(attr_name, sizeof (attr_name),
+		    "%d", (int)lot_num);
+		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+		    attr_name, 2, attr_count, attrs, tx));
+	}
+
+	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+	    offsetof(sa_idx_tab_t, sa_next));
+
+	for (i = 0; i != attr_count; i++) {
+		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+			tb->lot_var_sizes++;
+	}
+
+	avl_add(&sa->sa_layout_num_tree, tb);
+
+	/* verify we don't have a hash collision */
+	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+		for (; findtb && findtb->lot_hash == hash;
+		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+			if (findtb->lot_instance != tb->lot_instance)
+				break;
+			tb->lot_instance++;
+		}
+	}
+	avl_add(&sa->sa_layout_hash_tree, tb);
+	return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+    int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+	sa_lot_t *tb, tbsearch;
+	avl_index_t loc;
+	sa_os_t *sa = os->os_sa;
+	boolean_t found = B_FALSE;
+
+	mutex_enter(&sa->sa_lock);
+	tbsearch.lot_hash = hash;
+	tbsearch.lot_instance = 0;
+	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+	if (tb) {
+		for (; tb && tb->lot_hash == hash;
+		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+			if (sa_layout_equal(tb, attrs, count) == 0) {
+				found = B_TRUE;
+				break;
+			}
+		}
+	}
+	if (!found) {
+		tb = sa_add_layout_entry(os, attrs, count,
+		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+	}
+	mutex_exit(&sa->sa_lock);
+	*lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+	int error;
+	uint32_t blocksize;
+
+	if (size == 0) {
+		blocksize = SPA_MINBLOCKSIZE;
+	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
+		ASSERT(0);
+		return (SET_ERROR(EFBIG));
+	} else {
+		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+	}
+
+	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+	ASSERT(error == 0);
+	return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+	if (func == NULL) {
+		bcopy(datastart, target, buflen);
+	} else {
+		boolean_t start;
+		int bytes;
+		void *dataptr;
+		void *saptr = target;
+		uint32_t length;
+
+		start = B_TRUE;
+		bytes = 0;
+		while (bytes < buflen) {
+			func(&dataptr, &length, buflen, start, datastart);
+			bcopy(dataptr, saptr, length);
+			saptr = (void *)((caddr_t)saptr + length);
+			bytes += length;
+			start = B_FALSE;
+		}
+	}
+}
+
+/*
+ * Determine several different sizes
+ * first the sa header size
+ * the number of bytes to be stored
+ * if spill would occur the index in the attribute array is returned
+ *
+ * the boolean will_spill will be set when spilling is necessary.  It
+ * is only set when the buftype is SA_BONUS
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+    dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total,
+    boolean_t *will_spill)
+{
+	int var_size = 0;
+	int i;
+	int full_space;
+	int hdrsize;
+	int extra_hdrsize;
+
+	if (buftype == SA_BONUS && sa->sa_force_spill) {
+		*total = 0;
+		*index = 0;
+		*will_spill = B_TRUE;
+		return (0);
+	}
+
+	*index = -1;
+	*total = 0;
+	*will_spill = B_FALSE;
+
+	extra_hdrsize = 0;
+	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+	    sizeof (sa_hdr_phys_t);
+
+	full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size;
+	ASSERT(IS_P2ALIGNED(full_space, 8));
+
+	for (i = 0; i != attr_count; i++) {
+		boolean_t is_var_sz;
+
+		*total = P2ROUNDUP(*total, 8);
+		*total += attr_desc[i].sa_length;
+		if (*will_spill)
+			continue;
+
+		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+		if (is_var_sz) {
+			var_size++;
+		}
+
+		if (is_var_sz && var_size > 1) {
+			/*
+			 * Don't worry that the spill block might overflow.
+			 * It will be resized if needed in sa_build_layouts().
+			 */
+			if (buftype == SA_SPILL ||
+			    P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+			    *total < full_space) {
+				/*
+				 * Account for header space used by array of
+				 * optional sizes of variable-length attributes.
+				 * Record the extra header size in case this
+				 * increase needs to be reversed due to
+				 * spill-over.
+				 */
+				hdrsize += sizeof (uint16_t);
+				if (*index != -1)
+					extra_hdrsize += sizeof (uint16_t);
+			} else {
+				ASSERT(buftype == SA_BONUS);
+				if (*index == -1)
+					*index = i;
+				*will_spill = B_TRUE;
+				continue;
+			}
+		}
+
+		/*
+		 * find index of where spill *could* occur.
+		 * Then continue to count of remainder attribute
+		 * space.  The sum is used later for sizing bonus
+		 * and spill buffer.
+		 */
+		if (buftype == SA_BONUS && *index == -1 &&
+		    (*total + P2ROUNDUP(hdrsize, 8)) >
+		    (full_space - sizeof (blkptr_t))) {
+			*index = i;
+		}
+
+		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
+		    buftype == SA_BONUS)
+			*will_spill = B_TRUE;
+	}
+
+	if (*will_spill)
+		hdrsize -= extra_hdrsize;
+
+	hdrsize = P2ROUNDUP(hdrsize, 8);
+	return (hdrsize);
+}
+
+#define	BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+    dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	uint64_t hash;
+	sa_buf_type_t buftype;
+	sa_hdr_phys_t *sahdr;
+	void *data_start;
+	int buf_space;
+	sa_attr_type_t *attrs, *attrs_start;
+	int i, lot_count;
+	int hdrsize;
+	int spillhdrsize = 0;
+	int used;
+	dmu_object_type_t bonustype;
+	sa_lot_t *lot;
+	int len_idx;
+	int spill_used;
+	boolean_t spilling;
+
+	dmu_buf_will_dirty(hdl->sa_bonus, tx);
+	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+
+	/* first determine bonus header size and sum of all attributes */
+	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+	    SA_BONUS, &i, &used, &spilling);
+
+	if (used > SPA_OLD_MAXBLOCKSIZE)
+		return (SET_ERROR(EFBIG));
+
+	VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
+	    MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) :
+	    used + hdrsize, tx));
+
+	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+	    bonustype == DMU_OT_SA);
+
+	/* setup and size spill buffer when needed */
+	if (spilling) {
+		boolean_t dummy;
+
+		if (hdl->sa_spill == NULL) {
+			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL,
+			    &hdl->sa_spill) == 0);
+		}
+		dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+		spillhdrsize = sa_find_sizes(sa, &attr_desc[i],
+		    attr_count - i, hdl->sa_spill, SA_SPILL, &i,
+		    &spill_used, &dummy);
+
+		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
+			return (SET_ERROR(EFBIG));
+
+		buf_space = hdl->sa_spill->db_size - spillhdrsize;
+		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+		    hdl->sa_spill->db_size)
+			VERIFY(0 == sa_resize_spill(hdl,
+			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+	}
+
+	/* setup starting pointers to lay down data */
+	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+	buftype = SA_BONUS;
+
+	if (spilling)
+		buf_space = (sa->sa_force_spill) ?
+		    0 : SA_BLKPTR_SPACE - hdrsize;
+	else
+		buf_space = hdl->sa_bonus->db_size - hdrsize;
+
+	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+	    KM_SLEEP);
+	lot_count = 0;
+
+	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+		uint16_t length;
+
+		ASSERT(IS_P2ALIGNED(data_start, 8));
+		ASSERT(IS_P2ALIGNED(buf_space, 8));
+		attrs[i] = attr_desc[i].sa_attr;
+		length = SA_REGISTERED_LEN(sa, attrs[i]);
+		if (length == 0)
+			length = attr_desc[i].sa_length;
+		else
+			VERIFY(length == attr_desc[i].sa_length);
+
+		if (buf_space < length) {  /* switch to spill buffer */
+			VERIFY(spilling);
+			VERIFY(bonustype == DMU_OT_SA);
+			if (buftype == SA_BONUS && !sa->sa_force_spill) {
+				sa_find_layout(hdl->sa_os, hash, attrs_start,
+				    lot_count, tx, &lot);
+				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+			}
+
+			buftype = SA_SPILL;
+			hash = -1ULL;
+			len_idx = 0;
+
+			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+			sahdr->sa_magic = SA_MAGIC;
+			data_start = (void *)((uintptr_t)sahdr +
+			    spillhdrsize);
+			attrs_start = &attrs[i];
+			buf_space = hdl->sa_spill->db_size - spillhdrsize;
+			lot_count = 0;
+		}
+		hash ^= SA_ATTR_HASH(attrs[i]);
+		attr_desc[i].sa_addr = data_start;
+		attr_desc[i].sa_size = length;
+		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+		    data_start, length);
+		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+			sahdr->sa_lengths[len_idx++] = length;
+		}
+		VERIFY((uintptr_t)data_start % 8 == 0);
+		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+		    length), 8);
+		buf_space -= P2ROUNDUP(length, 8);
+		lot_count++;
+	}
+
+	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+	/*
+	 * Verify that old znodes always have layout number 0.
+	 * Must be DMU_OT_SA for arbitrary layouts
+	 */
+	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
+	if (bonustype == DMU_OT_SA) {
+		SA_SET_HDR(sahdr, lot->lot_num,
+		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
+	}
+
+	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+	if (hdl->sa_bonus_tab) {
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+		hdl->sa_bonus_tab = NULL;
+	}
+	if (!sa->sa_force_spill)
+		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+	if (hdl->sa_spill) {
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+		if (!spilling) {
+			/*
+			 * remove spill block that is no longer needed.
+			 */
+			dmu_buf_rele(hdl->sa_spill, NULL);
+			hdl->sa_spill = NULL;
+			hdl->sa_spill_tab = NULL;
+			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+			    sa_handle_object(hdl), tx));
+		} else {
+			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+		}
+	}
+
+	return (0);
+}
+
+static void
+sa_free_attr_table(sa_os_t *sa)
+{
+	int i;
+
+	if (sa->sa_attr_table == NULL)
+		return;
+
+	for (i = 0; i != sa->sa_num_attrs; i++) {
+		if (sa->sa_attr_table[i].sa_name)
+			kmem_free(sa->sa_attr_table[i].sa_name,
+			    strlen(sa->sa_attr_table[i].sa_name) + 1);
+	}
+
+	kmem_free(sa->sa_attr_table,
+	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+	sa->sa_attr_table = NULL;
+}
+
+static int
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+	sa_os_t *sa = os->os_sa;
+	uint64_t sa_attr_count = 0;
+	uint64_t sa_reg_count = 0;
+	int error = 0;
+	uint64_t attr_value;
+	sa_attr_table_t *tb;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int registered_count = 0;
+	int i;
+	dmu_objset_type_t ostype = dmu_objset_type(os);
+
+	sa->sa_user_table =
+	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+	if (sa->sa_reg_attr_obj != 0) {
+		error = zap_count(os, sa->sa_reg_attr_obj,
+		    &sa_attr_count);
+
+		/*
+		 * Make sure we retrieved a count and that it isn't zero
+		 */
+		if (error || (error == 0 && sa_attr_count == 0)) {
+			if (error == 0)
+				error = SET_ERROR(EINVAL);
+			goto bail;
+		}
+		sa_reg_count = sa_attr_count;
+	}
+
+	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+		sa_attr_count += sa_legacy_attr_count;
+
+	/* Allocate attribute numbers for attributes that aren't registered */
+	for (i = 0; i != count; i++) {
+		boolean_t found = B_FALSE;
+		int j;
+
+		if (ostype == DMU_OST_ZFS) {
+			for (j = 0; j != sa_legacy_attr_count; j++) {
+				if (strcmp(reg_attrs[i].sa_name,
+				    sa_legacy_attrs[j].sa_name) == 0) {
+					sa->sa_user_table[i] =
+					    sa_legacy_attrs[j].sa_attr;
+					found = B_TRUE;
+				}
+			}
+		}
+		if (found)
+			continue;
+
+		if (sa->sa_reg_attr_obj)
+			error = zap_lookup(os, sa->sa_reg_attr_obj,
+			    reg_attrs[i].sa_name, 8, 1, &attr_value);
+		else
+			error = SET_ERROR(ENOENT);
+		switch (error) {
+		case ENOENT:
+			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+			sa_attr_count++;
+			break;
+		case 0:
+			sa->sa_user_table[i] = ATTR_NUM(attr_value);
+			break;
+		default:
+			goto bail;
+		}
+	}
+
+	sa->sa_num_attrs = sa_attr_count;
+	tb = sa->sa_attr_table =
+	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+	/*
+	 * Attribute table is constructed from requested attribute list,
+	 * previously foreign registered attributes, and also the legacy
+	 * ZPL set of attributes.
+	 */
+
+	if (sa->sa_reg_attr_obj) {
+		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t value;
+			value  = za.za_first_integer;
+
+			registered_count++;
+			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+			if (tb[ATTR_NUM(value)].sa_name) {
+				continue;
+			}
+			tb[ATTR_NUM(value)].sa_name =
+			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+			    strlen(za.za_name) +1);
+		}
+		zap_cursor_fini(&zc);
+		/*
+		 * Make sure we processed the correct number of registered
+		 * attributes
+		 */
+		if (registered_count != sa_reg_count) {
+			ASSERT(error != 0);
+			goto bail;
+		}
+
+	}
+
+	if (ostype == DMU_OST_ZFS) {
+		for (i = 0; i != sa_legacy_attr_count; i++) {
+			if (tb[i].sa_name)
+				continue;
+			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+			tb[i].sa_registered = B_FALSE;
+			tb[i].sa_name =
+			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+			    KM_SLEEP);
+			(void) strlcpy(tb[i].sa_name,
+			    sa_legacy_attrs[i].sa_name,
+			    strlen(sa_legacy_attrs[i].sa_name) + 1);
+		}
+	}
+
+	for (i = 0; i != count; i++) {
+		sa_attr_type_t attr_id;
+
+		attr_id = sa->sa_user_table[i];
+		if (tb[attr_id].sa_name)
+			continue;
+
+		tb[attr_id].sa_length = reg_attrs[i].sa_length;
+		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+		tb[attr_id].sa_attr = attr_id;
+		tb[attr_id].sa_name =
+		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+		    strlen(reg_attrs[i].sa_name) + 1);
+	}
+
+	sa->sa_need_attr_registration =
+	    (sa_attr_count != registered_count);
+
+	return (0);
+bail:
+	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+	sa->sa_user_table = NULL;
+	sa_free_attr_table(sa);
+	return ((error != 0) ? error : EINVAL);
+}
+
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+    sa_attr_type_t **user_table)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	sa_os_t *sa;
+	dmu_objset_type_t ostype = dmu_objset_type(os);
+	sa_attr_type_t *tb;
+	int error;
+
+	mutex_enter(&os->os_user_ptr_lock);
+	if (os->os_sa) {
+		mutex_enter(&os->os_sa->sa_lock);
+		mutex_exit(&os->os_user_ptr_lock);
+		tb = os->os_sa->sa_user_table;
+		mutex_exit(&os->os_sa->sa_lock);
+		*user_table = tb;
+		return (0);
+	}
+
+	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+	mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+	sa->sa_master_obj = sa_obj;
+
+	os->os_sa = sa;
+	mutex_enter(&sa->sa_lock);
+	mutex_exit(&os->os_user_ptr_lock);
+	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+	if (sa_obj) {
+		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+		    8, 1, &sa->sa_layout_attr_obj);
+		if (error != 0 && error != ENOENT)
+			goto fail;
+		error = zap_lookup(os, sa_obj, SA_REGISTRY,
+		    8, 1, &sa->sa_reg_attr_obj);
+		if (error != 0 && error != ENOENT)
+			goto fail;
+	}
+
+	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+		goto fail;
+
+	if (sa->sa_layout_attr_obj != 0) {
+		uint64_t layout_count;
+
+		error = zap_count(os, sa->sa_layout_attr_obj,
+		    &layout_count);
+
+		/*
+		 * Layout number count should be > 0
+		 */
+		if (error || (error == 0 && layout_count == 0)) {
+			if (error == 0)
+				error = SET_ERROR(EINVAL);
+			goto fail;
+		}
+
+		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+		    zap_cursor_advance(&zc)) {
+			sa_attr_type_t *lot_attrs;
+			uint64_t lot_num;
+
+			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+			    za.za_num_integers, KM_SLEEP);
+
+			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+			    za.za_name, 2, za.za_num_integers,
+			    lot_attrs))) != 0) {
+				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+				    za.za_num_integers);
+				break;
+			}
+			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+			    (unsigned long long *)&lot_num) == 0);
+
+			(void) sa_add_layout_entry(os, lot_attrs,
+			    za.za_num_integers, lot_num,
+			    sa_layout_info_hash(lot_attrs,
+			    za.za_num_integers), B_FALSE, NULL);
+			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+			    za.za_num_integers);
+		}
+		zap_cursor_fini(&zc);
+
+		/*
+		 * Make sure layout count matches number of entries added
+		 * to AVL tree
+		 */
+		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+			ASSERT(error != 0);
+			goto fail;
+		}
+	}
+
+	/* Add special layout number for old ZNODES */
+	if (ostype == DMU_OST_ZFS) {
+		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+		    sa_legacy_attr_count, 0,
+		    sa_layout_info_hash(sa_legacy_zpl_layout,
+		    sa_legacy_attr_count), B_FALSE, NULL);
+
+		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+		    0, B_FALSE, NULL);
+	}
+	*user_table = os->os_sa->sa_user_table;
+	mutex_exit(&sa->sa_lock);
+	return (0);
+fail:
+	os->os_sa = NULL;
+	sa_free_attr_table(sa);
+	if (sa->sa_user_table)
+		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+	mutex_exit(&sa->sa_lock);
+	avl_destroy(&sa->sa_layout_hash_tree);
+	avl_destroy(&sa->sa_layout_num_tree);
+	mutex_destroy(&sa->sa_lock);
+	kmem_free(sa, sizeof (sa_os_t));
+	return ((error == ECKSUM) ? EIO : error);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+	sa_os_t *sa = os->os_sa;
+	sa_lot_t *layout;
+	void *cookie;
+
+	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+	/* Free up attr table */
+
+	sa_free_attr_table(sa);
+
+	cookie = NULL;
+	while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) {
+		sa_idx_tab_t *tab;
+		while (tab = list_head(&layout->lot_idx_tab)) {
+			ASSERT(refcount_count(&tab->sa_refcount));
+			sa_idx_tab_rele(os, tab);
+		}
+	}
+
+	cookie = NULL;
+	while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) {
+#ifdef __NetBSD__
+		if (layout->lot_attr_count != 0)
+#endif
+		kmem_free(layout->lot_attrs,
+		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
+		kmem_free(layout, sizeof (sa_lot_t));
+	}
+
+	avl_destroy(&sa->sa_layout_hash_tree);
+	avl_destroy(&sa->sa_layout_num_tree);
+	mutex_destroy(&sa->sa_lock);
+
+	kmem_free(sa, sizeof (sa_os_t));
+	os->os_sa = NULL;
+}
+
+void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+    uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+	sa_idx_tab_t *idx_tab = userp;
+
+	if (var_length) {
+		ASSERT(idx_tab->sa_variable_lengths);
+		idx_tab->sa_variable_lengths[length_idx] = length;
+	}
+	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+    sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+	void *data_start;
+	sa_lot_t *tb = tab;
+	sa_lot_t search;
+	avl_index_t loc;
+	sa_os_t *sa = os->os_sa;
+	int i;
+	uint16_t *length_start = NULL;
+	uint8_t length_idx = 0;
+
+	if (tab == NULL) {
+		search.lot_num = SA_LAYOUT_NUM(hdr, type);
+		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+		ASSERT(tb);
+	}
+
+	if (IS_SA_BONUSTYPE(type)) {
+		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+		    offsetof(sa_hdr_phys_t, sa_lengths) +
+		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+		length_start = hdr->sa_lengths;
+	} else {
+		data_start = hdr;
+	}
+
+	for (i = 0; i != tb->lot_attr_count; i++) {
+		int attr_length, reg_length;
+		uint8_t idx_len;
+
+		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+		if (reg_length) {
+			attr_length = reg_length;
+			idx_len = 0;
+		} else {
+			attr_length = length_start[length_idx];
+			idx_len = length_idx++;
+		}
+
+		func(hdr, data_start, tb->lot_attrs[i], attr_length,
+		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+		    attr_length), 8);
+	}
+}
+
+/*ARGSUSED*/
+void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+    uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+	sa_handle_t *hdl = userp;
+	sa_os_t *sa = hdl->sa_os->os_sa;
+
+	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+	dmu_buf_impl_t *db;
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	int num_lengths = 1;
+	int i;
+
+	ASSERT(MUTEX_HELD(&sa->sa_lock));
+	if (sa_hdr_phys->sa_magic == SA_MAGIC)
+		return;
+
+	db = SA_GET_DB(hdl, buftype);
+
+	if (buftype == SA_SPILL) {
+		arc_release(db->db_buf, NULL);
+		arc_buf_thaw(db->db_buf);
+	}
+
+	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+	/*
+	 * Determine number of variable lenghts in header
+	 * The standard 8 byte header has one for free and a
+	 * 16 byte header would have 4 + 1;
+	 */
+	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+	for (i = 0; i != num_lengths; i++)
+		sa_hdr_phys->sa_lengths[i] =
+		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+	    sa_byteswap_cb, NULL, hdl);
+
+	if (buftype == SA_SPILL)
+		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+	sa_hdr_phys_t *sa_hdr_phys;
+	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	sa_idx_tab_t *idx_tab;
+
+	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+	mutex_enter(&sa->sa_lock);
+
+	/* Do we need to byteswap? */
+
+	/* only check if not old znode */
+	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+	    sa_hdr_phys->sa_magic != 0) {
+		VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC);
+		sa_byteswap(hdl, buftype);
+	}
+
+	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+	if (buftype == SA_BONUS)
+		hdl->sa_bonus_tab = idx_tab;
+	else
+		hdl->sa_spill_tab = idx_tab;
+
+	mutex_exit(&sa->sa_lock);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_evict_sync(void *dbu)
+{
+	panic("evicting sa dbuf\n");
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+	sa_os_t *sa = os->os_sa;
+	sa_idx_tab_t *idx_tab = arg;
+
+	if (idx_tab == NULL)
+		return;
+
+	mutex_enter(&sa->sa_lock);
+	if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+		if (idx_tab->sa_variable_lengths)
+			kmem_free(idx_tab->sa_variable_lengths,
+			    sizeof (uint16_t) *
+			    idx_tab->sa_layout->lot_var_sizes);
+		refcount_destroy(&idx_tab->sa_refcount);
+		kmem_free(idx_tab->sa_idx_tab,
+		    sizeof (uint32_t) * sa->sa_num_attrs);
+		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+	}
+	mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+	sa_os_t *sa = os->os_sa;
+
+	ASSERT(MUTEX_HELD(&sa->sa_lock));
+	(void) refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+	dmu_buf_t *db = hdl->sa_bonus;
+
+	mutex_enter(&hdl->sa_lock);
+	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
+
+	if (hdl->sa_bonus_tab)
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+
+	if (hdl->sa_spill_tab)
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+
+	dmu_buf_rele(hdl->sa_bonus, NULL);
+
+	if (hdl->sa_spill)
+		dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL);
+	mutex_exit(&hdl->sa_lock);
+
+	kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+	int error = 0;
+	dmu_object_info_t doi;
+	sa_handle_t *handle = NULL;
+
+#ifdef ZFS_DEBUG
+	dmu_object_info_from_db(db, &doi);
+	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+	    doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+	/* find handle, if it exists */
+	/* if one doesn't exist then create a new one, and initialize it */
+
+	if (hdl_type == SA_HDL_SHARED)
+		handle = dmu_buf_get_user(db);
+
+	if (handle == NULL) {
+		sa_handle_t *winner = NULL;
+
+		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+		handle->sa_dbu.dbu_evict_func_sync = NULL;
+		handle->sa_dbu.dbu_evict_func_async = NULL;
+		handle->sa_userp = userp;
+		handle->sa_bonus = db;
+		handle->sa_os = os;
+		handle->sa_spill = NULL;
+		handle->sa_bonus_tab = NULL;
+		handle->sa_spill_tab = NULL;
+
+		error = sa_build_index(handle, SA_BONUS);
+
+		if (hdl_type == SA_HDL_SHARED) {
+			dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
+			    NULL);
+			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
+		}
+
+		if (winner != NULL) {
+			kmem_cache_free(sa_cache, handle);
+			handle = winner;
+		}
+	}
+	*handlepp = handle;
+
+	return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+	dmu_buf_t *db;
+	int error;
+
+	if (error = dmu_bonus_hold(objset, objid, NULL, &db))
+		return (error);
+
+	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+	    handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+	return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+	dmu_buf_rele(db, tag);
+}
+
+int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+	ASSERT(hdl);
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+	int error;
+	sa_bulk_attr_t bulk;
+
+	bulk.sa_attr = attr;
+	bulk.sa_data = buf;
+	bulk.sa_length = buflen;
+	bulk.sa_data_func = NULL;
+
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+	error = sa_lookup_impl(hdl, &bulk, 1);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
+{
+	int error;
+	sa_bulk_attr_t bulk;
+
+	bulk.sa_data = NULL;
+	bulk.sa_attr = attr;
+	bulk.sa_data_func = NULL;
+
+	ASSERT(hdl);
+
+	mutex_enter(&hdl->sa_lock);
+	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
+		error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+		    uio->uio_resid), UIO_READ, uio);
+	}
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+
+}
+#endif
+
+void *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data)
+{
+	sa_idx_tab_t *idx_tab;
+	sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data;
+	sa_os_t *sa = os->os_sa;
+	sa_lot_t *tb, search;
+	avl_index_t loc;
+
+	/*
+	 * Deterimine layout number.  If SA node and header == 0 then
+	 * force the index table to the dummy "1" empty layout.
+	 *
+	 * The layout number would only be zero for a newly created file
+	 * that has not added any attributes yet, or with crypto enabled which
+	 * doesn't write any attributes to the bonus buffer.
+	 */
+
+	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+	/* Verify header size is consistent with layout information */
+	ASSERT(tb);
+	ASSERT(IS_SA_BONUSTYPE(bonustype) &&
+	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) ||
+	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+	/*
+	 * See if any of the already existing TOC entries can be reused?
+	 */
+
+	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+		boolean_t valid_idx = B_TRUE;
+		int i;
+
+		if (tb->lot_var_sizes != 0 &&
+		    idx_tab->sa_variable_lengths != NULL) {
+			for (i = 0; i != tb->lot_var_sizes; i++) {
+				if (hdr->sa_lengths[i] !=
+				    idx_tab->sa_variable_lengths[i]) {
+					valid_idx = B_FALSE;
+					break;
+				}
+			}
+		}
+		if (valid_idx) {
+			sa_idx_tab_hold(os, idx_tab);
+			return (idx_tab);
+		}
+	}
+
+	/* No such luck, create a new entry */
+	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+	idx_tab->sa_idx_tab =
+	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+	idx_tab->sa_layout = tb;
+	refcount_create(&idx_tab->sa_refcount);
+	if (tb->lot_var_sizes)
+		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+		    tb->lot_var_sizes, KM_SLEEP);
+
+	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+	    tb, idx_tab);
+	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
+	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
+	list_insert_tail(&tb->lot_idx_tab, idx_tab);
+	return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+    boolean_t start, void *userdata)
+{
+	ASSERT(start);
+
+	*dataptr = userdata;
+	*len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+	uint64_t attr_value = 0;
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	sa_attr_table_t *tb = sa->sa_attr_table;
+	int i;
+
+	mutex_enter(&sa->sa_lock);
+
+	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
+		mutex_exit(&sa->sa_lock);
+		return;
+	}
+
+	if (sa->sa_reg_attr_obj == 0) {
+		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
+		    DMU_OT_SA_ATTR_REGISTRATION,
+		    sa->sa_master_obj, SA_REGISTRY, tx);
+	}
+	for (i = 0; i != sa->sa_num_attrs; i++) {
+		if (sa->sa_attr_table[i].sa_registered)
+			continue;
+		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+		    tb[i].sa_byteswap);
+		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+		    tb[i].sa_name, 8, 1, &attr_value, tx));
+		tb[i].sa_registered = B_TRUE;
+	}
+	sa->sa_need_attr_registration = B_FALSE;
+	mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file.  It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+    int attr_count, dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+
+	if (sa->sa_need_attr_registration)
+		sa_attr_register_sync(hdl, tx);
+	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+    int attr_count, dmu_tx_t *tx)
+{
+	int error;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_replace_all_by_template_locked(hdl, attr_desc,
+	    attr_count, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+/*
+ * Add/remove a single attribute or replace a variable-sized attribute value
+ * with a value of a different size, and then rewrite the entire set
+ * of attributes.
+ * Same-length attribute value replacement (including fixed-length attributes)
+ * is handled more efficiently by the upper layers.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+    uint16_t buflen, dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+	dnode_t *dn;
+	sa_bulk_attr_t *attr_desc;
+	void *old_data[2];
+	int bonus_attr_count = 0;
+	int bonus_data_size = 0;
+	int spill_data_size = 0;
+	int spill_attr_count = 0;
+	int error;
+	uint16_t length, reg_length;
+	int i, j, k, length_idx;
+	sa_hdr_phys_t *hdr;
+	sa_idx_tab_t *idx_tab;
+	int attr_count;
+	int count;
+
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+	/* First make of copy of the old data */
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dn->dn_bonuslen != 0) {
+		bonus_data_size = hdl->sa_bonus->db_size;
+		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+		bcopy(hdl->sa_bonus->db_data, old_data[0],
+		    hdl->sa_bonus->db_size);
+		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+	} else {
+		old_data[0] = NULL;
+	}
+	DB_DNODE_EXIT(db);
+
+	/* Bring spill buffer online if it isn't currently */
+
+	if ((error = sa_get_spill(hdl)) == 0) {
+		spill_data_size = hdl->sa_spill->db_size;
+		old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP);
+		bcopy(hdl->sa_spill->db_data, old_data[1],
+		    hdl->sa_spill->db_size);
+		spill_attr_count =
+		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
+	} else if (error && error != ENOENT) {
+		if (old_data[0])
+			kmem_free(old_data[0], bonus_data_size);
+		return (error);
+	} else {
+		old_data[1] = NULL;
+	}
+
+	/* build descriptor of all attributes */
+
+	attr_count = bonus_attr_count + spill_attr_count;
+	if (action == SA_ADD)
+		attr_count++;
+	else if (action == SA_REMOVE)
+		attr_count--;
+
+	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+	/*
+	 * loop through bonus and spill buffer if it exists, and
+	 * build up new attr_descriptor to reset the attributes
+	 */
+	k = j = 0;
+	count = bonus_attr_count;
+	hdr = SA_GET_HDR(hdl, SA_BONUS);
+	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+	for (; k != 2; k++) {
+		/*
+		 * Iterate over each attribute in layout.  Fetch the
+		 * size of variable-length attributes needing rewrite
+		 * from sa_lengths[].
+		 */
+		for (i = 0, length_idx = 0; i != count; i++) {
+			sa_attr_type_t attr;
+
+			attr = idx_tab->sa_layout->lot_attrs[i];
+			reg_length = SA_REGISTERED_LEN(sa, attr);
+			if (reg_length == 0) {
+				length = hdr->sa_lengths[length_idx];
+				length_idx++;
+			} else {
+				length = reg_length;
+			}
+			if (attr == newattr) {
+				/*
+				 * There is nothing to do for SA_REMOVE,
+				 * so it is just skipped.
+				 */
+				if (action == SA_REMOVE)
+					continue;
+
+				/*
+				 * Duplicate attributes are not allowed, so the
+				 * action can not be SA_ADD here.
+				 */
+				ASSERT3S(action, ==, SA_REPLACE);
+
+				/*
+				 * Only a variable-sized attribute can be
+				 * replaced here, and its size must be changing.
+				 */
+				ASSERT3U(reg_length, ==, 0);
+				ASSERT3U(length, !=, buflen);
+				SA_ADD_BULK_ATTR(attr_desc, j, attr,
+				    locator, datastart, buflen);
+			} else {
+				SA_ADD_BULK_ATTR(attr_desc, j, attr,
+				    NULL, (void *)
+				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+				    (uintptr_t)old_data[k]), length);
+			}
+		}
+		if (k == 0 && hdl->sa_spill) {
+			hdr = SA_GET_HDR(hdl, SA_SPILL);
+			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+			count = spill_attr_count;
+		} else {
+			break;
+		}
+	}
+	if (action == SA_ADD) {
+		reg_length = SA_REGISTERED_LEN(sa, newattr);
+		IMPLY(reg_length != 0, reg_length == buflen);
+		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+		    datastart, buflen);
+	}
+	ASSERT3U(j, ==, attr_count);
+
+	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+	if (old_data[0])
+		kmem_free(old_data[0], bonus_data_size);
+	if (old_data[1])
+		kmem_free(old_data[1], spill_data_size);
+	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+	return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+    dmu_tx_t *tx)
+{
+	int error;
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	dmu_object_type_t bonustype;
+
+	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+
+	ASSERT(hdl);
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+	/* sync out registration table if necessary */
+	if (sa->sa_need_attr_registration)
+		sa_attr_register_sync(hdl, tx);
+
+	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+		sa->sa_update_cb(hdl, tx);
+
+	return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+    void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+	int error;
+	sa_bulk_attr_t bulk;
+
+	bulk.sa_attr = type;
+	bulk.sa_data_func = NULL;
+	bulk.sa_length = buflen;
+	bulk.sa_data = buf;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+int
+sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr,
+    uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx)
+{
+	int error;
+	sa_bulk_attr_t bulk;
+
+	bulk.sa_attr = attr;
+	bulk.sa_data = userdata;
+	bulk.sa_data_func = locator;
+	bulk.sa_length = buflen;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+	sa_bulk_attr_t bulk;
+	int error;
+
+	bulk.sa_data = NULL;
+	bulk.sa_attr = attr;
+	bulk.sa_data_func = NULL;
+
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
+		mutex_exit(&hdl->sa_lock);
+		return (error);
+	}
+	*size = bulk.sa_size;
+
+	mutex_exit(&hdl->sa_lock);
+	return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+	ASSERT(hdl);
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+	return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+	int error;
+
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+	error = sa_bulk_lookup_locked(hdl, attrs, count);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+	int error;
+
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+	error = sa_bulk_update_impl(hdl, attrs, count, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+	int error;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+	    NULL, 0, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+	dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+	dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus,
+	    blksize, nblocks);
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+	hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+	return ((dmu_buf_t *)hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+	return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+	os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+	mutex_enter(&os->os_sa->sa_lock);
+	sa_register_update_callback_locked(os, func);
+	mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+	return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+	return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+	sa_os_t *sa = os->os_sa;
+
+	if (sa->sa_master_obj)
+		return (1);
+
+	sa->sa_master_obj = sa_object;
+
+	return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+	sa_hdr_phys_t *hdr = arg;
+
+	return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+	ASSERT(hdl);
+	mutex_exit(&hdl->sa_lock);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 sha256.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c	27 Feb 2010 22:31:01 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c	28 Jun 2017 00:06:33 -0000
@@ -22,19 +22,34 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+#include <crypto/sha2/sha256.h>
+#include <crypto/sha2/sha512t.h>
+#else
+#include <sha256.h>
+#include <sha512t.h>
+#endif
+#else
 #include <sys/sha2.h>
+#endif
 
+/*ARGSUSED*/
 void
-zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_SHA256(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
-	SHA2_CTX ctx;
+	SHA256_CTX ctx;
 	zio_cksum_t tmp;
 
-	SHA2Init(SHA256, &ctx);
-	SHA2Update(&ctx, buf, size);
-	SHA2Final(&tmp, &ctx);
+	SHA256_Init(&ctx);
+	SHA256_Update(&ctx, buf, size);
+	SHA256_Final((unsigned char *)&tmp, &ctx);
 
 	/*
 	 * A prior implementation of this function had a
@@ -48,3 +63,31 @@ zio_checksum_SHA256(const void *buf, uin
 	zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
 	zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
 }
+
+#ifndef __NetBSD__
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	SHA512_CTX	ctx;
+
+	SHA512_256_Init(&ctx);
+	SHA512_256_Update(&ctx, buf, size);
+	SHA512_256_Final((unsigned char *)zcp, &ctx);
+}
+
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+#endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c	28 Jun 2017 00:19:57 -0000
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#ifdef _KERNEL
+#include <crypto/skein/skein.h>
+#else
+#include <skein.h>
+#endif
+
+/*
+ * Computes a native 256-bit skein MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using zio_checksum_skein_tmpl_init.
+ */
+/*ARGSUSED*/
+void
+zio_checksum_skein_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	Skein_512_Ctxt_t	ctx;
+
+	ASSERT(ctx_template != NULL);
+	bcopy(ctx_template, &ctx, sizeof (ctx));
+	(void) Skein_512_Update(&ctx, buf, size);
+	(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
+	bzero(&ctx, sizeof (ctx));
+}
+
+/*
+ * Byteswapped version of zio_checksum_skein_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * skein is internally endian-insensitive).
+ */
+void
+zio_checksum_skein_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	zio_checksum_skein_native(buf, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a skein MAC template suitable for using in skein MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+{
+	Skein_512_Ctxt_t	*ctx;
+
+	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+	(void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
+	    salt->zcs_bytes, sizeof (salt->zcs_bytes));
+	return (ctx);
+}
+
+/*
+ * Frees a skein context template previously allocated using
+ * zio_checksum_skein_tmpl_init.
+ */
+void
+zio_checksum_skein_tmpl_free(void *ctx_template)
+{
+	Skein_512_Ctxt_t	*ctx = ctx_template;
+
+	bzero(ctx, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c,v
retrieving revision 1.6
diff -u -p -r1.6 spa.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c	27 Mar 2014 15:50:48 -0000	1.6
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c	30 Apr 2017 14:02:00 -0000
@@ -1,4 +1,3 @@
-
 /*
  * CDDL HEADER START
  *
@@ -21,11 +20,19 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
  */
 
 /*
+ * SPA: Storage Pool Allocator
+ *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
@@ -58,40 +65,60 @@
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
-#include <sys/systeminfo.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zvol.h>
+#include <sys/trim_map.h>
 
 #ifdef	_KERNEL
-#include <sys/bootprops.h>
 #include <sys/callb.h>
 #include <sys/cpupart.h>
-#include <sys/pool.h>
-#include <sys/sysdc.h>
 #include <sys/zone.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
+/* Check hostid on import? */
+static int check_hostid = 1;
+
+/*
+ * The interval, in seconds, at which failed configuration cache file writes
+ * should be retried.
+ */
+static int zfs_ccw_retry_interval = 300;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
+    "Check hostid on import?");
+TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
+    &zfs_ccw_retry_interval, 0,
+    "Configuration cache file write, retry after failure, interval (seconds)");
+
 typedef enum zti_modes {
-	zti_mode_fixed,			/* value is # of threads (min 1) */
-	zti_mode_online_percent,	/* value is % of online CPUs */
-	zti_mode_batch,			/* cpu-intensive; value is ignored */
-	zti_mode_null,			/* don't create a taskq */
-	zti_nmodes
+	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
+	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
+	ZTI_MODE_NULL,			/* don't create a taskq */
+	ZTI_NMODES
 } zti_modes_t;
 
-#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
-#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
-#define	ZTI_BATCH	{ zti_mode_batch, 0 }
-#define	ZTI_NULL	{ zti_mode_null, 0 }
+#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
+#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
+#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
-#define	ZTI_ONE		ZTI_FIX(1)
+#define	ZTI_N(n)	ZTI_P(n, 1)
+#define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
-	enum zti_modes zti_mode;
+	zti_modes_t zti_mode;
 	uint_t zti_value;
+	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
@@ -99,31 +126,53 @@ static const char *const zio_taskq_types
 };
 
 /*
- * Define the taskq threads for the following I/O types:
- * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ * This table defines the taskq settings for each ZFS I/O type. When
+ * initializing a pool, we use this table to create an appropriately sized
+ * taskq. Some operations are low volume and therefore have a small, static
+ * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
+ * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macro causes us to create a taskq oriented for throughput. Some operations
+ * are so high frequency and short-lived that the taskq itself can become a a
+ * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
+ * additional degree of parallelism specified by the number of threads per-
+ * taskq and the number of taskqs; when dispatching an event in this case, the
+ * particular taskq is chosen at random.
+ *
+ * The different taskq priorities are to handle the different contexts (issue
+ * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
+ * need to be handled with minimum delay.
  */
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
-	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
+	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
+	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
+	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 };
 
-static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
+static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name);
+static void spa_event_post(sysevent_t *ev);
+static void spa_sync_version(void *arg, dmu_tx_t *tx);
+static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
 
-uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
+uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
+#ifdef PSRSET_BIND
 id_t		zio_taskq_psrset_bind = PS_NONE;
+#endif
+#ifdef SYSDC
 boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
 uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
+#endif
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
+extern int	zfs_sync_pass_deferred_free;
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
@@ -165,15 +214,16 @@ spa_prop_add_list(nvlist_t *nvl, zpool_p
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
-	uint64_t size;
-	uint64_t alloc;
-	uint64_t cap, version;
+	vdev_t *rvd = spa->spa_root_vdev;
+	dsl_pool_t *pool = spa->spa_dsl_pool;
+	uint64_t size, alloc, cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
+	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
-	if (spa->spa_root_vdev != NULL) {
+	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 		size = metaslab_class_get_space(spa_normal_class(spa));
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
@@ -182,6 +232,13 @@ spa_prop_get_config(spa_t *spa, nvlist_t
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 
+		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
+		    metaslab_class_fragmentation(mc), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
+		    metaslab_class_expandable_space(mc), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+		    (spa_mode(spa) == FREAD), src);
+
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
@@ -189,7 +246,7 @@ spa_prop_get_config(spa_t *spa, nvlist_t
 		    ddt_get_pool_dedup_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
-		    spa->spa_root_vdev->vdev_state, src);
+		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
@@ -199,12 +256,49 @@ spa_prop_get_config(spa_t *spa, nvlist_t
 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 	}
 
+	if (pool != NULL) {
+		/*
+		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
+		 * when opening pools before this version freedir will be NULL.
+		 */
+		if (pool->dp_free_dir != NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
+			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+			    src);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
+			    NULL, 0, src);
+		}
+
+		if (pool->dp_leak_dir != NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+			    src);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+			    NULL, 0, src);
+		}
+	}
+
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
+	if (spa->spa_comment != NULL) {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
+		    0, ZPROP_SRC_LOCAL);
+	}
+
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+	} else {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+	}
+
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -268,19 +362,18 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 				dsl_dataset_t *ds = NULL;
 
 				dp = spa_get_dsl(spa);
-				rw_enter(&dp->dp_config_rwlock, RW_READER);
+				dsl_pool_config_enter(dp, FTAG);
 				if (err = dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &ds)) {
-					rw_exit(&dp->dp_config_rwlock);
+					dsl_pool_config_exit(dp, FTAG);
 					break;
 				}
 
-				strval = kmem_alloc(
-				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
+				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
-				rw_exit(&dp->dp_config_rwlock);
+				dsl_pool_config_exit(dp, FTAG);
 			} else {
 				strval = NULL;
 				intval = za.za_first_integer;
@@ -289,8 +382,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
-				kmem_free(strval,
-				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
+				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 
 			break;
 
@@ -332,27 +424,57 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
-	uint64_t objnum;
+	uint64_t objnum = 0;
+	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
-		zpool_prop_t prop;
-		char *propname, *strval;
 		uint64_t intval;
-		objset_t *os;
-		char *slash;
+		char *strval, *slash, *check, *fname;
+		const char *propname = nvpair_name(elem);
+		zpool_prop_t prop = zpool_name_to_prop(propname);
+
+		switch (prop) {
+		case ZPROP_INVAL:
+			if (!zpool_prop_feature(propname)) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
 
-		propname = nvpair_name(elem);
+			/*
+			 * Sanitize the input.
+			 */
+			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
 
-		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
-			return (EINVAL);
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			if (intval != 0) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			fname = strchr(propname, '@') + 1;
+			if (zfeature_lookup_name(fname, NULL) != 0) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			has_feature = B_TRUE;
+			break;
 
-		switch (prop) {
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
-			    (intval < spa_version(spa) || intval > SPA_VERSION))
-				error = EINVAL;
+			    (intval < spa_version(spa) ||
+			    intval > SPA_VERSION_BEFORE_FEATURES ||
+			    has_feature))
+				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
@@ -361,7 +483,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 		case ZPOOL_PROP_AUTOEXPAND:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
-				error = EINVAL;
+				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
@@ -371,7 +493,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
-				error = ENOTSUP;
+				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
@@ -379,7 +501,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
-				error = ENOTSUP;
+				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
@@ -388,7 +510,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
-				uint64_t compress;
+				objset_t *os;
+				uint64_t propval;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
@@ -399,15 +522,20 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 				if (error = dmu_objset_hold(strval, FTAG, &os))
 					break;
 
-				/* Must be ZPL and not gzip compressed. */
+				/*
+				 * Must be ZPL, and its property settings
+				 * must be supported by GRUB (compression
+				 * is not gzip, and large blocks are not used).
+				 */
 
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
-					error = ENOTSUP;
-				} else if ((error = dsl_prop_get_integer(strval,
+					error = SET_ERROR(ENOTSUP);
+				} else if ((error =
+				    dsl_prop_get_int_ds(dmu_objset_ds(os),
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
-				    &compress, NULL)) == 0 &&
-				    !BOOTFS_COMPRESS_VALID(compress)) {
-					error = ENOTSUP;
+				    &propval)) == 0 &&
+				    !BOOTFS_COMPRESS_VALID(propval)) {
+					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
@@ -419,7 +547,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 			    intval > ZIO_FAILURE_MODE_PANIC))
-				error = EINVAL;
+				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
@@ -433,7 +561,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
-				error = EIO;
+				error = SET_ERROR(EIO);
 			}
 			break;
 
@@ -448,7 +576,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 				break;
 
 			if (strval[0] != '/') {
-				error = EINVAL;
+				error = SET_ERROR(EINVAL);
 				break;
 			}
 
@@ -457,17 +585,36 @@ spa_prop_validate(spa_t *spa, nvlist_t *
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
-				error = EINVAL;
+				error = SET_ERROR(EINVAL);
+			break;
+
+		case ZPOOL_PROP_COMMENT:
+			if ((error = nvpair_value_string(elem, &strval)) != 0)
+				break;
+			for (check = strval; *check != '\0'; check++) {
+				/*
+				 * The kernel doesn't have an easy isprint()
+				 * check.  For this kernel check, we merely
+				 * check ASCII apart from DEL.  Fix this if
+				 * there is an easy-to-use kernel isprint().
+				 */
+				if (*check >= 0x7f) {
+					error = SET_ERROR(EINVAL);
+					break;
+				}
+			}
+			if (strlen(strval) > ZPROP_MAX_COMMENT)
+				error = E2BIG;
 			break;
 
 		case ZPOOL_PROP_DEDUPDITTO:
 			if (spa_version(spa) < SPA_VERSION_DEDUP)
-				error = ENOTSUP;
+				error = SET_ERROR(ENOTSUP);
 			else
 				error = nvpair_value_uint64(elem, &intval);
 			if (error == 0 &&
 			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
-				error = EINVAL;
+				error = SET_ERROR(EINVAL);
 			break;
 		}
 
@@ -517,31 +664,59 @@ int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
-	nvpair_t *elem;
+	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
-	zpool_prop_t prop;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
-	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
-		if ((prop = zpool_name_to_prop(
-		    nvpair_name(elem))) == ZPROP_INVAL)
-			return (EINVAL);
+		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
+
+		if (prop == ZPOOL_PROP_CACHEFILE ||
+		    prop == ZPOOL_PROP_ALTROOT ||
+		    prop == ZPOOL_PROP_READONLY)
+			continue;
+
+		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
+			uint64_t ver;
 
-		if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
+			if (prop == ZPOOL_PROP_VERSION) {
+				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
+			} else {
+				ASSERT(zpool_prop_feature(nvpair_name(elem)));
+				ver = SPA_VERSION_FEATURES;
+				need_sync = B_TRUE;
+			}
+
+			/* Save time if the version is already set. */
+			if (ver == spa_version(spa))
+				continue;
+
+			/*
+			 * In addition to the pool directory object, we might
+			 * create the pool properties object, the features for
+			 * read object, the features for write object, or the
+			 * feature descriptions object.
+			 */
+			error = dsl_sync_task(spa->spa_name, NULL,
+			    spa_sync_version, &ver,
+			    6, ZFS_SPACE_CHECK_RESERVED);
+			if (error)
+				return (error);
 			continue;
+		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
-	if (need_sync)
-		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-		    spa, nvp, 3));
-	else
-		return (0);
+	if (need_sync) {
+		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
+		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
+	}
+
+	return (0);
 }
 
 /*
@@ -558,6 +733,80 @@ spa_prop_clear_bootfs(spa_t *spa, uint64
 	}
 }
 
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg, dmu_tx_t *tx)
+{
+	uint64_t *newguid = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t vdev_state;
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	vdev_state = rvd->vdev_state;
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	if (vdev_state != VDEV_STATE_HEALTHY)
+		return (SET_ERROR(ENXIO));
+
+	ASSERT3U(spa_guid(spa), !=, *newguid);
+
+	return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg, dmu_tx_t *tx)
+{
+	uint64_t *newguid = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	uint64_t oldguid;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	oldguid = spa_guid(spa);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	rvd->vdev_guid = *newguid;
+	rvd->vdev_guid_sum += (*newguid - oldguid);
+	vdev_config_dirty(rvd);
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
+	    oldguid, *newguid);
+}
+
+/*
+ * Change the GUID for the pool.  This is done so that we can later
+ * re-import a pool built from a clone of our own vdevs.  We will modify
+ * the root vdev's guid, our own pool guid, and then mark all of our
+ * vdevs dirty.  Note that we must make sure that all our vdevs are
+ * online when we do this, or else any vdevs that weren't present
+ * would be orphaned from our pool.  We are also going to issue a
+ * sysevent to update any watchers.
+ */
+int
+spa_change_guid(spa_t *spa)
+{
+	int error;
+	uint64_t guid;
+
+	mutex_enter(&spa->spa_vdev_top_lock);
+	mutex_enter(&spa_namespace_lock);
+	guid = spa_generate_guid(NULL);
+
+	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
+	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
+
+	if (error == 0) {
+		spa_config_sync(spa, B_FALSE, B_TRUE);
+		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
+	}
+
+	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	return (error);
+}
+
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
@@ -572,7 +821,7 @@ spa_error_entry_compare(const void *a, c
 	int ret;
 
 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
-	    sizeof (zbookmark_t));
+	    sizeof (zbookmark_phys_t));
 
 	if (ret < 0)
 		return (-1);
@@ -602,48 +851,141 @@ spa_get_errlists(spa_t *spa, avl_tree_t 
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
-static taskq_t *
-spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
-    uint_t value)
+static void
+spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
-	uint_t flags = TASKQ_PREPOPULATE;
+	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+	enum zti_modes mode = ztip->zti_mode;
+	uint_t value = ztip->zti_value;
+	uint_t count = ztip->zti_count;
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+	char name[32];
+	uint_t flags = 0;
 	boolean_t batch = B_FALSE;
 
-	switch (mode) {
-	case zti_mode_null:
-		return (NULL);		/* no taskq needed */
+	if (mode == ZTI_MODE_NULL) {
+		tqs->stqs_count = 0;
+		tqs->stqs_taskq = NULL;
+		return;
+	}
 
-	case zti_mode_fixed:
+	ASSERT3U(count, >, 0);
+
+	tqs->stqs_count = count;
+	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
+
+	switch (mode) {
+	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >=, 1);
 		value = MAX(value, 1);
 		break;
 
-	case zti_mode_batch:
+	case ZTI_MODE_BATCH:
 		batch = B_TRUE;
 		flags |= TASKQ_THREADS_CPU_PCT;
 		value = zio_taskq_batch_pct;
 		break;
 
-	case zti_mode_online_percent:
-		flags |= TASKQ_THREADS_CPU_PCT;
-		break;
-
 	default:
-		panic("unrecognized mode for %s taskq (%u:%u) in "
+		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_activate()",
-		    name, mode, value);
+		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
-	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
-		if (batch)
-			flags |= TASKQ_DC_BATCH;
+	for (uint_t i = 0; i < count; i++) {
+		taskq_t *tq;
+
+		if (count > 1) {
+			(void) snprintf(name, sizeof (name), "%s_%s_%u",
+			    zio_type_name[t], zio_taskq_types[q], i);
+		} else {
+			(void) snprintf(name, sizeof (name), "%s_%s",
+			    zio_type_name[t], zio_taskq_types[q]);
+		}
+
+#ifdef SYSDC
+		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+			if (batch)
+				flags |= TASKQ_DC_BATCH;
+
+			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
+			    spa->spa_proc, zio_taskq_basedc, flags);
+		} else {
+#endif
+			pri_t pri = maxclsyspri;
+			/*
+			 * The write issue taskq can be extremely CPU
+			 * intensive.  Run it at slightly lower priority
+			 * than the other taskqs.
+			 * FreeBSD notes:
+			 * - numerically higher priorities are lower priorities;
+			 * - if priorities divided by four (RQ_PPQ) are equal
+			 *   then a difference between them is insignificant.
+			 */
+			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+#ifdef illumos
+				pri--;
+#else
+				pri += 4;
+#endif
+
+			tq = taskq_create_proc(name, value, pri, 50,
+			    INT_MAX, spa->spa_proc, flags);
+#ifdef SYSDC
+		}
+#endif
+
+		tqs->stqs_taskq[i] = tq;
+	}
+}
+
+static void
+spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+
+	if (tqs->stqs_taskq == NULL) {
+		ASSERT0(tqs->stqs_count);
+		return;
+	}
+
+	for (uint_t i = 0; i < tqs->stqs_count; i++) {
+		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
+		taskq_destroy(tqs->stqs_taskq[i]);
+	}
+
+	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
+	tqs->stqs_taskq = NULL;
+}
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself. In that case we choose which taskq at random by using
+ * the low bits of gethrtime().
+ */
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+{
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+	taskq_t *tq;
+
+	ASSERT3P(tqs->stqs_taskq, !=, NULL);
+	ASSERT3U(tqs->stqs_count, !=, 0);
 
-		return (taskq_create_sysdc(name, value, 50, INT_MAX,
-		    spa->spa_proc, zio_taskq_basedc, flags));
+	if (tqs->stqs_count == 1) {
+		tq = tqs->stqs_taskq[0];
+	} else {
+#if defined(__FreeBSD__) && defined(_KERNEL)
+		tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
+#else
+		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
+#endif
 	}
-	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
-	    spa->spa_proc, flags));
+
+	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
 static void
@@ -651,39 +993,30 @@ spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
-			enum zti_modes mode = ztip->zti_mode;
-			uint_t value = ztip->zti_value;
-			char name[32];
-
-			(void) snprintf(name, sizeof (name),
-			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
-
-			spa->spa_zio_taskq[t][q] =
-			    spa_taskq_create(spa, name, mode, value);
+			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 #ifdef _KERNEL
+#ifdef SPA_PROCESS
 static void
 spa_thread(void *arg)
 {
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
+	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
-#ifdef PORT_SOLARIS
-	user_t *pu = PTOU(curproc);
-
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
+#ifdef PSRSET_BIND
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
@@ -705,12 +1038,16 @@ spa_thread(void *arg)
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
+#endif
 
+#ifdef SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
-#endif /* PORT_SOLARIS */
+#endif
+
 	spa->spa_proc = curproc;
+	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
@@ -731,9 +1068,10 @@ spa_thread(void *arg)
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
-/*	mutex_enter(curproc->p_lock);
-	lwp_exit(curproc); */
+	mutex_enter(&curproc->p_lock);
+	lwp_exit();
 }
+#endif	/* SPA_PROCESS */
 #endif
 
 /*
@@ -755,7 +1093,8 @@ spa_activate(spa_t *spa, int mode)
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
-#if 0
+
+#ifdef SPA_PROCESS
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
@@ -775,17 +1114,25 @@ spa_activate(spa_t *spa, int mode)
 			    spa->spa_name);
 #endif
 		}
-	}	
-#endif
+	}
+#endif	/* SPA_PROCESS */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
+	ASSERT(spa->spa_proc == &p0);
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
+	/*
+	 * Start TRIM thread.
+	 */
+	trim_thread_create(spa);
+
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
+	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
+	    offsetof(objset_t, os_evicting_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
@@ -812,16 +1159,23 @@ spa_deactivate(spa_t *spa)
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
+	/*
+	 * Stop TRIM thread in case spa_unload() wasn't called directly
+	 * before spa_deactivate().
+	 */
+	trim_thread_destroy(spa);
+
+	spa_evicting_os_wait(spa);
+
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
+	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			if (spa->spa_zio_taskq[t][q] != NULL)
-				taskq_destroy(spa->spa_zio_taskq[t][q]);
-			spa->spa_zio_taskq[t][q] = NULL;
+			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
@@ -857,6 +1211,7 @@ spa_deactivate(spa_t *spa)
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
+#ifdef SPA_PROCESS
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
@@ -866,6 +1221,7 @@ spa_deactivate(spa_t *spa)
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
+#endif	/* SPA_PROCESS */
 }
 
 /*
@@ -897,7 +1253,7 @@ spa_config_parse(spa_t *spa, vdev_t **vd
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
@@ -926,6 +1282,11 @@ spa_unload(spa_t *spa)
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
+	 * Stop TRIM thread.
+	 */
+	trim_thread_destroy(spa);
+
+	/*
 	 * Stop async tasks.
 	 */
 	spa_async_suspend(spa);
@@ -939,13 +1300,39 @@ spa_unload(spa_t *spa)
 	}
 
 	/*
+	 * Even though vdev_free() also calls vdev_metaslab_fini, we need
+	 * to call it earlier, before we wait for async i/o to complete.
+	 * This ensures that there is no async metaslab prefetching, by
+	 * calling taskq_wait(mg_taskq).
+	 */
+	if (spa->spa_root_vdev != NULL) {
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
+			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+	}
+
+	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
-		(void) zio_wait(spa->spa_async_zio_root);
+		for (int i = 0; i < max_ncpus; i++)
+			(void) zio_wait(spa->spa_async_zio_root[i]);
+		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
+	bpobj_close(&spa->spa_deferred_bpobj);
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	/*
+	 * Close all vdevs.
+	 */
+	if (spa->spa_root_vdev)
+		vdev_free(spa->spa_root_vdev);
+	ASSERT(spa->spa_root_vdev == NULL);
+
 	/*
 	 * Close the dsl pool.
 	 */
@@ -957,20 +1344,11 @@ spa_unload(spa_t *spa)
 
 	ddt_unload(spa);
 
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
-	/*
-	 * Close all vdevs.
-	 */
-	if (spa->spa_root_vdev)
-		vdev_free(spa->spa_root_vdev);
-	ASSERT(spa->spa_root_vdev == NULL);
-
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
@@ -984,8 +1362,10 @@ spa_unload(spa_t *spa)
 	}
 	spa->spa_spares.sav_count = 0;
 
-	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
+	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+	}
 	if (spa->spa_l2cache.sav_vdevs) {
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
@@ -999,6 +1379,11 @@ spa_unload(spa_t *spa)
 
 	spa->spa_async_suspended = 0;
 
+	if (spa->spa_comment != NULL) {
+		spa_strfree(spa->spa_comment);
+		spa->spa_comment = NULL;
+	}
+
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
@@ -1109,7 +1494,7 @@ spa_load_spares(spa_t *spa)
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
-		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
@@ -1143,6 +1528,7 @@ spa_load_l2cache(spa_t *spa)
 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 	} else {
 		nl2cache = 0;
+		newvdevs = NULL;
 	}
 
 	oldvdevs = sav->sav_vdevs;
@@ -1208,11 +1594,13 @@ spa_load_l2cache(spa_t *spa)
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
+			ASSERT(vd->vdev_isl2cache);
+
 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
-			(void) vdev_close(vd);
-			spa_l2cache_remove(vd);
+			vdev_clear_stats(vd);
+			vdev_free(vd);
 		}
 	}
 
@@ -1235,7 +1623,7 @@ spa_load_l2cache(spa_t *spa)
 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
-		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 out:
@@ -1254,7 +1642,10 @@ load_nvlist(spa_t *spa, uint64_t obj, nv
 	int error;
 	*value = NULL;
 
-	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
+	if (error != 0)
+		return (error);
+
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
@@ -1278,70 +1669,193 @@ spa_check_removed(vdev_t *vd)
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
-	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
+	    !vd->vdev_ishole) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
-/*
- * Load the slog device state from the config object since it's possible
- * that the label does not contain the most up-to-date information.
- */
-void
-spa_load_log_state(spa_t *spa, nvlist_t *nv)
+static void
+spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
 {
-	vdev_t *ovd, *rvd = spa->spa_root_vdev;
+	ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
 
-	/*
-	 * Load the original root vdev tree from the passed config.
-	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+	vd->vdev_top_zap = mvd->vdev_top_zap;
+	vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
 
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *cvd = rvd->vdev_child[c];
-		if (cvd->vdev_islog)
-			vdev_load_log_state(cvd, ovd->vdev_child[c]);
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
 	}
-	vdev_free(ovd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
- * Check for missing log devices
+ * Validate the current config against the MOS config
  */
-int
-spa_check_logs(spa_t *spa)
-{
-	switch (spa->spa_log_state) {
-	case SPA_LOG_MISSING:
-		/* need to recheck in case slog has been restored */
-	case SPA_LOG_UNKNOWN:
-		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
-		    DS_FIND_CHILDREN)) {
-			spa_set_log_state(spa, SPA_LOG_MISSING);
-			return (1);
-		}
-		break;
-	}
-	return (0);
-}
-
 static boolean_t
-spa_passivate_log(spa_t *spa)
+spa_config_valid(spa_t *spa, nvlist_t *config)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
-	boolean_t slog_found = B_FALSE;
+	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+	nvlist_t *nv;
 
-	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
 
-	if (!spa_has_slogs(spa))
-		return (B_FALSE);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *tvd = rvd->vdev_child[c];
-		metaslab_group_t *mg = tvd->vdev_mg;
+	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
+
+	/*
+	 * If we're doing a normal import, then build up any additional
+	 * diagnostic information about missing devices in this config.
+	 * We'll pass this up to the user for further processing.
+	 */
+	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+		nvlist_t **child, *nv;
+		uint64_t idx = 0;
+
+		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
+		    KM_SLEEP);
+		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *tvd = rvd->vdev_child[c];
+			vdev_t *mtvd  = mrvd->vdev_child[c];
+
+			if (tvd->vdev_ops == &vdev_missing_ops &&
+			    mtvd->vdev_ops != &vdev_missing_ops &&
+			    mtvd->vdev_islog)
+				child[idx++] = vdev_config_generate(spa, mtvd,
+				    B_FALSE, 0);
+		}
+
+		if (idx) {
+			VERIFY(nvlist_add_nvlist_array(nv,
+			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
+
+			for (int i = 0; i < idx; i++)
+				nvlist_free(child[i]);
+		}
+		nvlist_free(nv);
+		kmem_free(child, rvd->vdev_children * sizeof (char **));
+	}
+
+	/*
+	 * Compare the root vdev tree with the information we have
+	 * from the MOS config (mrvd). Check each top-level vdev
+	 * with the corresponding MOS config top-level (mtvd).
+	 */
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		vdev_t *mtvd  = mrvd->vdev_child[c];
+
+		/*
+		 * Resolve any "missing" vdevs in the current configuration.
+		 * If we find that the MOS config has more accurate information
+		 * about the top-level vdev then use that vdev instead.
+		 */
+		if (tvd->vdev_ops == &vdev_missing_ops &&
+		    mtvd->vdev_ops != &vdev_missing_ops) {
+
+			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
+				continue;
+
+			/*
+			 * Device specific actions.
+			 */
+			if (mtvd->vdev_islog) {
+				spa_set_log_state(spa, SPA_LOG_CLEAR);
+			} else {
+				/*
+				 * XXX - once we have 'readonly' pool
+				 * support we should be able to handle
+				 * missing data devices by transitioning
+				 * the pool to readonly.
+				 */
+				continue;
+			}
+
+			/*
+			 * Swap the missing vdev with the data we were
+			 * able to obtain from the MOS config.
+			 */
+			vdev_remove_child(rvd, tvd);
+			vdev_remove_child(mrvd, mtvd);
+
+			vdev_add_child(rvd, mtvd);
+			vdev_add_child(mrvd, tvd);
+
+			spa_config_exit(spa, SCL_ALL, FTAG);
+			vdev_load(mtvd);
+			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+			vdev_reopen(rvd);
+		} else {
+			if (mtvd->vdev_islog) {
+				/*
+				 * Load the slog device's state from the MOS
+				 * config since it's possible that the label
+				 * does not contain the most up-to-date
+				 * information.
+				 */
+				vdev_load_log_state(tvd, mtvd);
+				vdev_reopen(tvd);
+			}
+
+			/*
+			 * Per-vdev ZAP info is stored exclusively in the MOS.
+			 */
+			spa_config_valid_zaps(tvd, mtvd);
+		}
+	}
+
+	vdev_free(mrvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	/*
+	 * Ensure we were able to validate the config.
+	 */
+	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
+}
+
+/*
+ * Check for missing log devices
+ */
+static boolean_t
+spa_check_logs(spa_t *spa)
+{
+	boolean_t rv = B_FALSE;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	switch (spa->spa_log_state) {
+	case SPA_LOG_MISSING:
+		/* need to recheck in case slog has been restored */
+	case SPA_LOG_UNKNOWN:
+		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
+		if (rv)
+			spa_set_log_state(spa, SPA_LOG_MISSING);
+		break;
+	}
+	return (rv);
+}
+
+static boolean_t
+spa_passivate_log(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	boolean_t slog_found = B_FALSE;
+
+	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+	if (!spa_has_slogs(spa))
+		return (B_FALSE);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog) {
 			metaslab_group_passivate(mg);
@@ -1371,11 +1885,11 @@ spa_activate_log(spa_t *spa)
 int
 spa_offline_log(spa_t *spa)
 {
-	int error = 0;
-
-	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
-	    NULL, DS_FIND_CHILDREN)) == 0) {
+	int error;
 
+	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+	    NULL, DS_FIND_CHILDREN);
+	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
@@ -1389,7 +1903,9 @@ spa_offline_log(spa_t *spa)
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
-	for (int i = 0; i < sav->sav_count; i++)
+	int i;
+
+	for (i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
@@ -1419,32 +1935,85 @@ spa_load_verify_done(zio_t *zio)
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
+	spa_t *spa = zio->io_spa;
 
 	if (error) {
-		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
-			atomic_add_64(&sle->sle_meta_count, 1);
+			atomic_inc_64(&sle->sle_meta_count);
 		else
-			atomic_add_64(&sle->sle_data_count, 1);
+			atomic_inc_64(&sle->sle_data_count);
 	}
 	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
 }
 
+/*
+ * Maximum number of concurrent scrub i/os to create while verifying
+ * a pool while importing it.
+ */
+int spa_load_verify_maxinflight = 10000;
+boolean_t spa_load_verify_metadata = B_TRUE;
+boolean_t spa_load_verify_data = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
+    &spa_load_verify_maxinflight, 0,
+    "Maximum number of concurrent scrub I/Os to create while verifying a "
+    "pool while importing it");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
+    &spa_load_verify_metadata, 0,
+    "Check metadata on import?");
+ 
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
+    &spa_load_verify_data, 0,
+    "Check user data on import?");
+ 
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-	if (bp != NULL) {
-		zio_t *rio = arg;
-		size_t size = BP_GET_PSIZE(bp);
-		void *data = zio_data_buf_alloc(size);
-
-		zio_nowait(zio_read(rio, spa, bp, data, size,
-		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
-		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
-	}
+	if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+		return (0);
+	/*
+	 * Note: normally this routine will not be called if
+	 * spa_load_verify_metadata is not set.  However, it may be useful
+	 * to manually set the flag after the traversal has begun.
+	 */
+	if (!spa_load_verify_metadata)
+		return (0);
+	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+		return (0);
+
+	zio_t *rio = arg;
+	size_t size = BP_GET_PSIZE(bp);
+	void *data = zio_data_buf_alloc(size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	spa->spa_scrub_inflight++;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	zio_nowait(zio_read(rio, spa, bp, data, size,
+	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+	return (0);
+}
+
+/* ARGSUSED */
+int
+verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
 	return (0);
 }
 
@@ -1455,18 +2024,29 @@ spa_load_verify(spa_t *spa)
 	spa_load_error_t sle = { 0 };
 	zpool_rewind_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
-	int error;
+	int error = 0;
 
 	zpool_get_rewind_policy(spa->spa_config, &policy);
 
 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
 		return (0);
 
+	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+	error = dmu_objset_find_dp(spa->spa_dsl_pool,
+	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
+	    DS_FIND_CHILDREN);
+	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+	if (error != 0)
+		return (error);
+
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
-	error = traverse_pool(spa, spa->spa_verify_min_txg,
-	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+	if (spa_load_verify_metadata) {
+		error = traverse_pool(spa, spa->spa_verify_min_txg,
+		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+		    spa_load_verify_cb, rio);
+	}
 
 	(void) zio_wait(rio);
 
@@ -1475,16 +2055,26 @@ spa_load_verify(spa_t *spa)
 
 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
 	    sle.sle_data_count <= policy.zrp_maxdata) {
+		int64_t loss = 0;
+
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+		VERIFY(nvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+		VERIFY(nvlist_add_int64(spa->spa_load_info,
+		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+		VERIFY(nvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
-			error = EIO;
+			error = SET_ERROR(EIO);
 		return (error);
 	}
 
@@ -1606,12 +2196,17 @@ spa_load(spa_t *spa, spa_load_state_t st
 {
 	nvlist_t *config = spa->spa_config;
 	char *ereport = FM_EREPORT_ZFS_POOL;
+	char *comment;
 	int error;
 	uint64_t pool_guid;
 	nvlist_t *nvl;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
+
+	ASSERT(spa->spa_comment == NULL);
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+		spa->spa_comment = spa_strdup(comment);
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
@@ -1626,9 +2221,9 @@ spa_load(spa_t *spa, spa_load_state_t st
 
 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
-		error = EEXIST;
+		error = SET_ERROR(EEXIST);
 	} else {
-		spa->spa_load_guid = pool_guid;
+		spa->spa_config_guid = pool_guid;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
 		    &nvl) == 0) {
@@ -1636,13 +2231,29 @@ spa_load(spa_t *spa, spa_load_state_t st
 			    KM_SLEEP) == 0);
 		}
 
+		nvlist_free(spa->spa_load_info);
+		spa->spa_load_info = fnvlist_alloc();
+
+		gethrestime(&spa->spa_loaded_ts);
 		error = spa_load_impl(spa, pool_guid, config, state, type,
 		    mosconfig, &ereport);
 	}
 
+	/*
+	 * Don't count references from objsets that are already closed
+	 * and are making their way through the eviction process.
+	 */
+	spa_evicting_os_wait(spa);
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
-	if (error && error != EBADF)
-		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+	if (error) {
+		if (error != EEXIST) {
+			spa->spa_loaded_ts.tv_sec = 0;
+			spa->spa_loaded_ts.tv_nsec = 0;
+		}
+		if (error != EBADF) {
+			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+		}
+	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
@@ -1650,6 +2261,34 @@ spa_load(spa_t *spa, spa_load_state_t st
 }
 
 /*
+ * Count the number of per-vdev ZAPs associated with all of the vdevs in the
+ * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
+ * spa's per-vdev ZAP list.
+ */
+static uint64_t
+vdev_count_verify_zaps(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	uint64_t total = 0;
+	if (vd->vdev_top_zap != 0) {
+		total++;
+		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
+	}
+	if (vd->vdev_leaf_zap != 0) {
+		total++;
+		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		total += vdev_count_verify_zaps(vd->vdev_child[i]);
+	}
+
+	return (total);
+}
+
+/*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
@@ -1659,12 +2298,15 @@ spa_load_impl(spa_t *spa, uint64_t pool_
     char **ereport)
 {
 	int error = 0;
-	nvlist_t *nvconfig, *nvroot = NULL;
+	nvlist_t *nvroot = NULL;
+	nvlist_t *label;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
-	uint64_t config_cache_txg = spa->spa_config_txg;
+	uint64_t children, config_cache_txg = spa->spa_config_txg;
 	int orig_mode = spa->spa_mode;
 	int parse;
+	uint64_t obj;
+	boolean_t missing_feat_write = B_FALSE;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
@@ -1678,7 +2320,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 	spa->spa_load_state = state;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
@@ -1686,8 +2328,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
-	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+	    KM_SLEEP);
+	for (int i = 0; i < max_ncpus; i++) {
+		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
@@ -1702,6 +2349,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 		return (error);
 
 	ASSERT(spa->spa_root_vdev == rvd);
+	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
@@ -1731,38 +2380,101 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		error = vdev_validate(rvd);
+		error = vdev_validate(rvd, mosconfig);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0)
 			return (error);
 
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-			return (ENXIO);
+			return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
-	vdev_uberblock_load(NULL, rvd, ub);
+	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
-	if (ub->ub_txg == 0)
+	if (ub->ub_txg == 0) {
+		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+	}
 
 	/*
-	 * If the pool is newer than the code, we can't open it.
+	 * If the pool has an unsupported version we can't open it.
 	 */
-	if (ub->ub_version > SPA_VERSION)
+	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
+		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+	}
+
+	if (ub->ub_version >= SPA_VERSION_FEATURES) {
+		nvlist_t *features;
+
+		/*
+		 * If we weren't able to find what's necessary for reading the
+		 * MOS in the label, return failure.
+		 */
+		if (label == NULL || nvlist_lookup_nvlist(label,
+		    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
+			nvlist_free(label);
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+			    ENXIO));
+		}
+
+		/*
+		 * Update our in-core representation with the definitive values
+		 * from the label.
+		 */
+		nvlist_free(spa->spa_label_features);
+		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
+	}
+
+	nvlist_free(label);
+
+	/*
+	 * Look through entries in the label nvlist's features_for_read. If
+	 * there is a feature listed there which we don't understand then we
+	 * cannot open a pool.
+	 */
+	if (ub->ub_version >= SPA_VERSION_FEATURES) {
+		nvlist_t *unsup_feat;
+
+		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
+		    0);
+
+		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
+		    NULL); nvp != NULL;
+		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
+			if (!zfeature_is_supported(nvpair_name(nvp))) {
+				VERIFY(nvlist_add_string(unsup_feat,
+				    nvpair_name(nvp), "") == 0);
+			}
+		}
+
+		if (!nvlist_empty(unsup_feat)) {
+			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
+			nvlist_free(unsup_feat);
+			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+			    ENOTSUP));
+		}
+
+		nvlist_free(unsup_feat);
+	}
 
 	/*
 	 * If the vdev guid sum doesn't match the uberblock, we have an
-	 * incomplete configuration.
+	 * incomplete configuration.  We first check to see if the pool
+	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
+	 * If it is, defer the vdev_guid_sum check till later so we
+	 * can handle missing vdevs.
 	 */
-	if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+	    &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
@@ -1784,8 +2496,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
+	spa->spa_prev_software_version = ub->ub_software_version;
 
-	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
@@ -1793,12 +2506,119 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
-	if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
+		boolean_t missing_feat_read = B_FALSE;
+		nvlist_t *unsup_feat, *enabled_feat;
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
+		    &spa->spa_feat_for_read_obj) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
+		    &spa->spa_feat_for_write_obj) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
+		    &spa->spa_feat_desc_obj) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		enabled_feat = fnvlist_alloc();
+		unsup_feat = fnvlist_alloc();
+
+		if (!spa_features_check(spa, B_FALSE,
+		    unsup_feat, enabled_feat))
+			missing_feat_read = B_TRUE;
+
+		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
+			if (!spa_features_check(spa, B_TRUE,
+			    unsup_feat, enabled_feat)) {
+				missing_feat_write = B_TRUE;
+			}
+		}
+
+		fnvlist_add_nvlist(spa->spa_load_info,
+		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
+
+		if (!nvlist_empty(unsup_feat)) {
+			fnvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
+		}
+
+		fnvlist_free(enabled_feat);
+		fnvlist_free(unsup_feat);
+
+		if (!missing_feat_read) {
+			fnvlist_add_boolean(spa->spa_load_info,
+			    ZPOOL_CONFIG_CAN_RDONLY);
+		}
+
+		/*
+		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
+		 * twofold: to determine whether the pool is available for
+		 * import in read-write mode and (if it is not) whether the
+		 * pool is available for import in read-only mode. If the pool
+		 * is available for import in read-write mode, it is displayed
+		 * as available in userland; if it is not available for import
+		 * in read-only mode, it is displayed as unavailable in
+		 * userland. If the pool is available for import in read-only
+		 * mode but not read-write mode, it is displayed as unavailable
+		 * in userland with a special note that the pool is actually
+		 * available for open in read-only mode.
+		 *
+		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
+		 * missing a feature for write, we must first determine whether
+		 * the pool can be opened read-only before returning to
+		 * userland in order to know whether to display the
+		 * abovementioned note.
+		 */
+		if (missing_feat_read || (missing_feat_write &&
+		    spa_writeable(spa))) {
+			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+			    ENOTSUP));
+		}
+
+		/*
+		 * Load refcounts for ZFS features from disk into an in-memory
+		 * cache during SPA initialization.
+		 */
+		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+			uint64_t refcount;
+
+			error = feature_get_refcount_from_disk(spa,
+			    &spa_feature_table[i], &refcount);
+			if (error == 0) {
+				spa->spa_feat_refcount_cache[i] = refcount;
+			} else if (error == ENOTSUP) {
+				spa->spa_feat_refcount_cache[i] =
+				    SPA_FEATURE_DISABLED;
+			} else {
+				return (spa_vdev_err(rvd,
+				    VDEV_AUX_CORRUPT_DATA, EIO));
+			}
+		}
+	}
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
+		    &spa->spa_feat_enabled_txg_obj) != 0)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	spa->spa_is_initializing = B_TRUE;
+	error = dsl_pool_open(spa->spa_dsl_pool);
+	spa->spa_is_initializing = B_FALSE;
+	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (!mosconfig) {
 		uint64_t hostid;
-		nvlist_t *policy = NULL;
+		nvlist_t *policy = NULL, *nvconfig;
+
+		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
@@ -1817,15 +2637,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 			 */
 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
 #endif	/* _KERNEL */
-			if (hostid != 0 && myhostid != 0 &&
+			if (check_hostid && hostid != 0 && myhostid != 0 &&
 			    hostid != myhostid) {
+				nvlist_free(nvconfig);
 				cmn_err(CE_WARN, "pool '%s' could not be "
 				    "loaded as it was last accessed by "
 				    "another system (host: %s hostid: 0x%lx). "
-				    "See: http://www.sun.com/msg/ZFS-8000-EY",
+				    "See: http://illumos.org/msg/ZFS-8000-EY",
 				    spa_name(spa), hostname,
 				    (unsigned long)hostid);
-				return (EBADF);
+				return (SET_ERROR(EBADF));
 			}
 		}
 		if (nvlist_lookup_nvlist(spa->spa_config,
@@ -1841,8 +2662,23 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
-	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST,
-	    &spa->spa_deferred_bplist_obj) != 0)
+	/* Grab the secret checksum salt from the MOS. */
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT, 1,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes),
+	    spa->spa_cksum_salt.zcs_bytes);
+	if (error == ENOENT) {
+		/* Generate a new salt for subsequent use */
+		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes));
+	} else if (error != 0) {
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
@@ -1854,6 +2690,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
+	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+	    &spa->spa_creation_version);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
@@ -1876,6 +2717,39 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
+	 * Load the per-vdev ZAP map. If we have an older pool, this will not
+	 * be present; in this case, defer its creation to a later time to
+	 * avoid dirtying the MOS this early / out of sync context. See
+	 * spa_sync_config_object.
+	 */
+
+	/* The sentinel is only available in the MOS config. */
+	nvlist_t *mos_config;
+	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
+	    &spa->spa_all_vdev_zaps);
+
+	if (error != ENOENT && error != 0) {
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	} else if (error == 0 && !nvlist_exists(mos_config,
+	    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
+		/*
+		 * An older version of ZFS overwrote the sentinel value, so
+		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
+		 * destruction to later; see spa_sync_config_object.
+		 */
+		spa->spa_avz_action = AVZ_ACTION_DESTROY;
+		/*
+		 * We're assuming that no vdevs have had their ZAPs created
+		 * before this. Better be sure of it.
+		 */
+		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+	}
+	nvlist_free(mos_config);
+
+	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
@@ -1973,13 +2847,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
-	 * Check the state of the root vdev.  If it can't be opened, it
-	 * indicates one or more toplevel vdevs are faulted.
-	 */
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
-		return (ENXIO);
-
-	/*
 	 * Load the DDTs (dedup tables).
 	 */
 	error = ddt_load(spa);
@@ -1988,33 +2855,66 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 
 	spa_update_dspace(spa);
 
-	if (state != SPA_LOAD_TRYIMPORT) {
-		error = spa_load_verify(spa);
-		if (error)
-			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
-			    error));
-	}
-
 	/*
-	 * Load the intent log state and check log integrity.  If we're
-	 * assembling a pool from a split, the log is not transferred over.
+	 * Validate the config, using the MOS config to fill in any
+	 * information which might be missing.  If we fail to validate
+	 * the config then declare the pool unfit for use. If we're
+	 * assembling a pool from a split, the log is not transferred
+	 * over.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
-		VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
-		    &nvroot) == 0);
-		spa_load_log_state(spa, nvroot);
+		nvlist_t *nvconfig;
+
+		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+		if (!spa_config_valid(spa, nvconfig)) {
+			nvlist_free(nvconfig);
+			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+			    ENXIO));
+		}
 		nvlist_free(nvconfig);
 
-		if (spa_check_logs(spa)) {
+		/*
+		 * Now that we've validated the config, check the state of the
+		 * root vdev.  If it can't be opened, it indicates one or
+		 * more toplevel vdevs are faulted.
+		 */
+		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+			return (SET_ERROR(ENXIO));
+
+		if (spa_writeable(spa) && spa_check_logs(spa)) {
 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
 		}
 	}
 
+	if (missing_feat_write) {
+		ASSERT(state == SPA_LOAD_TRYIMPORT);
+
+		/*
+		 * At this point, we know that we can open the pool in
+		 * read-only mode but not read-write mode. We now have enough
+		 * information and can return to userland.
+		 */
+		return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
+	}
+
+	/*
+	 * We've successfully opened the pool, verify that we're ready
+	 * to start pushing transactions.
+	 */
+	if (state != SPA_LOAD_TRYIMPORT) {
+		if (error = spa_load_verify(spa))
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+			    error));
+	}
+
 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
+		dsl_pool_t *dp = spa_get_dsl(spa);
 
 		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
@@ -2027,9 +2927,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 		 */
 		spa->spa_claiming = B_TRUE;
 
-		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
-		    spa_first_txg(spa));
-		(void) dmu_objset_find(spa_name(spa),
+		tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+		(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
@@ -2052,12 +2951,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 		 * If the config cache is stale, or we have uninitialized
 		 * metaslabs (see spa_vdev_add()), then update the config.
 		 *
-		 * If spa_load_verbatim is true, trust the current
+		 * If this is a verbatim import, trust the current
 		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
-		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
-		    state == SPA_LOAD_RECOVER)
+		    state == SPA_LOAD_IMPORT ||
+		    state == SPA_LOAD_RECOVER ||
+		    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 			need_update = B_TRUE;
 
 		for (int c = 0; c < rvd->vdev_children; c++)
@@ -2074,10 +2974,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
 		 */
-		if (vdev_resilver_needed(rvd, NULL, NULL))
+		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+		    vdev_resilver_needed(rvd, NULL, NULL))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		/*
+		 * Log the fact that we booted up (so that we can detect if
+		 * we rebooted in the middle of an operation).
+		 */
+		spa_history_log_version(spa, "open");
+
+		/*
 		 * Delete any inconsistent datasets.
 		 */
 		(void) dmu_objset_find(spa_name(spa),
@@ -2095,21 +3002,31 @@ spa_load_impl(spa_t *spa, uint64_t pool_
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
+	int mode = spa->spa_mode;
+
 	spa_unload(spa);
 	spa_deactivate(spa);
 
-	spa->spa_load_max_txg--;
+	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
-	spa_activate(spa, spa_mode_global);
+	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
 }
 
+/*
+ * If spa_load() fails this function will try loading prior txg's. If
+ * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
+ * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
+ * function will not rewind the pool and will return the same error as
+ * spa_load().
+ */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
     uint64_t max_request, int rewind_flags)
 {
+	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
@@ -2120,6 +3037,8 @@ spa_load_best(spa_t *spa, spa_load_state
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
+		if (max_request != UINT64_MAX)
+			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
@@ -2138,11 +3057,20 @@ spa_load_best(spa_t *spa, spa_load_state
 		return (load_error);
 	}
 
-	/* Price of rolling back is discarding txgs, including log */
-	if (state == SPA_LOAD_RECOVER)
+	if (state == SPA_LOAD_RECOVER) {
+		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
-
-	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+	} else {
+		/*
+		 * If we aren't rolling back save the load info from our first
+		 * import attempt so that we can restore it after attempting
+		 * to rewind.
+		 */
+		loadinfo = spa->spa_load_info;
+		spa->spa_load_info = fnvlist_alloc();
+	}
+
+	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
@@ -2158,16 +3086,26 @@ spa_load_best(spa_t *spa, spa_load_state
 		rewind_error = spa_load_retry(spa, state, mosconfig);
 	}
 
-	if (config)
-		spa_rewind_data_to_nvlist(spa, config);
-
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 
-	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
+	if (state == SPA_LOAD_RECOVER) {
+		ASSERT3P(loadinfo, ==, NULL);
+		return (rewind_error);
+	} else {
+		/* Store the rewind info as part of the initial load info */
+		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
+		    spa->spa_load_info);
+
+		/* Restore the initial load info */
+		fnvlist_free(spa->spa_load_info);
+		spa->spa_load_info = loadinfo;
+
+		return (load_error);
+	}
 }
 
 /*
@@ -2187,10 +3125,10 @@ spa_open_common(const char *pool, spa_t 
     nvlist_t **config)
 {
 	spa_t *spa;
-	zpool_rewind_policy_t policy;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
+	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
@@ -2208,27 +3146,20 @@ spa_open_common(const char *pool, spa_t 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	}
 
-	zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy);
-	if (policy.zrp_request & ZPOOL_DO_REWIND)
-		state = SPA_LOAD_RECOVER;
-
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+		zpool_rewind_policy_t policy;
 
-		spa_activate(spa, spa_mode_global);
+		firstopen = B_TRUE;
 
-		if (spa->spa_last_open_failed && (policy.zrp_request &
-		    (ZPOOL_NO_REWIND | ZPOOL_NEVER_REWIND))) {
-			if (config != NULL && spa->spa_config)
-				VERIFY(nvlist_dup(spa->spa_config,
-				    config, KM_SLEEP) == 0);
-			spa_deactivate(spa);
-			if (locked)
-				mutex_exit(&spa_namespace_lock);
-			return (spa->spa_last_open_failed);
-		}
+		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
+		    &policy);
+		if (policy.zrp_request & ZPOOL_DO_REWIND)
+			state = SPA_LOAD_RECOVER;
+
+		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
@@ -2250,7 +3181,7 @@ spa_open_common(const char *pool, spa_t 
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
@@ -2259,9 +3190,13 @@ spa_open_common(const char *pool, spa_t 
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
-			if (config != NULL && spa->spa_config)
+			if (config != NULL && spa->spa_config) {
 				VERIFY(nvlist_dup(spa->spa_config, config,
 				    KM_SLEEP) == 0);
+				VERIFY(nvlist_add_nvlist(*config,
+				    ZPOOL_CONFIG_LOAD_INFO,
+				    spa->spa_load_info) == 0);
+			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
@@ -2270,20 +3205,33 @@ spa_open_common(const char *pool, spa_t 
 			*spapp = NULL;
 			return (error);
 		}
-
 	}
 
 	spa_open_ref(spa, tag);
 
-
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
+	/*
+	 * If we've recovered the pool, pass back any information we
+	 * gathered while doing the load.
+	 */
+	if (state == SPA_LOAD_RECOVER) {
+		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+		    spa->spa_load_info) == 0);
+	}
+
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+		if (firstopen)
+			zvol_create_minors(spa->spa_name);
+#endif
+#endif
 	}
 
 	*spapp = spa;
@@ -2372,7 +3320,7 @@ spa_add_spares(spa_t *spa, nvlist_t *con
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
-				    spares[i], ZPOOL_CONFIG_STATS,
+				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
@@ -2429,14 +3377,62 @@ spa_add_l2cache(spa_t *spa, nvlist_t *co
 			ASSERT(vd != NULL);
 
 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
-			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+			    == 0);
 			vdev_get_stats(vd, vs);
 		}
 	}
 }
 
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+	nvlist_t *features;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	/* We may be unable to read features if pool is suspended. */
+	if (spa_suspended(spa))
+		goto out;
+
+	if (spa->spa_feat_for_read_obj != 0) {
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_feat_for_read_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+			    za.za_num_integers == 1);
+			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+			    za.za_first_integer));
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	if (spa->spa_feat_for_write_obj != 0) {
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_feat_for_write_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+			    za.za_num_integers == 1);
+			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
+			    za.za_first_integer));
+		}
+		zap_cursor_fini(&zc);
+	}
+
+out:
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+	    features) == 0);
+	nvlist_free(features);
+}
+
 int
-spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
+spa_get_stats(const char *name, nvlist_t **config,
+    char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
@@ -2453,6 +3449,13 @@ spa_get_stats(const char *name, nvlist_t
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
+			uint64_t loadtimes[2];
+
+			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+			VERIFY(nvlist_add_uint64_array(*config,
+			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
 			VERIFY(nvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_get_errlog_size(spa)) == 0);
@@ -2464,6 +3467,7 @@ spa_get_stats(const char *name, nvlist_t
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
+			spa_add_feature_stats(spa, *config);
 		}
 	}
 
@@ -2519,14 +3523,14 @@ spa_validate_aux_devs(spa_t *spa, nvlist
 		return (0);
 
 	if (ndev == 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
@@ -2542,7 +3546,7 @@ spa_validate_aux_devs(spa_t *spa, nvlist
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
-			error = EINVAL;
+			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
@@ -2553,7 +3557,8 @@ spa_validate_aux_devs(spa_t *spa, nvlist
 #ifdef _KERNEL
 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
-			error = ENOTBLK;
+			error = SET_ERROR(ENOTBLK);
+			vdev_free(vd);
 			goto out;
 		}
 #endif
@@ -2663,10 +3668,6 @@ spa_l2cache_drop(spa_t *spa)
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
-		if (vd->vdev_isl2cache)
-			spa_l2cache_remove(vd);
-		vdev_clear_stats(vd);
-		(void) vdev_close(vd);
 	}
 }
 
@@ -2675,7 +3676,7 @@ spa_l2cache_drop(spa_t *spa)
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
-    const char *history_str, nvlist_t *zplprops)
+    nvlist_t *zplprops)
 {
 	spa_t *spa;
 	char *altroot = NULL;
@@ -2686,7 +3687,8 @@ spa_create(const char *pool, nvlist_t *n
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
-	uint64_t version;
+	uint64_t version, obj;
+	boolean_t has_features;
 
 	/*
 	 * If this pool already exists, return failure.
@@ -2694,7 +3696,7 @@ spa_create(const char *pool, nvlist_t *n
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
-		return (EEXIST);
+		return (SET_ERROR(EEXIST));
 	}
 
 	/*
@@ -2712,21 +3714,35 @@ spa_create(const char *pool, nvlist_t *n
 		return (error);
 	}
 
-	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
-	    &version) != 0)
+	has_features = B_FALSE;
+	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
+	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
+		if (zpool_prop_feature(nvpair_name(elem)))
+			has_features = B_TRUE;
+	}
+
+	if (has_features || nvlist_lookup_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
-	ASSERT(version <= SPA_VERSION);
+	}
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_load_state = SPA_LOAD_CREATE;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
-	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+	    KM_SLEEP);
+	for (int i = 0; i < max_ncpus; i++) {
+		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+	}
 
 	/*
 	 * Create the root vdev.
@@ -2739,13 +3755,14 @@ spa_create(const char *pool, nvlist_t *n
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
-		error = EINVAL;
+		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_ashift_optimize(rvd->vdev_child[c]);
 			vdev_metaslab_set_size(rvd->vdev_child[c]);
 			vdev_expand(rvd->vdev_child[c], txg);
 		}
@@ -2791,8 +3808,10 @@ spa_create(const char *pool, nvlist_t *n
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
+	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
+	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
@@ -2816,6 +3835,15 @@ spa_create(const char *pool, nvlist_t *n
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
+	if (spa_version(spa) >= SPA_VERSION_FEATURES)
+		spa_feature_create_zap_objects(spa, tx);
+
+	if (zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+	    sizeof (uint64_t), 1, &version, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add pool version");
+	}
+
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
@@ -2827,20 +3855,20 @@ spa_create(const char *pool, nvlist_t *n
 	}
 
 	/*
-	 * Create the deferred-free bplist object.  Turn off compression
+	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
-	spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset,
-	    1 << 14, tx);
-	dmu_object_set_compress(spa->spa_meta_objset,
-	    spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx);
-
+	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+	dmu_object_set_compress(spa->spa_meta_objset, obj,
+	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add bplist");
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+	    sizeof (uint64_t), 1, &obj, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
+	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Create the pool's history object.
@@ -2849,6 +3877,12 @@ spa_create(const char *pool, nvlist_t *n
 		spa_history_create_obj(spa, tx);
 
 	/*
+	 * Generate some random noise for salted checksums to operate on.
+	 */
+	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
@@ -2858,7 +3892,7 @@ spa_create(const char *pool, nvlist_t *n
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
-		spa_sync_props(spa, props, CRED(), tx);
+		spa_sync_props(props, tx);
 	}
 
 	dmu_tx_commit(tx);
@@ -2873,19 +3907,26 @@ spa_create(const char *pool, nvlist_t *n
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	spa_config_sync(spa, B_FALSE, B_TRUE);
+	spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE);
 
-	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
-		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
-	spa_history_log_version(spa, LOG_POOL_CREATE);
+	spa_history_log_version(spa, "create");
 
+	/*
+	 * Don't count references from objsets that are already closed
+	 * and are making their way through the eviction process.
+	 */
+	spa_evicting_os_wait(spa);
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
+	spa->spa_load_state = SPA_LOAD_NONE;
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
+#ifndef __NetBSD__
 #ifdef _KERNEL
+#ifdef illumos
 /*
  * Get the root pool information from the root disk, then import the root pool
  * during the system boot up time.
@@ -3000,9 +4041,9 @@ spa_import_rootpool(char *devpath, char 
 	}
 #endif
 	if (config == NULL) {
-		cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
 		    devpath);
-		return (EIO);
+		return (SET_ERROR(EIO));
 	}
 
 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
@@ -3020,7 +4061,7 @@ spa_import_rootpool(char *devpath, char 
 
 	spa = spa_add(pname, config, NULL);
 	spa->spa_is_root = B_TRUE;
-	spa->spa_load_verbatim = B_TRUE;
+	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
@@ -3045,7 +4086,7 @@ spa_import_rootpool(char *devpath, char 
 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
 		    (u_longlong_t)guid);
-		error = ENOENT;
+		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
@@ -3057,7 +4098,7 @@ spa_import_rootpool(char *devpath, char 
 	if (avd != bvd) {
 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
 		    "try booting from '%s'", avd->vdev_path);
-		error = EINVAL;
+		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
@@ -3069,13 +4110,13 @@ spa_import_rootpool(char *devpath, char 
 	    !bvd->vdev_isspare) {
 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
 		    "try booting from '%s'",
-		    bvd->vdev_parent->vdev_child[1]->vdev_path);
-		error = EINVAL;
+		    bvd->vdev_parent->
+		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
+		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	error = 0;
-	spa_history_log_version(spa, LOG_POOL_IMPORT);
 out:
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
@@ -3086,51 +4127,215 @@ out:
 	return (error);
 }
 
-#endif
+#else	/* !illumos */
+
+extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
+    uint64_t *count);
+
+static nvlist_t *
+spa_generate_rootconf(const char *name)
+{
+	nvlist_t **configs, **tops;
+	nvlist_t *config;
+	nvlist_t *best_cfg, *nvtop, *nvroot;
+	uint64_t *holes;
+	uint64_t best_txg;
+	uint64_t nchildren;
+	uint64_t pgid;
+	uint64_t count;
+	uint64_t i;
+	uint_t   nholes;
+
+	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
+		return (NULL);
+
+	ASSERT3U(count, !=, 0);
+	best_txg = 0;
+	for (i = 0; i < count; i++) {
+		uint64_t txg;
+
+		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
+		    &txg) == 0);
+		if (txg > best_txg) {
+			best_txg = txg;
+			best_cfg = configs[i];
+		}
+	}
+
+	nchildren = 1;
+	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
+	holes = NULL;
+	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
+	    &holes, &nholes);
+
+	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
+	for (i = 0; i < nchildren; i++) {
+		if (i >= count)
+			break;
+		if (configs[i] == NULL)
+			continue;
+		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
+		    &nvtop) == 0);
+		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
+	}
+	for (i = 0; holes != NULL && i < nholes; i++) {
+		if (i >= nchildren)
+			continue;
+		if (tops[holes[i]] != NULL)
+			continue;
+		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
+		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_HOLE) == 0);
+		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
+		    holes[i]) == 0);
+		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
+		    0) == 0);
+	}
+	for (i = 0; i < nchildren; i++) {
+		if (tops[i] != NULL)
+			continue;
+		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
+		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_MISSING) == 0);
+		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
+		    i) == 0);
+		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
+		    0) == 0);
+	}
+
+	/*
+	 * Create pool config based on the best vdev config.
+	 */
+	nvlist_dup(best_cfg, &config, KM_SLEEP);
+
+	/*
+	 * Put this pool's top-level vdevs into a root vdev.
+	 */
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &pgid) == 0);
+	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    tops, nchildren) == 0);
+
+	/*
+	 * Replace the existing vdev_tree with the new root vdev in
+	 * this pool's configuration (remove the old, add the new).
+	 */
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+
+	/*
+	 * Drop vdev config elements that should not be present at pool level.
+	 */
+	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
+	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
+
+	for (i = 0; i < count; i++)
+		nvlist_free(configs[i]);
+	kmem_free(configs, count * sizeof(void *));
+	for (i = 0; i < nchildren; i++)
+		nvlist_free(tops[i]);
+	kmem_free(tops, nchildren * sizeof(void *));
+	nvlist_free(nvroot);
+	return (config);
+}
 
-/*
- * Take a pool and insert it into the namespace as if it had been loaded at
- * boot.
- */
 int
-spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import_rootpool(const char *name)
 {
 	spa_t *spa;
-	char *altroot = NULL;
+	vdev_t *rvd, *bvd, *avd = NULL;
+	nvlist_t *config, *nvtop;
+	uint64_t txg;
+	char *pname;
+	int error;
 
-	mutex_enter(&spa_namespace_lock);
-	if (spa_lookup(pool) != NULL) {
-		mutex_exit(&spa_namespace_lock);
-		return (EEXIST);
-	}
+	/*
+	 * Read the label from the boot device and generate a configuration.
+	 */
+	config = spa_generate_rootconf(name);
 
-	(void) nvlist_lookup_string(props,
-	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, config, altroot);
+	mutex_enter(&spa_namespace_lock);
+	if (config != NULL) {
+		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    &pname) == 0 && strcmp(name, pname) == 0);
+		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
+		    == 0);
 
-	spa->spa_load_verbatim = B_TRUE;
+		if ((spa = spa_lookup(pname)) != NULL) {
+			/*
+			 * Remove the existing root pool from the namespace so
+			 * that we can replace it with the correct config
+			 * we just read in.
+			 */
+			spa_remove(spa);
+		}
+		spa = spa_add(pname, config, NULL);
 
-	if (props != NULL)
-		spa_configfile_set(spa, props, B_FALSE);
+		/*
+		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
+		 * via spa_version().
+		 */
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+		    &spa->spa_ubsync.ub_version) != 0)
+			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+	} else if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
+		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
+		    name);
+		return (EIO);
+	} else {
+		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
+	}
+	spa->spa_is_root = B_TRUE;
+	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
-	spa_config_sync(spa, B_FALSE, B_TRUE);
+	/*
+	 * Build up a vdev tree based on the boot device's label config.
+	 */
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvtop) == 0);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+	    VDEV_ALLOC_ROOTPOOL);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	if (error) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
+		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+		    pname);
+		return (error);
+	}
 
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	vdev_free(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
-	spa_history_log_version(spa, LOG_POOL_IMPORT);
 
+	nvlist_free(config);
 	return (0);
 }
 
+#endif	/* illumos */
+#endif	/* _KERNEL */
+#endif	/* !__NetBSD__ */
+
 /*
  * Import a non-root pool into the system.
  */
 int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_rewind_policy_t policy;
+	uint64_t mode = spa_mode_global;
+	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
@@ -3142,26 +4347,47 @@ spa_import(const char *pool, nvlist_t *c
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
-		return (EEXIST);
+		return (SET_ERROR(EEXIST));
 	}
 
-	zpool_get_rewind_policy(config, &policy);
-	if (policy.zrp_request & ZPOOL_DO_REWIND)
-		state = SPA_LOAD_RECOVER;
-
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+	(void) nvlist_lookup_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+	if (readonly)
+		mode = FREAD;
 	spa = spa_add(pool, config, altroot);
-	spa_activate(spa, spa_mode_global);
+	spa->spa_import_flags = flags;
+
+	/*
+	 * Verbatim import - Take a pool and insert it into the namespace
+	 * as if it had been loaded at boot.
+	 */
+	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+		if (props != NULL)
+			spa_configfile_set(spa, props, B_FALSE);
+
+		spa_config_sync(spa, B_FALSE, B_TRUE);
+		spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
+
+		mutex_exit(&spa_namespace_lock);
+		return (0);
+	}
+
+	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
+	zpool_get_rewind_policy(config, &policy);
+	if (policy.zrp_request & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
+
 	/*
 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
 	 * because the user-supplied config is actually the one to trust when
@@ -3169,14 +4395,16 @@ spa_import(const char *pool, nvlist_t *c
 	 */
 	if (state != SPA_LOAD_RECOVER)
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+
 	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
 	    policy.zrp_request);
 
 	/*
-	 * Propagate anything learned about failing or best txgs
-	 * back to caller
+	 * Propagate anything learned while loading the pool and pass it
+	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
-	spa_rewind_data_to_nvlist(spa, config);
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+	    spa->spa_load_info) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
@@ -3274,9 +4502,17 @@ spa_import(const char *pool, nvlist_t *c
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
+	spa_history_log_version(spa, "import");
+
+	spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
+
 	mutex_exit(&spa_namespace_lock);
-	spa_history_log_version(spa, LOG_POOL_IMPORT);
 
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	zvol_create_minors(pool);
+#endif
+#endif
 	return (0);
 }
 
@@ -3320,6 +4556,8 @@ spa_tryimport(nvlist_t *tryconfig)
 		    state) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp) == 0);
+		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+		    spa->spa_load_info) == 0);
 
 		/*
 		 * If the bootfs property exists on this pool then we
@@ -3389,12 +4627,12 @@ spa_export_common(char *pool, int new_st
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & FWRITE))
-		return (EROFS);
+		return (SET_ERROR(EROFS));
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	}
 
 	/*
@@ -3417,6 +4655,7 @@ spa_export_common(char *pool, int new_st
 		 * have to force it to sync before checking spa_refcnt.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
+		spa_evicting_os_wait(spa);
 
 		/*
 		 * A pool cannot be exported or destroyed if there are active
@@ -3428,7 +4667,7 @@ spa_export_common(char *pool, int new_st
 		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
-			return (EBUSY);
+			return (SET_ERROR(EBUSY));
 		}
 
 		/*
@@ -3441,7 +4680,7 @@ spa_export_common(char *pool, int new_st
 		    spa_has_active_shared_spare(spa)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
-			return (EXDEV);
+			return (SET_ERROR(EXDEV));
 		}
 
 		/*
@@ -3452,7 +4691,8 @@ spa_export_common(char *pool, int new_st
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
-			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
+			spa->spa_final_txg = spa_last_synced_txg(spa) +
+			    TXG_DEFER_SIZE + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
@@ -3529,6 +4769,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroo
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
+	ASSERT(spa_writeable(spa));
+
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
@@ -3611,6 +4853,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroo
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -3632,7 +4875,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroo
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
-	uint64_t txg, open_txg;
+	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
@@ -3640,6 +4883,8 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 	int newvd_isspare;
 	int error;
 
+	ASSERT(spa_writeable(spa));
+
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3653,7 +4898,7 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
-	    VDEV_ALLOC_ADD)) != 0)
+	    VDEV_ALLOC_ATTACH)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
@@ -3689,7 +4934,7 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
-		    pvd->vdev_child[1] == oldvd &&
+		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
@@ -3701,13 +4946,15 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
-		if (pvd->vdev_ops == &vdev_replacing_ops)
+		if (pvd->vdev_ops == &vdev_replacing_ops &&
+		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		else if (pvd->vdev_ops == &vdev_spare_ops &&
-		    newvd->vdev_isspare != oldvd->vdev_isspare)
+		} else if (pvd->vdev_ops == &vdev_spare_ops &&
+		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
-		else if (pvd->vdev_ops != &vdev_spare_ops &&
-		    newvd->vdev_isspare)
+		}
+
+		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
@@ -3731,10 +4978,10 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
-		size_t plen = strlen(newvd->vdev_path) + 5;
 		spa_strfree(oldvd->vdev_path);
-		oldvd->vdev_path = kmem_alloc(plen, KM_SLEEP);
-		snprintf(oldvd->vdev_path, plen, "%s/%s",
+		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+		    KM_SLEEP);
+		(void) sprintf(oldvd->vdev_path, "%s/%s",
 		    newvd->vdev_path, "old");
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
@@ -3742,6 +4989,9 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 		}
 	}
 
+	/* mark the device being resilvered */
+	newvd->vdev_resilver_txg = txg;
+
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
@@ -3768,13 +5018,14 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 	vdev_config_dirty(tvd);
 
 	/*
-	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
-	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
+	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+	 * for any dmu_sync-ed blocks.  It will propagate upward when
+	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
-	open_txg = txg + TXG_CONCURRENT_STATES - 1;
+	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-	vdev_dtl_dirty(newvd, DTL_MISSING,
-	    TXG_INITIAL, open_txg - TXG_INITIAL + 1);
+	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
+	    dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
@@ -3790,10 +5041,25 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
-	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
+	/*
+	 * Schedule the resilver to restart in the future. We do this to
+	 * ensure that dmu_sync-ed blocks have been stitched into the
+	 * respective datasets.
+	 */
+	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+
+	if (spa->spa_bootfs)
+		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
+	spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH);
+
+	/*
+	 * Commit the config
+	 */
+	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
-	spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
-	    CRED(),  "%s vdev=%s %s vdev=%s",
+	spa_history_log_internal(spa, "vdev attach", NULL,
+	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
@@ -3801,16 +5067,12 @@ spa_vdev_attach(spa_t *spa, uint64_t gui
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
-	/*
-	 * Kick off a resilver to update newvd.
-	 */
-	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
-
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
+ *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing vdev.
  */
@@ -3822,10 +5084,11 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
-	uint64_t unspare_guid;
-	size_t len;
+	uint64_t unspare_guid = 0;
 	char *vdpath;
 
+	ASSERT(spa_writeable(spa));
+
 	txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@@ -3855,18 +5118,11 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
-	 * If replace_done is specified, only remove this device if it's
-	 * the first child of a replacing vdev.  For the 'spare' vdev, either
-	 * disk can be removed.
-	 */
-	if (replace_done) {
-		if (pvd->vdev_ops == &vdev_replacing_ops) {
-			if (vd->vdev_id != 0)
-				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-		} else if (pvd->vdev_ops != &vdev_spare_ops) {
-			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
-		}
-	}
+	 * Only 'replacing' or 'spare' vdevs can be replaced.
+	 */
+	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+	    pvd->vdev_ops != &vdev_spare_ops)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
@@ -3893,16 +5149,22 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
-	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
-	    pvd->vdev_child[0]->vdev_path != NULL &&
-	    pvd->vdev_child[1]->vdev_path != NULL) {
-		ASSERT(pvd->vdev_child[1] == vd);
-		cvd = pvd->vdev_child[0];
-		len = strlen(vd->vdev_path);
-		if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
-		    strcmp(cvd->vdev_path + len, "/old") == 0) {
-			spa_strfree(cvd->vdev_path);
-			cvd->vdev_path = spa_strdup(vd->vdev_path);
+	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+	    vd->vdev_path != NULL) {
+		size_t len = strlen(vd->vdev_path);
+
+		for (int c = 0; c < pvd->vdev_children; c++) {
+			cvd = pvd->vdev_child[c];
+
+			if (cvd == vd || cvd->vdev_path == NULL)
+				continue;
+
+			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+			    strcmp(cvd->vdev_path + len, "/old") == 0) {
+				spa_strfree(cvd->vdev_path);
+				cvd->vdev_path = spa_strdup(vd->vdev_path);
+				break;
+			}
 		}
 	}
 
@@ -3912,7 +5174,8 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
+	    vd->vdev_id == 0 &&
+	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
@@ -3934,7 +5197,7 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
-	cvd = pvd->vdev_child[0];
+	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
@@ -3950,14 +5213,19 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
-	if (pvd->vdev_children == 1)
+	if (pvd->vdev_children == 1) {
+		if (pvd->vdev_ops == &vdev_spare_ops)
+			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
+	}
+
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
@@ -3999,9 +5267,12 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 
 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
 
+	/* hang on to the spa before we release the lock */
+	spa_open_ref(spa, FTAG);
+
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
-	spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(),
+	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
@@ -4011,23 +5282,31 @@ spa_vdev_detach(spa_t *spa, uint64_t gui
 	 * list of every other pool.
 	 */
 	if (unspare) {
-		spa_t *myspa = spa;
-		spa = NULL;
+		spa_t *altspa = NULL;
+
 		mutex_enter(&spa_namespace_lock);
-		while ((spa = spa_next(spa)) != NULL) {
-			if (spa->spa_state != POOL_STATE_ACTIVE)
-				continue;
-			if (spa == myspa)
+		while ((altspa = spa_next(altspa)) != NULL) {
+			if (altspa->spa_state != POOL_STATE_ACTIVE ||
+			    altspa == spa)
 				continue;
-			spa_open_ref(spa, FTAG);
+
+			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
-			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
-			spa_close(spa, FTAG);
+			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
+
+		/* search the rest of the vdevs for spares to remove */
+		spa_vdev_resilver_done(spa);
 	}
 
+	/* all done with the spa; OK to release */
+	mutex_enter(&spa_namespace_lock);
+	spa_close(spa, FTAG);
+	mutex_exit(&spa_namespace_lock);
+
 	return (error);
 }
 
@@ -4048,8 +5327,7 @@ spa_vdev_split_mirror(spa_t *spa, char *
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
-	if (!spa_writeable(spa))
-		return (EROFS);
+	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
@@ -4115,7 +5393,7 @@ spa_vdev_split_mirror(spa_t *spa, char *
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
-				error = EINVAL;
+				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
@@ -4123,14 +5401,14 @@ spa_vdev_split_mirror(spa_t *spa, char *
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
-			error = EINVAL;
+			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
-			error = ENODEV;
+			error = SET_ERROR(ENODEV);
 			break;
 		}
 
@@ -4144,12 +5422,12 @@ spa_vdev_split_mirror(spa_t *spa, char *
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
-			error = EINVAL;
+			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c])) {
-			error = EBUSY;
+			error = SET_ERROR(EBUSY);
 			break;
 		}
 
@@ -4162,6 +5440,16 @@ spa_vdev_split_mirror(spa_t *spa, char *
 		    vml[c]->vdev_top->vdev_asize) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift) == 0);
+
+		/* transfer per-vdev ZAPs */
+		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
+		VERIFY0(nvlist_add_uint64(child[c],
+		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
+
+		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
+		VERIFY0(nvlist_add_uint64(child[c],
+		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
+		    vml[c]->vdev_parent->vdev_top_zap));
 	}
 
 	if (error != 0) {
@@ -4186,8 +5474,10 @@ spa_vdev_split_mirror(spa_t *spa, char *
 	    glist, children) == 0);
 	kmem_free(glist, children * sizeof (uint64_t));
 
+	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
 	    nvl) == 0);
+	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
@@ -4201,11 +5491,13 @@ spa_vdev_split_mirror(spa_t *spa, char *
 	    spa->spa_config_txg) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL)) == 0);
+	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
+	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
@@ -4218,8 +5510,15 @@ spa_vdev_split_mirror(spa_t *spa, char *
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
+#ifndef illumos
+	/* mark that we are creating new spa by splitting */
+	newspa->spa_splitting_newspa = B_TRUE;
+#endif
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+#ifndef illumos
+	newspa->spa_splitting_newspa = B_FALSE;
+#endif
 	if (error)
 		goto out;
 
@@ -4261,12 +5560,13 @@ spa_vdev_split_mirror(spa_t *spa, char *
 		if (vml[c] != NULL) {
 			vdev_split(vml[c]);
 			if (error == 0)
-				spa_history_internal_log(LOG_POOL_VDEV_DETACH,
-				    spa, tx, CRED(), "vdev=%s",
-				    vml[c]->vdev_path);
+				spa_history_log_internal(spa, "detach", tx,
+				    "vdev=%s", vml[c]->vdev_path);
+
 			vdev_free(vml[c]);
 		}
 	}
+	spa->spa_avz_action = AVZ_ACTION_REBUILD;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
@@ -4278,8 +5578,8 @@ spa_vdev_split_mirror(spa_t *spa, char *
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
-	spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(),
-	    "split new pool %s from pool %s", newname, spa_name(spa));
+	spa_history_log_internal(newspa, "split", NULL,
+	    "from pool %s", spa_name(spa));
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 
@@ -4296,6 +5596,14 @@ out:
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
+
+	/* re-online all offlined disks */
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL)
+			vml[c]->vdev_offline = B_FALSE;
+	}
+	vdev_reopen(spa->spa_root_vdev);
+
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
@@ -4322,7 +5630,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvp
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
-	nvlist_t *dev_to_remove)
+    nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
@@ -4346,21 +5654,13 @@ spa_vdev_remove_aux(nvlist_t *config, ch
 }
 
 /*
- * Removing a device from the vdev namespace requires several steps
- * and can take a significant amount of time.  As a result we use
- * the spa_vdev_config_[enter/exit] functions which allow us to
- * grab and release the spa_config_lock while still holding the namespace
- * lock.  During each step the configuration is synced out.
+ * Evacuate the device.
  */
-
-/*
- * Evacuate the device.
- */
-int
+static int
 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 {
-	int error = 0;
 	uint64_t txg;
+	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
@@ -4373,14 +5673,12 @@ spa_vdev_remove_evacuate(spa_t *spa, vde
 	 * should no longer have any blocks allocated on it.
 	 */
 	if (vd->vdev_islog) {
-		error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
-		    NULL, DS_FIND_CHILDREN);
+		if (vd->vdev_stat.vs_alloc != 0)
+			error = spa_offline_log(spa);
 	} else {
-		error = ENOTSUP;	/* until we have bp rewrite */
+		error = SET_ERROR(ENOTSUP);
 	}
 
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
 	if (error)
 		return (error);
 
@@ -4388,9 +5686,10 @@ spa_vdev_remove_evacuate(spa_t *spa, vde
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
+	ASSERT0(vd->vdev_stat.vs_alloc);
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
-	vdev_dirty(vd, 0, NULL, txg);
+	vdev_dirty_leaves(vd, VDD_DTL, txg);
 	vdev_config_dirty(vd);
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
@@ -4400,7 +5699,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vde
 /*
  * Complete the removal by cleaning up the namespace.
  */
-void
+static void
 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
@@ -4411,6 +5710,12 @@ spa_vdev_remove_from_namespace(spa_t *sp
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vd == vd->vdev_top);
 
+	/*
+	 * Only remove any devices which are empty.
+	 */
+	if (vd->vdev_stat.vs_alloc != 0)
+		return;
+
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
@@ -4435,13 +5740,22 @@ spa_vdev_remove_from_namespace(spa_t *sp
 }
 
 /*
- * Remove a device from the pool.  Currently, this supports removing only hot
- * spares, slogs, and level 2 ARC devices.
+ * Remove a device from the pool -
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time.  As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock.  During each step the configuration is synced out.
+ *
+ * Currently, this supports removing only hot spares, slogs, and level 2 ARC
+ * devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
+	sysevent_t *ev = NULL;
 	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
@@ -4449,6 +5763,8 @@ spa_vdev_remove(spa_t *spa, uint64_t gui
 	int error = 0;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
+	ASSERT(spa_writeable(spa));
+
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
@@ -4463,12 +5779,15 @@ spa_vdev_remove(spa_t *spa, uint64_t gui
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
+			if (vd == NULL)
+				vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+			ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 			spa_load_spares(spa);
 			spa->spa_spares.sav_sync = B_TRUE;
 		} else {
-			error = EBUSY;
+			error = SET_ERROR(EBUSY);
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
@@ -4477,6 +5796,8 @@ spa_vdev_remove(spa_t *spa, uint64_t gui
 		/*
 		 * Cache devices can always be removed.
 		 */
+		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+		ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
@@ -4485,11 +5806,6 @@ spa_vdev_remove(spa_t *spa, uint64_t gui
 		ASSERT(!locked);
 		ASSERT(vd == vd->vdev_top);
 
-		/*
-		 * XXX - Once we have bp-rewrite this should
-		 * become the common case.
-		 */
-
 		mg = vd->vdev_mg;
 
 		/*
@@ -4522,29 +5838,33 @@ spa_vdev_remove(spa_t *spa, uint64_t gui
 		/*
 		 * Clean up the vdev namespace.
 		 */
+		ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV);
 		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
 		/*
 		 * Normal vdevs cannot be removed (yet).
 		 */
-		error = ENOTSUP;
+		error = SET_ERROR(ENOTSUP);
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
-		error = ENOENT;
+		error = SET_ERROR(ENOENT);
 	}
 
 	if (!locked)
-		return (spa_vdev_exit(spa, NULL, txg, error));
+		error = spa_vdev_exit(spa, NULL, txg, error);
+
+	if (ev)
+		spa_event_post(ev);
 
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
- * current spared, so we can detach it.
+ * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
@@ -4558,13 +5878,21 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 	}
 
 	/*
-	 * Check for a completed replacement.
+	 * Check for a completed replacement.  We always consider the first
+	 * vdev in the list to be the oldest vdev, and the last one to be
+	 * the newest (see spa_vdev_attach() for how that works).  In
+	 * the case where the newest vdev is faulted, we will not automatically
+	 * remove it after a resilver completes.  This is OK as it will require
+	 * user intervention to determine which disk the admin wishes to keep.
 	 */
-	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
+	if (vd->vdev_ops == &vdev_replacing_ops) {
+		ASSERT(vd->vdev_children > 1);
+
+		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
-		newvd = vd->vdev_child[1];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
@@ -4572,15 +5900,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 */
-	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
-		newvd = vd->vdev_child[0];
-		oldvd = vd->vdev_child[1];
+	if (vd->vdev_ops == &vdev_spare_ops) {
+		vdev_t *first = vd->vdev_child[0];
+		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+		if (last->vdev_unspare) {
+			oldvd = first;
+			newvd = last;
+		} else if (first->vdev_unspare) {
+			oldvd = last;
+			newvd = first;
+		} else {
+			oldvd = NULL;
+		}
 
-		if (newvd->vdev_unspare &&
+		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
-		    !vdev_dtl_required(oldvd)) {
-			newvd->vdev_unspare = 0;
+		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+		    !vdev_dtl_required(oldvd))
 			return (oldvd);
+
+		/*
+		 * If there are more than two spares attached to a disk,
+		 * and those spares are not required, then we want to
+		 * attempt to free them up now so that they can be used
+		 * by other pools.  Once we're back down to a single
+		 * disk+spare, we stop removing them.
+		 */
+		if (vd->vdev_children > 2) {
+			newvd = vd->vdev_child[1];
+
+			if (newvd->vdev_isspare && last->vdev_isspare &&
+			    vdev_dtl_empty(last, DTL_MISSING) &&
+			    vdev_dtl_empty(last, DTL_OUTAGE) &&
+			    !vdev_dtl_required(newvd))
+				return (newvd);
 		}
 	}
 
@@ -4607,11 +5961,13 @@ spa_vdev_resilver_done(spa_t *spa)
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
-		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
+		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-			ASSERT(ppvd->vdev_children == 2);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
+		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
@@ -4631,6 +5987,9 @@ spa_vdev_set_common(spa_t *spa, uint64_t
     boolean_t ispath)
 {
 	vdev_t *vd;
+	boolean_t sync = B_FALSE;
+
+	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
@@ -4641,15 +6000,23 @@ spa_vdev_set_common(spa_t *spa, uint64_t
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
-		spa_strfree(vd->vdev_path);
-		vd->vdev_path = spa_strdup(value);
+		if (strcmp(value, vd->vdev_path) != 0) {
+			spa_strfree(vd->vdev_path);
+			vd->vdev_path = spa_strdup(value);
+			sync = B_TRUE;
+		}
 	} else {
-		if (vd->vdev_fru != NULL)
+		if (vd->vdev_fru == NULL) {
+			vd->vdev_fru = spa_strdup(value);
+			sync = B_TRUE;
+		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
-		vd->vdev_fru = spa_strdup(value);
+			vd->vdev_fru = spa_strdup(value);
+			sync = B_TRUE;
+		}
 	}
 
-	return (spa_vdev_state_exit(spa, vd, 0));
+	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
@@ -4666,40 +6033,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t gui
 
 /*
  * ==========================================================================
- * SPA Scrubbing
+ * SPA Scanning
  * ==========================================================================
  */
 
 int
-spa_scrub(spa_t *spa, pool_scrub_type_t type)
+spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+	if (dsl_scan_resilvering(spa->spa_dsl_pool))
+		return (SET_ERROR(EBUSY));
+	return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
 
-	if ((uint_t)type >= POOL_SCRUB_TYPES)
-		return (ENOTSUP);
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
+{
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
+		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
-	if (type == POOL_SCRUB_RESILVER &&
+	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
-	if (type == POOL_SCRUB_EVERYTHING &&
-	    spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
-	    spa->spa_dsl_pool->dp_scrub_isresilver)
-		return (EBUSY);
-
-	if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
-		return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
-	} else if (type == POOL_SCRUB_NONE) {
-		return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
-	} else {
-		return (EINVAL);
-	}
+	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
@@ -4712,7 +6077,8 @@ static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
-		vd->vdev_remove_wanted = 0;
+		vd->vdev_remove_wanted = B_FALSE;
+		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
@@ -4726,6 +6092,8 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
+		/* Tell userspace that the vdev is gone. */
+		zfs_post_remove(spa, vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
@@ -4736,7 +6104,7 @@ static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_probe_wanted) {
-		vd->vdev_probe_wanted = 0;
+		vd->vdev_probe_wanted = B_FALSE;
 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
@@ -4769,7 +6137,7 @@ spa_async_autoexpand(spa_t *spa, vdev_t 
 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 
 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
-	    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
 
 	nvlist_free(attr);
 	kmem_free(physpath, MAXPATHLEN);
@@ -4778,14 +6146,14 @@ spa_async_autoexpand(spa_t *spa, vdev_t 
 static void
 spa_async_thread(void *arg)
 {
-	int tasks;
 	spa_t *spa = arg;
+	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
-	spa->spa_async_tasks = 0;
+	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
@@ -4805,26 +6173,12 @@ spa_async_thread(void *arg)
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
-			spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
-			    spa, NULL, CRED(),
+			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), new_space, new_space - old_space);
 		}
 	}
 
-	/*
-	 * See if any devices need to be marked REMOVED.
-	 */
-	if (tasks & SPA_ASYNC_REMOVE) {
-		spa_vdev_state_enter(spa, SCL_NONE);
-		spa_async_remove(spa, spa->spa_root_vdev);
-		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
-			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
-		for (int i = 0; i < spa->spa_spares.sav_count; i++)
-			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
-		(void) spa_vdev_state_exit(spa, NULL, 0);
-	}
-
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
@@ -4850,7 +6204,7 @@ spa_async_thread(void *arg)
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER)
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
+		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Let the world know that we're done.
@@ -4862,12 +6216,53 @@ spa_async_thread(void *arg)
 	thread_exit();
 }
 
+static void
+spa_async_thread_vd(void *arg)
+{
+	spa_t *spa = arg;
+	int tasks;
+
+	ASSERT(spa->spa_sync_on);
+
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+retry:
+	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
+	mutex_exit(&spa->spa_async_lock);
+
+	/*
+	 * See if any devices need to be marked REMOVED.
+	 */
+	if (tasks & SPA_ASYNC_REMOVE) {
+		spa_vdev_state_enter(spa, SCL_NONE);
+		spa_async_remove(spa, spa->spa_root_vdev);
+		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
+		for (int i = 0; i < spa->spa_spares.sav_count; i++)
+			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
+		(void) spa_vdev_state_exit(spa, NULL, 0);
+	}
+
+	/*
+	 * Let the world know that we're done.
+	 */
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+	if ((tasks & SPA_ASYNC_REMOVE) != 0)
+		goto retry;
+	spa->spa_async_thread_vd = NULL;
+	cv_broadcast(&spa->spa_async_cv);
+	mutex_exit(&spa->spa_async_lock);
+	thread_exit();
+}
+
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
-	while (spa->spa_async_thread != NULL)
+	while (spa->spa_async_thread != NULL &&
+	    spa->spa_async_thread_vd != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 }
@@ -4881,24 +6276,61 @@ spa_async_resume(spa_t *spa)
 	mutex_exit(&spa->spa_async_lock);
 }
 
+static boolean_t
+spa_async_tasks_pending(spa_t *spa)
+{
+	uint_t non_config_tasks;
+	uint_t config_task;
+	boolean_t config_task_suspended;
+
+	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
+	    SPA_ASYNC_REMOVE);
+	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
+	if (spa->spa_ccw_fail_time == 0) {
+		config_task_suspended = B_FALSE;
+	} else {
+		config_task_suspended =
+		    (gethrtime() - spa->spa_ccw_fail_time) <
+		    (zfs_ccw_retry_interval * NANOSEC);
+	}
+
+	return (non_config_tasks || (config_task && !config_task_suspended));
+}
+
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
-	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+	if (spa_async_tasks_pending(spa) &&
+	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL &&
-	    rootdir != NULL && !vn_is_readonly(rootdir))
+	    rootdir != NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
+static void
+spa_async_dispatch_vd(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
+	    !spa->spa_async_suspended &&
+	    spa->spa_async_thread_vd == NULL &&
+	    rootdir != NULL)
+		spa->spa_async_thread_vd = thread_create(NULL, 0,
+		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
+	mutex_exit(&spa->spa_async_lock);
+}
+
 void
 spa_async_request(spa_t *spa, int task)
 {
+	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
+	spa_async_dispatch_vd(spa);
 }
 
 /*
@@ -4906,36 +6338,51 @@ spa_async_request(spa_t *spa, int task)
  * SPA syncing routines
  * ==========================================================================
  */
-static void
-spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg)
-{
-	blkptr_t blk;
-	uint64_t itor = 0;
-	uint8_t c = 1;
 
-	while (bplist_iterate(bpl, &itor, &blk) == 0) {
-		ASSERT(blk.blk_birth < txg);
-		zio_free(spa, txg, &blk);
-	}
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	bpobj_t *bpo = arg;
+	bpobj_enqueue(bpo, bp, tx);
+	return (0);
+}
 
-	bplist_vacate(bpl, tx);
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zio_t *zio = arg;
 
-	/*
-	 * Pre-dirty the first block so we sync to convergence faster.
-	 * (Usually only the first block is needed.)
-	 */
-	dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx);
+	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+	    BP_GET_PSIZE(bp), zio->io_flags));
+	return (0);
 }
 
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
 static void
-spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
-	zio_t *zio = arg;
+	zio_t *zio = zio_root(spa, NULL, NULL, 0);
+	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+	VERIFY(zio_wait(zio) == 0);
+}
 
-	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
-	    zio->io_flags));
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+	zio_t *zio = zio_root(spa, NULL, NULL, 0);
+	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+	    spa_free_sync_cb, zio, tx), ==, 0);
+	VERIFY0(zio_wait(zio));
 }
 
+
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
@@ -4948,10 +6395,10 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
-	 * information.  This avoids the dbuf_will_dirty() path and
+	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
-	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
+	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = kmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
@@ -5000,7 +6447,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vde
 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
-			    B_FALSE, B_FALSE, B_TRUE);
+			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
 		    sav->sav_count) == 0);
 		for (i = 0; i < sav->sav_count; i++)
@@ -5014,63 +6461,200 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vde
 	sav->sav_sync = B_FALSE;
 }
 
+/*
+ * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
+ * The all-vdev ZAP must be empty.
+ */
+static void
+spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+	if (vd->vdev_top_zap != 0) {
+		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+		    vd->vdev_top_zap, tx));
+	}
+	if (vd->vdev_leaf_zap != 0) {
+		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+		    vd->vdev_leaf_zap, tx));
+	}
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		spa_avz_build(vd->vdev_child[i], avz, tx);
+	}
+}
+
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
-	if (list_is_empty(&spa->spa_config_dirty_list))
+	/*
+	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
+	 * its config may not be dirty but we still need to build per-vdev ZAPs.
+	 * Similarly, if the pool is being assembled (e.g. after a split), we
+	 * need to rebuild the AVZ although the config may not be dirty.
+	 */
+	if (list_is_empty(&spa->spa_config_dirty_list) &&
+	    spa->spa_avz_action == AVZ_ACTION_NONE)
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
+	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
+	    spa->spa_all_vdev_zaps != 0);
+
+	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
+		/* Make and build the new AVZ */
+		uint64_t new_avz = zap_create(spa->spa_meta_objset,
+		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
+
+		/* Diff old AVZ with new one */
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t vdzap = za.za_first_integer;
+			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
+			    vdzap) == ENOENT) {
+				/*
+				 * ZAP is listed in old AVZ but not in new one;
+				 * destroy it
+				 */
+				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
+				    tx));
+			}
+		}
+
+		zap_cursor_fini(&zc);
+
+		/* Destroy the old AVZ */
+		VERIFY0(zap_destroy(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, tx));
+
+		/* Replace the old AVZ in the dir obj with the new one */
+		VERIFY0(zap_update(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
+		    sizeof (new_avz), 1, &new_avz, tx));
+
+		spa->spa_all_vdev_zaps = new_avz;
+	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		/* Walk through the AVZ and destroy all listed ZAPs */
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t zap = za.za_first_integer;
+			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
+		}
+
+		zap_cursor_fini(&zc);
+
+		/* Destroy and unlink the AVZ itself */
+		VERIFY0(zap_destroy(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, tx));
+		VERIFY0(zap_remove(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
+		spa->spa_all_vdev_zaps = 0;
+	}
+
+	if (spa->spa_all_vdev_zaps == 0) {
+		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
+		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_VDEV_ZAP_MAP, tx);
+	}
+	spa->spa_avz_action = AVZ_ACTION_NONE;
+
+	/* Create ZAPs for vdevs that don't have them. */
+	vdev_construct_zaps(spa->spa_root_vdev, tx);
+
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
+	/*
+	 * If we're upgrading the spa version then make sure that
+	 * the config object gets updated with the correct version.
+	 */
+	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+		    spa->spa_uberblock.ub_version);
+
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	if (spa->spa_config_syncing)
-		nvlist_free(spa->spa_config_syncing);
+	nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
+static void
+spa_sync_version(void *arg, dmu_tx_t *tx)
+{
+	uint64_t *versionp = arg;
+	uint64_t version = *versionp;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	/*
+	 * Setting the version is special cased when first creating the pool.
+	 */
+	ASSERT(tx->tx_txg != TXG_INITIAL);
+
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+	ASSERT(version >= spa_version(spa));
+
+	spa->spa_uberblock.ub_version = version;
+	vdev_config_dirty(spa->spa_root_vdev);
+	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
+}
+
 /*
  * Set zpool properties.
  */
 static void
-spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_sync_props(void *arg, dmu_tx_t *tx)
 {
-	spa_t *spa = arg1;
+	nvlist_t *nvp = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
-	nvlist_t *nvp = arg2;
-	nvpair_t *elem;
-	uint64_t intval;
-	char *strval;
-	zpool_prop_t prop;
-	const char *propname;
-	zprop_type_t proptype;
+	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
-	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
+		uint64_t intval;
+		char *strval, *fname;
+		zpool_prop_t prop;
+		const char *propname;
+		zprop_type_t proptype;
+		spa_feature_t fid;
+
 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+		case ZPROP_INVAL:
+			/*
+			 * We checked this earlier in spa_prop_validate().
+			 */
+			ASSERT(zpool_prop_feature(nvpair_name(elem)));
+
+			fname = strchr(nvpair_name(elem), '@') + 1;
+			VERIFY0(zfeature_lookup_name(fname, &fid));
+
+			spa_feature_enable(spa, fid, tx);
+			spa_history_log_internal(spa, "set", tx,
+			    "%s=enabled", nvpair_name(elem));
+			break;
+
 		case ZPOOL_PROP_VERSION:
+			intval = fnvpair_value_uint64(elem);
 			/*
-			 * Only set version for non-zpool-creation cases
-			 * (set/import). spa_create() needs special care
-			 * for version setting.
+			 * The version is synced seperatly before other
+			 * properties and should be correct by now.
 			 */
-			if (tx->tx_txg != TXG_INITIAL) {
-				VERIFY(nvpair_value_uint64(elem,
-				    &intval) == 0);
-				ASSERT(intval <= SPA_VERSION);
-				ASSERT(intval >= spa_version(spa));
-				spa->spa_uberblock.ub_version = intval;
-				vdev_config_dirty(spa->spa_root_vdev);
-			}
+			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
@@ -5081,24 +6665,38 @@ spa_sync_props(void *arg1, void *arg2, c
 			ASSERT(spa->spa_root != NULL);
 			break;
 
+		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
-			 * 'cachefile' is also a non-persisitent property.
+			 * 'readonly' and 'cachefile' are also non-persisitent
+			 * properties.
 			 */
 			break;
+		case ZPOOL_PROP_COMMENT:
+			strval = fnvpair_value_string(elem);
+			if (spa->spa_comment != NULL)
+				spa_strfree(spa->spa_comment);
+			spa->spa_comment = spa_strdup(strval);
+			/*
+			 * We need to dirty the configuration on all the vdevs
+			 * so that their labels get updated.  It's unnecessary
+			 * to do this for pool creation since the vdev's
+			 * configuratoin has already been dirtied.
+			 */
+			if (tx->tx_txg != TXG_INITIAL)
+				vdev_config_dirty(spa->spa_root_vdev);
+			spa_history_log_internal(spa, "set", tx,
+			    "%s=%s", nvpair_name(elem), strval);
+			break;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
-				VERIFY((spa->spa_pool_props_object =
-				    zap_create(mos, DMU_OT_POOL_PROPS,
-				    DMU_OT_NONE, 0, tx)) > 0);
-
-				VERIFY(zap_update(mos,
+				spa->spa_pool_props_object =
+				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
-				    8, 1, &spa->spa_pool_props_object, tx)
-				    == 0);
+				    tx);
 			}
 
 			/* normalize the property name */
@@ -5107,22 +6705,25 @@ spa_sync_props(void *arg1, void *arg2, c
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
-				VERIFY(nvpair_value_string(elem, &strval) == 0);
-				VERIFY(zap_update(mos,
+				strval = fnvpair_value_string(elem);
+				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
-				    1, strlen(strval) + 1, strval, tx) == 0);
-
+				    1, strlen(strval) + 1, strval, tx));
+				spa_history_log_internal(spa, "set", tx,
+				    "%s=%s", nvpair_name(elem), strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
-				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
-					VERIFY(zpool_prop_index_to_string(
-					    prop, intval, &unused) == 0);
+					VERIFY0(zpool_prop_index_to_string(
+					    prop, intval, &unused));
 				}
-				VERIFY(zap_update(mos,
+				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
-				    8, 1, &intval, tx) == 0);
+				    8, 1, &intval, tx));
+				spa_history_log_internal(spa, "set", tx,
+				    "%s=%lld", nvpair_name(elem), intval);
 			} else {
 				ASSERT(0); /* not allowed */
 			}
@@ -5139,7 +6740,9 @@ spa_sync_props(void *arg1, void *arg2, c
 				break;
 			case ZPOOL_PROP_AUTOEXPAND:
 				spa->spa_autoexpand = intval;
-				spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+				if (tx->tx_txg != TXG_INITIAL)
+					spa_async_request(spa,
+					    SPA_ASYNC_AUTOEXPAND);
 				break;
 			case ZPOOL_PROP_DEDUPDITTO:
 				spa->spa_dedup_ditto = intval;
@@ -5149,33 +6752,104 @@ spa_sync_props(void *arg1, void *arg2, c
 			}
 		}
 
-		/* log internal history if this is not a zpool create */
-		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
-		    tx->tx_txg != TXG_INITIAL) {
-			spa_history_internal_log(LOG_POOL_PROPSET,
-			    spa, tx, cr, "%s %lld %s",
-			    nvpair_name(elem), intval, spa_name(spa));
-		}
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
+ * Perform one-time upgrade on-disk changes.  spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+
+	ASSERT(spa->spa_sync_pass == 1);
+
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+		dsl_pool_create_origin(dp, tx);
+
+		/* Keeping the origin open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+		dsl_pool_upgrade_clones(dp, tx);
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+		dsl_pool_upgrade_dir_clones(dp, tx);
+
+		/* Keeping the freedir open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+		spa_feature_create_zap_objects(spa, tx);
+	}
+
+	/*
+	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
+	 * when possibility to use lz4 compression for metadata was added
+	 * Old pools that have this feature enabled must be upgraded to have
+	 * this feature active
+	 */
+	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+		boolean_t lz4_en = spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LZ4_COMPRESS);
+		boolean_t lz4_ac = spa_feature_is_active(spa,
+		    SPA_FEATURE_LZ4_COMPRESS);
+
+		if (lz4_en && !lz4_ac)
+			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
+	}
+
+	/*
+	 * If we haven't written the salt, do so now.  Note that the
+	 * feature may not be activated yet, but that's fine since
+	 * the presence of this ZAP entry is backwards compatible.
+	 */
+	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+		VERIFY0(zap_add(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes),
+		    spa->spa_cksum_salt.zcs_bytes, tx));
+	}
+
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
+}
+
+/*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
+
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
-	bplist_t *defer_bpl = &spa->spa_deferred_bplist;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int error;
+	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+	    zfs_vdev_queue_depth_pct / 100;
+
+	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Lock out configuration changes.
@@ -5185,6 +6859,10 @@ spa_sync(spa_t *spa, uint64_t txg)
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
+	mutex_enter(&spa->spa_alloc_lock);
+	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+	mutex_exit(&spa->spa_alloc_lock);
+
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
@@ -5209,10 +6887,26 @@ spa_sync(spa_t *spa, uint64_t txg)
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj));
-
 	tx = dmu_tx_create_assigned(dp, txg);
 
+	spa->spa_sync_starttime = gethrtime();
+#ifdef illumos
+	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
+	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
+#endif	/* illumos */
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	callout_schedule(&spa->spa_deadman_cycid,
+	    hz * spa->spa_deadman_synctime / NANOSEC);
+#endif
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+#ifdef _KERNEL
+	callout_schedule(&spa->spa_deadman_cycid,
+	    hz * spa->spa_deadman_synctime / NANOSEC);
+#endif
+#endif
+
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
@@ -5234,28 +6928,37 @@ spa_sync(spa_t *spa, uint64_t txg)
 		}
 	}
 
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
-		dsl_pool_create_origin(dp, tx);
+	/*
+	 * Set the top-level vdev's max queue depth. Evaluate each
+	 * top-level's async write queue depth in case it changed.
+	 * The max queue depth will not change in the middle of syncing
+	 * out this txg.
+	 */
+	uint64_t queue_depth_total = 0;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
 
-		/* Keeping the origin open increases spa_minref */
-		spa->spa_minref += 3;
-	}
+		if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
+		    !metaslab_group_initialized(mg))
+			continue;
 
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
-		dsl_pool_upgrade_clones(dp, tx);
-	}
+		/*
+		 * It is safe to do a lock-free check here because only async
+		 * allocations look at mg_max_alloc_queue_depth, and async
+		 * allocations all happen from spa_sync().
+		 */
+		ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+		mg->mg_max_alloc_queue_depth = max_queue_depth;
+		queue_depth_total += mg->mg_max_alloc_queue_depth;
+	}
+	metaslab_class_t *mc = spa_normal_class(spa);
+	ASSERT0(refcount_count(&mc->mc_alloc_slots));
+	mc->mc_alloc_max_slots = queue_depth_total;
+	mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 
-	/*
-	 * If anything has changed in this txg, push the deferred frees
-	 * from the previous txg.  If not, leave them alone so that we
-	 * don't generate work on an otherwise idle system.
-	 */
-	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
-	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
-	    !txg_list_empty(&dp->dp_sync_tasks, txg))
-		spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
+	ASSERT3U(mc->mc_alloc_max_slots, <=,
+	    max_queue_depth * rvd->vdev_children);
 
 	/*
 	 * Iterate to convergence.
@@ -5271,29 +6974,72 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
-		if (pass <= SYNC_PASS_DEFERRED_FREE) {
-			zio_t *zio = zio_root(spa, NULL, NULL, 0);
-			bplist_sync(free_bpl, spa_sync_free, zio, tx);
-			VERIFY(zio_wait(zio) == 0);
+		if (pass < zfs_sync_pass_deferred_free) {
+			spa_sync_frees(spa, free_bpl, tx);
 		} else {
-			bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx);
+			/*
+			 * We can not defer frees in pass 1, because
+			 * we sync the deferred frees later in pass 1.
+			 */
+			ASSERT3U(pass, >, 1);
+			bplist_iterate(free_bpl, bpobj_enqueue_cb,
+			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		ddt_sync(spa, txg);
-
-		mutex_enter(&spa->spa_scrub_lock);
-		while (spa->spa_scrub_inflight > 0)
-			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-		mutex_exit(&spa->spa_scrub_lock);
+		dsl_scan_sync(dp, tx);
 
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
 
-	} while (dmu_objset_is_dirty(mos, txg));
+		if (pass == 1) {
+			spa_sync_upgrades(spa, tx);
+			ASSERT3U(txg, >=,
+			    spa->spa_uberblock.ub_rootbp.blk_birth);
+			/*
+			 * Note: We need to check if the MOS is dirty
+			 * because we could have marked the MOS dirty
+			 * without updating the uberblock (e.g. if we
+			 * have sync tasks but no dirty user data).  We
+			 * need to check the uberblock's rootbp because
+			 * it is updated if we have synced out dirty
+			 * data (though in this case the MOS will most
+			 * likely also be dirty due to second order
+			 * effects, we don't want to rely on that here).
+			 */
+			if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+			    !dmu_objset_is_dirty(mos, txg)) {
+				/*
+				 * Nothing changed on the first pass,
+				 * therefore this TXG is a no-op.  Avoid
+				 * syncing deferred frees, so that we
+				 * can keep this TXG as a no-op.
+				 */
+				ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
+				    txg));
+				ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+				ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+				break;
+			}
+			spa_sync_deferred_frees(spa, tx);
+		}
 
-	ASSERT(free_bpl->bpl_queue == NULL);
+	} while (dmu_objset_is_dirty(mos, txg));
 
-	bplist_close(defer_bpl);
+	if (!list_is_empty(&spa->spa_config_dirty_list)) {
+		/*
+		 * Make sure that the number of ZAPs for all the vdevs matches
+		 * the number of ZAPs in the per-vdev ZAP list. This only gets
+		 * called if the config is dirty; otherwise there may be
+		 * outstanding AVZ operations that weren't completed in
+		 * spa_sync_config_object.
+		 */
+		uint64_t all_vdev_zap_entry_count;
+		ASSERT0(zap_count(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+		    all_vdev_zap_entry_count);
+	}
 
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
@@ -5325,18 +7071,15 @@ spa_sync(spa_t *spa, uint64_t txg)
 				if (svdcount == SPA_DVAS_PER_BP)
 					break;
 			}
-			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
-			if (error != 0)
-				error = vdev_config_sync(svd, svdcount, txg,
-				    B_TRUE);
+			error = vdev_config_sync(svd, svdcount, txg);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
-			    rvd->vdev_children, txg, B_FALSE);
-			if (error != 0)
-				error = vdev_config_sync(rvd->vdev_child,
-				    rvd->vdev_children, txg, B_TRUE);
+			    rvd->vdev_children, txg);
 		}
 
+		if (error == 0)
+			spa->spa_last_synced_guid = rvd->vdev_guid;
+
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
@@ -5346,6 +7089,20 @@ spa_sync(spa_t *spa, uint64_t txg)
 	}
 	dmu_tx_commit(tx);
 
+#ifdef illumos
+	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#endif	/* illumos */
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif	/* __FreeBSD__ */
+#ifdef __NetBSD__
+#ifdef _KERNEL
+	callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif	/* __NetBSD__ */
+
 	/*
 	 * Clear the dirty config list.
 	 */
@@ -5362,10 +7119,12 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa->spa_config_syncing = NULL;
 	}
 
-	spa->spa_ubsync = spa->spa_uberblock;
-
 	dsl_pool_sync_done(dp, txg);
 
+	mutex_enter(&spa->spa_alloc_lock);
+	VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+	mutex_exit(&spa->spa_alloc_lock);
+
 	/*
 	 * Update usable space statistics.
 	 */
@@ -5381,11 +7140,16 @@ spa_sync(spa_t *spa, uint64_t txg)
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
-	ASSERT(defer_bpl->bpl_queue == NULL);
-	ASSERT(free_bpl->bpl_queue == NULL);
 
 	spa->spa_sync_pass = 0;
 
+	/*
+	 * Update the last synced uberblock here. We want to do this at
+	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
+	 * will be guaranteed that all the processing associated with
+	 * that txg has been completed.
+	 */
+	spa->spa_ubsync = spa->spa_uberblock;
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
@@ -5394,6 +7158,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
+	spa_async_dispatch_vd(spa);
 }
 
 /*
@@ -5407,7 +7172,8 @@ spa_sync_allpools(void)
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
-		if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
+		if (spa_state(spa) != POOL_STATE_ACTIVE ||
+		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
@@ -5487,6 +7253,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
+	ASSERT(spa_writeable(spa));
+
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
@@ -5494,8 +7262,8 @@ spa_upgrade(spa_t *spa, uint64_t version
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
-	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
-	ASSERT(version >= spa->spa_uberblock.ub_version);
+	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
+	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
@@ -5546,25 +7314,17 @@ spa_has_active_shared_spare(spa_t *spa)
 	return (B_FALSE);
 }
 
-/*
- * Post a sysevent corresponding to the given event.  The 'name' must be one of
- * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
- * filled in from the spa and (optionally) the vdev.  This doesn't do anything
- * in the userland libzpool, as we don't want consumers to misinterpret ztest
- * or zdb as real changes.
- */
-void
-spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+static sysevent_t *
+spa_event_create(spa_t *spa, vdev_t *vd, const char *name)
 {
-#ifndef __NetBSD__	
+	sysevent_t		*ev = NULL;
 #ifdef _KERNEL
-	sysevent_t		*ev;
 	sysevent_attr_list_t	*attr = NULL;
 	sysevent_value_t	value;
-	sysevent_id_t		eid;
 
 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
 	    SE_SLEEP);
+	ASSERT(ev != NULL);
 
 	value.value_type = SE_DATA_TYPE_STRING;
 	value.value.sv_string = spa_name(spa);
@@ -5596,12 +7356,34 @@ spa_event_notify(spa_t *spa, vdev_t *vd,
 		goto done;
 	attr = NULL;
 
-	(void) log_sysevent(ev, SE_SLEEP, &eid);
-
 done:
 	if (attr)
 		sysevent_free_attr(attr);
+
+#endif
+	return (ev);
+}
+
+static void
+spa_event_post(sysevent_t *ev)
+{
+#ifdef _KERNEL
+	sysevent_id_t		eid;
+
+	(void) log_sysevent(ev, SE_SLEEP, &eid);
 	sysevent_free(ev);
 #endif
-#endif /* __NetBSD__ */
+}
+
+/*
+ * Post a sysevent corresponding to the given event.  The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
+ * filled in from the spa and (optionally) the vdev.  This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+{
+	spa_event_post(spa_event_create(spa, vd, name));
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c,v
retrieving revision 1.6
diff -u -p -r1.6 spa_config.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c	21 Nov 2011 17:51:03 -0000	1.6
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c	14 Apr 2017 20:19:21 -0000
@@ -20,11 +20,14 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
+#include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/nvpair.h>
 #include <sys/uio.h>
@@ -32,8 +35,8 @@
 #include <sys/vdev_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/utsname.h>
-#include <sys/systeminfo.h>
 #include <sys/sunddi.h>
+#include <sys/zfeature.h>
 #ifdef _KERNEL
 #include <sys/kobj.h>
 #include <sys/zone.h>
@@ -83,6 +86,7 @@ spa_config_load(void)
 	 * Open the configuration file.
 	 */
 	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
 	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
 
 	file = kobj_open_file(pathname);
@@ -122,7 +126,7 @@ spa_config_load(void)
 		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
 			continue;
 
-		VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+		child = fnvpair_value_nvlist(nvpair);
 
 		if (spa_lookup(nvpair_name(nvpair)) != NULL)
 			continue;
@@ -140,6 +144,26 @@ out:
 }
 
 static void
+spa_config_clean(nvlist_t *nvl)
+{
+	nvlist_t **child;
+	nvlist_t *nvroot = NULL;
+	uint_t c, children;
+
+	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0) {
+		for (c = 0; c < children; c++)
+			spa_config_clean(child[c]);
+	}
+
+	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
+		spa_config_clean(nvroot);
+
+	nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
+	nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
+}
+
+static int
 spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
 {
 	size_t buflen;
@@ -147,25 +171,22 @@ spa_config_write(spa_config_dirent_t *dp
 	vnode_t *vp;
 	int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
 	char *temp;
+	int err;
+
 	/*
 	 * If the nvlist is empty (NULL), then remove the old cachefile.
 	 */
 	if (nvl == NULL) {
-		(void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
-		return;
+		err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
+		return (err);
 	}
 
 	/*
 	 * Pack the configuration into a buffer.
 	 */
-	VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0);
-
-	buf = kmem_alloc(buflen, KM_SLEEP);
+	buf = fnvlist_pack(nvl, &buflen);
 	temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
-	VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR,
-	    KM_SLEEP) == 0);
-
 	/*
 	 * Write the configuration to disk.  We need to do the traditional
 	 * 'write to temporary file, sync, move over original' to make sure we
@@ -173,30 +194,40 @@ spa_config_write(spa_config_dirent_t *dp
 	 */
 	(void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);
 
-	if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) {
-		if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
-		    0, RLIM64_INFINITY, kcred, NULL) == 0 &&
-		    VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) {
-			(void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
-		}
+	err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0);
+	if (err == 0) {
+		err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+		    0, RLIM64_INFINITY, kcred, NULL);
+		if (err == 0)
+			err = VOP_FSYNC(vp, FSYNC, kcred, NULL);
+		if (err == 0)
+			err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
 		(void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
 	}
 
 	(void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
 
-	kmem_free(buf, buflen);
+	fnvlist_pack_free(buf, buflen);
 	kmem_free(temp, MAXPATHLEN);
+	return (err);
 }
 
 /*
  * Synchronize pool configuration to disk.  This must be called with the
- * namespace lock held.
+ * namespace lock held. Synchronizing the pool cache is typically done after
+ * the configuration has been synced to the MOS. This exposes a window where
+ * the MOS config will have been updated but the cache file has not. If
+ * the system were to crash at that instant then the cached config may not
+ * contain the correct information to open the pool and an explicity import
+ * would be required.
  */
 void
 spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
 {
 	spa_config_dirent_t *dp, *tdp;
 	nvlist_t *nvl;
+	boolean_t ccw_failure;
+	int error;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
@@ -208,6 +239,7 @@ spa_config_sync(spa_t *target, boolean_t
 	 * cachefile is changed, the new one is pushed onto this list, allowing
 	 * us to update previous cachefiles that no longer contain this pool.
 	 */
+	ccw_failure = B_FALSE;
 	for (dp = list_head(&target->spa_config_list); dp != NULL;
 	    dp = list_next(&target->spa_config_list, dp)) {
 		spa_t *spa = NULL;
@@ -219,8 +251,19 @@ spa_config_sync(spa_t *target, boolean_t
 		 */
 		nvl = NULL;
 		while ((spa = spa_next(spa)) != NULL) {
-			if (spa == target && removing)
+			nvlist_t *nvroot = NULL;
+			/*
+			 * Skip over our own pool if we're about to remove
+			 * ourselves from the spa namespace or any pool that
+			 * is readonly. Since we cannot guarantee that a
+			 * readonly pool would successfully import upon reboot,
+			 * we don't allow them to be written to the cache file.
+			 */
+			if ((spa == target && removing) ||
+			    (spa_state(spa) == POOL_STATE_ACTIVE &&
+			    !spa_writeable(spa)))
 				continue;
+
 			mutex_enter(&spa->spa_props_lock);
 			tdp = list_head(&spa->spa_config_list);
 			if (spa->spa_config == NULL ||
@@ -231,18 +274,42 @@ spa_config_sync(spa_t *target, boolean_t
 			}
 
 			if (nvl == NULL)
-				VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME,
-				    KM_SLEEP) == 0);
+				nvl = fnvlist_alloc();
 
-			VERIFY(nvlist_add_nvlist(nvl, spa->spa_name,
-			    spa->spa_config) == 0);
+			fnvlist_add_nvlist(nvl, spa->spa_name,
+			    spa->spa_config);
 			mutex_exit(&spa->spa_props_lock);
+
+			if (nvlist_lookup_nvlist(nvl, spa->spa_name, &nvroot) == 0)
+				spa_config_clean(nvroot);
 		}
 
-		spa_config_write(dp, nvl);
+		error = spa_config_write(dp, nvl);
+		if (error != 0)
+			ccw_failure = B_TRUE;
 		nvlist_free(nvl);
 	}
 
+	if (ccw_failure) {
+		/*
+		 * Keep trying so that configuration data is
+		 * written if/when any temporary filesystem
+		 * resource issues are resolved.
+		 */
+		if (target->spa_ccw_fail_time == 0) {
+			zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
+			    target, NULL, NULL, 0, 0);
+		}
+		target->spa_ccw_fail_time = gethrtime();
+		spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
+	} else {
+		/*
+		 * Do not rate limit future attempts to update
+		 * the config cache.
+		 */
+		target->spa_ccw_fail_time = 0;
+	}
+
 	/*
 	 * Remove any config entries older than the current one.
 	 */
@@ -275,15 +342,15 @@ spa_all_configs(uint64_t *generation)
 	if (*generation == spa_config_generation)
 		return (NULL);
 
-	VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	pools = fnvlist_alloc();
 
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
-		if (INGLOBALZONE(curproc) ||
+		if (INGLOBALZONE(curthread) ||
 		    zone_dataset_visible(spa_name(spa), NULL)) {
 			mutex_enter(&spa->spa_props_lock);
-			VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
-			    spa->spa_config) == 0);
+			fnvlist_add_nvlist(pools, spa_name(spa),
+			    spa->spa_config);
 			mutex_exit(&spa->spa_props_lock);
 		}
 	}
@@ -297,32 +364,14 @@ void
 spa_config_set(spa_t *spa, nvlist_t *config)
 {
 	mutex_enter(&spa->spa_props_lock);
-	if (spa->spa_config != NULL)
-		nvlist_free(spa->spa_config);
+	nvlist_free(spa->spa_config);
 	spa->spa_config = config;
 	mutex_exit(&spa->spa_props_lock);
 }
 
-/* Add discovered rewind info, if any to the provided nvlist */
-void
-spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl)
-{
-	int64_t loss = 0;
-
-	if (tonvl == NULL || spa->spa_load_txg == 0)
-		return;
-
-	VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME,
-	    spa->spa_load_txg_ts) == 0);
-	if (spa->spa_last_ubsync_txg)
-		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
-	VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
-	VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
-	    spa->spa_load_data_errors) == 0);
-}
-
 /*
  * Generate the pool's configuration based on the current in-core state.
+ *
  * We infer whether to generate a complete config or just one top-level config
  * based on whether vd is the root vdev.
  */
@@ -350,18 +399,18 @@ spa_config_generate(spa_t *spa, vdev_t *
 	if (txg == -1ULL)
 		txg = spa->spa_config_txg;
 
-	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	config = fnvlist_alloc();
+
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
+	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+	if (spa->spa_comment != NULL) {
+		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
+		    spa->spa_comment);
+	}
 
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
-	    spa_version(spa)) == 0);
-	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
-	    spa_name(spa)) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-	    spa_state(spa)) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    txg) == 0);
-	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-	    spa_guid(spa)) == 0);
 #ifdef	_KERNEL
 	hostid = zone_get_hostid(NULL);
 #else	/* _KERNEL */
@@ -372,23 +421,24 @@ spa_config_generate(spa_t *spa, vdev_t *
 	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
 #endif	/* _KERNEL */
 	if (hostid != 0) {
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
-		    hostid) == 0);
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
 	}
-	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
-	    utsname.nodename) == 0);
+	fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename);
 
+	int config_gen_flags = 0;
 	if (vd != rvd) {
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
-		    vd->vdev_top->vdev_guid) == 0);
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
-		    vd->vdev_guid) == 0);
-		if (vd->vdev_isspare)
-			VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
-			    1ULL) == 0);
-		if (vd->vdev_islog)
-			VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
-			    1ULL) == 0);
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+		    vd->vdev_top->vdev_guid);
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+		    vd->vdev_guid);
+		if (vd->vdev_isspare) {
+			fnvlist_add_uint64(config,
+			    ZPOOL_CONFIG_IS_SPARE, 1ULL);
+		}
+		if (vd->vdev_islog) {
+			fnvlist_add_uint64(config,
+			    ZPOOL_CONFIG_IS_LOG, 1ULL);
+		}
 		vd = vd->vdev_top;		/* label contains top config */
 	} else {
 		/*
@@ -396,14 +446,17 @@ spa_config_generate(spa_t *spa, vdev_t *
 		 * in the mos config, and not in the vdev labels
 		 */
 		if (spa->spa_config_splitting != NULL)
-			VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
-			    spa->spa_config_splitting) == 0);
+			fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+			    spa->spa_config_splitting);
+		fnvlist_add_boolean(config,
+		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
+
+		config_gen_flags |= VDEV_CONFIG_MOS;
 	}
 
 	/*
 	 * Add the top-level config.  We even add this on pools which
-	 * don't support holes in the namespace as older pools will
-	 * just ignore it.
+	 * don't support holes in the namespace.
 	 */
 	vdev_top_config_generate(spa, config);
 
@@ -413,14 +466,20 @@ spa_config_generate(spa_t *spa, vdev_t *
 	if (spa->spa_config_splitting != NULL &&
 	    nvlist_lookup_uint64(spa->spa_config_splitting,
 	    ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
-		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
-		    split_guid) == 0);
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
+		    split_guid);
 	}
 
-	nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
-	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+	nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
+	fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
 	nvlist_free(nvroot);
 
+	/*
+	 * Store what's necessary for reading the MOS in the label.
+	 */
+	fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+	    spa->spa_label_features);
+
 	if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
 		ddt_histogram_t *ddh;
 		ddt_stat_t *dds;
@@ -428,28 +487,26 @@ spa_config_generate(spa_t *spa, vdev_t *
 
 		ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 		ddt_get_dedup_histogram(spa, ddh);
-		VERIFY(nvlist_add_uint64_array(config,
+		fnvlist_add_uint64_array(config,
 		    ZPOOL_CONFIG_DDT_HISTOGRAM,
-		    (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0);
+		    (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t));
 		kmem_free(ddh, sizeof (ddt_histogram_t));
 
 		ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
 		ddt_get_dedup_object_stats(spa, ddo);
-		VERIFY(nvlist_add_uint64_array(config,
+		fnvlist_add_uint64_array(config,
 		    ZPOOL_CONFIG_DDT_OBJ_STATS,
-		    (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0);
+		    (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t));
 		kmem_free(ddo, sizeof (ddt_object_t));
 
 		dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
 		ddt_get_dedup_stats(spa, dds);
-		VERIFY(nvlist_add_uint64_array(config,
+		fnvlist_add_uint64_array(config,
 		    ZPOOL_CONFIG_DDT_STATS,
-		    (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0);
+		    (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t));
 		kmem_free(dds, sizeof (ddt_stat_t));
 	}
 
-	spa_rewind_data_to_nvlist(spa, config);
-
 	if (locked)
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
@@ -484,8 +541,10 @@ spa_config_update(spa_t *spa, int what)
 		 */
 		for (c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
-			if (tvd->vdev_ms_array == 0)
+			if (tvd->vdev_ms_array == 0) {
+				vdev_ashift_optimize(tvd);
 				vdev_metaslab_set_size(tvd);
+			}
 			vdev_expand(tvd, txg);
 		}
 	}
@@ -499,8 +558,7 @@ spa_config_update(spa_t *spa, int what)
 	/*
 	 * Update the global config cache to reflect the new mosconfig.
 	 */
-	if (!spa->spa_is_root)
-		spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
+	spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
 
 	if (what == SPA_CONFIG_UPDATE_POOL)
 		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 spa_errlog.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c	27 Feb 2010 22:31:06 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c	17 Jul 2014 16:23:26 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 /*
@@ -35,7 +35,7 @@
  * deleted from the log when the scrub completes.
  *
  * The log is stored using a ZAP object whose key is a string form of the
- * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
  * optional 'objset:object' human-readable string describing the data.  When an
  * error is first logged, this string will be empty, indicating that no name is
  * known.  This prevents us from having to issue a potentially large amount of
@@ -54,42 +54,12 @@
 #include <sys/zap.h>
 #include <sys/zio.h>
 
-/*
- * This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
- */
-uint64_t
-strtonum(const char *str, char **nptr)
-{
-	uint64_t val = 0;
-	char c;
-	int digit;
-
-	while ((c = *str) != '\0') {
-		if (c >= '0' && c <= '9')
-			digit = c - '0';
-		else if (c >= 'a' && c <= 'f')
-			digit = 10 + c - 'a';
-		else
-			break;
-
-		val *= 16;
-		val += digit;
-
-		str++;
-	}
-
-	if (nptr)
-		*nptr = (char *)str;
-
-	return (val);
-}
 
 /*
  * Convert a bookmark to a string.
  */
 static void
-bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
 {
 	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
 	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
@@ -101,7 +71,7 @@ bookmark_to_name(zbookmark_t *zb, char *
  */
 #ifdef _KERNEL
 static void
-name_to_bookmark(char *buf, zbookmark_t *zb)
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
 {
 	zb->zb_objset = strtonum(buf, &buf);
 	ASSERT(*buf == ':');
@@ -122,7 +92,7 @@ name_to_bookmark(char *buf, zbookmark_t 
 void
 spa_log_error(spa_t *spa, zio_t *zio)
 {
-	zbookmark_t *zb = &zio->io_logical->io_bookmark;
+	zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
 	spa_error_entry_t search;
 	spa_error_entry_t *new;
 	avl_tree_t *tree;
@@ -195,7 +165,7 @@ process_error_log(spa_t *spa, uint64_t o
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 
 	if (obj == 0)
 		return (0);
@@ -206,15 +176,17 @@ process_error_log(spa_t *spa, uint64_t o
 
 		if (*count == 0) {
 			zap_cursor_fini(&zc);
-			return (ENOMEM);
+			return (SET_ERROR(ENOMEM));
 		}
 
 		name_to_bookmark(za.za_name, &zb);
 
 		if (copyout(&zb, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_t),
-		    sizeof (zbookmark_t)) != 0)
-			return (EFAULT);
+		    (*count - 1) * sizeof (zbookmark_phys_t),
+		    sizeof (zbookmark_phys_t)) != 0) {
+			zap_cursor_fini(&zc);
+			return (SET_ERROR(EFAULT));
+		}
 
 		*count -= 1;
 	}
@@ -232,12 +204,12 @@ process_error_list(avl_tree_t *list, voi
 	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
 
 		if (*count == 0)
-			return (ENOMEM);
+			return (SET_ERROR(ENOMEM));
 
 		if (copyout(&se->se_bookmark, (char *)addr +
-		    (*count - 1) * sizeof (zbookmark_t),
-		    sizeof (zbookmark_t)) != 0)
-			return (EFAULT);
+		    (*count - 1) * sizeof (zbookmark_phys_t),
+		    sizeof (zbookmark_phys_t)) != 0)
+			return (SET_ERROR(EFAULT));
 
 		*count -= 1;
 	}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c,v
retrieving revision 1.3
diff -u -p -r1.3 spa_history.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c	14 Apr 2017 20:24:24 -0000
@@ -20,8 +20,9 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/spa.h>
@@ -30,10 +31,14 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
 #include <sys/utsname.h>
-#include <sys/cmn_err.h>
 #include <sys/sunddi.h>
+#include <sys/cred.h>
+#include "zfs_comutil.h"
 #ifdef _KERNEL
+#include <sys/cmn_err.h>
 #include <sys/zone.h>
 #endif
 
@@ -86,7 +91,7 @@ spa_history_create_obj(spa_t *spa, dmu_t
 
 	ASSERT(spa->spa_history == 0);
 	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
-	    SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
 	    sizeof (spa_history_phys_t), tx);
 
 	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
@@ -101,11 +106,11 @@ spa_history_create_obj(spa_t *spa, dmu_t
 
 	/*
 	 * Figure out maximum size of history log.  We set it at
-	 * 1% of pool size, with a max of 32MB and min of 128KB.
+	 * 0.1% of pool size, with a max of 1G and min of 128KB.
 	 */
 	shpp->sh_phys_max_off =
-	    metaslab_class_get_dspace(spa_normal_class(spa)) / 100;
-	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
+	    metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
+	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
 	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
 
 	dmu_buf_rele(dbp, FTAG);
@@ -175,30 +180,32 @@ spa_history_write(spa_t *spa, void *buf,
 }
 
 static char *
-spa_history_zone()
+spa_history_zone(void)
 {
-#if defined(_KERNEL) && !defined(__NetBSD__)
-	return (curproc->p_zone->zone_name);
-#else
-	return ("global");
+#ifdef _KERNEL
+#ifdef __FreeBSD__
+	/* XXX: pr_hostname can be changed by default from within a jail! */
+	if (jailed(curthread->td_ucred))
+		return (curthread->td_ucred->cr_prison->pr_hostname);
+#endif
 #endif
+	return (NULL);
 }
 
 /*
  * Write out a history event.
  */
+/*ARGSUSED*/
 static void
-spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+spa_history_log_sync(void *arg, dmu_tx_t *tx)
 {
-	spa_t		*spa = arg1;
-	history_arg_t	*hap = arg2;
-	const char	*history_str = hap->ha_history_str;
+	nvlist_t	*nvl = arg;
+	spa_t		*spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t	*mos = spa->spa_meta_objset;
 	dmu_buf_t	*dbp;
 	spa_history_phys_t *shpp;
 	size_t		reclen;
 	uint64_t	le_len;
-	nvlist_t	*nvrecord;
 	char		*record_packed = NULL;
 	int		ret;
 
@@ -215,7 +222,7 @@ spa_history_log_sync(void *arg1, void *a
 	 * Get the offset of where we need to write via the bonus buffer.
 	 * Update the offset when the write completes.
 	 */
-	VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
 	shpp = dbp->db_data;
 
 	dmu_buf_will_dirty(dbp, tx);
@@ -228,40 +235,35 @@ spa_history_log_sync(void *arg1, void *a
 	}
 #endif
 
-	VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME,
-	    gethrestime_sec()) == 0);
-	VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO,
-	    (uint64_t)crgetuid(cr)) == 0);
-	if (hap->ha_zone[0] != '\0')
-		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE,
-		    hap->ha_zone) == 0);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
 #ifdef _KERNEL
-	VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST,
-	    utsname.nodename) == 0);
+	fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename);
 #endif
-	if (hap->ha_log_type == LOG_CMD_POOL_CREATE ||
-	    hap->ha_log_type == LOG_CMD_NORMAL) {
-		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD,
-		    history_str) == 0);
-	} else {
-		VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT,
-		    hap->ha_event) == 0);
-		VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG,
-		    tx->tx_txg) == 0);
-		VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR,
-		    history_str) == 0);
+	if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+		zfs_dbgmsg("command: %s",
+		    fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
+	} else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
+		if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
+			zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
+			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
+			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+		} else {
+			zfs_dbgmsg("txg %lld %s %s",
+			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+		}
+	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
+		zfs_dbgmsg("ioctl %s",
+		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
 	}
 
-	VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0);
-	record_packed = kmem_alloc(reclen, KM_SLEEP);
-
-	VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen,
-	    NV_ENCODE_XDR, KM_SLEEP) == 0);
+	record_packed = fnvlist_pack(nvl, &reclen);
 
 	mutex_enter(&spa->spa_history_lock);
-	if (hap->ha_log_type == LOG_CMD_POOL_CREATE)
-		VERIFY(shpp->sh_eof == shpp->sh_pool_create_len);
 
 	/* write out the packed length as little endian */
 	le_len = LE_64((uint64_t)reclen);
@@ -269,37 +271,68 @@ spa_history_log_sync(void *arg1, void *a
 	if (!ret)
 		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
 
-	if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) {
-		shpp->sh_pool_create_len += sizeof (le_len) + reclen;
-		shpp->sh_bof = shpp->sh_pool_create_len;
+	/* The first command is the create, which we keep forever */
+	if (ret == 0 && shpp->sh_pool_create_len == 0 &&
+	    nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+		shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
 	}
 
 	mutex_exit(&spa->spa_history_lock);
-	nvlist_free(nvrecord);
-	kmem_free(record_packed, reclen);
+	fnvlist_pack_free(record_packed, reclen);
 	dmu_buf_rele(dbp, FTAG);
-
-	if (hap->ha_log_type == LOG_INTERNAL) {
-		kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN);
-		kmem_free(hap, sizeof (history_arg_t));
-	}
+	fnvlist_free(nvl);
 }
 
 /*
  * Write out a history event.
  */
 int
-spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what)
+spa_history_log(spa_t *spa, const char *msg)
 {
-	history_arg_t ha;
+	int err;
+	nvlist_t *nvl = fnvlist_alloc();
 
-	ASSERT(what != LOG_INTERNAL);
+	fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
+	err = spa_history_log_nvl(spa, nvl);
+	fnvlist_free(nvl);
+	return (err);
+}
+
+int
+spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
+{
+	int err = 0;
+	dmu_tx_t *tx;
+	nvlist_t *nvarg;
+
+	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY)
+		return (EINVAL);
+
+	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
+		return (SET_ERROR(EINVAL));
+
+	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	nvarg = fnvlist_dup(nvl);
+	if (spa_history_zone() != NULL) {
+		fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
+		    spa_history_zone());
+	}
+	fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
+
+	/* Kick this off asynchronously; errors are ignored. */
+	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
+	    nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
+	dmu_tx_commit(tx);
+
+	/* spa_history_log_sync will free nvl */
+	return (err);
 
-	ha.ha_history_str = history_str;
-	ha.ha_log_type = what;
-	(void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone));
-	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync,
-	    spa, &ha, 0));
 }
 
 /*
@@ -316,11 +349,19 @@ spa_history_get(spa_t *spa, uint64_t *of
 	int err;
 
 	/*
-	 * If the command history  doesn't exist (older pool),
+	 * If the command history doesn't exist (older pool),
 	 * that's ok, just return ENOENT.
 	 */
 	if (!spa->spa_history)
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
+
+	/*
+	 * The history is logged asynchronously, so when they request
+	 * the first chunk of history, make sure everything has been
+	 * synced to disk so that we get it.
+	 */
+	if (*offp == 0 && spa_writeable(spa))
+		txg_wait_synced(spa_get_dsl(spa), 0);
 
 	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
 		return (err);
@@ -391,42 +432,50 @@ spa_history_get(spa_t *spa, uint64_t *of
 	return (err);
 }
 
+/*
+ * The nvlist will be consumed by this call.
+ */
 static void
-log_internal(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx)
+log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
+    dmu_tx_t *tx, const char *fmt, va_list adx)
 {
-	history_arg_t *hap;
-	char *str;
+	char *msg;
+	va_list adx2;
 
 	/*
 	 * If this is part of creating a pool, not everything is
 	 * initialized yet, so don't bother logging the internal events.
+	 * Likewise if the pool is not writeable.
 	 */
-	if (tx->tx_txg == TXG_INITIAL)
+	if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) {
+		fnvlist_free(nvl);
 		return;
+	}
+
+	va_copy(adx2, adx);
 
-	hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
-	str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
+	(void) vsprintf(msg, fmt, adx2);
+	fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
+	strfree(msg);
 
-	(void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
+	va_end(adx2);
 
-	hap->ha_log_type = LOG_INTERNAL;
-	hap->ha_history_str = str;
-	hap->ha_event = event;
-	hap->ha_zone[0] = '\0';
+	fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
 
 	if (dmu_tx_is_syncing(tx)) {
-		spa_history_log_sync(spa, hap, cr, tx);
+		spa_history_log_sync(nvl, tx);
 	} else {
-		dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL,
-		    spa_history_log_sync, spa, hap, 0, tx);
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
 	}
-	/* spa_history_log_sync() will free hap and str */
+	/* spa_history_log_sync() will free nvl */
 }
 
 void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+spa_history_log_internal(spa_t *spa, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...)
 {
 	dmu_tx_t *htx = tx;
 	va_list adx;
@@ -441,7 +490,7 @@ spa_history_internal_log(history_interna
 	}
 
 	va_start(adx, fmt);
-	log_internal(event, spa, htx, cr, fmt, adx);
+	log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
 	va_end(adx);
 
 	/* if we didn't get a tx from the caller, commit the one we made */
@@ -450,21 +499,50 @@ spa_history_internal_log(history_interna
 }
 
 void
-spa_history_log_version(spa_t *spa, history_internal_events_t event)
+spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...)
 {
-#ifdef _KERNEL
-	uint64_t current_vers = spa_version(spa);
+	va_list adx;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *nvl = fnvlist_alloc();
 
-	if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
-		spa_history_internal_log(event, spa, NULL, CRED(),
-		    "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
-		    (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
-		    utsname.nodename, utsname.release, utsname.version,
-		    utsname.machine);
-	}
-	cmn_err(CE_CONT, "!%s version %llu pool %s using %llu",
-	    event == LOG_POOL_IMPORT ? "imported" :
-	    event == LOG_POOL_CREATE ? "created" : "accessed",
-	    (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
-#endif
+	ASSERT(tx != NULL);
+
+	dsl_dataset_name(ds, namebuf);
+	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
+
+	va_start(adx, fmt);
+	log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
+	va_end(adx);
+}
+
+void
+spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...)
+{
+	va_list adx;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *nvl = fnvlist_alloc();
+
+	ASSERT(tx != NULL);
+
+	dsl_dir_name(dd, namebuf);
+	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
+	    dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+	va_start(adx, fmt);
+	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
+	va_end(adx);
+}
+
+void
+spa_history_log_version(spa_t *spa, const char *operation)
+{
+	spa_history_log_internal(spa, operation, NULL,
+	    "pool version %llu; software version %llu/%d; uts %s %s %s %s",
+	    (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION,
+	    utsname.nodename, utsname.release, utsname.version,
+	    utsname.machine);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c,v
retrieving revision 1.2
diff -u -p -r1.2 spa_misc.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c	28 Mar 2014 03:46:56 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c	6 May 2017 15:16:38 -0000
@@ -19,12 +19,18 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
+#include <sys/spa_boot.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
@@ -41,11 +47,18 @@
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include "zfs_prop.h"
+#include <sys/zfeature.h>
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
 
 /*
  * SPA locking
@@ -214,7 +227,7 @@
  * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
  * locking is, always, based on spa_namespace_lock and spa_config_lock[].
  *
- * spa_rename() is also implemented within this file since is requires
+ * spa_rename() is also implemented within this file since it requires
  * manipulation of the namespace.
  */
 
@@ -233,8 +246,8 @@ kmem_cache_t *spa_buffer_pool;
 int spa_mode_global;
 
 #ifdef ZFS_DEBUG
-/* Everything except dprintf is on by default in debug builds */
-int zfs_flags = ~ZFS_DEBUG_DPRINTF;
+/* Everything except dprintf and spa is on by default in debug builds */
+int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA);
 #else
 int zfs_flags = 0;
 #endif
@@ -243,9 +256,212 @@ int zfs_flags = 0;
  * zfs_recover can be set to nonzero to attempt to recover from
  * otherwise-fatal errors, typically caused by on-disk corruption.  When
  * set, calls to zfs_panic_recover() will turn into warning messages.
+ * This should only be used as a last resort, as it typically results
+ * in leaked space, or worse.
+ */
+boolean_t zfs_recover = B_FALSE;
+
+/*
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress.  While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers.  In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks.  However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug).  In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless.  In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do.  Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
+ */
+boolean_t zfs_free_leak_on_eio = B_FALSE;
+
+/*
+ * Expiration time in milliseconds. This value has two meanings. First it is
+ * used to determine when the spa_deadman() logic should fire. By default the
+ * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that
+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+ * in a system panic.
+ */
+uint64_t zfs_deadman_synctime_ms = 1000000ULL;
+
+/*
+ * Check time in milliseconds. This defines the frequency at which we check
+ * for hung I/O.
+ */
+uint64_t zfs_deadman_checktime_ms = 5000ULL;
+
+/*
+ * Default value of -1 for zfs_deadman_enabled is resolved in
+ * zfs_deadman_init()
+ */
+int zfs_deadman_enabled = -1;
+
+/*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that.  Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
+ * the worst case is:
+ *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
  */
-int zfs_recover = 0;
+int spa_asize_inflation = 24;
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
+    "Try to recover from otherwise-fatal errors.");
+
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+	int err, val;
+
+	val = zfs_flags;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	/*
+	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+	 * arc buffers in the system have the necessary additional
+	 * checksum data.  However, it is safe to disable at any
+	 * time.
+	 */
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+		val &= ~ZFS_DEBUG_MODIFY;
+	zfs_flags = val;
+
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+    sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
+    &zfs_deadman_synctime_ms, 0,
+    "Stalled ZFS I/O expiration time in milliseconds");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
+    &zfs_deadman_checktime_ms, 0,
+    "Period of checks for stalled ZFS I/O in milliseconds");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
+    &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
+    &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
+#endif
+
 
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+static void
+zfs_deadman_init(void)
+{
+	/*
+	 * If we are not i386 or amd64 or in a virtual machine,
+	 * disable ZFS deadman thread by default
+	 */
+	if (zfs_deadman_enabled == -1) {
+#if defined(__amd64__) || defined(__i386__)
+		zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0;
+#else
+		zfs_deadman_enabled = 0;
+#endif
+	}
+}
+#endif	/* _KERNEL */
+#endif	/* __FreeBSD__ */
+
+#ifdef __NetBSD__
+#ifdef _HARDKERNEL
+static struct workqueue *spa_workqueue;
+
+static void spa_deadman(void *arg);
+
+static void
+spa_deadman_wq(struct work *wk, void *arg)
+{
+	spa_t *spa = container_of(wk, struct spa, spa_deadman_work);
+
+	spa_deadman(spa);
+}
+
+static void
+zfs_deadman_init(void)
+{
+	int error;
+
+	error = workqueue_create(&spa_workqueue, "spa_deadman",
+	    spa_deadman_wq, NULL, PRI_NONE, IPL_NONE, WQ_MPSAFE);
+	VERIFY0(error);
+}
+
+static void
+zfs_deadman_fini(void)
+{
+	workqueue_destroy(spa_workqueue);
+	spa_workqueue = NULL;
+}
+#else /* !_HARDKERNEL */
+#define zfs_deadman_init() /* nothing */
+#define zfs_deadman_fini() /* nothing */
+#endif /* !_HARDKERNEL */
+#endif /* __NetBSD__ */
+
+/*
+ * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
+ * the pool to be consumed.  This ensures that we don't run the pool
+ * completely out of space, due to unaccounted changes (e.g. to the MOS).
+ * It also limits the worst-case time to allocate space.  If we have
+ * less than this amount of free space, most ZPL operations (e.g. write,
+ * create) will return ENOSPC.
+ *
+ * Certain operations (e.g. file removal, most administrative actions) can
+ * use half the slop space.  They will only return ENOSPC if less than half
+ * the slop space is free.  Typically, once the pool has less than the slop
+ * space free, the user will use these operations to free up space in the pool.
+ * These are the operations that call dsl_pool_adjustedsize() with the netfree
+ * argument set to TRUE.
+ *
+ * A very restricted set of operations are always permitted, regardless of
+ * the amount of free space.  These are the operations that call
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy".  If these
+ * operations result in a net increase in the amount of space used,
+ * it is possible to run the pool completely out of space, causing it to
+ * be permanently read-only.
+ *
+ * Note that on very small pools, the slop space will be larger than
+ * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
+ * but we never allow it to be more than half the pool size.
+ *
+ * See also the comments in zfs_space_check_t.
+ */
+int spa_slop_shift = 5;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
+    &spa_slop_shift, 0,
+    "Shift value of reserved space (1/(2^spa_slop_shift)).");
+uint64_t spa_min_slop = 128 * 1024 * 1024;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
+    &spa_min_slop, 0,
+    "Minimal value of reserved space");
 
 /*
  * ==========================================================================
@@ -259,7 +475,7 @@ spa_config_lock_init(spa_t *spa)
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
 		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
-		refcount_create(&scl->scl_count);
+		refcount_create_untracked(&scl->scl_count);
 		scl->scl_writer = NULL;
 		scl->scl_write_wanted = 0;
 	}
@@ -289,14 +505,16 @@ spa_config_tryenter(spa_t *spa, int lock
 		if (rw == RW_READER) {
 			if (scl->scl_writer || scl->scl_write_wanted) {
 				mutex_exit(&scl->scl_lock);
-				spa_config_exit(spa, locks ^ (1 << i), tag);
+				spa_config_exit(spa, locks & ((1 << i) - 1),
+				    tag);
 				return (0);
 			}
 		} else {
 			ASSERT(scl->scl_writer != curthread);
 			if (!refcount_is_zero(&scl->scl_count)) {
 				mutex_exit(&scl->scl_lock);
-				spa_config_exit(spa, locks ^ (1 << i), tag);
+				spa_config_exit(spa, locks & ((1 << i) - 1),
+				    tag);
 				return (0);
 			}
 			scl->scl_writer = curthread;
@@ -312,6 +530,8 @@ spa_config_enter(spa_t *spa, int locks, 
 {
 	int wlocks_held = 0;
 
+	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
+
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
 		if (scl->scl_writer == curthread)
@@ -390,31 +610,76 @@ spa_lookup(const char *name)
 	static spa_t search;	/* spa_t is large; don't allocate on stack */
 	spa_t *spa;
 	avl_index_t where;
-	char c;
 	char *cp;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
+
 	/*
 	 * If it's a full dataset name, figure out the pool name and
 	 * just use that.
 	 */
-	cp = strpbrk(name, "/@");
-	if (cp) {
-		c = *cp;
+	cp = strpbrk(search.spa_name, "/@#");
+	if (cp != NULL)
 		*cp = '\0';
-	}
 
-	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
 	spa = avl_find(&spa_namespace_avl, &search, &where);
 
-	if (cp)
-		*cp = c;
-
 	return (spa);
 }
 
 /*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+static void
+spa_deadman(void *arg)
+{
+	spa_t *spa = arg;
+
+	/*
+	 * Disable the deadman timer if the pool is suspended.
+	 */
+	if (spa_suspended(spa)) {
+#ifdef illumos
+		VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#else
+		/* Nothing.  just don't schedule any future callouts. */
+#endif
+		return;
+	}
+
+	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+	    ++spa->spa_deadman_calls);
+	if (zfs_deadman_enabled)
+		vdev_deadman(spa->spa_root_vdev);
+#ifndef illumos
+#ifdef _KERNEL
+	callout_schedule(&spa->spa_deadman_cycid,
+	    hz * zfs_deadman_checktime_ms / MILLISEC);
+#endif
+#endif
+}
+
+#ifdef _HARDKERNEL
+static void
+spa_deadman_timeout(void *arg)
+{
+	spa_t *spa = arg;
+
+#ifdef __FreeBSD__
+	taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
+#endif
+#ifdef __NetBSD__
+	workqueue_enqueue(spa_workqueue, &spa->spa_deadman_work, NULL);
+#endif
+}
+#endif /* _KERNEL */
+
+/*
  * Create an uninitialized spa_t with the given name.  Requires
  * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
  * exist by calling spa_lookup() first.
@@ -424,6 +689,10 @@ spa_add(const char *name, nvlist_t *conf
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
+#ifndef __FreeBSD__
+	cyc_handler_t hdlr;
+	cyc_time_t when;
+#endif
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
@@ -432,21 +701,24 @@ spa_add(const char *name, nvlist_t *conf
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < TXG_SIZE; t++)
-		bplist_init(&spa->spa_free_bplist[t]);
-	bplist_init(&spa->spa_deferred_bplist);
+		bplist_create(&spa->spa_free_bplist[t]);
 
 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
@@ -456,6 +728,55 @@ spa_add(const char *name, nvlist_t *conf
 	spa->spa_proc = &p0;
 	spa->spa_proc_state = SPA_PROC_NONE;
 
+#ifndef __FreeBSD__
+	hdlr.cyh_func = spa_deadman;
+	hdlr.cyh_arg = spa;
+	hdlr.cyh_level = CY_LOW_LEVEL;
+#endif
+
+	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
+
+#ifdef illumos
+	/*
+	 * This determines how often we need to check for hung I/Os after
+	 * the cyclic has already fired. Since checking for hung I/Os is
+	 * an expensive operation we don't want to check too frequently.
+	 * Instead wait for 5 seconds before checking again.
+	 */
+	when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
+	when.cyt_when = CY_INFINITY;
+	mutex_enter(&cpu_lock);
+	spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
+	mutex_exit(&cpu_lock);
+#endif
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	/*
+	 * callout(9) does not provide a way to initialize a callout with
+	 * a function and an argument, so we use callout_reset() to schedule
+	 * the callout in the very distant future.  Even if that event ever
+	 * fires, it should be okayas we won't have any active zio-s.
+	 * But normally spa_sync() will reschedule the callout with a proper
+	 * timeout.
+	 * callout(9) does not allow the callback function to sleep but
+	 * vdev_deadman() needs to acquire vq_lock and illumos mutexes are
+	 * emulated using sx(9).  For this reason spa_deadman_timeout()
+	 * will schedule spa_deadman() as task on a taskqueue that allows
+	 * sleeping.
+	 */
+	TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
+	callout_init(&spa->spa_deadman_cycid, 1);
+	callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
+	    spa_deadman_timeout, spa, 0);
+#endif
+#endif
+#ifdef __NetBSD__
+#ifdef _HARDKERNEL
+	callout_init(&spa->spa_deadman_cycid, 0);
+	callout_setfunc(&spa->spa_deadman_cycid, spa_deadman_timeout, spa);
+#endif
+#endif
+
 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 
@@ -469,6 +790,9 @@ spa_add(const char *name, nvlist_t *conf
 		spa_active_count++;
 	}
 
+	avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
+	    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+
 	/*
 	 * Every pool starts with the default cachefile
 	 */
@@ -476,11 +800,42 @@ spa_add(const char *name, nvlist_t *conf
 	    offsetof(spa_config_dirent_t, scd_link));
 
 	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
-	dp->scd_path = spa_strdup(spa_config_path);
+	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
 	list_insert_head(&spa->spa_config_list, dp);
 
-	if (config != NULL)
+	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+	    KM_SLEEP) == 0);
+
+	if (config != NULL) {
+		nvlist_t *features;
+
+		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+		    &features) == 0) {
+			VERIFY(nvlist_dup(features, &spa->spa_label_features,
+			    0) == 0);
+		}
+
 		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+	}
+
+	if (spa->spa_label_features == NULL) {
+		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
+		    KM_SLEEP) == 0);
+	}
+
+	spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
+
+	spa->spa_min_ashift = INT_MAX;
+	spa->spa_max_ashift = 0;
+
+	/*
+	 * As a pool is being created, treat all features as disabled by
+	 * setting SPA_FEATURE_DISABLED for all entries in the feature
+	 * refcount cache.
+	 */
+	for (int i = 0; i < SPA_FEATURES; i++) {
+		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
+	}
 
 	return (spa);
 }
@@ -497,6 +852,7 @@ spa_remove(spa_t *spa)
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+	ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
 
 	nvlist_free(spa->spa_config_splitting);
 
@@ -515,29 +871,56 @@ spa_remove(spa_t *spa)
 		kmem_free(dp, sizeof (spa_config_dirent_t));
 	}
 
+	avl_destroy(&spa->spa_alloc_tree);
 	list_destroy(&spa->spa_config_list);
 
+	nvlist_free(spa->spa_label_features);
+	nvlist_free(spa->spa_load_info);
 	spa_config_set(spa, NULL);
 
+#ifdef illumos
+	mutex_enter(&cpu_lock);
+	if (spa->spa_deadman_cycid != CYCLIC_NONE)
+		cyclic_remove(spa->spa_deadman_cycid);
+	mutex_exit(&cpu_lock);
+	spa->spa_deadman_cycid = CYCLIC_NONE;
+#endif /* !illumos */
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	callout_drain(&spa->spa_deadman_cycid);
+	taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
+#endif
+#endif
+#ifdef __NetBSD__
+#ifdef _HARDKERNEL
+	callout_drain(&spa->spa_deadman_cycid);
+#endif
+#endif
+
 	refcount_destroy(&spa->spa_refcount);
 
 	spa_config_lock_destroy(spa);
 
 	for (int t = 0; t < TXG_SIZE; t++)
-		bplist_fini(&spa->spa_free_bplist[t]);
-	bplist_fini(&spa->spa_deferred_bplist);
+		bplist_destroy(&spa->spa_free_bplist[t]);
+
+	zio_checksum_templates_free(spa);
 
 	cv_destroy(&spa->spa_async_cv);
+	cv_destroy(&spa->spa_evicting_os_cv);
 	cv_destroy(&spa->spa_proc_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
+	mutex_destroy(&spa->spa_alloc_lock);
 	mutex_destroy(&spa->spa_async_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
 	mutex_destroy(&spa->spa_errlog_lock);
+	mutex_destroy(&spa->spa_evicting_os_lock);
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_cksum_tmpls_lock);
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
@@ -591,6 +974,20 @@ spa_close(spa_t *spa, void *tag)
 }
 
 /*
+ * Remove a reference to the given spa_t held by a dsl dir that is
+ * being asynchronously released.  Async releases occur from a taskq
+ * performing eviction of dsl datasets and dirs.  The namespace lock
+ * isn't held and the hold by the object being evicted may contribute to
+ * spa_minref (e.g. dataset or directory released during pool export),
+ * so the asserts in spa_close() do not apply.
+ */
+void
+spa_async_close(spa_t *spa, void *tag)
+{
+	(void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
  * Check to see if the spa refcount is zero.  Must be called with
  * spa_namespace_lock held.  We really compare against spa_minref, which is the
  * number of references acquired when opening a pool
@@ -888,11 +1285,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t 
 	 */
 	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
 
-	/*
-	 * If the config changed, notify the scrub thread that it must restart.
-	 */
 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
-		dsl_pool_scrub_restart(spa->spa_dsl_pool);
 		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
@@ -922,7 +1315,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t 
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
-		ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_ALL, spa);
@@ -959,22 +1352,46 @@ spa_vdev_state_enter(spa_t *spa, int opl
 {
 	int locks = SCL_STATE_ALL | oplocks;
 
-	spa_config_enter(spa, locks, spa, RW_WRITER);
+	/*
+	 * Root pools may need to read of the underlying devfs filesystem
+	 * when opening up a vdev.  Unfortunately if we're holding the
+	 * SCL_ZIO lock it will result in a deadlock when we try to issue
+	 * the read from the root filesystem.  Instead we "prefetch"
+	 * the associated vnodes that we need prior to opening the
+	 * underlying devices and cache them so that we can prevent
+	 * any I/O when we are doing the actual open.
+	 */
+	if (spa_is_root(spa)) {
+		int low = locks & ~(SCL_ZIO - 1);
+		int high = locks & ~low;
+
+		spa_config_enter(spa, high, spa, RW_WRITER);
+		vdev_hold(spa->spa_root_vdev);
+		spa_config_enter(spa, low, spa, RW_WRITER);
+	} else {
+		spa_config_enter(spa, locks, spa, RW_WRITER);
+	}
 	spa->spa_vdev_locks = locks;
 }
 
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
+	boolean_t config_changed = B_FALSE;
+
 	if (vd != NULL || error == 0)
 		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
 		    0, 0, B_FALSE);
 
 	if (vd != NULL) {
 		vdev_state_dirty(vd->vdev_top);
+		config_changed = B_TRUE;
 		spa->spa_config_generation++;
 	}
 
+	if (spa_is_root(spa))
+		vdev_rele(spa->spa_root_vdev);
+
 	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
 	spa_config_exit(spa, spa->spa_vdev_locks, spa);
 
@@ -987,6 +1404,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *
 	if (vd != NULL)
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
+	/*
+	 * If the config changed, update the config cache.
+	 */
+	if (config_changed) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_sync(spa, B_FALSE, B_TRUE);
+		mutex_exit(&spa_namespace_lock);
+	}
+
 	return (error);
 }
 
@@ -996,6 +1422,30 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *
  * ==========================================================================
  */
 
+void
+spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
+{
+	if (!nvlist_exists(spa->spa_label_features, feature)) {
+		fnvlist_add_boolean(spa->spa_label_features, feature);
+		/*
+		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
+		 * dirty the vdev config because lock SCL_CONFIG is not held.
+		 * Thankfully, in this case we don't need to dirty the config
+		 * because it will be written out anyway when we finish
+		 * creating the pool.
+		 */
+		if (tx->tx_txg != TXG_INITIAL)
+			vdev_config_dirty(spa->spa_root_vdev);
+	}
+}
+
+void
+spa_deactivate_mos_feature(spa_t *spa, const char *feature)
+{
+	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
+		vdev_config_dirty(spa->spa_root_vdev);
+}
+
 /*
  * Rename a spa_t.
  */
@@ -1046,14 +1496,13 @@ spa_rename(const char *name, const char 
 	return (0);
 }
 
-
 /*
- * Determine whether a pool with given pool_guid exists.  If device_guid is
- * non-zero, determine whether the pool exists *and* contains a device with the
- * specified device_guid.
+ * Return the spa_t associated with given pool_guid, if it exists.  If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
  */
-boolean_t
-spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
 {
 	spa_t *spa;
 	avl_tree_t *t = &spa_namespace_avl;
@@ -1084,7 +1533,16 @@ spa_guid_exists(uint64_t pool_guid, uint
 		}
 	}
 
-	return (spa != NULL);
+	return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+	return (spa_by_guid(pool_guid, device_guid) != NULL);
 }
 
 char *
@@ -1138,11 +1596,31 @@ spa_generate_guid(spa_t *spa)
 void
 snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
 {
-	char *type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
-	char *checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
-	char *compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
+	char type[256];
+	char *checksum = NULL;
+	char *compress = NULL;
+
+	if (bp != NULL) {
+		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
+			dmu_object_byteswap_t bswap =
+			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+			(void) snprintf(type, sizeof (type), "bswap %s %s",
+			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
+			    "metadata" : "data",
+			    dmu_ot_byteswap[bswap].ob_name);
+		} else {
+			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
+			    sizeof (type));
+		}
+		if (!BP_IS_EMBEDDED(bp)) {
+			checksum =
+			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+		}
+		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
+	}
 
-	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, compress);
+	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+	    compress);
 }
 
 void
@@ -1171,6 +1649,37 @@ zfs_panic_recover(const char *fmt, ...)
 }
 
 /*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexadecimal numbers that don't overflow.
+ */
+uint64_t
+zfs_strtonum(const char *str, char **nptr)
+{
+	uint64_t val = 0;
+	char c;
+	int digit;
+
+	while ((c = *str) != '\0') {
+		if (c >= '0' && c <= '9')
+			digit = c - '0';
+		else if (c >= 'a' && c <= 'f')
+			digit = 10 + c - 'a';
+		else
+			break;
+
+		val *= 16;
+		val += digit;
+
+		str++;
+	}
+
+	if (nptr)
+		*nptr = (char *)str;
+
+	return (val);
+}
+
+/*
  * ==========================================================================
  * Accessor functions
  * ==========================================================================
@@ -1188,6 +1697,12 @@ spa_get_dsl(spa_t *spa)
 	return (spa->spa_dsl_pool);
 }
 
+boolean_t
+spa_is_initializing(spa_t *spa)
+{
+	return (spa->spa_is_initializing);
+}
+
 blkptr_t *
 spa_get_rootblkptr(spa_t *spa)
 {
@@ -1224,16 +1739,40 @@ spa_name(spa_t *spa)
 uint64_t
 spa_guid(spa_t *spa)
 {
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	uint64_t guid;
+
 	/*
 	 * If we fail to parse the config during spa_load(), we can go through
 	 * the error path (which posts an ereport) and end up here with no root
-	 * vdev.  We stash the original pool guid in 'spa_load_guid' to handle
+	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
 	 * this case.
 	 */
-	if (spa->spa_root_vdev != NULL)
+	if (spa->spa_root_vdev == NULL)
+		return (spa->spa_config_guid);
+
+	guid = spa->spa_last_synced_guid != 0 ?
+	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+	/*
+	 * Return the most recently synced out guid unless we're
+	 * in syncing context.
+	 */
+	if (dp && dsl_pool_sync_context(dp))
 		return (spa->spa_root_vdev->vdev_guid);
 	else
-		return (spa->spa_load_guid);
+		return (guid);
+}
+
+uint64_t
+spa_load_guid(spa_t *spa)
+{
+	/*
+	 * This is a GUID that exists solely as a reference for the
+	 * purposes of the arc.  It is generated at load time, and
+	 * is never written to persistent storage.
+	 */
+	return (spa->spa_load_guid);
 }
 
 uint64_t
@@ -1276,14 +1815,21 @@ spa_freeze_txg(spa_t *spa)
 uint64_t
 spa_get_asize(spa_t *spa, uint64_t lsize)
 {
-	/*
-	 * The worst case is single-sector max-parity RAID-Z blocks, in which
-	 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
-	 * times the size; so just assume that.  Add to this the fact that
-	 * we can have up to 3 DVAs per bp, and one more factor of 2 because
-	 * the block may be dittoed with up to 3 DVAs by ddt_sync().
-	 */
-	return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
+	return (lsize * spa_asize_inflation);
+}
+
+/*
+ * Return the amount of slop space in bytes.  It is 1/32 of the pool (3.2%),
+ * or at least 128MB, unless that would cause it to be more than half the
+ * pool size.
+ *
+ * See the comment above spa_slop_shift for details.
+ */
+uint64_t
+spa_get_slop_space(spa_t *spa)
+{
+	uint64_t space = spa_get_dspace(spa);
+	return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
 }
 
 uint64_t
@@ -1339,6 +1885,34 @@ spa_log_class(spa_t *spa)
 	return (spa->spa_log_class);
 }
 
+void
+spa_evicting_os_register(spa_t *spa, objset_t *os)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	list_insert_head(&spa->spa_evicting_os_list, os);
+	mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_deregister(spa_t *spa, objset_t *os)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	list_remove(&spa->spa_evicting_os_list, os);
+	cv_broadcast(&spa->spa_evicting_os_cv);
+	mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_wait(spa_t *spa)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	while (!list_is_empty(&spa->spa_evicting_os_list))
+		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
+	mutex_exit(&spa->spa_evicting_os_lock);
+
+	dmu_buf_user_evict_wait();
+}
+
 int
 spa_max_replication(spa_t *spa)
 {
@@ -1352,6 +1926,18 @@ spa_max_replication(spa_t *spa)
 	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
+int
+spa_prev_software_version(spa_t *spa)
+{
+	return (spa->spa_prev_software_version);
+}
+
+uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+	return (spa->spa_deadman_synctime);
+}
+
 uint64_t
 dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
@@ -1361,7 +1947,13 @@ dva_get_dsize_sync(spa_t *spa, const dva
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (asize != 0 && spa->spa_deflate) {
-		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+		uint64_t vdev = DVA_GET_VDEV(dva);
+		vdev_t *vd = vdev_lookup_top(spa, vdev);
+		if (vd == NULL) {
+			panic(
+			    "dva_get_dsize_sync(): bad DVA %llu:%llu",
+			    (u_longlong_t)vdev, (u_longlong_t)asize);
+		}
 		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
 	}
 
@@ -1373,7 +1965,7 @@ bp_get_dsize_sync(spa_t *spa, const blkp
 {
 	uint64_t dsize = 0;
 
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	return (dsize);
@@ -1386,7 +1978,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t 
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
 		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
 	spa_config_exit(spa, SCL_VDEV, FTAG);
@@ -1427,6 +2019,12 @@ spa_boot_init()
 	spa_config_load();
 }
 
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+#endif
+
 void
 spa_init(int mode)
 {
@@ -1446,21 +2044,51 @@ spa_init(int mode)
 
 	spa_mode_global = mode;
 
-	refcount_init();
+#ifdef illumos
+#ifdef _KERNEL
+	spa_arch_init();
+#else
+	if (spa_mode_global != FREAD && dprintf_find_string("watch")) {
+		arc_procfd = open("/proc/self/ctl", O_WRONLY);
+		if (arc_procfd == -1) {
+			perror("could not enable watchpoints: "
+			    "opening /proc/self/ctl failed: ");
+		} else {
+			arc_watch = B_TRUE;
+		}
+	}
+#endif
+#endif /* illumos */
+	refcount_sysinit();
 	unique_init();
+	range_tree_init();
+	metaslab_alloc_trace_init();
 	zio_init();
+	lz4_init();
 	dmu_init();
 	zil_init();
 	vdev_cache_stat_init();
 	zfs_prop_init();
 	zpool_prop_init();
+	zpool_feature_init();
 	spa_config_load();
 	l2arc_start();
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	zfs_deadman_init();
+#endif
+#endif	/* __FreeBSD__ */
+#ifdef __NetBSD__
+	zfs_deadman_init();
+#endif
 }
 
 void
 spa_fini(void)
 {
+#ifdef __NetBSD__
+	zfs_deadman_fini();
+#endif
 	l2arc_stop();
 
 	spa_evict_all();
@@ -1468,7 +2096,10 @@ spa_fini(void)
 	vdev_cache_stat_fini();
 	zil_fini();
 	dmu_fini();
+	lz4_fini();
 	zio_fini();
+	metaslab_alloc_trace_fini();
+	range_tree_fini();
 	unique_fini();
 	refcount_fini();
 
@@ -1517,6 +2148,16 @@ spa_writeable(spa_t *spa)
 	return (!!(spa->spa_mode & FWRITE));
 }
 
+/*
+ * Returns true if there is a pending sync task in any of the current
+ * syncing txg, the current quiescing txg, or the current open txg.
+ */
+boolean_t
+spa_has_pending_synctask(spa_t *spa)
+{
+	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+}
+
 int
 spa_mode(spa_t *spa)
 {
@@ -1546,3 +2187,60 @@ spa_dedup_checksum(spa_t *spa)
 {
 	return (spa->spa_dedup_checksum);
 }
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+	/* data not stored on disk */
+	spa->spa_scan_pass_start = gethrestime_sec();
+	spa->spa_scan_pass_exam = 0;
+	vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+		return (SET_ERROR(ENOENT));
+	bzero(ps, sizeof (pool_scan_stat_t));
+
+	/* data stored on disk */
+	ps->pss_func = scn->scn_phys.scn_func;
+	ps->pss_start_time = scn->scn_phys.scn_start_time;
+	ps->pss_end_time = scn->scn_phys.scn_end_time;
+	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+	ps->pss_examined = scn->scn_phys.scn_examined;
+	ps->pss_to_process = scn->scn_phys.scn_to_process;
+	ps->pss_processed = scn->scn_phys.scn_processed;
+	ps->pss_errors = scn->scn_phys.scn_errors;
+	ps->pss_state = scn->scn_phys.scn_state;
+
+	/* data not stored on disk */
+	ps->pss_pass_start = spa->spa_scan_pass_start;
+	ps->pss_pass_exam = spa->spa_scan_pass_exam;
+
+	return (0);
+}
+
+boolean_t
+spa_debug_enabled(spa_t *spa)
+{
+	return (spa->spa_debug);
+}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SPA_MAXBLOCKSIZE);
+	else
+		return (SPA_OLD_MAXBLOCKSIZE);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c,v
retrieving revision 1.4
diff -u -p -r1.4 space_map.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c	19 May 2010 17:50:59 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c	27 Mar 2017 06:19:45 -0000
@@ -22,296 +22,81 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/dsl_pool.h>
 #include <sys/zio.h>
 #include <sys/space_map.h>
+#include <sys/refcount.h>
+#include <sys/zfeature.h>
 
-/*
- * Space map routines.
- * NOTE: caller is responsible for all locking.
- */
-static int
-space_map_seg_compare(const void *x1, const void *x2)
-{
-	const space_seg_t *s1 = x1;
-	const space_seg_t *s2 = x2;
-
-	if (s1->ss_start < s2->ss_start) {
-		if (s1->ss_end > s2->ss_start)
-			return (0);
-		return (-1);
-	}
-	if (s1->ss_start > s2->ss_start) {
-		if (s1->ss_start < s2->ss_end)
-			return (0);
-		return (1);
-	}
-	return (0);
-}
-
-void
-space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
-	kmutex_t *lp)
-{
-	bzero(sm, sizeof (*sm));
-
-	cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
-
-	avl_create(&sm->sm_root, space_map_seg_compare,
-	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
-
-	sm->sm_start = start;
-	sm->sm_size = size;
-	sm->sm_shift = shift;
-	sm->sm_lock = lp;
-}
-
-void
-space_map_destroy(space_map_t *sm)
-{
-	ASSERT(!sm->sm_loaded && !sm->sm_loading);
-	VERIFY3U(sm->sm_space, ==, 0);
-	avl_destroy(&sm->sm_root);
-	cv_destroy(&sm->sm_load_cv);
-}
-
-void
-space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-	space_seg_t ssearch, *ss_before, *ss_after, *ss;
-	uint64_t end = start + size;
-	int merge_before, merge_after;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(size != 0);
-	VERIFY3U(start, >=, sm->sm_start);
-	VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
-	VERIFY(sm->sm_space + size <= sm->sm_size);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ssearch.ss_start = start;
-	ssearch.ss_end = end;
-	ss = avl_find(&sm->sm_root, &ssearch, &where);
-
-	if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
-		zfs_panic_recover("zfs: allocating allocated segment"
-		    "(offset=%llu size=%llu)\n",
-		    (longlong_t)start, (longlong_t)size);
-		return;
-	}
-
-	/* Make sure we don't overlap with either of our neighbors */
-	VERIFY(ss == NULL);
-
-	ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
-	ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
-
-	merge_before = (ss_before != NULL && ss_before->ss_end == start);
-	merge_after = (ss_after != NULL && ss_after->ss_start == end);
-
-	if (merge_before && merge_after) {
-		avl_remove(&sm->sm_root, ss_before);
-		if (sm->sm_pp_root) {
-			avl_remove(sm->sm_pp_root, ss_before);
-			avl_remove(sm->sm_pp_root, ss_after);
-		}
-		ss_after->ss_start = ss_before->ss_start;
-		kmem_free(ss_before, sizeof (*ss_before));
-		ss = ss_after;
-	} else if (merge_before) {
-		ss_before->ss_end = end;
-		if (sm->sm_pp_root)
-			avl_remove(sm->sm_pp_root, ss_before);
-		ss = ss_before;
-	} else if (merge_after) {
-		ss_after->ss_start = start;
-		if (sm->sm_pp_root)
-			avl_remove(sm->sm_pp_root, ss_after);
-		ss = ss_after;
-	} else {
-		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
-		ss->ss_start = start;
-		ss->ss_end = end;
-		avl_insert(&sm->sm_root, ss, where);
-	}
-
-	if (sm->sm_pp_root)
-		avl_add(sm->sm_pp_root, ss);
-
-	sm->sm_space += size;
-}
-
-void
-space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-	space_seg_t ssearch, *ss, *newseg;
-	uint64_t end = start + size;
-	int left_over, right_over;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(size != 0);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ssearch.ss_start = start;
-	ssearch.ss_end = end;
-	ss = avl_find(&sm->sm_root, &ssearch, &where);
-
-	/* Make sure we completely overlap with someone */
-	if (ss == NULL) {
-		zfs_panic_recover("zfs: freeing free segment "
-		    "(offset=%llu size=%llu)",
-		    (longlong_t)start, (longlong_t)size);
-		return;
-	}
-	VERIFY3U(ss->ss_start, <=, start);
-	VERIFY3U(ss->ss_end, >=, end);
-	VERIFY(sm->sm_space - size <= sm->sm_size);
-
-	left_over = (ss->ss_start != start);
-	right_over = (ss->ss_end != end);
-
-	if (sm->sm_pp_root)
-		avl_remove(sm->sm_pp_root, ss);
-
-	if (left_over && right_over) {
-		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
-		newseg->ss_start = end;
-		newseg->ss_end = ss->ss_end;
-		ss->ss_end = start;
-		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
-		if (sm->sm_pp_root)
-			avl_add(sm->sm_pp_root, newseg);
-	} else if (left_over) {
-		ss->ss_end = start;
-	} else if (right_over) {
-		ss->ss_start = end;
-	} else {
-		avl_remove(&sm->sm_root, ss);
-		kmem_free(ss, sizeof (*ss));
-		ss = NULL;
-	}
-
-	if (sm->sm_pp_root && ss != NULL)
-		avl_add(sm->sm_pp_root, ss);
-
-	sm->sm_space -= size;
-}
-
-boolean_t
-space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_index_t where;
-	space_seg_t ssearch, *ss;
-	uint64_t end = start + size;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-	VERIFY(size != 0);
-	VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
-	VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
-	ssearch.ss_start = start;
-	ssearch.ss_end = end;
-	ss = avl_find(&sm->sm_root, &ssearch, &where);
-
-	return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
-}
-
-void
-space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
-	space_seg_t *ss;
-	void *cookie = NULL;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
-		if (func != NULL)
-			func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-		kmem_free(ss, sizeof (*ss));
-	}
-	sm->sm_space = 0;
-}
-
-void
-space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
-	space_seg_t *ss;
-
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
+SYSCTL_DECL(_vfs_zfs);
 
 /*
- * Wait for any in-progress space_map_load() to complete.
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
  */
-void
-space_map_load_wait(space_map_t *sm)
-{
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	while (sm->sm_loading) {
-		ASSERT(!sm->sm_loaded);
-		cv_wait(&sm->sm_load_cv, sm->sm_lock);
-	}
-}
+int space_map_blksz = (1 << 12);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, 0,
+    "Maximum block size for space map.  Must be power of 2 and greater than 4096.");
 
 /*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ *
  * Note: space_map_load() will drop sm_lock across dmu_read() calls.
  * The caller must be OK with this.
  */
 int
-space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
-	space_map_obj_t *smo, objset_t *os)
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
 {
 	uint64_t *entry, *entry_map, *entry_map_end;
 	uint64_t bufsize, size, offset, end, space;
-	uint64_t mapstart = sm->sm_start;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
-	ASSERT(!sm->sm_loaded);
-	ASSERT(!sm->sm_loading);
 
-	sm->sm_loading = B_TRUE;
-	end = smo->smo_objsize;
-	space = smo->smo_alloc;
+	end = space_map_length(sm);
+	space = space_map_allocated(sm);
 
-	ASSERT(sm->sm_ops == NULL);
-	VERIFY3U(sm->sm_space, ==, 0);
+	VERIFY0(range_tree_space(rt));
 
 	if (maptype == SM_FREE) {
-		space_map_add(sm, sm->sm_start, sm->sm_size);
+		range_tree_add(rt, sm->sm_start, sm->sm_size);
 		space = sm->sm_size - space;
 	}
 
-	bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
+	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
 	entry_map = zio_buf_alloc(bufsize);
 
 	mutex_exit(sm->sm_lock);
-	if (end > bufsize)
-		dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
+	if (end > bufsize) {
+		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
+		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
+	}
 	mutex_enter(sm->sm_lock);
 
 	for (offset = 0; offset < end; offset += bufsize) {
 		size = MIN(end - offset, bufsize);
 		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
 		VERIFY(size != 0);
+		ASSERT3U(sm->sm_blksz, !=, 0);
 
 		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    smo->smo_object, offset, size);
+		    space_map_object(sm), offset, size);
 
 		mutex_exit(sm->sm_lock);
-		error = dmu_read(os, smo->smo_object, offset, size, entry_map,
-		    DMU_READ_PREFETCH);
+		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
+		    entry_map, DMU_READ_PREFETCH);
 		mutex_enter(sm->sm_lock);
 		if (error != 0)
 			break;
@@ -319,115 +104,176 @@ space_map_load(space_map_t *sm, space_ma
 		entry_map_end = entry_map + (size / sizeof (uint64_t));
 		for (entry = entry_map; entry < entry_map_end; entry++) {
 			uint64_t e = *entry;
+			uint64_t offset, size;
 
 			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
 				continue;
 
-			(SM_TYPE_DECODE(e) == maptype ?
-			    space_map_add : space_map_remove)(sm,
-			    (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
-			    SM_RUN_DECODE(e) << sm->sm_shift);
+			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
+			    sm->sm_start;
+			size = SM_RUN_DECODE(e) << sm->sm_shift;
+
+			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
+			VERIFY3U(offset, >=, sm->sm_start);
+			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
+			if (SM_TYPE_DECODE(e) == maptype) {
+				VERIFY3U(range_tree_space(rt) + size, <=,
+				    sm->sm_size);
+				range_tree_add(rt, offset, size);
+			} else {
+				range_tree_remove(rt, offset, size);
+			}
 		}
 	}
 
-	if (error == 0) {
-		VERIFY3U(sm->sm_space, ==, space);
-
-		sm->sm_loaded = B_TRUE;
-		sm->sm_ops = ops;
-		if (ops != NULL)
-			ops->smop_load(sm);
-	} else {
-		space_map_vacate(sm, NULL, NULL);
-	}
+	if (error == 0)
+		VERIFY3U(range_tree_space(rt), ==, space);
+	else
+		range_tree_vacate(rt, NULL, NULL);
 
 	zio_buf_free(entry_map, bufsize);
-
-	sm->sm_loading = B_FALSE;
-
-	cv_broadcast(&sm->sm_load_cv);
-
 	return (error);
 }
 
 void
-space_map_unload(space_map_t *sm)
+space_map_histogram_clear(space_map_t *sm)
 {
-	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	if (sm->sm_loaded && sm->sm_ops != NULL)
-		sm->sm_ops->smop_unload(sm);
-
-	sm->sm_loaded = B_FALSE;
-	sm->sm_ops = NULL;
+	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+		return;
 
-	space_map_vacate(sm, NULL, NULL);
+	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
 }
 
-uint64_t
-space_map_maxsize(space_map_t *sm)
+boolean_t
+space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
 {
-	ASSERT(sm->sm_ops != NULL);
-	return (sm->sm_ops->smop_max(sm));
+	/*
+	 * Verify that the in-core range tree does not have any
+	 * ranges smaller than our sm_shift size.
+	 */
+	for (int i = 0; i < sm->sm_shift; i++) {
+		if (rt->rt_histogram[i] != 0)
+			return (B_FALSE);
+	}
+	return (B_TRUE);
 }
 
-uint64_t
-space_map_alloc(space_map_t *sm, uint64_t size)
+void
+space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
 {
-	uint64_t start;
+	int idx = 0;
 
-	start = sm->sm_ops->smop_alloc(sm, size);
-	if (start != -1ULL)
-		space_map_remove(sm, start, size);
-	return (start);
-}
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	ASSERT(dmu_tx_is_syncing(tx));
+	VERIFY3U(space_map_object(sm), !=, 0);
 
-void
-space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	sm->sm_ops->smop_claim(sm, start, size);
-	space_map_remove(sm, start, size);
+	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+		return;
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	ASSERT(space_map_histogram_verify(sm, rt));
+	/*
+	 * Transfer the content of the range tree histogram to the space
+	 * map histogram. The space map histogram contains 32 buckets ranging
+	 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
+	 * however, can represent ranges from 2^0 to 2^63. Since the space
+	 * map only cares about allocatable blocks (minimum of sm_shift) we
+	 * can safely ignore all ranges in the range tree smaller than sm_shift.
+	 */
+	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+
+		/*
+		 * Since the largest histogram bucket in the space map is
+		 * 2^(32+sm_shift-1), we need to normalize the values in
+		 * the range tree for any bucket larger than that size. For
+		 * example given an sm_shift of 9, ranges larger than 2^40
+		 * would get normalized as if they were 1TB ranges. Assume
+		 * the range tree had a count of 5 in the 2^44 (16TB) bucket,
+		 * the calculation below would normalize this to 5 * 2^4 (16).
+		 */
+		ASSERT3U(i, >=, idx + sm->sm_shift);
+		sm->sm_phys->smp_histogram[idx] +=
+		    rt->rt_histogram[i] << (i - idx - sm->sm_shift);
+
+		/*
+		 * Increment the space map's index as long as we haven't
+		 * reached the maximum bucket size. Accumulate all ranges
+		 * larger than the max bucket size into the last bucket.
+		 */
+		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+			ASSERT3U(idx + sm->sm_shift, ==, i);
+			idx++;
+			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+		}
+	}
 }
 
-void
-space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
+uint64_t
+space_map_entries(space_map_t *sm, range_tree_t *rt)
 {
-	space_map_add(sm, start, size);
-	sm->sm_ops->smop_free(sm, start, size);
+	avl_tree_t *t = &rt->rt_root;
+	range_seg_t *rs;
+	uint64_t size, entries;
+
+	/*
+	 * All space_maps always have a debug entry so account for it here.
+	 */
+	entries = 1;
+
+	/*
+	 * Traverse the range tree and calculate the number of space map
+	 * entries that would be required to write out the range tree.
+	 */
+	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+		entries += howmany(size, SM_RUN_MAX);
+	}
+	return (entries);
 }
 
 /*
- * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
+ * Note: space_map_write() will drop sm_lock across dmu_write() calls.
  */
 void
-space_map_sync(space_map_t *sm, uint8_t maptype,
-	space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    dmu_tx_t *tx)
 {
+	objset_t *os = sm->sm_os;
 	spa_t *spa = dmu_objset_spa(os);
-	void *cookie = NULL;
-	space_seg_t *ss;
-	uint64_t bufsize, start, size, run_len;
+	avl_tree_t *t = &rt->rt_root;
+	range_seg_t *rs;
+	uint64_t size, total, rt_space, nodes;
 	uint64_t *entry, *entry_map, *entry_map_end;
+	uint64_t expected_entries, actual_entries = 1;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	VERIFY3U(space_map_object(sm), !=, 0);
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	/*
+	 * This field is no longer necessary since the in-core space map
+	 * now contains the object number but is maintained for backwards
+	 * compatibility.
+	 */
+	sm->sm_phys->smp_object = sm->sm_object;
 
-	if (sm->sm_space == 0)
+	if (range_tree_space(rt) == 0) {
+		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
 		return;
-
-	dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
-	    smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
-	    maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
-	    sm->sm_space);
+	}
 
 	if (maptype == SM_ALLOC)
-		smo->smo_alloc += sm->sm_space;
+		sm->sm_phys->smp_alloc += range_tree_space(rt);
 	else
-		smo->smo_alloc -= sm->sm_space;
+		sm->sm_phys->smp_alloc -= range_tree_space(rt);
 
-	bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
-	bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
-	entry_map = zio_buf_alloc(bufsize);
-	entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+	expected_entries = space_map_entries(sm, rt);
+
+	entry_map = zio_buf_alloc(sm->sm_blksz);
+	entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
 	entry = entry_map;
 
 	*entry++ = SM_DEBUG_ENCODE(1) |
@@ -435,22 +281,29 @@ space_map_sync(space_map_t *sm, uint8_t 
 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
-	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
-		size = ss->ss_end - ss->ss_start;
-		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+	total = 0;
+	nodes = avl_numnodes(&rt->rt_root);
+	rt_space = range_tree_space(rt);
+	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+		uint64_t start;
+
+		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+		start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+
+		total += size << sm->sm_shift;
 
-		sm->sm_space -= size;
-		size >>= sm->sm_shift;
+		while (size != 0) {
+			uint64_t run_len;
 
-		while (size) {
 			run_len = MIN(size, SM_RUN_MAX);
 
 			if (entry == entry_map_end) {
-				mutex_exit(sm->sm_lock);
-				dmu_write(os, smo->smo_object, smo->smo_objsize,
-				    bufsize, entry_map, tx);
-				mutex_enter(sm->sm_lock);
-				smo->smo_objsize += bufsize;
+				mutex_exit(rt->rt_lock);
+				dmu_write(os, space_map_object(sm),
+				    sm->sm_phys->smp_objsize, sm->sm_blksz,
+				    entry_map, tx);
+				mutex_enter(rt->rt_lock);
+				sm->sm_phys->smp_objsize += sm->sm_blksz;
 				entry = entry_map;
 			}
 
@@ -460,157 +313,234 @@ space_map_sync(space_map_t *sm, uint8_t 
 
 			start += run_len;
 			size -= run_len;
+			actual_entries++;
 		}
-		kmem_free(ss, sizeof (*ss));
 	}
 
 	if (entry != entry_map) {
 		size = (entry - entry_map) * sizeof (uint64_t);
-		mutex_exit(sm->sm_lock);
-		dmu_write(os, smo->smo_object, smo->smo_objsize,
+		mutex_exit(rt->rt_lock);
+		dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
 		    size, entry_map, tx);
-		mutex_enter(sm->sm_lock);
-		smo->smo_objsize += size;
+		mutex_enter(rt->rt_lock);
+		sm->sm_phys->smp_objsize += size;
 	}
+	ASSERT3U(expected_entries, ==, actual_entries);
 
-	zio_buf_free(entry_map, bufsize);
+	/*
+	 * Ensure that the space_map's accounting wasn't changed
+	 * while we were in the middle of writing it out.
+	 */
+	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
+	VERIFY3U(range_tree_space(rt), ==, rt_space);
+	VERIFY3U(range_tree_space(rt), ==, total);
 
-	VERIFY3U(sm->sm_space, ==, 0);
+	zio_buf_free(entry_map, sm->sm_blksz);
 }
 
-void
-space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+static int
+space_map_open_impl(space_map_t *sm)
 {
-	VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
+	int error;
+	u_longlong_t blocks;
+
+	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
+	if (error)
+		return (error);
 
-	smo->smo_objsize = 0;
-	smo->smo_alloc = 0;
+	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
+	sm->sm_phys = sm->sm_dbuf->db_data;
+	return (0);
 }
 
-/*
- * Space map reference trees.
- *
- * A space map is a collection of integers.  Every integer is either
- * in the map, or it's not.  A space map reference tree generalizes
- * the idea: it allows its members to have arbitrary reference counts,
- * as opposed to the implicit reference count of 0 or 1 in a space map.
- * This representation comes in handy when computing the union or
- * intersection of multiple space maps.  For example, the union of
- * N space maps is the subset of the reference tree with refcnt >= 1.
- * The intersection of N space maps is the subset with refcnt >= N.
- *
- * [It's very much like a Fourier transform.  Unions and intersections
- * are hard to perform in the 'space map domain', so we convert the maps
- * into the 'reference count domain', where it's trivial, then invert.]
- *
- * vdev_dtl_reassess() uses computations of this form to determine
- * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
- * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
- * has an outage wherever refcnt >= vdev_children.
- */
-static int
-space_map_ref_compare(const void *x1, const void *x2)
+int
+space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+    uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp)
 {
-	const space_ref_t *sr1 = x1;
-	const space_ref_t *sr2 = x2;
+	space_map_t *sm;
+	int error;
+
+	ASSERT(*smp == NULL);
+	ASSERT(os != NULL);
+	ASSERT(object != 0);
 
-	if (sr1->sr_offset < sr2->sr_offset)
-		return (-1);
-	if (sr1->sr_offset > sr2->sr_offset)
-		return (1);
-
-	if (sr1 < sr2)
-		return (-1);
-	if (sr1 > sr2)
-		return (1);
+	sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
+
+	sm->sm_start = start;
+	sm->sm_size = size;
+	sm->sm_shift = shift;
+	sm->sm_lock = lp;
+	sm->sm_os = os;
+	sm->sm_object = object;
+
+	error = space_map_open_impl(sm);
+	if (error != 0) {
+		space_map_close(sm);
+		return (error);
+	}
+
+	*smp = sm;
 
 	return (0);
 }
 
 void
-space_map_ref_create(avl_tree_t *t)
+space_map_close(space_map_t *sm)
 {
-	avl_create(t, space_map_ref_compare,
-	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+	if (sm == NULL)
+		return;
+
+	if (sm->sm_dbuf != NULL)
+		dmu_buf_rele(sm->sm_dbuf, sm);
+	sm->sm_dbuf = NULL;
+	sm->sm_phys = NULL;
+
+	kmem_free(sm, sizeof (*sm));
 }
 
 void
-space_map_ref_destroy(avl_tree_t *t)
+space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
 {
-	space_ref_t *sr;
-	void *cookie = NULL;
+	objset_t *os = sm->sm_os;
+	spa_t *spa = dmu_objset_spa(os);
+	dmu_object_info_t doi;
 
-	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
-		kmem_free(sr, sizeof (*sr));
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(dmu_tx_is_syncing(tx));
 
-	avl_destroy(t);
-}
+	dmu_object_info_from_db(sm->sm_dbuf, &doi);
 
-static void
-space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
-{
-	space_ref_t *sr;
+	/*
+	 * If the space map has the wrong bonus size (because
+	 * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
+	 * the wrong block size (because space_map_blksz has changed),
+	 * free and re-allocate its object with the updated sizes.
+	 *
+	 * Otherwise, just truncate the current object.
+	 */
+	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
+	    doi.doi_data_block_size != space_map_blksz) {
+		zfs_dbgmsg("txg %llu, spa %s, reallocating: "
+		    "old bonus %u, old blocksz %u", dmu_tx_get_txg(tx),
+		    spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size);
 
-	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
-	sr->sr_offset = offset;
-	sr->sr_refcnt = refcnt;
+		space_map_free(sm, tx);
+		dmu_buf_rele(sm->sm_dbuf, sm);
 
-	avl_add(t, sr);
-}
+		sm->sm_object = space_map_alloc(sm->sm_os, tx);
+		VERIFY0(space_map_open_impl(sm));
+	} else {
+		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
 
-void
-space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
-	int64_t refcnt)
-{
-	space_map_ref_add_node(t, start, refcnt);
-	space_map_ref_add_node(t, end, -refcnt);
+		/*
+		 * If the spacemap is reallocated, its histogram
+		 * will be reset.  Do the same in the common case so that
+		 * bugs related to the uncommon case do not go unnoticed.
+		 */
+		bzero(sm->sm_phys->smp_histogram,
+		    sizeof (sm->sm_phys->smp_histogram));
+	}
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+	sm->sm_phys->smp_objsize = 0;
+	sm->sm_phys->smp_alloc = 0;
 }
 
 /*
- * Convert (or add) a space map into a reference tree.
+ * Update the in-core space_map allocation and length values.
  */
 void
-space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+space_map_update(space_map_t *sm)
 {
-	space_seg_t *ss;
+	if (sm == NULL)
+		return;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+	sm->sm_alloc = sm->sm_phys->smp_alloc;
+	sm->sm_length = sm->sm_phys->smp_objsize;
 }
 
-/*
- * Convert a reference tree into a space map.  The space map will contain
- * all members of the reference tree for which refcnt >= minref.
- */
-void
-space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+uint64_t
+space_map_alloc(objset_t *os, dmu_tx_t *tx)
 {
-	uint64_t start = -1ULL;
-	int64_t refcnt = 0;
-	space_ref_t *sr;
+	spa_t *spa = dmu_objset_spa(os);
+	uint64_t object;
+	int bonuslen;
 
-	ASSERT(MUTEX_HELD(sm->sm_lock));
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+		bonuslen = sizeof (space_map_phys_t);
+		ASSERT3U(bonuslen, <=, dmu_bonus_max());
+	} else {
+		bonuslen = SPACE_MAP_SIZE_V0;
+	}
 
-	space_map_vacate(sm, NULL, NULL);
+	object = dmu_object_alloc(os,
+	    DMU_OT_SPACE_MAP, space_map_blksz,
+	    DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
 
-	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
-		refcnt += sr->sr_refcnt;
-		if (refcnt >= minref) {
-			if (start == -1ULL) {
-				start = sr->sr_offset;
-			}
-		} else {
-			if (start != -1ULL) {
-				uint64_t end = sr->sr_offset;
-				ASSERT(start <= end);
-				if (end > start)
-					space_map_add(sm, start, end - start);
-				start = -1ULL;
-			}
+	return (object);
+}
+
+void
+space_map_free(space_map_t *sm, dmu_tx_t *tx)
+{
+	spa_t *spa;
+
+	if (sm == NULL)
+		return;
+
+	spa = dmu_objset_spa(sm->sm_os);
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		dmu_object_info_t doi;
+
+		dmu_object_info_from_db(sm->sm_dbuf, &doi);
+		if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
+			VERIFY(spa_feature_is_active(spa,
+			    SPA_FEATURE_SPACEMAP_HISTOGRAM));
+			spa_feature_decr(spa,
+			    SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
 		}
 	}
-	ASSERT(refcnt == 0);
-	ASSERT(start == -1ULL);
+
+	VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0);
+	sm->sm_object = 0;
+}
+
+uint64_t
+space_map_object(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_object : 0);
+}
+
+/*
+ * Returns the already synced, on-disk allocated space.
+ */
+uint64_t
+space_map_allocated(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_alloc : 0);
+}
+
+/*
+ * Returns the already synced, on-disk length;
+ */
+uint64_t
+space_map_length(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_length : 0);
+}
+
+/*
+ * Returns the allocated space that is currently syncing.
+ */
+int64_t
+space_map_alloc_delta(space_map_t *sm)
+{
+	if (sm == NULL)
+		return (0);
+	ASSERT(sm->sm_dbuf != NULL);
+	return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c	22 Nov 2015 17:22:31 -0000
@@ -0,0 +1,159 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/range_tree.h>
+#include <sys/space_reftree.h>
+
+/*
+ * Space reference trees.
+ *
+ * A range tree is a collection of integers.  Every integer is either
+ * in the tree, or it's not.  A space reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a range tree.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps.  For example, the union of
+ * N range trees is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N range trees is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform.  Unions and intersections
+ * are hard to perform in the 'range tree domain', so we convert the trees
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_reftree_compare(const void *x1, const void *x2)
+{
+	const space_ref_t *sr1 = x1;
+	const space_ref_t *sr2 = x2;
+
+	if (sr1->sr_offset < sr2->sr_offset)
+		return (-1);
+	if (sr1->sr_offset > sr2->sr_offset)
+		return (1);
+
+	if (sr1 < sr2)
+		return (-1);
+	if (sr1 > sr2)
+		return (1);
+
+	return (0);
+}
+
+void
+space_reftree_create(avl_tree_t *t)
+{
+	avl_create(t, space_reftree_compare,
+	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_reftree_destroy(avl_tree_t *t)
+{
+	space_ref_t *sr;
+	void *cookie = NULL;
+
+	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(sr, sizeof (*sr));
+
+	avl_destroy(t);
+}
+
+static void
+space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+	space_ref_t *sr;
+
+	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+	sr->sr_offset = offset;
+	sr->sr_refcnt = refcnt;
+
+	avl_add(t, sr);
+}
+
+void
+space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+    int64_t refcnt)
+{
+	space_reftree_add_node(t, start, refcnt);
+	space_reftree_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a range tree into a reference tree.
+ */
+void
+space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
+{
+	range_seg_t *rs;
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
+		space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt);
+}
+
+/*
+ * Convert a reference tree into a range tree.  The range tree will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
+{
+	uint64_t start = -1ULL;
+	int64_t refcnt = 0;
+	space_ref_t *sr;
+
+	ASSERT(MUTEX_HELD(rt->rt_lock));
+
+	range_tree_vacate(rt, NULL, NULL);
+
+	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+		refcnt += sr->sr_refcnt;
+		if (refcnt >= minref) {
+			if (start == -1ULL) {
+				start = sr->sr_offset;
+			}
+		} else {
+			if (start != -1ULL) {
+				uint64_t end = sr->sr_offset;
+				ASSERT(start <= end);
+				if (end > start)
+					range_tree_add(rt, start, end - start);
+				start = -1ULL;
+			}
+		}
+	}
+	ASSERT(refcnt == 0);
+	ASSERT(start == -1ULL);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c	22 Apr 2017 10:02:25 -0000
@@ -0,0 +1,657 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/trim_map.h>
+#include <sys/time.h>
+
+/*
+ * Calculate the zio end, upgrading based on ashift which would be
+ * done by zio_vdev_io_start.
+ *
+ * This makes free range consolidation much more effective
+ * than it would otherwise be as well as ensuring that entire
+ * blocks are invalidated by writes.
+ */
+#define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
+ 	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
+
+/* Maximal segment size for ATA TRIM. */
+#define TRIM_MAP_SIZE_FACTOR	(512 << 16)
+
+#define TRIM_MAP_SEGS(size)	(1 + (size) / TRIM_MAP_SIZE_FACTOR)
+
+#define TRIM_MAP_ADD(tm, ts)	do {				\
+	list_insert_tail(&(tm)->tm_head, (ts));			\
+	(tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
+
+#define TRIM_MAP_REM(tm, ts)	do {				\
+	list_remove(&(tm)->tm_head, (ts));			\
+	(tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
+
+typedef struct trim_map {
+	list_t		tm_head;		/* List of segments sorted by txg. */
+	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
+	avl_tree_t	tm_inflight_frees;	/* AVL tree of in-flight TRIMs. */
+	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
+	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
+	kmutex_t	tm_lock;
+	uint64_t	tm_pending;		/* Count of pending TRIMs. */
+} trim_map_t;
+
+typedef struct trim_seg {
+	avl_node_t	ts_node;	/* AVL node. */
+	list_node_t	ts_next;	/* List element. */
+	uint64_t	ts_start;	/* Starting offset of this segment. */
+	uint64_t	ts_end;		/* Ending offset (non-inclusive). */
+	uint64_t	ts_txg;		/* Segment creation txg. */
+	hrtime_t	ts_time;	/* Segment creation time. */
+} trim_seg_t;
+
+extern boolean_t zfs_trim_enabled;
+
+static u_int trim_txg_delay = 32;	/* Keep deleted data up to 32 TXG */
+static u_int trim_timeout = 30;		/* Keep deleted data up to 30s */
+static u_int trim_max_interval = 1;	/* 1s delays between TRIMs */
+static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
+
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
+    0, "Delay TRIMs by up to this many TXGs");
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
+    "Delay TRIMs by up to this many seconds");
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
+    &trim_max_interval, 0,
+    "Maximum interval between TRIM queue processing (seconds)");
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
+    &trim_vdev_max_pending, 0,
+    "Maximum pending TRIM segments for a vdev");
+
+static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
+
+static int
+trim_map_seg_compare(const void *x1, const void *x2)
+{
+	const trim_seg_t *s1 = x1;
+	const trim_seg_t *s2 = x2;
+
+	if (s1->ts_start < s2->ts_start) {
+		if (s1->ts_end > s2->ts_start)
+			return (0);
+		return (-1);
+	}
+	if (s1->ts_start > s2->ts_start) {
+		if (s1->ts_start < s2->ts_end)
+			return (0);
+		return (1);
+	}
+	return (0);
+}
+
+static int
+trim_map_zio_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_offset < z2->io_offset) {
+		if (z1->io_offset + z1->io_size > z2->io_offset)
+			return (0);
+		return (-1);
+	}
+	if (z1->io_offset > z2->io_offset) {
+		if (z1->io_offset < z2->io_offset + z2->io_size)
+			return (0);
+		return (1);
+	}
+	return (0);
+}
+
+void
+trim_map_create(vdev_t *vd)
+{
+	trim_map_t *tm;
+
+	ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
+		vd->vdev_ops->vdev_op_leaf);
+
+	tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
+	mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&tm->tm_head, sizeof (trim_seg_t),
+	    offsetof(trim_seg_t, ts_next));
+	list_create(&tm->tm_pending_writes, sizeof (zio_t),
+	    offsetof(zio_t, io_trim_link));
+	avl_create(&tm->tm_queued_frees, trim_map_seg_compare,
+	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+	avl_create(&tm->tm_inflight_frees, trim_map_seg_compare,
+	    sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node));
+	avl_create(&tm->tm_inflight_writes, trim_map_zio_compare,
+	    sizeof (zio_t), offsetof(zio_t, io_trim_node));
+	vd->vdev_trimmap = tm;
+}
+
+void
+trim_map_destroy(vdev_t *vd)
+{
+	trim_map_t *tm;
+	trim_seg_t *ts;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	if (!zfs_trim_enabled)
+		return;
+
+	tm = vd->vdev_trimmap;
+	if (tm == NULL)
+		return;
+
+	/*
+	 * We may have been called before trim_map_vdev_commit_done()
+	 * had a chance to run, so do it now to prune the remaining
+	 * inflight frees.
+	 */
+	trim_map_vdev_commit_done(vd->vdev_spa, vd);
+
+	mutex_enter(&tm->tm_lock);
+	while ((ts = list_head(&tm->tm_head)) != NULL) {
+		avl_remove(&tm->tm_queued_frees, ts);
+		TRIM_MAP_REM(tm, ts);
+		kmem_free(ts, sizeof (*ts));
+	}
+	mutex_exit(&tm->tm_lock);
+
+	avl_destroy(&tm->tm_queued_frees);
+	avl_destroy(&tm->tm_inflight_frees);
+	avl_destroy(&tm->tm_inflight_writes);
+	list_destroy(&tm->tm_pending_writes);
+	list_destroy(&tm->tm_head);
+	mutex_destroy(&tm->tm_lock);
+	kmem_free(tm, sizeof (*tm));
+	vd->vdev_trimmap = NULL;
+}
+
+static void
+trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+	avl_index_t where;
+	trim_seg_t tsearch, *ts_before, *ts_after, *ts;
+	boolean_t merge_before, merge_after;
+	hrtime_t time;
+
+	ASSERT(MUTEX_HELD(&tm->tm_lock));
+	VERIFY(start < end);
+
+	time = gethrtime();
+	tsearch.ts_start = start;
+	tsearch.ts_end = end;
+
+	ts = avl_find(&tm->tm_queued_frees, &tsearch, &where);
+	if (ts != NULL) {
+		if (start < ts->ts_start)
+			trim_map_segment_add(tm, start, ts->ts_start, txg);
+		if (end > ts->ts_end)
+			trim_map_segment_add(tm, ts->ts_end, end, txg);
+		return;
+	}
+
+	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
+	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
+
+	merge_before = (ts_before != NULL && ts_before->ts_end == start);
+	merge_after = (ts_after != NULL && ts_after->ts_start == end);
+
+	if (merge_before && merge_after) {
+		avl_remove(&tm->tm_queued_frees, ts_before);
+		TRIM_MAP_REM(tm, ts_before);
+		TRIM_MAP_REM(tm, ts_after);
+		ts_after->ts_start = ts_before->ts_start;
+		ts_after->ts_txg = txg;
+		ts_after->ts_time = time;
+		TRIM_MAP_ADD(tm, ts_after);
+		kmem_free(ts_before, sizeof (*ts_before));
+	} else if (merge_before) {
+		TRIM_MAP_REM(tm, ts_before);
+		ts_before->ts_end = end;
+		ts_before->ts_txg = txg;
+		ts_before->ts_time = time;
+		TRIM_MAP_ADD(tm, ts_before);
+	} else if (merge_after) {
+		TRIM_MAP_REM(tm, ts_after);
+		ts_after->ts_start = start;
+		ts_after->ts_txg = txg;
+		ts_after->ts_time = time;
+		TRIM_MAP_ADD(tm, ts_after);
+	} else {
+		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
+		ts->ts_start = start;
+		ts->ts_end = end;
+		ts->ts_txg = txg;
+		ts->ts_time = time;
+		avl_insert(&tm->tm_queued_frees, ts, where);
+		TRIM_MAP_ADD(tm, ts);
+	}
+}
+
+static void
+trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start,
+    uint64_t end)
+{
+	trim_seg_t *nts;
+	boolean_t left_over, right_over;
+
+	ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+	left_over = (ts->ts_start < start);
+	right_over = (ts->ts_end > end);
+
+	TRIM_MAP_REM(tm, ts);
+	if (left_over && right_over) {
+		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
+		nts->ts_start = end;
+		nts->ts_end = ts->ts_end;
+		nts->ts_txg = ts->ts_txg;
+		nts->ts_time = ts->ts_time;
+		ts->ts_end = start;
+		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
+		TRIM_MAP_ADD(tm, ts);
+		TRIM_MAP_ADD(tm, nts);
+	} else if (left_over) {
+		ts->ts_end = start;
+		TRIM_MAP_ADD(tm, ts);
+	} else if (right_over) {
+		ts->ts_start = end;
+		TRIM_MAP_ADD(tm, ts);
+	} else {
+		avl_remove(&tm->tm_queued_frees, ts);
+		kmem_free(ts, sizeof (*ts));
+	}
+}
+
+static void
+trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg)
+{
+	zio_t zsearch, *zs;
+
+	ASSERT(MUTEX_HELD(&tm->tm_lock));
+
+	zsearch.io_offset = start;
+	zsearch.io_size = end - start;
+
+	zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL);
+	if (zs == NULL) {
+		trim_map_segment_add(tm, start, end, txg);
+		return;
+	}
+	if (start < zs->io_offset)
+		trim_map_free_locked(tm, start, zs->io_offset, txg);
+	if (zs->io_offset + zs->io_size < end)
+		trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg);
+}
+
+void
+trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
+{
+	trim_map_t *tm = vd->vdev_trimmap;
+
+	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
+		return;
+
+	mutex_enter(&tm->tm_lock);
+	trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg);
+	mutex_exit(&tm->tm_lock);
+}
+
+boolean_t
+trim_map_write_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	trim_map_t *tm = vd->vdev_trimmap;
+	trim_seg_t tsearch, *ts;
+	boolean_t left_over, right_over;
+	uint64_t start, end;
+
+	if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL)
+		return (B_TRUE);
+
+	start = zio->io_offset;
+	end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size);
+	tsearch.ts_start = start;
+	tsearch.ts_end = end;
+
+	mutex_enter(&tm->tm_lock);
+
+	/*
+	 * Checking for colliding in-flight frees.
+	 */
+	ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL);
+	if (ts != NULL) {
+		list_insert_tail(&tm->tm_pending_writes, zio);
+		mutex_exit(&tm->tm_lock);
+		return (B_FALSE);
+	}
+
+	ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
+	if (ts != NULL) {
+		/*
+		 * Loop until all overlapping segments are removed.
+		 */
+		do {
+			trim_map_segment_remove(tm, ts, start, end);
+			ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL);
+		} while (ts != NULL);
+	}
+	avl_add(&tm->tm_inflight_writes, zio);
+
+	mutex_exit(&tm->tm_lock);
+
+	return (B_TRUE);
+}
+
+void
+trim_map_write_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	trim_map_t *tm = vd->vdev_trimmap;
+
+	/*
+	 * Don't check for vdev_notrim, since the write could have
+	 * started before vdev_notrim was set.
+	 */
+	if (!zfs_trim_enabled || tm == NULL)
+		return;
+
+	mutex_enter(&tm->tm_lock);
+	/*
+	 * Don't fail if the write isn't in the tree, since the write
+	 * could have started after vdev_notrim was set.
+	 */
+	if (zio->io_trim_node.avl_child[0] ||
+	    zio->io_trim_node.avl_child[1] ||
+	    AVL_XPARENT(&zio->io_trim_node) ||
+	    tm->tm_inflight_writes.avl_root == &zio->io_trim_node)
+		avl_remove(&tm->tm_inflight_writes, zio);
+	mutex_exit(&tm->tm_lock);
+}
+
+/*
+ * Return the oldest segment (the one with the lowest txg / time) or NULL if:
+ * 1. The list is empty
+ * 2. The first element's txg is greater than txgsafe
+ * 3. The first element's txg is not greater than the txg argument and the
+ *    the first element's time is not greater than time argument
+ */
+static trim_seg_t *
+trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
+    boolean_t force)
+{
+	trim_seg_t *ts;
+
+	ASSERT(MUTEX_HELD(&tm->tm_lock));
+	VERIFY(txgsafe >= txg);
+
+	ts = list_head(&tm->tm_head);
+	if (ts != NULL && ts->ts_txg <= txgsafe &&
+	    (ts->ts_txg <= txg || ts->ts_time <= time || force))
+		return (ts);
+	return (NULL);
+}
+
+static void
+trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+	trim_map_t *tm = vd->vdev_trimmap;
+	trim_seg_t *ts;
+	uint64_t size, offset, txgtarget, txgsafe;
+	int64_t hard, soft;
+	hrtime_t timelimit;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	if (tm == NULL)
+		return;
+
+	timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
+	if (vd->vdev_isl2cache) {
+		txgsafe = UINT64_MAX;
+		txgtarget = UINT64_MAX;
+	} else {
+		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
+		if (txgsafe > trim_txg_delay)
+			txgtarget = txgsafe - trim_txg_delay;
+		else
+			txgtarget = 0;
+	}
+
+	mutex_enter(&tm->tm_lock);
+	hard = 0;
+	if (tm->tm_pending > trim_vdev_max_pending)
+		hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
+	soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
+	/* Loop until we have sent all outstanding free's */
+	while (soft > 0 &&
+	    (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
+	    != NULL) {
+		TRIM_MAP_REM(tm, ts);
+		avl_remove(&tm->tm_queued_frees, ts);
+		avl_add(&tm->tm_inflight_frees, ts);
+		size = ts->ts_end - ts->ts_start;
+		offset = ts->ts_start;
+		/*
+		 * We drop the lock while we call zio_nowait as the IO
+		 * scheduler can result in a different IO being run e.g.
+		 * a write which would result in a recursive lock.
+		 */
+		mutex_exit(&tm->tm_lock);
+
+		zio_nowait(zio_trim(zio, spa, vd, offset, size));
+
+		soft -= TRIM_MAP_SEGS(size);
+		hard -= TRIM_MAP_SEGS(size);
+		mutex_enter(&tm->tm_lock);
+	}
+	mutex_exit(&tm->tm_lock);
+}
+
+static void
+trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd)
+{
+	trim_map_t *tm = vd->vdev_trimmap;
+	trim_seg_t *ts;
+	list_t pending_writes;
+	zio_t *zio;
+	uint64_t start, size;
+	void *cookie;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	if (tm == NULL)
+		return;
+
+	mutex_enter(&tm->tm_lock);
+	if (!avl_is_empty(&tm->tm_inflight_frees)) {
+		cookie = NULL;
+		while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees,
+		    &cookie)) != NULL) {
+			kmem_free(ts, sizeof (*ts));
+		}
+	}
+	list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t,
+	    io_trim_link));
+	list_move_tail(&pending_writes, &tm->tm_pending_writes);
+	mutex_exit(&tm->tm_lock);
+
+	while ((zio = list_remove_head(&pending_writes)) != NULL) {
+		zio_vdev_io_reissue(zio);
+		zio_execute(zio);
+	}
+	list_destroy(&pending_writes);
+}
+
+static void
+trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
+{
+	int c;
+
+	if (vd == NULL)
+		return;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		trim_map_vdev_commit(spa, zio, vd);
+	} else {
+		for (c = 0; c < vd->vdev_children; c++)
+			trim_map_commit(spa, zio, vd->vdev_child[c]);
+	}
+}
+
+static void
+trim_map_commit_done(spa_t *spa, vdev_t *vd)
+{
+	int c;
+
+	if (vd == NULL)
+		return;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		trim_map_vdev_commit_done(spa, vd);
+	} else {
+		for (c = 0; c < vd->vdev_children; c++)
+			trim_map_commit_done(spa, vd->vdev_child[c]);
+	}
+}
+
+static void
+trim_thread(void *arg)
+{
+	spa_t *spa = arg;
+	zio_t *zio;
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	(void) snprintf(curthread->td_name, sizeof(curthread->td_name),
+	    "trim %s", spa_name(spa));
+#endif
+#endif
+#ifdef __NetBSD__
+#ifdef _KERNEL
+	size_t sz;
+	char *name, *oname;
+	struct lwp *l = curlwp;
+
+	name = kmem_alloc(MAXCOMLEN, KM_SLEEP);
+	snprintf(name, MAXCOMLEN, "trim %s", spa_name(spa));
+	name[MAXCOMLEN - 1] = 0;
+
+	lwp_lock(l);
+	oname = l->l_name;
+	l->l_name = name;
+	lwp_unlock(l);
+
+	if (oname != NULL)
+		kmem_free(oname, MAXCOMLEN);
+#endif
+#endif
+
+	for (;;) {
+		mutex_enter(&spa->spa_trim_lock);
+		if (spa->spa_trim_thread == NULL) {
+			spa->spa_trim_thread = curthread;
+			cv_signal(&spa->spa_trim_cv);
+			mutex_exit(&spa->spa_trim_lock);
+			thread_exit();
+		}
+
+		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
+		    hz * trim_max_interval);
+		mutex_exit(&spa->spa_trim_lock);
+
+		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+		trim_map_commit(spa, zio, spa->spa_root_vdev);
+		(void) zio_wait(zio);
+		trim_map_commit_done(spa, spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_STATE, FTAG);
+	}
+}
+
+void
+trim_thread_create(spa_t *spa)
+{
+
+	if (!zfs_trim_enabled)
+		return;
+
+	mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL);
+	mutex_enter(&spa->spa_trim_lock);
+	spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0,
+	    TS_RUN, minclsyspri);
+	mutex_exit(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_destroy(spa_t *spa)
+{
+
+	if (!zfs_trim_enabled)
+		return;
+	if (spa->spa_trim_thread == NULL)
+		return;
+
+	mutex_enter(&spa->spa_trim_lock);
+	/* Setting spa_trim_thread to NULL tells the thread to stop. */
+	spa->spa_trim_thread = NULL;
+	cv_signal(&spa->spa_trim_cv);
+	/* The thread will set it back to != NULL on exit. */
+	while (spa->spa_trim_thread == NULL)
+		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
+	spa->spa_trim_thread = NULL;
+	mutex_exit(&spa->spa_trim_lock);
+
+	cv_destroy(&spa->spa_trim_cv);
+	mutex_destroy(&spa->spa_trim_lock);
+}
+
+void
+trim_thread_wakeup(spa_t *spa)
+{
+
+	if (!zfs_trim_enabled)
+		return;
+	if (spa->spa_trim_thread == NULL)
+		return;
+
+	mutex_enter(&spa->spa_trim_lock);
+	cv_signal(&spa->spa_trim_cv);
+	mutex_exit(&spa->spa_trim_lock);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c,v
retrieving revision 1.4
diff -u -p -r1.4 txg.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c	20 Nov 2011 02:54:25 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c	4 Feb 2015 07:24:17 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -28,16 +29,91 @@
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
 #include <sys/callb.h>
 
 /*
- * Pool-wide transaction groups.
- */
-
-static void txg_sync_thread(void *);
-static void txg_quiesce_thread(void *);
-
-int zfs_txg_timeout = 30;	/* max seconds worth of delta per txg */
+ * ZFS Transaction Groups
+ * ----------------------
+ *
+ * ZFS transaction groups are, as the name implies, groups of transactions
+ * that act on persistent state. ZFS asserts consistency at the granularity of
+ * these transaction groups. Each successive transaction group (txg) is
+ * assigned a 64-bit consecutive identifier. There are three active
+ * transaction group states: open, quiescing, or syncing. At any given time,
+ * there may be an active txg associated with each state; each active txg may
+ * either be processing, or blocked waiting to enter the next state. There may
+ * be up to three active txgs, and there is always a txg in the open state
+ * (though it may be blocked waiting to enter the quiescing state). In broad
+ * strokes, transactions -- operations that change in-memory structures -- are
+ * accepted into the txg in the open state, and are completed while the txg is
+ * in the open or quiescing states. The accumulated changes are written to
+ * disk in the syncing state.
+ *
+ * Open
+ *
+ * When a new txg becomes active, it first enters the open state. New
+ * transactions -- updates to in-memory structures -- are assigned to the
+ * currently open txg. There is always a txg in the open state so that ZFS can
+ * accept new changes (though the txg may refuse new changes if it has hit
+ * some limit). ZFS advances the open txg to the next state for a variety of
+ * reasons such as it hitting a time or size threshold, or the execution of an
+ * administrative action that must be completed in the syncing state.
+ *
+ * Quiescing
+ *
+ * After a txg exits the open state, it enters the quiescing state. The
+ * quiescing state is intended to provide a buffer between accepting new
+ * transactions in the open state and writing them out to stable storage in
+ * the syncing state. While quiescing, transactions can continue their
+ * operation without delaying either of the other states. Typically, a txg is
+ * in the quiescing state very briefly since the operations are bounded by
+ * software latencies rather than, say, slower I/O latencies. After all
+ * transactions complete, the txg is ready to enter the next state.
+ *
+ * Syncing
+ *
+ * In the syncing state, the in-memory state built up during the open and (to
+ * a lesser degree) the quiescing states is written to stable storage. The
+ * process of writing out modified data can, in turn modify more data. For
+ * example when we write new blocks, we need to allocate space for them; those
+ * allocations modify metadata (space maps)... which themselves must be
+ * written to stable storage. During the sync state, ZFS iterates, writing out
+ * data until it converges and all in-memory changes have been written out.
+ * The first such pass is the largest as it encompasses all the modified user
+ * data (as opposed to filesystem metadata). Subsequent passes typically have
+ * far less data to write as they consist exclusively of filesystem metadata.
+ *
+ * To ensure convergence, after a certain number of passes ZFS begins
+ * overwriting locations on stable storage that had been allocated earlier in
+ * the syncing state (and subsequently freed). ZFS usually allocates new
+ * blocks to optimize for large, continuous, writes. For the syncing state to
+ * converge however it must complete a pass where no new blocks are allocated
+ * since each allocation requires a modification of persistent metadata.
+ * Further, to hasten convergence, after a prescribed number of passes, ZFS
+ * also defers frees, and stops compressing.
+ *
+ * In addition to writing out user data, we must also execute synctasks during
+ * the syncing context. A synctask is the mechanism by which some
+ * administrative activities work such as creating and destroying snapshots or
+ * datasets. Note that when a synctask is initiated it enters the open txg,
+ * and ZFS then pushes that txg as quickly as possible to completion of the
+ * syncing state in order to reduce the latency of the administrative
+ * activity. To complete the syncing state, ZFS writes out a new uberblock,
+ * the root of the tree of blocks that comprise all state stored on the ZFS
+ * pool. Finally, if there is a quiesced txg waiting, we signal that it can
+ * now transition to the syncing state.
+ */
+
+static void txg_sync_thread(void *arg);
+static void txg_quiesce_thread(void *arg);
+
+int zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
+SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
+    "Maximum seconds worth of delta per txg");
 
 /*
  * Prepare the txg subsystem.
@@ -55,6 +131,8 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
 		int i;
 
 		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
+		    NULL);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 			    NULL);
@@ -97,6 +175,7 @@ txg_fini(dsl_pool_t *dp)
 	for (c = 0; c < max_ncpus; c++) {
 		int i;
 
+		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
 		mutex_destroy(&tx->tx_cpu[c].tc_lock);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
@@ -136,7 +215,7 @@ txg_sync_start(dsl_pool_t *dp)
 	 * 32-bit x86.  This is due in part to nested pools and
 	 * scrub_visitbp() recursion.
 	 */
-	tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread,
+	tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
 	mutex_exit(&tx->tx_sync_lock);
@@ -161,7 +240,7 @@ txg_thread_exit(tx_state_t *tx, callb_cp
 }
 
 static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 {
 	CALLB_CPR_SAFE_BEGIN(cpr);
 
@@ -220,10 +299,12 @@ txg_hold_open(dsl_pool_t *dp, txg_handle
 	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
 	uint64_t txg;
 
-	mutex_enter(&tc->tc_lock);
-
+	mutex_enter(&tc->tc_open_lock);
 	txg = tx->tx_open_txg;
+
+	mutex_enter(&tc->tc_lock);
 	tc->tc_count[txg & TXG_MASK]++;
+	mutex_exit(&tc->tc_lock);
 
 	th->th_cpu = tc;
 	th->th_txg = txg;
@@ -236,7 +317,8 @@ txg_rele_to_quiesce(txg_handle_t *th)
 {
 	tx_cpu_t *tc = th->th_cpu;
 
-	mutex_exit(&tc->tc_lock);
+	ASSERT(!MUTEX_HELD(&tc->tc_lock));
+	mutex_exit(&tc->tc_open_lock);
 }
 
 void
@@ -265,7 +347,13 @@ txg_rele_to_sync(txg_handle_t *th)
 	th->th_cpu = NULL;	/* defensive */
 }
 
-static void
+/*
+ * Blocks until all transactions in the group are committed.
+ *
+ * On return, the transaction group has reached a stable state in which it can
+ * then be passed off to the syncing context.
+ */
+static __noinline void
 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
@@ -273,20 +361,24 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg
 	int c;
 
 	/*
-	 * Grab all tx_cpu locks so nobody else can get into this txg.
+	 * Grab all tc_open_locks so nobody else can get into this txg.
 	 */
 	for (c = 0; c < max_ncpus; c++)
-		mutex_enter(&tx->tx_cpu[c].tc_lock);
+		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
+	tx->tx_open_time = gethrtime();
+
+	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 
 	/*
 	 * Now that we've incremented tx_open_txg, we can let threads
 	 * enter the next transaction group.
 	 */
 	for (c = 0; c < max_ncpus; c++)
-		mutex_exit(&tx->tx_cpu[c].tc_lock);
+		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 
 	/*
 	 * Quiesce the transaction group by waiting for everyone to txg_exit().
@@ -301,8 +393,10 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg
 }
 
 static void
-txg_do_callbacks(list_t *cb_list)
+txg_do_callbacks(void *arg)
 {
+	list_t *cb_list = arg;
+
 	dmu_tx_do_callbacks(cb_list, 0);
 
 	list_destroy(cb_list);
@@ -312,6 +406,9 @@ txg_do_callbacks(list_t *cb_list)
 
 /*
  * Dispatch the commit callbacks registered on this txg to worker threads.
+ *
+ * If no callbacks are registered for a given TXG, nothing happens.
+ * This function creates a taskq for the associated pool, if needed.
  */
 static void
 txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
@@ -322,7 +419,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, u
 
 	for (c = 0; c < max_ncpus; c++) {
 		tx_cpu_t *tc = &tx->tx_cpu[c];
-		/* No need to lock tx_cpu_t at this point */
+		/*
+		 * No need to lock tx_cpu_t at this point, since this can
+		 * only be called once a txg has been synced.
+		 */
 
 		int g = txg & TXG_MASK;
 
@@ -342,7 +442,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, u
 		list_create(cb_list, sizeof (dmu_tx_callback_t),
 		    offsetof(dmu_tx_callback_t, dcb_node));
 
-		list_move_tail(&tc->tc_callbacks[g], cb_list);
+		list_move_tail(cb_list, &tc->tc_callbacks[g]);
 
 		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 		    txg_do_callbacks, cb_list, TQ_SLEEP);
@@ -359,24 +459,24 @@ txg_sync_thread(void *arg)
 	uint64_t start, delta;
 
 	txg_thread_enter(tx, &cpr);
-	dprintf("txg_sync_thread called\n");
+
 	start = delta = 0;
 	for (;;) {
-		uint64_t timer, timeout = zfs_txg_timeout * hz;
+		uint64_t timeout = zfs_txg_timeout * hz;
+		uint64_t timer;
 		uint64_t txg;
-		dprintf("txg_sync_thread thread for\n");
+
 		/*
-		 * We sync when we're scrubbing, there's someone waiting
+		 * We sync when we're scanning, there's someone waiting
 		 * on us, or the quiesce thread has handed off a txg to
 		 * us, or we have reached our timeout.
 		 */
 		timer = (delta >= timeout ? 0 : timeout - delta);
-		while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
-		    spa_load_state(spa) != SPA_LOAD_NONE ||
-		    spa_shutting_down(spa)) &&
+		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    tx->tx_quiesced_txg == 0) {
+		    tx->tx_quiesced_txg == 0 &&
+		    dp->dp_dirty_total < zfs_dirty_data_sync) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
@@ -406,6 +506,7 @@ txg_sync_thread(void *arg)
 		txg = tx->tx_quiesced_txg;
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
+		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -419,6 +520,7 @@ txg_sync_thread(void *arg)
 		mutex_enter(&tx->tx_sync_lock);
 		tx->tx_synced_txg = txg;
 		tx->tx_syncing_txg = 0;
+		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_done_cv);
 
 		/*
@@ -468,23 +570,24 @@ txg_quiesce_thread(void *arg)
 		 */
 		dprintf("quiesce done, handing off txg %llu\n", txg);
 		tx->tx_quiesced_txg = txg;
+		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_broadcast(&tx->tx_quiesce_done_cv);
 	}
 }
 
 /*
- * Delay this thread by 'ticks' if we are still in the open transaction
- * group and there is already a waiting txg quiesing or quiesced.  Abort
- * the delay if this txg stalls or enters the quiesing state.
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiesing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiesing state.
  */
 void
-txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	int timeout = ddi_get_lbolt() + ticks;
+	hrtime_t start = gethrtime();
 
-	/* don't delay if this txg could transition to quiesing immediately */
+	/* don't delay if this txg could transition to quiescing immediately */
 	if (tx->tx_open_txg > txg ||
 	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
 		return;
@@ -495,10 +598,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, 
 		return;
 	}
 
-	while (ddi_get_lbolt() < timeout &&
-	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
-		(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
-		    timeout);
+	while (gethrtime() - start < delay &&
+	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+		    &tx->tx_sync_lock, delay, resolution, 0);
+	}
 
 	mutex_exit(&tx->tx_sync_lock);
 }
@@ -508,6 +612,8 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t
 {
 	tx_state_t *tx = &dp->dp_tx;
 
+	ASSERT(!dsl_pool_config_held(dp));
+
 	mutex_enter(&tx->tx_sync_lock);
 	ASSERT(tx->tx_threads == 2);
 	if (txg == 0)
@@ -531,6 +637,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t t
 {
 	tx_state_t *tx = &dp->dp_tx;
 
+	ASSERT(!dsl_pool_config_held(dp));
+
 	mutex_enter(&tx->tx_sync_lock);
 	ASSERT(tx->tx_threads == 2);
 	if (txg == 0)
@@ -546,6 +654,28 @@ txg_wait_open(dsl_pool_t *dp, uint64_t t
 	mutex_exit(&tx->tx_sync_lock);
 }
 
+/*
+ * If there isn't a txg syncing or in the pipeline, push another txg through
+ * the pipeline by queiscing the open txg.
+ */
+void
+txg_kick(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	ASSERT(!dsl_pool_config_held(dp));
+
+	mutex_enter(&tx->tx_sync_lock);
+	if (tx->tx_syncing_txg == 0 &&
+	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
+	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
+	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
+		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
 boolean_t
 txg_stalled(dsl_pool_t *dp)
 {
@@ -589,33 +719,80 @@ txg_list_destroy(txg_list_t *tl)
 	mutex_destroy(&tl->tl_lock);
 }
 
-int
+boolean_t
 txg_list_empty(txg_list_t *tl, uint64_t txg)
 {
 	return (tl->tl_head[txg & TXG_MASK] == NULL);
 }
 
 /*
- * Add an entry to the list.
- * Returns 0 if it's a new entry, 1 if it's already there.
+ * Returns true if all txg lists are empty.
+ *
+ * Warning: this is inherently racy (an item could be added immediately after this
+ * function returns). We don't bother with the lock because it wouldn't change the
+ * semantics.
+ */
+boolean_t
+txg_all_lists_empty(txg_list_t *tl)
+{
+	for (int i = 0; i < TXG_SIZE; i++) {
+		if (!txg_list_empty(tl, i)) {
+			return (B_FALSE);
+		}
+	}
+	return (B_TRUE);
+}
+
+/*
+ * Add an entry to the list (unless it's already on the list).
+ * Returns B_TRUE if it was actually added.
  */
-int
+boolean_t
 txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
-	int already_on_list;
+	boolean_t add;
 
 	mutex_enter(&tl->tl_lock);
-	already_on_list = tn->tn_member[t];
-	if (!already_on_list) {
+	add = (tn->tn_member[t] == 0);
+	if (add) {
 		tn->tn_member[t] = 1;
 		tn->tn_next[t] = tl->tl_head[t];
 		tl->tl_head[t] = tn;
 	}
 	mutex_exit(&tl->tl_lock);
 
-	return (already_on_list);
+	return (add);
+}
+
+/*
+ * Add an entry to the end of the list, unless it's already on the list.
+ * (walks list to find end)
+ * Returns B_TRUE if it was actually added.
+ */
+boolean_t
+txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+	boolean_t add;
+
+	mutex_enter(&tl->tl_lock);
+	add = (tn->tn_member[t] == 0);
+	if (add) {
+		txg_node_t **tp;
+
+		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
+			continue;
+
+		tn->tn_member[t] = 1;
+		tn->tn_next[t] = NULL;
+		*tp = tn;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (add);
 }
 
 /*
@@ -666,13 +843,13 @@ txg_list_remove_this(txg_list_t *tl, voi
 	return (NULL);
 }
 
-int
+boolean_t
 txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 
-	return (tn->tn_member[t]);
+	return (tn->tn_member[t] != 0);
 }
 
 /*
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 uberblock.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c	7 Aug 2009 18:33:14 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c	4 Feb 2015 07:24:13 -0000
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_impl.h>
@@ -36,16 +34,16 @@ uberblock_verify(uberblock_t *ub)
 		byteswap_uint64_array(ub, sizeof (uberblock_t));
 
 	if (ub->ub_magic != UBERBLOCK_MAGIC)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	return (0);
 }
 
 /*
- * Update the uberblock and return a boolean value indicating whether
- * anything changed in this transaction group.
+ * Update the uberblock and return TRUE if anything changed in this
+ * transaction group.
  */
-int
+boolean_t
 uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
 {
 	ASSERT(ub->ub_txg < txg);
@@ -58,6 +56,7 @@ uberblock_update(uberblock_t *ub, vdev_t
 	ub->ub_txg = txg;
 	ub->ub_guid_sum = rvd->vdev_guid_sum;
 	ub->ub_timestamp = gethrestime_sec();
+	ub->ub_software_version = SPA_VERSION;
 
 	return (ub->ub_rootbp.blk_birth == txg);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c,v
retrieving revision 1.4
diff -u -p -r1.4 vdev.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c	20 Nov 2011 02:54:25 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c	4 May 2017 13:47:07 -0000
@@ -20,8 +20,12 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
  */
 
 #include <sys/zfs_context.h>
@@ -35,31 +39,134 @@
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
+#include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
+#include <sys/dsl_scan.h>
+#include <sys/trim_map.h>
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
 
 /*
  * Virtual device management.
  */
 
+/*
+ * The limit for ZFS to automatically increase a top-level vdev's ashift
+ * from logical ashift to physical ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ *          child->vdev_ashift = 9 (512 bytes)
+ *          child->vdev_physical_ashift = 12 (4096 bytes)
+ *          zfs_max_auto_ashift = 11 (2048 bytes)
+ *          zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 2048 as limited by
+ * zfs_max_auto_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ *          child->vdev_ashift = 9 (512 bytes)
+ *          child->vdev_physical_ashift = 12 (4096 bytes)
+ *          zfs_max_auto_ashift = 13 (8192 bytes)
+ *          zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * max vdev_physical_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ *          child->vdev_ashift = 9 (512 bytes)
+ *          child->vdev_physical_ashift = 9 (512 bytes)
+ *          zfs_max_auto_ashift = 13 (8192 bytes)
+ *          zfs_min_auto_ashift = 12 (4096 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * zfs_min_auto_ashift.
+ */
+static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
+static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
+
+#ifdef __FreeBSD__
+static int
+sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_max_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
+		return (EINVAL);
+
+	zfs_max_auto_ashift = val;
+
+	return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_vfs_zfs_max_auto_ashift, "QU",
+    "Max ashift used when optimising for logical -> physical sectors size on "
+    "new top-level vdevs.");
+
+static int
+sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_min_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
+		return (EINVAL);
+
+	zfs_min_auto_ashift = val;
+
+	return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+    sysctl_vfs_zfs_min_auto_ashift, "QU",
+    "Min ashift used when creating new top-level vdevs.");
+#endif
+
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
+#if defined(__FreeBSD__) && defined(_KERNEL)
+	&vdev_geom_ops,
+#else
 	&vdev_disk_ops,
+#endif
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	NULL
 };
 
-/* maximum scrub/resilver I/O queue per leaf vdev */
-int zfs_scrub_limit = 10;
+
+/*
+ * When a vdev is added, it will be divided into approximately (but no
+ * more than) this number of metaslabs.
+ */
+int metaslabs_per_vdev = 200;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN,
+    &metaslabs_per_vdev, 0,
+    "When a vdev is added, how many metaslabs the vdev should be divided into");
 
 /*
  * Given a vdev type, return the appropriate ops vector.
@@ -106,7 +213,7 @@ vdev_get_min_asize(vdev_t *vd)
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
-	 * The our parent is NULL (inactive spare or cache) or is the root,
+	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
@@ -169,14 +276,35 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t
 	return (NULL);
 }
 
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+	int n = 0;
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (1);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+	return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+	return (vdev_count_leaves_impl(spa->spa_root_vdev));
+}
+
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
+	spa_t *spa = cvd->vdev_spa;
 
-	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
@@ -207,9 +335,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
 }
 
 void
@@ -244,9 +369,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *c
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
-
-	if (cvd->vdev_ops->vdev_op_leaf)
-		cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
 }
 
 /*
@@ -292,6 +414,7 @@ vdev_alloc_common(spa_t *spa, uint_t id,
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
+		spa->spa_load_guid = spa_generate_guid(NULL);
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
@@ -321,8 +444,9 @@ vdev_alloc_common(spa_t *spa, uint_t id,
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	for (int t = 0; t < DTL_TYPES; t++) {
-		space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
+		vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
 		    &vd->vdev_dtl_lock);
 	}
 	txg_list_create(&vd->vdev_ms_list,
@@ -353,10 +477,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
@@ -367,26 +491,26 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
@@ -394,10 +518,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the nparity property for RAID-Z vdevs.
@@ -407,24 +531,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
 			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
-				return (EINVAL);
+				return (SET_ERROR(EINVAL));
 			/*
 			 * Previous versions could only support 1 or 2 parity
 			 * device.
 			 */
 			if (nparity > 1 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ2)
-				return (ENOTSUP);
+				return (SET_ERROR(ENOTSUP));
 			if (nparity > 2 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ3)
-				return (ENOTSUP);
+				return (SET_ERROR(ENOTSUP));
 		} else {
 			/*
 			 * We require the parity to be specified for SPAs that
 			 * support multiple parity levels.
 			 */
 			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
-				return (EINVAL);
+				return (SET_ERROR(EINVAL));
 			/*
 			 * Otherwise, we default to 1 parity device for RAID-Z.
 			 */
@@ -487,9 +611,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+		    &vd->vdev_removing);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+		    &vd->vdev_top_zap);
+	} else {
+		ASSERT0(vd->vdev_top_zap);
 	}
 
-	if (parent && !parent->vdev_parent) {
+	if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
@@ -498,15 +628,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
 		    spa_log_class(spa) : spa_normal_class(spa), vd);
 	}
 
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+		(void) nvlist_lookup_uint64(nv,
+		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
+	} else {
+		ASSERT0(vd->vdev_leaf_zap);
+	}
+
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
+
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-			    &vd->vdev_dtl_smo.smo_object);
+			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
@@ -522,6 +661,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+		    &vd->vdev_resilver_txg);
+
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
@@ -590,9 +732,9 @@ vdev_free(vdev_t *vd)
 		metaslab_group_destroy(vd->vdev_mg);
 	}
 
-	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
-	ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
-	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
+	ASSERT0(vd->vdev_stat.vs_space);
+	ASSERT0(vd->vdev_stat.vs_dspace);
+	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
@@ -625,12 +767,14 @@ vdev_free(vdev_t *vd)
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
-		space_map_unload(&vd->vdev_dtl[t]);
-		space_map_destroy(&vd->vdev_dtl[t]);
+		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
+		range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
+	mutex_destroy(&vd->vdev_queue_lock);
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
@@ -657,11 +801,15 @@ vdev_top_transfer(vdev_t *svd, vdev_t *t
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
+	tvd->vdev_top_zap = svd->vdev_top_zap;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
+	svd->vdev_top_zap = 0;
 
+	if (tvd->vdev_mg)
+		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
@@ -733,7 +881,10 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t 
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
+	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
+	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
+	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
@@ -765,6 +916,8 @@ vdev_remove_parent(vdev_t *cvd)
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
+	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
+	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
@@ -815,9 +968,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t 
 
 	/*
 	 * Compute the raidz-deflation ratio.  Note, we hard-code
-	 * in 128k (1 << 17) because it is the current "typical" blocksize.
-	 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
-	 * or we will inconsistently account for existing bp's.
+	 * in 128k (1 << 17) because it is the "typical" blocksize.
+	 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+	 * otherwise it would inconsistently account for existing bp's.
 	 */
 	vd->vdev_deflate_ratio = (1 << 17) /
 	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
@@ -835,33 +988,31 @@ vdev_metaslab_init(vdev_t *vd, uint64_t 
 	vd->vdev_ms_count = newc;
 
 	for (m = oldc; m < newc; m++) {
-		space_map_obj_t smo = { 0, 0, 0 };
+		uint64_t object = 0;
+
 		if (txg == 0) {
-			uint64_t object = 0;
 			error = dmu_read(mos, vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error)
 				return (error);
-			if (object != 0) {
-				dmu_buf_t *db;
-				error = dmu_bonus_hold(mos, object, FTAG, &db);
-				if (error)
-					return (error);
-				ASSERT3U(db->db_size, >=, sizeof (smo));
-				bcopy(db->db_data, &smo, sizeof (smo));
-				ASSERT3U(smo.smo_object, ==, object);
-				dmu_buf_rele(db, FTAG);
-			}
 		}
-		vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
-		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+
+		error = metaslab_init(vd->vdev_mg, m, object, txg,
+		    &(vd->vdev_ms[m]));
+		if (error)
+			return (error);
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
-	if (oldc == 0)
+	/*
+	 * If the vdev is being removed we don't activate
+	 * the metaslabs since we want to ensure that no new
+	 * allocations are performed on this device.
+	 */
+	if (oldc == 0 && !vd->vdev_removing)
 		metaslab_group_activate(vd->vdev_mg);
 
 	if (txg == 0)
@@ -878,9 +1029,12 @@ vdev_metaslab_fini(vdev_t *vd)
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_passivate(vd->vdev_mg);
-		for (m = 0; m < count; m++)
-			if (vd->vdev_ms[m] != NULL)
-				metaslab_fini(vd->vdev_ms[m]);
+		for (m = 0; m < count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+
+			if (msp != NULL)
+				metaslab_fini(msp);
+		}
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 	}
@@ -929,7 +1083,7 @@ vdev_probe_done(zio_t *zio)
 			ASSERT(zio->io_error != 0);
 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, 0, 0);
-			zio->io_error = ENXIO;
+			zio->io_error = SET_ERROR(ENXIO);
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
@@ -937,18 +1091,21 @@ vdev_probe_done(zio_t *zio)
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
-		while ((pio = zio_walk_parents(zio)) != NULL)
+		zio_link_t *zl = NULL;
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
-				pio->io_error = ENXIO;
+				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
- * Determine whether this device is accessible by reading and writing
- * to several known locations: the pad regions of each vdev label
- * but the first (which we leave alone in case it contains a VTOC).
+ * Determine whether this device is accessible.
+ *
+ * Read and write to several known locations: the pad regions of each
+ * vdev label but the first, which we leave alone in case it contains
+ * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
@@ -1005,6 +1162,10 @@ vdev_probe(vdev_t *vd, zio_t *zio)
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 
+		/*
+		 * We can't change the vdev state in this context, so we
+		 * kick off an async task to do it on our behalf.
+		 */
 		if (zio != NULL) {
 			vd->vdev_probe_wanted = B_TRUE;
 			spa_async_request(spa, SPA_ASYNC_PROBE);
@@ -1070,7 +1231,7 @@ vdev_open_children(vdev_t *vd)
 	 * in a single thread so that the same thread holds the
 	 * spa_namespace_lock
 	 */
-	if (vdev_uses_zvols(vd)) {
+	if (B_TRUE || vdev_uses_zvols(vd)) {
 		for (int c = 0; c < children; c++)
 			vd->vdev_child[c]->vdev_open_error =
 			    vdev_open(vd->vdev_child[c]);
@@ -1095,8 +1256,10 @@ vdev_open(vdev_t *vd)
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
-	uint64_t asize, psize;
-	uint64_t ashift = 0;
+	uint64_t max_osize = 0;
+	uint64_t asize, max_asize, psize;
+	uint64_t logical_ashift = 0;
+	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
@@ -1107,6 +1270,7 @@ vdev_open(vdev_t *vd)
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
+	vd->vdev_notrim = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
@@ -1119,14 +1283,15 @@ vdev_open(vdev_t *vd)
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
 	}
 
-	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
+	    &logical_ashift, &physical_ashift);
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
@@ -1158,7 +1323,7 @@ vdev_open(vdev_t *vd)
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
@@ -1175,6 +1340,9 @@ vdev_open(vdev_t *vd)
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
+	if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
+		trim_map_create(vd);
+
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
@@ -1184,24 +1352,28 @@ vdev_open(vdev_t *vd)
 	}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
-			return (EOVERFLOW);
+			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
+		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
-			return (EOVERFLOW);
+			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
+		max_asize = max_osize;
 	}
 
 	vd->vdev_psize = psize;
@@ -1212,6 +1384,17 @@ vdev_open(vdev_t *vd)
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
+		return (SET_ERROR(EINVAL));
+	}
+
+	vd->vdev_physical_ashift =
+	    MAX(physical_ashift, vd->vdev_physical_ashift);
+	vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
+	vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
+
+	if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_ASHIFT_TOO_BIG);
 		return (EINVAL);
 	}
 
@@ -1221,16 +1404,18 @@ vdev_open(vdev_t *vd)
 		 * For testing purposes, a higher ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
-		vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
+		vd->vdev_max_asize = max_asize;
 	} else {
 		/*
 		 * Make sure the alignment requirement hasn't increased.
 		 */
-		if (ashift > vd->vdev_top->vdev_ashift) {
+		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
+		    vd->vdev_ops->vdev_op_leaf) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
+		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
@@ -1250,12 +1435,23 @@ vdev_open(vdev_t *vd)
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
-		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_IO_FAILURE);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
+	 * Track the min and max ashift values for normal data devices.
+	 */
+	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+	    !vd->vdev_islog && vd->vdev_aux == NULL) {
+		if (vd->vdev_ashift > spa->spa_max_ashift)
+			spa->spa_max_ashift = vd->vdev_ashift;
+		if (vd->vdev_ashift < spa->spa_min_ashift)
+			spa->spa_min_ashift = vd->vdev_ashift;
+	}
+
+	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
 	 * resilver.  But don't do this if we are doing a reopen for a scrub,
 	 * since this would just restart the scrub we are already doing.
@@ -1272,13 +1468,18 @@ vdev_open(vdev_t *vd)
  * contents.  This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
+ * If 'strict' is false ignore the spa guid check. This is necessary because
+ * if the machine crashed during a re-guid the new guid might have been written
+ * to all of the vdev labels, but not the cached config. The strict check
+ * will be performed when the pool is opened again using the mos config.
+ *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
-vdev_validate(vdev_t *vd)
+vdev_validate(vdev_t *vd, boolean_t strict)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
@@ -1286,8 +1487,8 @@ vdev_validate(vdev_t *vd)
 	uint64_t state;
 
 	for (int c = 0; c < vd->vdev_children; c++)
-		if (vdev_validate(vd->vdev_child[c]) != 0)
-			return (EBADF);
+		if (vdev_validate(vd->vdev_child[c], strict) != 0)
+			return (SET_ERROR(EBADF));
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
@@ -1297,8 +1498,10 @@ vdev_validate(vdev_t *vd)
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		uint64_t aux_guid = 0;
 		nvlist_t *nvl;
+		uint64_t txg = spa_last_synced_txg(spa) != 0 ?
+		    spa_last_synced_txg(spa) : -1ULL;
 
-		if ((label = vdev_label_read_config(vd)) == NULL) {
+		if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (0);
@@ -1316,8 +1519,9 @@ vdev_validate(vdev_t *vd)
 			return (0);
 		}
 
-		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
-		    &guid) != 0 || guid != spa_guid(spa)) {
+		if (strict && (nvlist_lookup_uint64(label,
+		    ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+		    guid != spa_guid(spa))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
@@ -1364,13 +1568,13 @@ vdev_validate(vdev_t *vd)
 		nvlist_free(label);
 
 		/*
-		 * If spa->spa_load_verbatim is true, no need to check the
+		 * If this is a verbatim import, no need to check the
 		 * state of the pool.
 		 */
-		if (!spa->spa_load_verbatim &&
+		if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 		    spa_load_state(spa) == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
-			return (EBADF);
+			return (SET_ERROR(EBADF));
 
 		/*
 		 * If we were able to open and validate a vdev that was
@@ -1406,6 +1610,9 @@ vdev_close(vdev_t *vd)
 
 	vdev_cache_purge(vd);
 
+	if (vd->vdev_ops->vdev_op_leaf)
+		trim_map_destroy(vd);
+
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
@@ -1420,6 +1627,35 @@ vdev_close(vdev_t *vd)
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
+void
+vdev_hold(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_is_root(spa));
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+		return;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_hold(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_is_root(spa));
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_rele(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_rele(vd);
+}
+
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
@@ -1450,7 +1686,7 @@ vdev_reopen(vdev_t *vd)
 		    !l2arc_vdev_present(vd))
 			l2arc_add_vdev(spa, vd);
 	} else {
-		(void) vdev_validate(vd);
+		(void) vdev_validate(vd, B_TRUE);
 	}
 
 	/*
@@ -1477,9 +1713,10 @@ vdev_create(vdev_t *vd, uint64_t txg, bo
 	}
 
 	/*
-	 * Recursively initialize all labels.
+	 * Recursively load DTLs and initialize all labels.
 	 */
-	if ((error = vdev_label_init(vd, txg, isreplacing ?
+	if ((error = vdev_dtl_load(vd)) != 0 ||
+	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
@@ -1492,18 +1729,47 @@ void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	/*
-	 * Aim for roughly 200 metaslabs per vdev.
+	 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
 	 */
-	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+	vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
 }
 
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
+void
+vdev_ashift_optimize(vdev_t *vd)
+{
+	if (vd == vd->vdev_top) {
+		if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+			vd->vdev_ashift = MIN(
+			    MAX(zfs_max_auto_ashift, vd->vdev_ashift),
+			    MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
+		} else {
+			/*
+			 * Unusual case where logical ashift > physical ashift
+			 * so we can't cap the calculated ashift based on max
+			 * ashift as that would cause failures.
+			 * We still check if we need to increase it to match
+			 * the min ashift.
+			 */
+			vd->vdev_ashift = MAX(zfs_min_auto_ashift,
+			    vd->vdev_ashift);
+		}
+	}
+}
+
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	ASSERT(!vd->vdev_ishole);
 	ASSERT(ISP2(flags));
+	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
@@ -1514,11 +1780,21 @@ vdev_dirty(vdev_t *vd, int flags, void *
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
+void
+vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vdev_dirty(vd->vdev_top, flags, vd, txg);
+}
+
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
- * the vdev has less than perfect replication.  There are three kinds of DTL:
+ * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
@@ -1555,30 +1831,31 @@ vdev_dirty(vdev_t *vd, int flags, void *
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
-	space_map_t *sm = &vd->vdev_dtl[t];
+	range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+	ASSERT(spa_writeable(vd->vdev_spa));
 
-	mutex_enter(sm->sm_lock);
-	if (!space_map_contains(sm, txg, size))
-		space_map_add(sm, txg, size);
-	mutex_exit(sm->sm_lock);
+	mutex_enter(rt->rt_lock);
+	if (!range_tree_contains(rt, txg, size))
+		range_tree_add(rt, txg, size);
+	mutex_exit(rt->rt_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
-	space_map_t *sm = &vd->vdev_dtl[t];
+	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
-	mutex_enter(sm->sm_lock);
-	if (sm->sm_space != 0)
-		dirty = space_map_contains(sm, txg, size);
-	mutex_exit(sm->sm_lock);
+	mutex_enter(rt->rt_lock);
+	if (range_tree_space(rt) != 0)
+		dirty = range_tree_contains(rt, txg, size);
+	mutex_exit(rt->rt_lock);
 
 	return (dirty);
 }
@@ -1586,17 +1863,86 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_t
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
-	space_map_t *sm = &vd->vdev_dtl[t];
+	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
-	mutex_enter(sm->sm_lock);
-	empty = (sm->sm_space == 0);
-	mutex_exit(sm->sm_lock);
+	mutex_enter(rt->rt_lock);
+	empty = (range_tree_space(rt) == 0);
+	mutex_exit(rt->rt_lock);
 
 	return (empty);
 }
 
 /*
+ * Returns the lowest txg in the DTL range.
+ */
+static uint64_t
+vdev_dtl_min(vdev_t *vd)
+{
+	range_seg_t *rs;
+
+	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+	ASSERT0(vd->vdev_children);
+
+	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+	return (rs->rs_start - 1);
+}
+
+/*
+ * Returns the highest txg in the DTL.
+ */
+static uint64_t
+vdev_dtl_max(vdev_t *vd)
+{
+	range_seg_t *rs;
+
+	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+	ASSERT0(vd->vdev_children);
+
+	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+	return (rs->rs_end);
+}
+
+/*
+ * Determine if a resilvering vdev should remove any DTL entries from
+ * its range. If the vdev was resilvering for the entire duration of the
+ * scan then it should excise that range from its DTLs. Otherwise, this
+ * vdev is considered partially resilvered and should leave its DTL
+ * entries intact. The comment in vdev_dtl_reassess() describes how we
+ * excise the DTLs.
+ */
+static boolean_t
+vdev_dtl_should_excise(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+	ASSERT0(scn->scn_phys.scn_errors);
+	ASSERT0(vd->vdev_children);
+
+	if (vd->vdev_resilver_txg == 0 ||
+	    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
+		return (B_TRUE);
+
+	/*
+	 * When a resilver is initiated the scan will assign the scn_max_txg
+	 * value to the highest txg value that exists in all DTLs. If this
+	 * device's max DTL is not part of this scan (i.e. it is not in
+	 * the range (scn_min_txg, scn_max_txg] then it is not eligible
+	 * for excision.
+	 */
+	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+		ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
+		ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
+		ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
  * Reassess DTLs after a config change or scrub completion.
  */
 void
@@ -1616,10 +1962,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t t
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
+		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
 		mutex_enter(&vd->vdev_dtl_lock);
+
+		/*
+		 * If we've completed a scan cleanly then determine
+		 * if this vdev should remove any DTLs. We only want to
+		 * excise regions on vdevs that were available during
+		 * the entire duration of this scan.
+		 */
 		if (scrub_txg != 0 &&
-		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
-			/* XXX should check scrub_done? */
+		    (spa->spa_scrub_started ||
+		    (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
+		    vdev_dtl_should_excise(vd)) {
 			/*
 			 * We completed a scrub up to scrub_txg.  If we
 			 * did it without rebooting, then the scrub dtl
@@ -1637,27 +1993,40 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t t
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
-			space_map_ref_create(&reftree);
-			space_map_ref_add_map(&reftree,
-			    &vd->vdev_dtl[DTL_MISSING], 1);
-			space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
-			space_map_ref_add_map(&reftree,
-			    &vd->vdev_dtl[DTL_SCRUB], 2);
-			space_map_ref_generate_map(&reftree,
-			    &vd->vdev_dtl[DTL_MISSING], 1);
-			space_map_ref_destroy(&reftree);
-		}
-		space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
-		space_map_walk(&vd->vdev_dtl[DTL_MISSING],
-		    space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
+			space_reftree_create(&reftree);
+			space_reftree_add_map(&reftree,
+			    vd->vdev_dtl[DTL_MISSING], 1);
+			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
+			space_reftree_add_map(&reftree,
+			    vd->vdev_dtl[DTL_SCRUB], 2);
+			space_reftree_generate_map(&reftree,
+			    vd->vdev_dtl[DTL_MISSING], 1);
+			space_reftree_destroy(&reftree);
+		}
+		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
-			space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
-		space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 		if (!vdev_readable(vd))
-			space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		else
-			space_map_walk(&vd->vdev_dtl[DTL_MISSING],
-			    space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
+			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
+
+		/*
+		 * If the vdev was resilvering and no longer has any
+		 * DTLs then reset its resilvering flag and dirty
+		 * the top level so that we persist the change.
+		 */
+		if (vd->vdev_resilver_txg != 0 &&
+		    range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
+		    range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
+			vd->vdev_resilver_txg = 0;
+			vdev_config_dirty(vd->vdev_top);
+		}
+
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
@@ -1677,112 +2046,187 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t t
 			minref = vd->vdev_nparity + 1;	/* RAID-Z */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
-		space_map_ref_create(&reftree);
+		space_reftree_create(&reftree);
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			mutex_enter(&cvd->vdev_dtl_lock);
-			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
+			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
 			mutex_exit(&cvd->vdev_dtl_lock);
 		}
-		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
-		space_map_ref_destroy(&reftree);
+		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
+		space_reftree_destroy(&reftree);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
-static int
+int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl_smo;
 	objset_t *mos = spa->spa_meta_objset;
-	dmu_buf_t *db;
-	int error;
+	int error = 0;
 
-	ASSERT(vd->vdev_children == 0);
+	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
+		ASSERT(!vd->vdev_ishole);
 
-	if (smo->smo_object == 0)
-		return (0);
+		error = space_map_open(&vd->vdev_dtl_sm, mos,
+		    vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+		if (error)
+			return (error);
+		ASSERT(vd->vdev_dtl_sm != NULL);
 
-	ASSERT(!vd->vdev_ishole);
+		mutex_enter(&vd->vdev_dtl_lock);
 
-	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
-		return (error);
+		/*
+		 * Now that we've opened the space_map we need to update
+		 * the in-core DTL.
+		 */
+		space_map_update(vd->vdev_dtl_sm);
 
-	ASSERT3U(db->db_size, >=, sizeof (*smo));
-	bcopy(db->db_data, smo, sizeof (*smo));
-	dmu_buf_rele(db, FTAG);
+		error = space_map_load(vd->vdev_dtl_sm,
+		    vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
+		mutex_exit(&vd->vdev_dtl_lock);
 
-	mutex_enter(&vd->vdev_dtl_lock);
-	error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
-	    NULL, SM_ALLOC, smo, mos);
-	mutex_exit(&vd->vdev_dtl_lock);
+		return (error);
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		error = vdev_dtl_load(vd->vdev_child[c]);
+		if (error != 0)
+			break;
+	}
 
 	return (error);
 }
 
 void
+vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
+	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+	    zapobj, tx));
+}
+
+uint64_t
+vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
+	    DMU_OT_NONE, 0, tx);
+
+	ASSERT(zap != 0);
+	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+	    zap, tx));
+
+	return (zap);
+}
+
+void
+vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
+{
+	if (vd->vdev_ops != &vdev_hole_ops &&
+	    vd->vdev_ops != &vdev_missing_ops &&
+	    vd->vdev_ops != &vdev_root_ops &&
+	    !vd->vdev_top->vdev_removing) {
+		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
+			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
+		}
+		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
+			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+		}
+	}
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_construct_zaps(vd->vdev_child[i], tx);
+	}
+}
+
+void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl_smo;
-	space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
+	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
-	space_map_t smsync;
-	kmutex_t smlock;
-	dmu_buf_t *db;
+	range_tree_t *rtsync;
+	kmutex_t rtlock;
 	dmu_tx_t *tx;
+	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(!vd->vdev_ishole);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
-	if (vd->vdev_detached) {
-		if (smo->smo_object != 0) {
-			int err = dmu_object_free(mos, smo->smo_object, tx);
-			ASSERT3U(err, ==, 0);
-			smo->smo_object = 0;
+	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		space_map_free(vd->vdev_dtl_sm, tx);
+		space_map_close(vd->vdev_dtl_sm);
+		vd->vdev_dtl_sm = NULL;
+		mutex_exit(&vd->vdev_dtl_lock);
+
+		/*
+		 * We only destroy the leaf ZAP for detached leaves or for
+		 * removed log devices. Removed data devices handle leaf ZAP
+		 * cleanup later, once cancellation is no longer possible.
+		 */
+		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
+		    vd->vdev_top->vdev_islog)) {
+			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
+			vd->vdev_leaf_zap = 0;
 		}
+
 		dmu_tx_commit(tx);
 		return;
 	}
 
-	if (smo->smo_object == 0) {
-		ASSERT(smo->smo_objsize == 0);
-		ASSERT(smo->smo_alloc == 0);
-		smo->smo_object = dmu_object_alloc(mos,
-		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
-		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
-		ASSERT(smo->smo_object != 0);
-		vdev_config_dirty(vd->vdev_top);
+	if (vd->vdev_dtl_sm == NULL) {
+		uint64_t new_object;
+
+		new_object = space_map_alloc(mos, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
+		    0, -1ULL, 0, &vd->vdev_dtl_lock));
+		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
-	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+	bzero(&rtlock, sizeof(rtlock));
+	mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
 
-	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
-	    &smlock);
+	rtsync = range_tree_create(NULL, NULL, &rtlock);
 
-	mutex_enter(&smlock);
+	mutex_enter(&rtlock);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_walk(sm, space_map_add, &smsync);
+	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
-	space_map_truncate(smo, mos, tx);
-	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
+	space_map_truncate(vd->vdev_dtl_sm, tx);
+	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
+	range_tree_vacate(rtsync, NULL, NULL);
 
-	space_map_destroy(&smsync);
+	range_tree_destroy(rtsync);
 
-	mutex_exit(&smlock);
-	mutex_destroy(&smlock);
+	mutex_exit(&rtlock);
+	mutex_destroy(&rtlock);
 
-	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
-	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, >=, sizeof (*smo));
-	bcopy(smo, db->db_data, sizeof (*smo));
-	dmu_buf_rele(db, FTAG);
+	/*
+	 * If the object for the space map has changed then dirty
+	 * the top level so that we update the config.
+	 */
+	if (object != space_map_object(vd->vdev_dtl_sm)) {
+		zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
+		    "new object %llu", txg, spa_name(spa), object,
+		    space_map_object(vd->vdev_dtl_sm));
+		vdev_config_dirty(vd->vdev_top);
+	}
 
 	dmu_tx_commit(tx);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_update(vd->vdev_dtl_sm);
+	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 /*
@@ -1813,6 +2257,9 @@ vdev_dtl_required(vdev_t *vd)
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 
+	if (!required && zio_injection_enabled)
+		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
+
 	return (required);
 }
 
@@ -1828,14 +2275,11 @@ vdev_resilver_needed(vdev_t *vd, uint64_
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
+		if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
 		    vdev_writeable(vd)) {
-			space_seg_t *ss;
 
-			ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
-			thismin = ss->ss_start - 1;
-			ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
-			thismax = ss->ss_end;
+			thismin = vdev_dtl_min(vd);
+			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
@@ -1902,14 +2346,14 @@ vdev_validate_aux(vdev_t *vd)
 	if (!vdev_readable(vd))
 		return (0);
 
-	if ((label = vdev_label_read_config(vd)) == NULL) {
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
-	    version > SPA_VERSION ||
+	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
@@ -1935,30 +2379,53 @@ vdev_remove(vdev_t *vd, uint64_t txg)
 	dmu_tx_t *tx;
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
-
-	if (vd->vdev_dtl_smo.smo_object) {
-		ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
-		(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
-		vd->vdev_dtl_smo.smo_object = 0;
-	}
+	ASSERT(vd == vd->vdev_top);
+	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	if (vd->vdev_ms != NULL) {
+		metaslab_group_t *mg = vd->vdev_mg;
+
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+
 		for (int m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
-			if (msp == NULL || msp->ms_smo.smo_object == 0)
+			if (msp == NULL || msp->ms_sm == NULL)
 				continue;
 
-			ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
-			(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
-			msp->ms_smo.smo_object = 0;
+			mutex_enter(&msp->ms_lock);
+			/*
+			 * If the metaslab was not loaded when the vdev
+			 * was removed then the histogram accounting may
+			 * not be accurate. Update the histogram information
+			 * here so that we ensure that the metaslab group
+			 * and metaslab class are up-to-date.
+			 */
+			metaslab_group_histogram_remove(mg, msp);
+
+			VERIFY0(space_map_allocated(msp->ms_sm));
+			space_map_free(msp->ms_sm, tx);
+			space_map_close(msp->ms_sm);
+			msp->ms_sm = NULL;
+			mutex_exit(&msp->ms_lock);
 		}
+
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+			ASSERT0(mg->mg_histogram[i]);
+
 	}
 
 	if (vd->vdev_ms_array) {
 		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
 		vd->vdev_ms_array = 0;
-		vd->vdev_ms_shift = 0;
+	}
+
+	if (vd->vdev_islog && vd->vdev_top_zap != 0) {
+		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
+		vd->vdev_top_zap = 0;
 	}
 	dmu_tx_commit(tx);
 }
@@ -1998,7 +2465,10 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 		dmu_tx_commit(tx);
 	}
 
-	if (vd->vdev_removing)
+	/*
+	 * Remove the metadata associated with this vdev once it's empty.
+	 */
+	if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove(vd, txg);
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
@@ -2025,7 +2495,7 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
-	vdev_t *vd;
+	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
@@ -2035,6 +2505,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vd
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	tvd = vd->vdev_top;
+
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
@@ -2045,17 +2517,16 @@ vdev_fault(spa_t *spa, uint64_t guid, vd
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
+	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
-	 * If marking the vdev as faulted cause the top-level vdev to become
-	 * unavailable, then back off and simply mark the vdev as degraded
-	 * instead.
+	 * If this device has the only valid copy of the data, then
+	 * back off and simply mark the vdev as degraded instead.
 	 */
-	if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog &&
-	    vd->vdev_aux == NULL) {
+	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
@@ -2063,7 +2534,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vd
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
-		vdev_reopen(vd);
+		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
@@ -2105,15 +2576,18 @@ vdev_degrade(spa_t *spa, uint64_t guid, 
 }
 
 /*
- * Online the given vdev.  If 'unspare' is set, it implies two things.  First,
- * any attached spare device should be detached when the device finishes
- * resilvering.  Second, the online should be treated like a 'test' online case,
- * so no FMA events are generated if the device fails to open.
+ * Online the given vdev.
+ *
+ * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
+ * spare device should be detached when the device finishes resilvering.
+ * Second, the online should be treated like a 'test' online case, so no FMA
+ * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
+	boolean_t postevent = B_FALSE;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
@@ -2123,6 +2597,10 @@ vdev_online(spa_t *spa, uint64_t guid, u
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	postevent =
+	    (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ?
+	    B_TRUE : B_FALSE;
+
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
@@ -2158,6 +2636,10 @@ vdev_online(spa_t *spa, uint64_t guid, u
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
+
+	if (postevent)
+		spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE);
+
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
@@ -2223,7 +2705,7 @@ top:
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
-			ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
@@ -2289,6 +2771,14 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
+	if (vd == rvd) {
+		for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
+			vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
+
+		for (int c = 0; c < spa->spa_spares.sav_count; c++)
+			vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
+	}
+
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
@@ -2305,15 +2795,15 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
-		vd->vdev_faulted = vd->vdev_degraded = 0;
+		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 
-		vdev_reopen(vd);
+		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
-		if (vd != rvd)
+		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
@@ -2373,7 +2863,8 @@ vdev_allocatable(vdev_t *vd)
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing);
+	    !vd->vdev_cant_write && !vd->vdev_ishole &&
+	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
@@ -2399,17 +2890,35 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd = vd->vdev_top;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
-	vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
 	if (vd->vdev_ops->vdev_op_leaf)
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
-	mutex_exit(&vd->vdev_stat_lock);
+	/*
+	 * Report expandable space on top-level, non-auxillary devices only.
+	 * The expandable space is reported in terms of metaslab sized units
+	 * since that determines how much space the pool can expand.
+	 */
+	if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
+		vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift);
+	}
+	vs->vs_configured_ashift = vd->vdev_top != NULL
+	    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
+	vs->vs_logical_ashift = vd->vdev_logical_ashift;
+	vs->vs_physical_ashift = vd->vdev_physical_ashift;
+	if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
+		vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+	}
 
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
@@ -2420,15 +2929,14 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *
 			vdev_t *cvd = rvd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 
-			mutex_enter(&vd->vdev_stat_lock);
 			for (int t = 0; t < ZIO_TYPES; t++) {
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
-			vs->vs_scrub_examined += cvs->vs_scrub_examined;
-			mutex_exit(&vd->vdev_stat_lock);
+			cvs->vs_scan_removing = cvd->vdev_removing;
 		}
 	}
+	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
@@ -2442,6 +2950,19 @@ vdev_clear_stats(vdev_t *vd)
 }
 
 void
+vdev_scan_stat_init(vdev_t *vd)
+{
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_scan_stat_init(vd->vdev_child[c]);
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vs->vs_scan_processed = 0;
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
@@ -2485,8 +3006,17 @@ vdev_stat_update(zio_t *zio, uint64_t ps
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
-			if (flags & ZIO_FLAG_SCRUB_THREAD)
-				vs->vs_scrub_repaired += psize;
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
+				dsl_scan_phys_t *scn_phys =
+				    &spa->spa_dsl_pool->dp_scan->scn_phys;
+				uint64_t *processed = &scn_phys->scn_processed;
+
+				/* XXX cleanup? */
+				if (vd->vdev_ops->vdev_op_leaf)
+					atomic_add_64(processed, psize);
+				vs->vs_scan_processed += psize;
+			}
+
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
@@ -2532,7 +3062,7 @@ vdev_stat_update(zio_t *zio, uint64_t ps
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
-	    (flags & ZIO_FLAG_SCRUB_THREAD) ||
+	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
@@ -2551,7 +3081,7 @@ vdev_stat_update(zio_t *zio, uint64_t ps
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
-			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
@@ -2572,35 +3102,6 @@ vdev_stat_update(zio_t *zio, uint64_t ps
 	}
 }
 
-void
-vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
-{
-	vdev_stat_t *vs = &vd->vdev_stat;
-
-	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
-
-	mutex_enter(&vd->vdev_stat_lock);
-
-	if (type == POOL_SCRUB_NONE) {
-		/*
-		 * Update completion and end time.  Leave everything else alone
-		 * so we can report what happened during the previous scrub.
-		 */
-		vs->vs_scrub_complete = complete;
-		vs->vs_scrub_end = gethrestime_sec();
-	} else {
-		vs->vs_scrub_type = type;
-		vs->vs_scrub_complete = 0;
-		vs->vs_scrub_examined = 0;
-		vs->vs_scrub_repaired = 0;
-		vs->vs_scrub_start = gethrestime_sec();
-		vs->vs_scrub_end = 0;
-	}
-
-	mutex_exit(&vd->vdev_stat_lock);
-}
-
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
@@ -2663,6 +3164,8 @@ vdev_config_dirty(vdev_t *vd)
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
+	ASSERT(spa_writeable(spa));
+
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
@@ -2700,7 +3203,7 @@ vdev_config_dirty(vdev_t *vd)
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
-		aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
+		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
@@ -2751,6 +3254,7 @@ vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
+	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
@@ -2865,28 +3369,18 @@ vdev_set_state(vdev_t *vd, boolean_t iso
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
-	 * always close the underlying device.  Otherwise, we keep accessible
-	 * but invalid devices open forever.  We don't call vdev_close() itself,
-	 * because that implies some extra checks (offline, etc) that we don't
-	 * want here.  This is limited to leaf devices, because otherwise
-	 * closing the device will affect other children.
+	 * always close the underlying device unless the device has requested
+	 * a delayed close (i.e. we're about to remove or fault the device).
+	 * Otherwise, we keep accessible but invalid devices open forever.
+	 * We don't call vdev_close() itself, because that implies some extra
+	 * checks (offline, etc) that we don't want here.  This is limited to
+	 * leaf devices, because otherwise closing the device will affect other
+	 * children.
 	 */
-	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
-	/*
-	 * If we have brought this vdev back into service, we need
-	 * to notify fmd so that it can gracefully repair any outstanding
-	 * cases due to a missing device.  We do this in all cases, even those
-	 * that probably don't correlate to a repaired fault.  This is sure to
-	 * catch all cases, and we let the zfs-retire agent sort it out.  If
-	 * this is a transient state it's OK, as the retire agent will
-	 * double-check the state of the vdev before repairing it.
-	 */
-	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
-	    vd->vdev_prevstate != state)
-		zfs_post_state_change(spa, vd);
-
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
@@ -2905,12 +3399,13 @@ vdev_set_state(vdev_t *vd, boolean_t iso
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
-		 * If we fail to open a vdev during an import, we mark it as
-		 * "not available", which signifies that it was never there to
-		 * begin with.  Failure to open such a device is not considered
-		 * an error.
+		 * If we fail to open a vdev during an import or recovery, we
+		 * mark it as "not available", which signifies that it was
+		 * never there to begin with.  Failure to open such a device
+		 * is not considered an error.
 		 */
-		if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
+		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
@@ -2953,9 +3448,6 @@ vdev_set_state(vdev_t *vd, boolean_t iso
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
-			case VDEV_AUX_IO_FAILURE:
-				class = FM_EREPORT_ZFS_IO_FAILURE;
-				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
@@ -2969,69 +3461,99 @@ vdev_set_state(vdev_t *vd, boolean_t iso
 		vd->vdev_removed = B_FALSE;
 	}
 
+	/*
+	* Notify the fmd of the state change.  Be verbose and post
+	* notifications even for stuff that's not important; the fmd agent can
+	* sort it out.  Don't emit state change events for non-leaf vdevs since
+	* they can't change state on their own.  The FMD can check their state
+	* if it wants to when it sees that a leaf vdev had a state change.
+	*/
+	if (vd->vdev_ops->vdev_op_leaf)
+		zfs_post_state_change(spa, vd);
+
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
- * a root pool. Currently, we do not support RAID-Z or partial configuration.
- * In addition, only a single top-level vdev is allowed and none of the leaves
- * can be wholedisks.
+ * a root pool. We do not support partial configuration.
+ * In addition, only a single top-level vdev is allowed.
+ *
+ * FreeBSD does not have above limitations.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
+#ifdef illumos
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
 		    vd->vdev_children > 1) {
 			return (B_FALSE);
-		} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
-		    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
+		} else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
 			return (B_FALSE);
 		}
-	} else if (vd->vdev_wholedisk == 1) {
-		return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
+#endif	/* illumos */
 	return (B_TRUE);
 }
 
 /*
  * Load the state from the original vdev tree (ovd) which
  * we've retrieved from the MOS config object. If the original
- * vdev was offline then we transfer that state to the device
- * in the current vdev tree (nvd).
+ * vdev was offline or faulted then we transfer that state to the
+ * device in the current vdev tree (nvd).
  */
 void
 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
 {
 	spa_t *spa = nvd->vdev_spa;
 
+	ASSERT(nvd->vdev_top->vdev_islog);
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
 
 	for (int c = 0; c < nvd->vdev_children; c++)
 		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
 
-	if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
+	if (nvd->vdev_ops->vdev_op_leaf) {
 		/*
-		 * It would be nice to call vdev_offline()
-		 * directly but the pool isn't fully loaded and
-		 * the txg threads have not been started yet.
+		 * Restore the persistent vdev state
 		 */
 		nvd->vdev_offline = ovd->vdev_offline;
-		vdev_reopen(nvd->vdev_top);
+		nvd->vdev_faulted = ovd->vdev_faulted;
+		nvd->vdev_degraded = ovd->vdev_degraded;
+		nvd->vdev_removed = ovd->vdev_removed;
 	}
 }
 
 /*
+ * Determine if a log device has valid content.  If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+	    !vd->vdev_removed)
+		return (B_TRUE);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		if (vdev_log_state_valid(vd->vdev_child[c]))
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
  * Expand a vdev if possible.
  */
 void
@@ -3064,3 +3586,50 @@ vdev_split(vdev_t *vd)
 	}
 	vdev_propagate_state(cvd);
 }
+
+void
+vdev_deadman(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		vdev_deadman(cvd);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		vdev_queue_t *vq = &vd->vdev_queue;
+
+		mutex_enter(&vq->vq_lock);
+		if (avl_numnodes(&vq->vq_active_tree) > 0) {
+			spa_t *spa = vd->vdev_spa;
+			zio_t *fio;
+			uint64_t delta;
+
+			/*
+			 * Look at the head of all the pending queues,
+			 * if any I/O has been outstanding for longer than
+			 * the spa_deadman_synctime we panic the system.
+			 */
+			fio = avl_first(&vq->vq_active_tree);
+			delta = gethrtime() - fio->io_timestamp;
+			if (delta > spa_deadman_synctime(spa)) {
+				zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
+				    "delta %lluns, last io %lluns",
+				    fio->io_timestamp, delta,
+				    vq->vq_io_complete_ts);
+
+				printf("SLOW IO: zio timestamp %lluns, "
+				    "delta %luns, last io %lluns",
+				    fio->io_timestamp, delta,
+				    vq->vq_io_complete_ts);
+
+				fm_panic("I/O to pool '%s' appears to be "
+				    "hung on vdev guid %llu at '%s'.",
+				    spa_name(spa),
+				    (long long unsigned int) vd->vdev_guid,
+				    vd->vdev_path);
+			}
+		}
+		mutex_exit(&vq->vq_lock);
+	}
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_cache.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c	27 Feb 2010 22:31:10 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c	10 Oct 2016 11:09:56 -0000
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
@@ -71,13 +74,29 @@
  * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
  * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
  * vdev's vdev_cache.
+ *
+ * TODO: Note that with the current ZFS code, it turns out that the
+ * vdev cache is not helpful, and in some cases actually harmful.  It
+ * is better if we disable this.  Once some time has passed, we should
+ * actually remove this to simplify the code.  For now we just disable
+ * it by setting the zfs_vdev_cache_size to zero.  Note that Solaris 11
+ * has made these same changes.
  */
 int zfs_vdev_cache_max = 1<<14;			/* 16KB */
-int zfs_vdev_cache_size = 10ULL << 20;		/* 10MB */
+int zfs_vdev_cache_size = 0;
 int zfs_vdev_cache_bshift = 16;
 
 #define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
 
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
+    &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
+    &zfs_vdev_cache_size, 0, "Size of VDEV cache");
+SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
+    &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
+
 kstat_t	*vdc_ksp = NULL;
 
 typedef struct vdc_stats {
@@ -92,7 +111,7 @@ static vdc_stats_t vdc_stats = {
 	{ "misses",		KSTAT_DATA_UINT64 }
 };
 
-#define	VDCSTAT_BUMP(stat)	atomic_add_64(&vdc_stats.stat.value.ui64, 1);
+#define	VDCSTAT_BUMP(stat)	atomic_inc_64(&vdc_stats.stat.value.ui64);
 
 static int
 vdev_cache_offset_compare(const void *a1, const void *a2)
@@ -228,7 +247,8 @@ vdev_cache_fill(zio_t *fio)
 	 * any reads that were queued up before the missed update are still
 	 * valid, so we can satisfy them from this line before we evict it.
 	 */
-	while ((pio = zio_walk_parents(fio)) != NULL)
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(fio, &zl)) != NULL)
 		vdev_cache_hit(vc, ve, pio);
 
 	if (fio->io_error || ve->ve_missed_update)
@@ -238,9 +258,9 @@ vdev_cache_fill(zio_t *fio)
 }
 
 /*
- * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
+ * Read data from the cache.  Returns B_TRUE cache hit, B_FALSE on miss.
  */
-int
+boolean_t
 vdev_cache_read(zio_t *zio)
 {
 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
@@ -252,16 +272,16 @@ vdev_cache_read(zio_t *zio)
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
-		return (EINVAL);
+		return (B_FALSE);
 
 	if (zio->io_size > zfs_vdev_cache_max)
-		return (EOVERFLOW);
+		return (B_FALSE);
 
 	/*
 	 * If the I/O straddles two or more cache blocks, don't cache it.
 	 */
 	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
-		return (EXDEV);
+		return (B_FALSE);
 
 	ASSERT(cache_phase + zio->io_size <= VCBS);
 
@@ -273,7 +293,7 @@ vdev_cache_read(zio_t *zio)
 	if (ve != NULL) {
 		if (ve->ve_missed_update) {
 			mutex_exit(&vc->vc_lock);
-			return (ESTALE);
+			return (B_FALSE);
 		}
 
 		if ((fio = ve->ve_fill_io) != NULL) {
@@ -281,7 +301,7 @@ vdev_cache_read(zio_t *zio)
 			zio_add_child(zio, fio);
 			mutex_exit(&vc->vc_lock);
 			VDCSTAT_BUMP(vdc_stat_delegations);
-			return (0);
+			return (B_TRUE);
 		}
 
 		vdev_cache_hit(vc, ve, zio);
@@ -289,18 +309,18 @@ vdev_cache_read(zio_t *zio)
 
 		mutex_exit(&vc->vc_lock);
 		VDCSTAT_BUMP(vdc_stat_hits);
-		return (0);
+		return (B_TRUE);
 	}
 
 	ve = vdev_cache_allocate(zio);
 
 	if (ve == NULL) {
 		mutex_exit(&vc->vc_lock);
-		return (ENOMEM);
+		return (B_FALSE);
 	}
 
 	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
-	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
+	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
@@ -311,7 +331,7 @@ vdev_cache_read(zio_t *zio)
 	zio_nowait(fio);
 	VDCSTAT_BUMP(vdc_stat_misses);
 
-	return (0);
+	return (B_TRUE);
 }
 
 /*
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c,v
retrieving revision 1.6
diff -u -p -r1.6 vdev_disk.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c	8 Dec 2015 20:56:21 -0000	1.6
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c	12 May 2017 02:10:16 -0000
@@ -1,4 +1,3 @@
-
 /*
  * CDDL HEADER START
  *
@@ -20,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -37,6 +38,15 @@
 #include <sys/dkio.h>
 #include <sys/workqueue.h>
 
+#ifdef __NetBSD__
+static int
+geterror(struct buf *bp)
+{
+
+	return (bp->b_error);
+}
+#endif
+
 /*
  * Virtual device vector for disks.
  */
@@ -44,6 +54,70 @@
 static void	vdev_disk_io_intr(buf_t *);
 
 static void
+vdev_disk_alloc(vdev_t *vd)
+{
+	vdev_disk_t *dvd;
+
+	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+#ifdef illumos
+	/*
+	 * Create the LDI event callback list.
+	 */
+	list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
+	    offsetof(vdev_disk_ldi_cb_t, lcb_next));
+#endif
+}
+
+
+static void
+vdev_disk_free(vdev_t *vd)
+{
+	vdev_disk_t *dvd = vd->vdev_tsd;
+#ifdef illumos
+	vdev_disk_ldi_cb_t *lcb;
+#endif
+
+	if (dvd == NULL)
+		return;
+
+#ifdef illumos
+	/*
+	 * We have already closed the LDI handle. Clean up the LDI event
+	 * callbacks and free vd->vdev_tsd.
+	 */
+	while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
+		list_remove(&dvd->vd_ldi_cbs, lcb);
+		(void) ldi_ev_remove_callbacks(lcb->lcb_id);
+		kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
+	}
+	list_destroy(&dvd->vd_ldi_cbs);
+#endif
+	kmem_free(dvd, sizeof (vdev_disk_t));
+	vd->vdev_tsd = NULL;
+}
+
+
+/*
+ * It's not clear what these hold/rele functions are supposed to do.
+ */
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+}
+
+static void
 vdev_disk_flush(struct work *work, void *cookie)
 {
 	vdev_disk_t *dvd;
@@ -54,20 +128,20 @@ vdev_disk_flush(struct work *work, void 
 	bp = (struct buf *)work;
 	vp = bp->b_vp;
 	dvd = cookie;
-	
-	KASSERT(vp == dvd->vd_vn);
 
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	KASSERT(vp == dvd->vd_vp);
+
 	cmd = 1;
-	error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE,
-	    kauth_cred_get());
-	VOP_UNLOCK(vp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred);
+	VOP_UNLOCK(vp, 0);
 	bp->b_error = error;
 	vdev_disk_io_intr(bp);
 }
 
 static int
-vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *ashift, uint64_t *pashift)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_disk_t *dvd;
@@ -80,7 +154,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi
 	 */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
@@ -90,10 +164,16 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi
 	if (vd->vdev_tsd != NULL) {
 		ASSERT(vd->vdev_reopening);
 		dvd = vd->vdev_tsd;
+		vp = dvd->vd_vp;
+		KASSERT(vp != NULL);
 		goto skip_open;
 	}
 
-	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+	/*
+	 * Create vd->vdev_tsd.
+	 */
+	vdev_disk_alloc(vd);
+	dvd = vd->vdev_tsd;
 
 	/*
 	 * When opening a disk device, we want to preserve the user's original
@@ -113,6 +193,13 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi
 	 */
 	if (vd->vdev_devid != NULL) {
 		/* XXXNETBSD wedges */
+#ifdef illumos
+		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
+		    &dvd->vd_minor) != 0) {
+			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+			return (SET_ERROR(EINVAL));
+		}
+#endif
 	}
 
 	error = EINVAL;		/* presume failure */
@@ -121,74 +208,137 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi
 	    &vp, CRCREAT, 0);
 	if (error != 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return error;
+		return (SET_ERROR(error));
 	}
 	if (vp->v_type != VBLK) {
 		vrele(vp);
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return EINVAL;
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * XXXNETBSD Compare the devid to the stored value.
 	 */
 
+	/*
+	 * Create a workqueue to process cache-flushes concurrently.
+	 */
+	error = workqueue_create(&dvd->vd_wq, "vdevsync",
+	    vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
+	if (error != 0) {
+		vrele(vp);
+		return (SET_ERROR(error));
+	}
+
+	dvd->vd_vp = vp;
+
 skip_open:
 	/*
 	 * Determine the actual size of the device.
 	 * XXXNETBSD wedges.
 	 */
-	error = VOP_IOCTL(vp, DIOCGPARTINFO, &pinfo, FREAD|FWRITE,
-	    kauth_cred_get());
+	error = VOP_IOCTL(vp, DIOCGPARTINFO, &pinfo, FREAD|FWRITE, kcred);
 	if (error != 0) {
-		vrele(vp);
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return error;
+		return (SET_ERROR(error));
 	}
 	*psize = pinfo.pi_size * pinfo.pi_secsize;
+	*max_psize = *psize;
+
 	*ashift = highbit(MAX(pinfo.pi_secsize, SPA_MINBLOCKSIZE)) - 1;
+	*pashift = *ashift;
 	vd->vdev_wholedisk = (pinfo.pi_offset == 0); /* XXXNETBSD */
 
 	/*
-	 * Create a workqueue to process cache-flushes concurrently.
-	 */
-	error = workqueue_create(&dvd->vd_wq, "vdevsync",
-	    vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE);
-	if (error != 0) {
-		vrele(vp);
-		return error;
-	}
-
-	/*
 	 * Clear the nowritecache bit, so that on a vdev_reopen() we will
 	 * try again.
 	 */
 	vd->vdev_nowritecache = B_FALSE;
 
-	dvd->vd_vn = vp;
-	return 0;
+	return (0);
 }
 
 static void
 vdev_disk_close(vdev_t *vd)
 {
 	vdev_disk_t *dvd = vd->vdev_tsd;
-	vnode_t *vp;
 
 	if (vd->vdev_reopening || dvd == NULL)
 		return;
 
-	if ((vp = dvd->vd_vn) != NULL) {
-/* XXX NetBSD Sometimes we deadlock on this why ? */
-//		vprint("vnode close info", vp);
-		vn_close(vp, FREAD|FWRITE, kauth_cred_get());
-//		vprint("vnode close info", vp);
-/* XXX is this needed ?		vrele(vp); */
+#ifdef illumos
+	if (dvd->vd_minor != NULL) {
+		ddi_devid_str_free(dvd->vd_minor);
+		dvd->vd_minor = NULL;
+	}
+
+	if (dvd->vd_devid != NULL) {
+		ddi_devid_free(dvd->vd_devid);
+		dvd->vd_devid = NULL;
+	}
+
+	if (dvd->vd_lh != NULL) {
+		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
+		dvd->vd_lh = NULL;
+	}
+#endif
+
+#ifdef __NetBSD__
+	if (dvd->vd_vp != NULL) {
+		vn_close(dvd->vd_vp, FREAD|FWRITE, kcred);
+		dvd->vd_vp = NULL;
+	}
+	if (dvd->vd_wq != NULL) {
 		workqueue_destroy(dvd->vd_wq);
+		dvd->vd_wq = NULL;
 	}
+#endif
 
-	kmem_free(dvd, sizeof (vdev_disk_t));
-	vd->vdev_tsd = NULL;
+	vd->vdev_delayed_close = B_FALSE;
+#ifdef illumos
+	/*
+	 * If we closed the LDI handle due to an offline notify from LDI,
+	 * don't free vd->vdev_tsd or unregister the callbacks here;
+	 * the offline finalize callback or a reopen will take care of it.
+	 */
+	if (dvd->vd_ldi_offline)
+		return;
+#endif
+
+	vdev_disk_free(vd);
+}
+
+int
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+    size_t size, uint64_t offset, int flags, boolean_t isdump)
+{
+#ifdef illumos
+	vdev_disk_t *dvd = vd->vdev_tsd;
+
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
+		return (EIO);
+
+	ASSERT(vd->vdev_ops == &vdev_disk_ops);
+
+	/*
+	 * If in the context of an active crash dump, use the ldi_dump(9F)
+	 * call instead of ldi_strategy(9F) as usual.
+	 */
+	if (isdump) {
+		ASSERT3P(dvd, !=, NULL);
+		return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
+		    lbtodb(size)));
+	}
+
+	return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+#endif
+#ifdef __NetBSD__
+	return (EIO);
+#endif
 }
 
 static void
@@ -201,19 +351,13 @@ vdev_disk_io_intr(buf_t *bp)
 	 * Rather than teach the rest of the stack about other error
 	 * possibilities (EFAULT, etc), we normalize the error value here.
 	 */
-	if (bp->b_error == 0) {
-		if (bp->b_resid != 0) {
-			zio->io_error = EIO;
-		} else {
-			zio->io_error = 0;
-		}
-	} else {
-		zio->io_error = EIO;
-	}
+	zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
 
+	if (zio->io_error == 0 && bp->b_resid != 0)
+		zio->io_error = SET_ERROR(EIO);
 
 	putiobuf(bp);
-	zio_interrupt(zio);
+	zio_delay_interrupt(zio);
 }
 
 static void
@@ -237,7 +381,7 @@ vdev_disk_ioctl_done(void *zio_arg, int 
 	zio_interrupt(zio);
 }
 
-static int
+static void
 vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -246,12 +390,32 @@ vdev_disk_io_start(zio_t *zio)
 	buf_t *bp, *nbp;
 	int error, size, off, resid;
 
-	vp = dvd->vd_vn;
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+#ifdef illumos
+	if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
+	}
+#endif
+#ifdef __NetBSD__
+	if (dvd == NULL) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
+	}
+	vp = dvd->vd_vp;
+#endif
+
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
-			zio->io_error = ENXIO;
-			return (ZIO_PIPELINE_CONTINUE);
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
 		}
 
 		switch (zio->io_cmd) {
@@ -268,15 +432,15 @@ vdev_disk_io_start(zio_t *zio)
 			bp = getiobuf(vp, true);
 			bp->b_private = zio;
 			workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL);
-			return (ZIO_PIPELINE_STOP);
-			break;
+			return;
 
 		default:
-			zio->io_error = ENOTSUP;
+			zio->io_error = SET_ERROR(ENOTSUP);
 			break;
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_execute(zio);
+		return;
 	}
 
 	bp = getiobuf(vp, true);
@@ -294,7 +458,7 @@ vdev_disk_io_start(zio_t *zio)
 		vp->v_numoutput++;
 		mutex_exit(vp->v_interlock);
 	}
-	
+
 	if (bp->b_bcount <= MAXPHYS) {
 		/* We can do this I/O in one pass. */
 		(void)VOP_STRATEGY(vp, bp);
@@ -316,15 +480,40 @@ vdev_disk_io_start(zio_t *zio)
 			off += size;
 		}
 	}
-	
-	return (ZIO_PIPELINE_STOP);
 }
 
 static void
 vdev_disk_io_done(zio_t *zio)
 {
+#ifdef illumos
+	vdev_t *vd = zio->io_vd;
 
-	/* NetBSD: nothing */
+	/*
+	 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
+	 * the device has been removed.  If this is the case, then we trigger an
+	 * asynchronous removal of the device. Otherwise, probe the device and
+	 * make sure it's still accessible.
+	 */
+	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
+		vdev_disk_t *dvd = vd->vdev_tsd;
+		int state = DKIO_NONE;
+
+		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
+		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
+			/*
+			 * We post the resource as soon as possible, instead of
+			 * when the async removal actually happens, because the
+			 * DE is using this information to discard previous I/O
+			 * errors.
+			 */
+			zfs_post_remove(zio->io_spa, vd);
+			vd->vdev_remove_wanted = B_TRUE;
+			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+		} else if (!vd->vdev_delayed_close) {
+			vd->vdev_delayed_close = B_TRUE;
+		}
+	}
+#endif
 }
 
 vdev_ops_t vdev_disk_ops = {
@@ -334,6 +523,8 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_disk_io_start,
 	vdev_disk_io_done,
 	NULL,
+	vdev_disk_hold,
+	vdev_disk_rele,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -345,6 +536,78 @@ vdev_ops_t vdev_disk_ops = {
 int
 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 {
+#ifdef __NetBSD__
+	return (ENOTSUP);
+#else
+	ldi_handle_t vd_lh;
+	vdev_label_t *label;
+	uint64_t s, size;
+	int l;
+	ddi_devid_t tmpdevid;
+	int error = -1;
+	char *minor_name;
+
+	/*
+	 * Read the device label and build the nvlist.
+	 */
+	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
+	    &minor_name) == 0) {
+		error = ldi_open_by_devid(tmpdevid, minor_name,
+		    FREAD, kcred, &vd_lh, zfs_li);
+		ddi_devid_free(tmpdevid);
+		ddi_devid_str_free(minor_name);
+	}
+
+	if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
+	    zfs_li)))
+		return (error);
+
+	if (ldi_get_size(vd_lh, &s)) {
+		(void) ldi_close(vd_lh, FREAD, kcred);
+		return (SET_ERROR(EIO));
+	}
+
+	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
+	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
+
+	*config = NULL;
+	for (l = 0; l < VDEV_LABELS; l++) {
+		uint64_t offset, state, txg = 0;
+
+		/* read vdev label */
+		offset = vdev_label_offset(size, l, 0);
+		if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
+		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
+			continue;
+
+		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
+		    sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) {
+			*config = NULL;
+			continue;
+		}
+
+		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
+		    &state) != 0 || state >= POOL_STATE_DESTROYED) {
+			nvlist_free(*config);
+			*config = NULL;
+			continue;
+		}
+
+		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) != 0 || txg == 0) {
+			nvlist_free(*config);
+			*config = NULL;
+			continue;
+		}
+
+		break;
+	}
+
+	kmem_free(label, sizeof (vdev_label_t));
+	(void) ldi_close(vd_lh, FREAD, kcred);
+	if (*config == NULL)
+		error = SET_ERROR(EIDRM);
 
-	return EOPNOTSUPP;
+	return (error);
+#endif
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c,v
retrieving revision 1.2
diff -u -p -r1.2 vdev_file.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c	23 Sep 2013 20:41:19 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c	5 May 2017 18:02:28 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -35,8 +35,21 @@
  * Virtual device vector for files.
  */
 
+static void
+vdev_file_hold(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
 static int
-vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_file_t *vf;
 	vnode_t *vp;
@@ -48,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psi
 	 */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
@@ -58,6 +71,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psi
 	if (vd->vdev_tsd != NULL) {
 		ASSERT(vd->vdev_reopening);
 		vf = vd->vdev_tsd;
+		vp = vf->vf_vnode;
 		goto skip_open;
 	}
 
@@ -75,6 +89,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psi
 
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+		vd->vdev_tsd = NULL;
 		return (error);
 	}
 
@@ -85,24 +101,40 @@ vdev_file_open(vdev_t *vd, uint64_t *psi
 	 * Make sure it's a regular file.
 	 */
 	if (vp->v_type != VREG) {
+		(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
+		kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+		vd->vdev_tsd = NULL;
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
-		return (ENODEV);
+		return (SET_ERROR(ENODEV));
 	}
-#endif
+#endif	/* _KERNEL */
 
 skip_open:
 	/*
 	 * Determine the physical size of the file.
 	 */
 	vattr.va_mask = AT_SIZE;
-	error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
+#ifdef __FreeBSD__
+	vn_lock(vp, LK_SHARED | LK_RETRY);
+	error = VOP_GETATTR(vp, &vattr, kcred);
+	VOP_UNLOCK(vp, 0);
+#endif
+#ifdef __NetBSD__
+	error = VOP_GETATTR(vp, &vattr, 0, kcred, NULL);
+#endif
 	if (error) {
+		(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		kmem_free(vd->vdev_tsd, sizeof (vdev_file_t));
+		vd->vdev_tsd = NULL;
 		return (error);
 	}
 
-	*psize = vattr.va_size;
-	*ashift = SPA_MINBLOCKSHIFT;
+	vd->vdev_notrim = B_TRUE;
+
+	*max_psize = *psize = vattr.va_size;
+	*logical_ashift = SPA_MINBLOCKSHIFT;
+	*physical_ashift = SPA_MINBLOCKSHIFT;
 
 	return (0);
 }
@@ -116,53 +148,62 @@ vdev_file_close(vdev_t *vd)
 		return;
 
 	if (vf->vf_vnode != NULL) {
-		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
 		(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
 		    kcred, NULL);
-		VN_RELE(vf->vf_vnode);
 	}
 
+	vd->vdev_delayed_close = B_FALSE;
 	kmem_free(vf, sizeof (vdev_file_t));
 	vd->vdev_tsd = NULL;
 }
 
-static int
+static void
 vdev_file_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	vdev_file_t *vf = vd->vdev_tsd;
-	ssize_t resid = 0;
+	vdev_file_t *vf;
+	vnode_t *vp;
+	ssize_t resid;
 
-	if (zio->io_type == ZIO_TYPE_IOCTL) {
-		/* XXPOLICY */
-		if (!vdev_readable(vd)) {
-			zio->io_error = ENXIO;
-			return (ZIO_PIPELINE_CONTINUE);
-		}
+	if (!vdev_readable(vd)) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
+	}
+
+	vf = vd->vdev_tsd;
+	vp = vf->vf_vnode;
 
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
-			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+			zio->io_error = VOP_FSYNC(vp, FSYNC | FDSYNC,
 			    kcred, NULL);
 			break;
 		default:
-			zio->io_error = ENOTSUP;
+			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_execute(zio);
+		return;
 	}
 
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+
 	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
-	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
-	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
-	    0, RLIM64_INFINITY, kcred, &resid);
+	    UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size,
+	    zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
 	if (resid != 0 && zio->io_error == 0)
 		zio->io_error = ENOSPC;
 
-	zio_interrupt(zio);
+	zio_delay_interrupt(zio);
 
-	return (ZIO_PIPELINE_STOP);
+#ifdef illumos
+	VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
+	    TQ_SLEEP), !=, 0);
+#endif
 }
 
 /* ARGSUSED */
@@ -178,6 +219,8 @@ vdev_ops_t vdev_file_ops = {
 	vdev_file_io_start,
 	vdev_file_io_done,
 	NULL,
+	vdev_file_hold,
+	vdev_file_rele,
 	VDEV_TYPE_FILE,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -194,6 +237,8 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_file_io_start,
 	vdev_file_io_done,
 	NULL,
+	vdev_file_hold,
+	vdev_file_rele,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c	3 Dec 2016 17:03:49 -0000
@@ -0,0 +1,1077 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/disk.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+#include <geom/geom_int.h>
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+static g_attrchanged_t vdev_geom_attrchanged;
+struct g_class zfs_vdev_class = {
+	.name = "ZFS::VDEV",
+	.version = G_VERSION,
+	.attrchanged = vdev_geom_attrchanged,
+};
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+/* Don't send BIO_FLUSH. */
+static int vdev_geom_bio_flush_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
+    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+/* Don't send BIO_DELETE. */
+static int vdev_geom_bio_delete_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
+    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
+
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
+static void
+vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
+{ 
+	int error;
+	uint16_t rate;
+
+	error = g_getattr("GEOM::rotation_rate", cp, &rate);
+	if (error == 0)
+		vd->vdev_rotation_rate = rate;
+	else
+		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
+}
+
+static void
+vdev_geom_set_physpath(struct g_consumer *cp, boolean_t do_null_update)
+{
+	boolean_t needs_update = B_FALSE;
+	vdev_t *vd;
+	char *physpath;
+	int error, physpath_len;
+
+	if (g_access(cp, 1, 0, 0) != 0)
+		return;
+
+	vd = cp->private;
+	physpath_len = MAXPATHLEN;
+	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+	g_access(cp, -1, 0, 0);
+	if (error == 0) {
+		char *old_physpath;
+
+		/* g_topology lock ensures that vdev has not been closed */
+		g_topology_assert();
+		old_physpath = vd->vdev_physpath;
+		vd->vdev_physpath = spa_strdup(physpath);
+
+		if (old_physpath != NULL) {
+			needs_update = (strcmp(old_physpath,
+						vd->vdev_physpath) != 0);
+			spa_strfree(old_physpath);
+		} else
+			needs_update = do_null_update;
+	}
+	g_free(physpath);
+
+	/*
+	 * If the physical path changed, update the config.
+	 * Only request an update for previously unset physpaths if
+	 * requested by the caller.
+	 */
+	if (needs_update)
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+	vdev_t *vd;
+	char *old_physpath;
+	int error;
+
+	vd = cp->private;
+	if (vd == NULL)
+		return;
+
+	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
+		vdev_geom_set_rotation_rate(vd, cp);
+		return;
+	}
+
+	if (strcmp(attr, "GEOM::physpath") == 0) {
+		vdev_geom_set_physpath(cp, /*do_null_update*/B_TRUE);
+		return;
+	}
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+	vdev_t *vd;
+
+	g_topology_assert();
+
+	vd = cp->private;
+	if (vd == NULL) {
+		/* Vdev close in progress.  Ignore the event. */
+		return;
+	}
+
+	/*
+	 * Orphan callbacks occur from the GEOM event thread.
+	 * Concurrent with this call, new I/O requests may be
+	 * working their way through GEOM about to find out
+	 * (only once executed by the g_down thread) that we've
+	 * been orphaned from our disk provider.  These I/Os
+	 * must be retired before we can detach our consumer.
+	 * This is most easily achieved by acquiring the
+	 * SPA ZIO configuration lock as a writer, but doing
+	 * so with the GEOM topology lock held would cause
+	 * a lock order reversal.  Instead, rely on the SPA's
+	 * async removal support to invoke a close on this
+	 * vdev once it is safe to do so.
+	 */
+	vd->vdev_remove_wanted = B_TRUE;
+	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
+{
+	struct g_geom *gp;
+	struct g_consumer *cp;
+	int error;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+	if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+		ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
+		    pp->name, pp->sectorsize);
+		return (NULL);
+	} else if (pp->mediasize < SPA_MINDEVSIZE) {
+		ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
+		    pp->name, pp->mediasize);
+		return (NULL);
+	}
+
+	/* Do we have geom already? No? Create one. */
+	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+		if (gp->flags & G_GEOM_WITHER)
+			continue;
+		if (strcmp(gp->name, "zfs::vdev") != 0)
+			continue;
+		break;
+	}
+	if (gp == NULL) {
+		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+		gp->orphan = vdev_geom_orphan;
+		gp->attrchanged = vdev_geom_attrchanged;
+		cp = g_new_consumer(gp);
+		error = g_attach(cp, pp);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+			    __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
+			return (NULL);
+		}
+		error = g_access(cp, 1, 0, 1);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
+			       __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
+			return (NULL);
+		}
+		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+	} else {
+		/* Check if we are already connected to this provider. */
+		LIST_FOREACH(cp, &gp->consumer, consumer) {
+			if (cp->provider == pp) {
+				ZFS_LOG(1, "Found consumer for %s.", pp->name);
+				break;
+			}
+		}
+		if (cp == NULL) {
+			cp = g_new_consumer(gp);
+			error = g_attach(cp, pp);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
+				return (NULL);
+			}
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
+				return (NULL);
+			}
+			ZFS_LOG(1, "Created consumer for %s.", pp->name);
+		} else {
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
+				return (NULL);
+			}
+			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+		}
+	}
+
+	/* 
+	 * BUG: cp may already belong to a vdev.  This could happen if:
+	 * 1) That vdev is a shared spare, or
+	 * 2) We are trying to reopen a missing vdev and we are scanning by
+	 *    guid.  In that case, we'll ultimately fail to open this consumer,
+	 *    but not until after setting the private field.
+	 * The solution is to:
+	 * 1) Don't set the private field until after the open succeeds, and
+	 * 2) Set it to a linked list of vdevs, not just a single vdev
+	 */
+	cp->private = vd;
+	if (vd != NULL) {
+		vd->vdev_tsd = cp;
+		vdev_geom_set_physpath(cp, /*do_null_update*/B_FALSE);
+	}
+
+	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+	return (cp);
+}
+
+static void
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
+{
+	struct g_geom *gp;
+	vdev_t *vd;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Detaching from %s.",
+	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
+	vd = cp->private;
+	cp->private = NULL;
+
+	gp = cp->geom;
+	if (open_for_read)
+		g_access(cp, -1, 0, -1);
+	/* Destroy consumer on last close. */
+	if (cp->acr == 0 && cp->ace == 0) {
+		if (cp->acw > 0)
+			g_access(cp, 0, -cp->acw, 0);
+		if (cp->provider != NULL) {
+			ZFS_LOG(1, "Destroying consumer for %s.",
+			    cp->provider->name ? cp->provider->name : "NULL");
+			g_detach(cp);
+		}
+		g_destroy_consumer(cp);
+	}
+	/* Destroy geom if there are no consumers left. */
+	if (LIST_EMPTY(&gp->consumer)) {
+		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+		g_wither_geom(gp, ENXIO);
+	}
+}
+
+static void
+vdev_geom_close_locked(vdev_t *vd)
+{
+	struct g_consumer *cp;
+
+	g_topology_assert();
+
+	cp = vd->vdev_tsd;
+	vd->vdev_tsd = NULL;
+	vd->vdev_delayed_close = B_FALSE;
+	if (cp == NULL)
+		return;
+
+	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+
+	vdev_geom_detach(cp, B_TRUE);
+}
+
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
+ * operation is described by parallel entries from each array.  There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+    off_t *sizes, int *errors, int ncmds)
+{
+	struct bio **bios;
+	u_char *p;
+	off_t off, maxio, s, end;
+	int i, n_bios, j;
+	size_t bios_size;
+
+	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
+	n_bios = 0;
+
+	/* How many bios are required for all commands ? */
+	for (i = 0; i < ncmds; i++)
+		n_bios += (sizes[i] + maxio - 1) / maxio;
+
+	/* Allocate memory for the bios */
+	bios_size = n_bios * sizeof(struct bio*);
+	bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+	/* Prepare and issue all of the bios */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		p = datas[i];
+		s = sizes[i];
+		end = off + s;
+		ASSERT((off % cp->provider->sectorsize) == 0);
+		ASSERT((s % cp->provider->sectorsize) == 0);
+
+		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+			bios[j] = g_alloc_bio();
+			bios[j]->bio_cmd = cmds[i];
+			bios[j]->bio_done = NULL;
+			bios[j]->bio_offset = off;
+			bios[j]->bio_length = MIN(s, maxio);
+			bios[j]->bio_data = p;
+			g_io_request(bios[j], cp);
+		}
+	}
+	ASSERT(j == n_bios);
+
+	/* Wait for all of the bios to complete, and clean them up */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		s = sizes[i];
+		end = off + s;
+
+		for (; off < end; off += maxio, s -= maxio, j++) {
+			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
+			g_destroy_bio(bios[j]);
+		}
+	}
+	kmem_free(bios, bios_size);
+}
+
+static int
+vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
+{
+	struct g_provider *pp;
+	vdev_phys_t *vdev_lists[VDEV_LABELS];
+	char *p, *buf;
+	size_t buflen;
+	uint64_t psize, state, txg;
+	off_t offsets[VDEV_LABELS];
+	off_t size;
+	off_t sizes[VDEV_LABELS];
+	int cmds[VDEV_LABELS];
+	int errors[VDEV_LABELS];
+	int l, len;
+
+	g_topology_assert_not();
+
+	pp = cp->provider;
+	ZFS_LOG(1, "Reading config from %s...", pp->name);
+
+	psize = pp->mediasize;
+	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
+
+	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
+	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
+
+	buflen = sizeof(vdev_lists[0]->vp_nvlist);
+
+	*config = NULL;
+	/* Create all of the IO requests */
+	for (l = 0; l < VDEV_LABELS; l++) {
+		cmds[l] = BIO_READ;
+		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+		sizes[l] = size;
+		errors[l] = 0;
+		ASSERT(offsets[l] % pp->sectorsize == 0);
+	}
+
+	/* Issue the IO requests */
+	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+	    VDEV_LABELS);
+
+	/* Parse the labels */
+	for (l = 0; l < VDEV_LABELS; l++) {
+		if (errors[l] != 0)
+			continue;
+
+		buf = vdev_lists[l]->vp_nvlist;
+
+		if (nvlist_unpack(buf, buflen, config, 0) != 0)
+			continue;
+
+		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
+		    &state) != 0 || state > POOL_STATE_L2CACHE) {
+			nvlist_free(*config);
+			*config = NULL;
+			continue;
+		}
+
+		if (state != POOL_STATE_SPARE &&
+		    state != POOL_STATE_L2CACHE &&
+		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) != 0 || txg == 0)) {
+			nvlist_free(*config);
+			*config = NULL;
+			continue;
+		}
+
+		break;
+	}
+
+	/* Free the label storage */
+	for (l = 0; l < VDEV_LABELS; l++)
+		kmem_free(vdev_lists[l], size);
+
+	return (*config == NULL ? ENOENT : 0);
+}
+
+static void
+resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
+{
+	nvlist_t **new_configs;
+	uint64_t i;
+
+	if (id < *count)
+		return;
+	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
+	    KM_SLEEP);
+	for (i = 0; i < *count; i++)
+		new_configs[i] = (*configs)[i];
+	if (*configs != NULL)
+		kmem_free(*configs, *count * sizeof(void *));
+	*configs = new_configs;
+	*count = id + 1;
+}
+
+static void
+process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
+    const char *name, uint64_t* known_pool_guid)
+{
+	nvlist_t *vdev_tree;
+	uint64_t pool_guid;
+	uint64_t vdev_guid, known_guid;
+	uint64_t id, txg, known_txg;
+	char *pname;
+	int i;
+
+	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
+	    strcmp(pname, name) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
+		goto ignore;
+
+	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+	if (*known_pool_guid != 0) {
+		if (pool_guid != *known_pool_guid)
+			goto ignore;
+	} else
+		*known_pool_guid = pool_guid;
+
+	resize_configs(configs, count, id);
+
+	if ((*configs)[id] != NULL) {
+		VERIFY(nvlist_lookup_uint64((*configs)[id],
+		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
+		if (txg <= known_txg)
+			goto ignore;
+		nvlist_free((*configs)[id]);
+	}
+
+	(*configs)[id] = cfg;
+	return;
+
+ignore:
+	nvlist_free(cfg);
+}
+
+int
+vdev_geom_read_pool_label(const char *name,
+    nvlist_t ***configs, uint64_t *count)
+{
+	struct g_class *mp;
+	struct g_geom *gp;
+	struct g_provider *pp;
+	struct g_consumer *zcp;
+	nvlist_t *vdev_cfg;
+	uint64_t pool_guid;
+	int error;
+
+	DROP_GIANT();
+	g_topology_lock();
+
+	*configs = NULL;
+	*count = 0;
+	pool_guid = 0;
+	LIST_FOREACH(mp, &g_classes, class) {
+		if (mp == &zfs_vdev_class)
+			continue;
+		LIST_FOREACH(gp, &mp->geom, geom) {
+			if (gp->flags & G_GEOM_WITHER)
+				continue;
+			LIST_FOREACH(pp, &gp->provider, provider) {
+				if (pp->flags & G_PF_WITHER)
+					continue;
+				zcp = vdev_geom_attach(pp, NULL);
+				if (zcp == NULL)
+					continue;
+				g_topology_unlock();
+				error = vdev_geom_read_config(zcp, &vdev_cfg);
+				g_topology_lock();
+				vdev_geom_detach(zcp, B_TRUE);
+				if (error)
+					continue;
+				ZFS_LOG(1, "successfully read vdev config");
+
+				process_vdev_config(configs, count,
+				    vdev_cfg, name, &pool_guid);
+			}
+		}
+	}
+	g_topology_unlock();
+	PICKUP_GIANT();
+
+	return (*count > 0 ? 0 : ENOENT);
+}
+
+enum match {
+	NO_MATCH,
+	TOP_MATCH,
+	FULL_MATCH
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
+{
+	nvlist_t *config;
+	uint64_t pool_guid, top_guid, vdev_guid;
+	struct g_consumer *cp;
+
+	cp = vdev_geom_attach(pp, NULL);
+	if (cp == NULL) {
+		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+		    pp->name);
+		return (NO_MATCH);
+	}
+	g_topology_unlock();
+	if (vdev_geom_read_config(cp, &config) != 0) {
+		g_topology_lock();
+		vdev_geom_detach(cp, B_TRUE);
+		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+		return (NO_MATCH);
+	}
+	g_topology_lock();
+	vdev_geom_detach(cp, B_TRUE);
+
+	pool_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+	top_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+	vdev_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+	nvlist_free(config);
+
+	/*
+	 * Check that the label's pool guid matches the desired guid.
+	 * Inactive spares and L2ARCs do not have any pool guid in the label.
+	 */
+	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+		    pp->name,
+		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+		return (NO_MATCH);
+	}
+
+	/*
+	 * Check that the label's vdev guid matches the desired guid.
+	 * The second condition handles possible race on vdev detach, when
+	 * remaining vdev receives GUID of destroyed top level mirror vdev.
+	 */
+	if (vdev_guid == vd->vdev_guid) {
+		ZFS_LOG(1, "guids match for provider %s.", pp->name);
+		return (FULL_MATCH);
+	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+		return (TOP_MATCH);
+	}
+	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+	return (NO_MATCH);
+}
+
+static struct g_consumer *
+vdev_geom_attach_by_guids(vdev_t *vd)
+{
+	struct g_class *mp;
+	struct g_geom *gp;
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	enum match m;
+
+	g_topology_assert();
+
+	cp = NULL;
+	LIST_FOREACH(mp, &g_classes, class) {
+		if (mp == &zfs_vdev_class)
+			continue;
+		LIST_FOREACH(gp, &mp->geom, geom) {
+			if (gp->flags & G_GEOM_WITHER)
+				continue;
+			LIST_FOREACH(pp, &gp->provider, provider) {
+				m = vdev_attach_ok(vd, pp);
+				if (m == NO_MATCH)
+					continue;
+				if (cp != NULL) {
+					if (m == FULL_MATCH)
+						vdev_geom_detach(cp, B_TRUE);
+					else
+						continue;
+				}
+				cp = vdev_geom_attach(pp, vd);
+				if (cp == NULL) {
+					printf("ZFS WARNING: Unable to "
+					    "attach to %s.\n", pp->name);
+					continue;
+				}
+				if (m == FULL_MATCH)
+					return (cp);
+			}
+		}
+	}
+	return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_guids(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	char *buf;
+	size_t len;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+	cp = vdev_geom_attach_by_guids(vd);
+	if (cp != NULL) {
+		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
+		buf = kmem_alloc(len, KM_SLEEP);
+
+		snprintf(buf, len, "/dev/%s", cp->provider->name);
+		spa_strfree(vd->vdev_path);
+		vd->vdev_path = buf;
+
+		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
+		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
+	} else {
+		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
+		    (uintmax_t)vd->vdev_guid);
+	}
+
+	return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_path(vdev_t *vd, int check_guid)
+{
+	struct g_provider *pp;
+	struct g_consumer *cp;
+
+	g_topology_assert();
+
+	cp = NULL;
+	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
+	if (pp != NULL) {
+		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
+		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+			cp = vdev_geom_attach(pp, vd);
+	}
+
+	return (cp);
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	size_t bufsize;
+	int error;
+
+	/* Set the TLS to indicate downstack that we should not access zvols*/
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	/*
+	 * Reopen the device if it's not currently open. Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if ((cp = vd->vdev_tsd) != NULL) {
+		ASSERT(vd->vdev_reopening);
+		goto skip_open;
+	}
+
+	DROP_GIANT();
+	g_topology_lock();
+	error = 0;
+
+	if (vd->vdev_spa->spa_splitting_newspa ||
+	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
+		/*
+		 * We are dealing with a vdev that hasn't been previously
+		 * opened (since boot), and we are not loading an
+		 * existing pool configuration.  This looks like a
+		 * vdev add operation to a new or existing pool.
+		 * Assume the user knows what he/she is doing and find
+		 * GEOM provider by its name, ignoring GUID mismatches.
+		 *
+		 * XXPOLICY: It would be safer to only allow a device
+		 *           that is unlabeled or labeled but missing
+		 *           GUID information to be opened in this fashion,
+		 *           unless we are doing a split, in which case we
+		 *           should allow any guid.
+		 */
+		cp = vdev_geom_open_by_path(vd, 0);
+	} else {
+		/*
+		 * Try using the recorded path for this device, but only
+		 * accept it if its label data contains the expected GUIDs.
+		 */
+		cp = vdev_geom_open_by_path(vd, 1);
+		if (cp == NULL) {
+			/*
+			 * The device at vd->vdev_path doesn't have the
+			 * expected GUIDs. The disks might have merely
+			 * moved around so try all other GEOM providers
+			 * to find one with the right GUIDs.
+			 */
+			cp = vdev_geom_open_by_guids(vd);
+		}
+	}
+
+	/* Clear the TLS now that tasting is done */
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
+	if (cp == NULL) {
+		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
+		error = ENOENT;
+	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+	    !ISP2(cp->provider->sectorsize)) {
+		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+		    vd->vdev_path);
+
+		vdev_geom_close_locked(vd);
+		error = EINVAL;
+		cp = NULL;
+	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
+		int i;
+
+		for (i = 0; i < 5; i++) {
+			error = g_access(cp, 0, 1, 0);
+			if (error == 0)
+				break;
+			g_topology_unlock();
+			tsleep(vd, 0, "vdev", hz / 2);
+			g_topology_lock();
+		}
+		if (error != 0) {
+			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
+			    vd->vdev_path, error);
+			vdev_geom_close_locked(vd);
+			cp = NULL;
+		}
+	}
+
+	/* Fetch initial physical path information for this device. */
+	if (cp != NULL)
+		vdev_geom_attrchanged(cp, "GEOM::physpath");
+	
+	g_topology_unlock();
+	PICKUP_GIANT();
+	if (cp == NULL) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+skip_open:
+	pp = cp->provider;
+
+	/*
+	 * Determine the actual size of the device.
+	 */
+	*max_psize = *psize = pp->mediasize;
+
+	/*
+	 * Determine the device's minimum transfer size and preferred
+	 * transfer size.
+	 */
+	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+	*physical_ashift = 0;
+	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
+	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
+		*physical_ashift = highbit(pp->stripesize) - 1;
+
+	/*
+	 * Clear the nowritecache settings, so that on a vdev_reopen()
+	 * we will try again.
+	 */
+	vd->vdev_nowritecache = B_FALSE;
+
+	/*
+	 * Determine the device's rotation rate.
+	 */
+	vdev_geom_set_rotation_rate(vd, cp);
+
+	return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+
+	if (vd->vdev_reopening)
+		return;
+
+	DROP_GIANT();
+	g_topology_lock();
+	vdev_geom_close_locked(vd);
+	g_topology_unlock();
+	PICKUP_GIANT();
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+	vdev_t *vd;
+	zio_t *zio;
+
+	zio = bp->bio_caller1;
+	vd = zio->io_vd;
+	zio->io_error = bp->bio_error;
+	if (zio->io_error == 0 && bp->bio_resid != 0)
+		zio->io_error = SET_ERROR(EIO);
+
+	switch(zio->io_error) {
+	case ENOTSUP:
+		/*
+		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+		 * that future attempts will never succeed. In this case
+		 * we set a persistent flag so that we don't bother with
+		 * requests in the future.
+		 */
+		switch(bp->bio_cmd) {
+		case BIO_FLUSH:
+			vd->vdev_nowritecache = B_TRUE;
+			break;
+		case BIO_DELETE:
+			vd->vdev_notrim = B_TRUE;
+			break;
+		}
+		break;
+	case ENXIO:
+		if (!vd->vdev_remove_wanted) {
+			/*
+			 * If provider's error is set we assume it is being
+			 * removed.
+			 */
+			if (bp->bio_to->error != 0) {
+				vd->vdev_remove_wanted = B_TRUE;
+				spa_async_request(zio->io_spa,
+				    SPA_ASYNC_REMOVE);
+			} else if (!vd->vdev_delayed_close) {
+				vd->vdev_delayed_close = B_TRUE;
+			}
+		}
+		break;
+	}
+	g_destroy_bio(bp);
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+	vdev_t *vd;
+	struct g_consumer *cp;
+	struct bio *bp;
+	int error;
+
+	vd = zio->io_vd;
+
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		} else {
+			switch (zio->io_cmd) {
+			case DKIOCFLUSHWRITECACHE:
+				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
+					break;
+				if (vd->vdev_nowritecache) {
+					zio->io_error = SET_ERROR(ENOTSUP);
+					break;
+				}
+				goto sendreq;
+			default:
+				zio->io_error = SET_ERROR(ENOTSUP);
+			}
+		}
+
+		zio_execute(zio);
+		return;
+	case ZIO_TYPE_FREE:
+		if (vd->vdev_notrim) {
+			zio->io_error = SET_ERROR(ENOTSUP);
+		} else if (!vdev_geom_bio_delete_disable) {
+			goto sendreq;
+		}
+		zio_execute(zio);
+		return;
+	}
+sendreq:
+	ASSERT(zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_FREE ||
+	    zio->io_type == ZIO_TYPE_IOCTL);
+
+	cp = vd->vdev_tsd;
+	if (cp == NULL) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
+	}
+	bp = g_alloc_bio();
+	bp->bio_caller1 = zio;
+	switch (zio->io_type) {
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
+		bp->bio_data = zio->io_data;
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		break;
+	case ZIO_TYPE_FREE:
+		bp->bio_cmd = BIO_DELETE;
+		bp->bio_data = NULL;
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		break;
+	case ZIO_TYPE_IOCTL:
+		bp->bio_cmd = BIO_FLUSH;
+		bp->bio_flags |= BIO_ORDERED;
+		bp->bio_data = NULL;
+		bp->bio_offset = cp->provider->mediasize;
+		bp->bio_length = 0;
+		break;
+	}
+	bp->bio_done = vdev_geom_io_intr;
+
+	g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+}
+
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
+vdev_ops_t vdev_geom_ops = {
+	vdev_geom_open,
+	vdev_geom_close,
+	vdev_default_asize,
+	vdev_geom_io_start,
+	vdev_geom_io_done,
+	NULL,
+	vdev_geom_hold,
+	vdev_geom_rele,
+	VDEV_TYPE_DISK,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_label.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c	27 Feb 2010 22:31:12 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c	3 Dec 2016 17:03:49 -0000
@@ -18,9 +18,10 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -122,6 +123,8 @@
  * 	txg		Transaction group in which this label was written
  * 	pool_guid	Unique identifier for this pool
  * 	vdev_tree	An nvlist describing vdev tree.
+ *	features_for_read
+ *			An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
@@ -141,8 +144,15 @@
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/trim_map.h>
 #include <sys/fs/zfs.h>
 
+static boolean_t vdev_trim_on_init = B_TRUE;
+SYSCTL_DECL(_vfs_zfs_vdev);
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
+    &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
+
 /*
  * Basic routines to read and write from a vdev label.
  * Used throughout the rest of this file.
@@ -175,7 +185,7 @@ vdev_label_number(uint64_t psize, uint64
 
 static void
 vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private, int flags)
+    uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
 	    SCL_STATE_ALL);
@@ -189,7 +199,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, 
 
 static void
 vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
-	uint64_t size, zio_done_func_t *done, void *private, int flags)
+    uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
 	    (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
@@ -208,34 +218,29 @@ vdev_label_write(zio_t *zio, vdev_t *vd,
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
-    boolean_t isspare, boolean_t isl2cache)
+    vdev_config_flag_t flags)
 {
 	nvlist_t *nv = NULL;
 
-	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	nv = fnvlist_alloc();
 
-	VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
-	    vd->vdev_ops->vdev_op_type) == 0);
-	if (!isspare && !isl2cache)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
-		    == 0);
-	VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
+	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 
 	if (vd->vdev_path != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
-		    vd->vdev_path) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
 
 	if (vd->vdev_devid != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
-		    vd->vdev_devid) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 
 	if (vd->vdev_physpath != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
-		    vd->vdev_physpath) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+		    vd->vdev_physpath);
 
 	if (vd->vdev_fru != NULL)
-		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
-		    vd->vdev_fru) == 0);
+		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
 
 	if (vd->vdev_nparity != 0) {
 		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
@@ -256,65 +261,103 @@ vdev_config_generate(spa_t *spa, vdev_t 
 		 * that only support a single parity device -- older software
 		 * will just ignore it.
 		 */
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
-		    vd->vdev_nparity) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
 	}
 
 	if (vd->vdev_wholedisk != -1ULL)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-		    vd->vdev_wholedisk) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+		    vd->vdev_wholedisk);
 
 	if (vd->vdev_not_present)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 
 	if (vd->vdev_isspare)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 
-	if (!isspare && !isl2cache && vd == vd->vdev_top) {
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
-		    vd->vdev_ms_array) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
-		    vd->vdev_ms_shift) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
-		    vd->vdev_ashift) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
-		    vd->vdev_asize) == 0);
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
-		    vd->vdev_islog) == 0);
-	}
-
-	if (vd->vdev_dtl_smo.smo_object != 0)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-		    vd->vdev_dtl_smo.smo_object) == 0);
+	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+	    vd == vd->vdev_top) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    vd->vdev_ms_array);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    vd->vdev_ms_shift);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    vd->vdev_asize);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
+		if (vd->vdev_removing)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+			    vd->vdev_removing);
+	}
+
+	if (vd->vdev_dtl_sm != NULL) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+		    space_map_object(vd->vdev_dtl_sm));
+	}
 
 	if (vd->vdev_crtxg)
-		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
-		    vd->vdev_crtxg) == 0);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+
+	if (flags & VDEV_CONFIG_MOS) {
+		if (vd->vdev_leaf_zap != 0) {
+			ASSERT(vd->vdev_ops->vdev_op_leaf);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
+			    vd->vdev_leaf_zap);
+		}
+
+		if (vd->vdev_top_zap != 0) {
+			ASSERT(vd == vd->vdev_top);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+			    vd->vdev_top_zap);
+		}
+	}
 
 	if (getstats) {
 		vdev_stat_t vs;
+		pool_scan_stat_t ps;
+
 		vdev_get_stats(vd, &vs);
-		VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS,
-		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
+
+		/* provide either current or previous scan information */
+		if (spa_scan_get_stats(spa, &ps) == 0) {
+			fnvlist_add_uint64_array(nv,
+			    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+			    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+		}
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t **child;
-		int c;
+		int c, idx;
 
 		ASSERT(!vd->vdev_ishole);
 
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
-		for (c = 0; c < vd->vdev_children; c++)
-			child[c] = vdev_config_generate(spa, vd->vdev_child[c],
-			    getstats, isspare, isl2cache);
+		for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+
+			/*
+			 * If we're generating an nvlist of removing
+			 * vdevs then skip over any device which is
+			 * not being removed.
+			 */
+			if ((flags & VDEV_CONFIG_REMOVING) &&
+			    !cvd->vdev_removing)
+				continue;
 
-		VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-		    child, vd->vdev_children) == 0);
+			child[idx++] = vdev_config_generate(spa, cvd,
+			    getstats, flags);
+		}
+
+		if (idx) {
+			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+			    child, idx);
+		}
 
-		for (c = 0; c < vd->vdev_children; c++)
+		for (c = 0; c < idx; c++)
 			nvlist_free(child[c]);
 
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
@@ -323,23 +366,20 @@ vdev_config_generate(spa_t *spa, vdev_t 
 		const char *aux = NULL;
 
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+		if (vd->vdev_resilver_txg != 0)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+			    vd->vdev_resilver_txg);
 		if (vd->vdev_faulted)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
 		if (vd->vdev_degraded)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
 		if (vd->vdev_removed)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
 		if (vd->vdev_unspare)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
 		if (vd->vdev_ishole)
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
-			    B_TRUE) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
 
 		switch (vd->vdev_stat.vs_aux) {
 		case VDEV_AUX_ERR_EXCEEDED:
@@ -352,12 +392,11 @@ vdev_config_generate(spa_t *spa, vdev_t 
 		}
 
 		if (aux != NULL)
-			VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
-			    aux) == 0);
+			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
 
 		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
-			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
-			    vd->vdev_orig_guid) == 0);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+			    vd->vdev_orig_guid);
 		}
 	}
 
@@ -375,12 +414,11 @@ vdev_top_config_generate(spa_t *spa, nvl
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *array;
-	uint_t idx;
+	uint_t c, idx;
 
 	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
 
-	idx = 0;
-	for (int c = 0; c < rvd->vdev_children; c++) {
+	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_ishole)
@@ -398,13 +436,23 @@ vdev_top_config_generate(spa_t *spa, nvl
 	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 }
 
+/*
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
+ */
 nvlist_t *
-vdev_label_read_config(vdev_t *vd)
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
 	zio_t *zio;
+	uint64_t best_txg = 0;
+	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 
@@ -417,6 +465,7 @@ vdev_label_read_config(vdev_t *vd)
 
 retry:
 	for (int l = 0; l < VDEV_LABELS; l++) {
+		nvlist_t *label = NULL;
 
 		zio = zio_root(spa, NULL, NULL, flags);
 
@@ -426,12 +475,31 @@ retry:
 
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
-		    &config, 0) == 0)
-			break;
+		    &label, 0) == 0) {
+			uint64_t label_txg = 0;
 
-		if (config != NULL) {
-			nvlist_free(config);
-			config = NULL;
+			/*
+			 * Auxiliary vdevs won't have txg values in their
+			 * labels and newly added vdevs may not have been
+			 * completely initialized so just return the
+			 * configuration from the first valid label we
+			 * encounter.
+			 */
+			error = nvlist_lookup_uint64(label,
+			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
+			if ((error || label_txg == 0) && !config) {
+				config = label;
+				break;
+			} else if (label_txg <= txg && label_txg > best_txg) {
+				best_txg = label_txg;
+				nvlist_free(config);
+				config = fnvlist_dup(label);
+			}
+		}
+
+		if (label != NULL) {
+			nvlist_free(label);
+			label = NULL;
 		}
 	}
 
@@ -466,7 +534,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, v
 	/*
 	 * Read the label, if any, and perform some basic sanity checks.
 	 */
-	if ((label = vdev_label_read_config(vd)) == NULL)
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
 		return (B_FALSE);
 
 	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
@@ -544,6 +612,16 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, v
 		return (B_TRUE);
 
 	/*
+	 * We can't rely on a pool's state if it's been imported
+	 * read-only.  Instead we look to see if the pools is marked
+	 * read-only in the namespace and set the state to active.
+	 */
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+	    (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+	    spa_mode(spa) == FREAD)
+		state = POOL_STATE_ACTIVE;
+
+	/*
 	 * If the device is marked ACTIVE, then this device is in use by another
 	 * pool on the system.
 	 */
@@ -583,21 +661,21 @@ vdev_label_init(vdev_t *vd, uint64_t crt
 	/* Track the creation time for this vdev */
 	vd->vdev_crtxg = crtxg;
 
-	if (!vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
 		return (0);
 
 	/*
 	 * Dead vdevs cannot be initialized.
 	 */
 	if (vdev_is_dead(vd))
-		return (EIO);
+		return (SET_ERROR(EIO));
 
 	/*
 	 * Determine if the vdev is in use.
 	 */
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
 	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
-		return (EBUSY);
+		return (SET_ERROR(EBUSY));
 
 	/*
 	 * If this is a request to add or replace a spare or l2cache device
@@ -645,6 +723,17 @@ vdev_label_init(vdev_t *vd, uint64_t crt
 	}
 
 	/*
+	 * TRIM the whole thing so that we start with a clean slate.
+	 * It's just an optimization, so we don't care if it fails.
+	 * Don't TRIM if removing so that we don't interfere with zpool
+	 * disaster recovery.
+	 */
+	if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim && 
+	    (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE ||
+	    reason == VDEV_LABEL_L2CACHE))
+		zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize));
+
+	/*
 	 * Initialize its label.
 	 */
 	vp = zio_buf_alloc(sizeof (vdev_phys_t));
@@ -782,6 +871,44 @@ retry:
 	return (error);
 }
 
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+	spa_t *spa = vd->vdev_spa;
+	zio_t *zio;
+	char *pad2;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int error;
+
+	if (size > VDEV_PAD_SIZE)
+		return (EINVAL);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (ENODEV);
+	if (vdev_is_dead(vd))
+		return (ENXIO);
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+	bzero(pad2, VDEV_PAD_SIZE);
+	memcpy(pad2, buf, size);
+
+retry:
+	zio = zio_root(spa, NULL, NULL, flags);
+	vdev_label_write(zio, vd, 0, pad2,
+	    offsetof(vdev_label_t, vl_pad2),
+	    VDEV_PAD_SIZE, NULL, NULL, flags);
+	error = zio_wait(zio);
+	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
+	zio_buf_free(pad2, VDEV_PAD_SIZE);
+	return (error);
+}
+
 /*
  * ==========================================================================
  * uberblock load/sync
@@ -794,7 +921,7 @@ retry:
  * come back up, we fail to see the uberblock for txg + 1 because, say,
  * it was on a mirrored device and the replica to which we wrote txg + 1
  * is now offline.  If we then make some changes and sync txg + 1, and then
- * the missing replica comes back, then for a new seconds we'll have two
+ * the missing replica comes back, then for a few seconds we'll have two
  * conflicting uberblocks on disk with the same txg.  The solution is simple:
  * among uberblocks with equal txg, choose the one with the latest timestamp.
  */
@@ -814,46 +941,47 @@ vdev_uberblock_compare(uberblock_t *ub1,
 	return (0);
 }
 
+struct ubl_cbdata {
+	uberblock_t	*ubl_ubbest;	/* Best uberblock */
+	vdev_t		*ubl_vd;	/* vdev associated with the above */
+};
+
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+	vdev_t *vd = zio->io_vd;
 	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
 	uberblock_t *ub = zio->io_data;
-	uberblock_t *ubbest = rio->io_private;
+	struct ubl_cbdata *cbp = rio->io_private;
 
-	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd));
+	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
 		if (ub->ub_txg <= spa->spa_load_max_txg &&
-		    vdev_uberblock_compare(ub, ubbest) > 0)
-			*ubbest = *ub;
+		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
+			/*
+			 * Keep track of the vdev in which this uberblock
+			 * was found. We will use this information later
+			 * to obtain the config nvlist associated with
+			 * this uberblock.
+			 */
+			*cbp->ubl_ubbest = *ub;
+			cbp->ubl_vd = vd;
+		}
 		mutex_exit(&rio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
 }
 
-void
-vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
+static void
+vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
+    struct ubl_cbdata *cbp)
 {
-	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
-
-	if (vd == rvd) {
-		ASSERT(zio == NULL);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		zio = zio_root(spa, NULL, ubbest, flags);
-		bzero(ubbest, sizeof (uberblock_t));
-	}
-
-	ASSERT(zio != NULL);
-
 	for (int c = 0; c < vd->vdev_children; c++)
-		vdev_uberblock_load(zio, vd->vdev_child[c], ubbest);
+		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		for (int l = 0; l < VDEV_LABELS; l++) {
@@ -866,11 +994,46 @@ vdev_uberblock_load(zio_t *zio, vdev_t *
 			}
 		}
 	}
+}
 
-	if (vd == rvd) {
-		(void) zio_wait(zio);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-	}
+/*
+ * Reads the 'best' uberblock from disk along with its associated
+ * configuration. First, we read the uberblock array of each label of each
+ * vdev, keeping track of the uberblock with the highest txg in each array.
+ * Then, we read the configuration from the same vdev as the best uberblock.
+ */
+void
+vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
+{
+	zio_t *zio;
+	spa_t *spa = rvd->vdev_spa;
+	struct ubl_cbdata cb;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+	ASSERT(ub);
+	ASSERT(config);
+
+	bzero(ub, sizeof (uberblock_t));
+	*config = NULL;
+
+	cb.ubl_ubbest = ub;
+	cb.ubl_vd = NULL;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	zio = zio_root(spa, NULL, &cb, flags);
+	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
+	(void) zio_wait(zio);
+
+	/*
+	 * It's possible that the best uberblock was discovered on a label
+	 * that has a configuration which was written in a future txg.
+	 * Search all labels on this vdev to find the configuration that
+	 * matches the txg for our uberblock.
+	 */
+	if (cb.ubl_vd != NULL)
+		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
@@ -883,7 +1046,7 @@ vdev_uberblock_sync_done(zio_t *zio)
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
-		atomic_add_64(good_writes, 1);
+		atomic_inc_64(good_writes);
 }
 
 /*
@@ -919,6 +1082,7 @@ vdev_uberblock_sync(zio_t *zio, uberbloc
 	zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
 }
 
+/* Sync the uberblocks to all vdevs in svd[] */
 int
 vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 {
@@ -957,7 +1121,7 @@ vdev_label_sync_done(zio_t *zio)
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0)
-		atomic_add_64(good_writes, 1);
+		atomic_inc_64(good_writes);
 }
 
 /*
@@ -969,7 +1133,7 @@ vdev_label_sync_top_done(zio_t *zio)
 	uint64_t *good_writes = zio->io_private;
 
 	if (*good_writes == 0)
-		zio->io_error = EIO;
+		zio->io_error = SET_ERROR(EIO);
 
 	kmem_free(good_writes, sizeof (uint64_t));
 }
@@ -1082,15 +1246,16 @@ vdev_label_sync_list(spa_t *spa, int l, 
  * at any time, you can just call it again, and it will resume its work.
  */
 int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
 	vdev_t *vd;
 	zio_t *zio;
-	int error;
+	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
+retry:
 	/*
 	 * Normally, we don't want to try too hard to write every label and
 	 * uberblock.  If there is a flaky disk, we don't want the rest of the
@@ -1098,8 +1263,11 @@ vdev_config_sync(vdev_t **svd, int svdco
 	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
 	 * bailing out and declaring the pool faulted.
 	 */
-	if (tryhard)
+	if (error != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0)
+			return (error);
 		flags |= ZIO_FLAG_TRYHARD;
+	}
 
 	ASSERT(ub->ub_txg <= txg);
 
@@ -1143,7 +1311,7 @@ vdev_config_sync(vdev_t **svd, int svdco
 	 * are committed to stable storage before the uberblock update.
 	 */
 	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
-		return (error);
+		goto retry;
 
 	/*
 	 * Sync the uberblocks to all vdevs in svd[].
@@ -1161,7 +1329,7 @@ vdev_config_sync(vdev_t **svd, int svdco
 	 *	to the new uberblocks.
 	 */
 	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
-		return (error);
+		goto retry;
 
 	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
@@ -1173,5 +1341,10 @@ vdev_config_sync(vdev_t **svd, int svdco
 	 * to disk to ensure that all odd-label updates are committed to
 	 * stable storage before the next transaction group begins.
 	 */
-	return (vdev_label_sync_list(spa, 1, txg, flags));
+	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
+		goto retry;;
+
+	trim_thread_wakeup(spa);
+
+	return (0);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_mirror.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c	27 Feb 2010 22:31:06 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c	18 Apr 2017 00:26:18 -0000
@@ -19,10 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
@@ -37,27 +41,105 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_load;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
 } mirror_child_t;
 
 typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
 	int		mm_children;
-	int		mm_replacing;
-	int		mm_preferred;
-	int		mm_root;
-	mirror_child_t	mm_child[1];
+	boolean_t	mm_replacing;
+	boolean_t	mm_root;
+	mirror_child_t	mm_child[];
 } mirror_map_t;
 
-int vdev_mirror_shift = 21;
+static int vdev_mirror_shift = 21;
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs_vdev);
+static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+    "ZFS VDEV Mirror");
+#endif
+#endif
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int rotating_inc = 0;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN,
+    &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
+#endif
+
+static int rotating_seek_inc = 5;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN,
+    &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
+#endif
+
+static int rotating_seek_offset = 1 * 1024 * 1024;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN,
+    &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
+    "triggers a reduced rotating media seek increment");
+#endif
+
+/* Non-rotating media load calculation configuration. */
+static int non_rotating_inc = 0;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN,
+    &non_rotating_inc, 0,
+    "Non-rotating media load increment for non-seeking I/O's");
+#endif
+
+static int non_rotating_seek_inc = 1;
+#ifdef _KERNEL
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN,
+    &non_rotating_seek_inc, 0,
+    "Non-rotating media load increment for seeking I/O's");
+#endif
+
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+	return (offsetof(mirror_map_t, mm_child[children]) +
+	    sizeof(int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root)
+{
+	mirror_map_t *mm;
+
+	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+	mm->mm_children = children;
+	mm->mm_replacing = replacing;
+	mm->mm_root = root;
+	mm->mm_preferred = (int *)((uintptr_t)mm + 
+	    offsetof(mirror_map_t, mm_child[children]));
+
+	return mm;
+}
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
 
-	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
 }
 
 static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
@@ -65,55 +147,80 @@ static const zio_vsd_ops_t vdev_mirror_v
 	zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+	uint64_t lastoffset;
+	int load;
+
+	/* All DVAs have equal weight at the root. */
+	if (mm->mm_root)
+		return (INT_MAX);
+
+	/*
+	 * We don't return INT_MAX if the device is resilvering i.e.
+	 * vdev_resilver_txg != 0 as when tested performance was slightly
+	 * worse overall when resilvering with compared to without.
+	 */
+
+	/* Standard load based on pending queue length. */
+	load = vdev_queue_length(vd);
+	lastoffset = vdev_queue_lastoffset(vd);
+
+	if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) {
+		/* Non-rotating media. */
+		if (lastoffset == zio_offset)
+			return (load + non_rotating_inc);
+
+		/*
+		 * Apply a seek penalty even for non-rotating devices as
+		 * sequential I/O'a can be aggregated into fewer operations
+		 * on the device, thus avoiding unnecessary per-command
+		 * overhead and boosting performance.
+		 */
+		return (load + non_rotating_seek_inc);
+	}
+
+	/* Rotating media I/O's which directly follow the last I/O. */
+	if (lastoffset == zio_offset)
+		return (load + rotating_inc);
+
+	/*
+	 * Apply half the seek increment to I/O's within seek offset
+	 * of the last I/O queued to this vdev as they should incure less
+	 * of a seek increment.
+	 */
+	if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
+		return (load + (rotating_seek_inc / 2));
+
+	/* Apply the full seek increment to all other I/O's. */
+	return (load + rotating_seek_inc);
+}
+
+
 static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
 {
 	mirror_map_t *mm = NULL;
 	mirror_child_t *mc;
 	vdev_t *vd = zio->io_vd;
-	int c, d;
+	int c;
 
 	if (vd == NULL) {
 		dva_t *dva = zio->io_bp->blk_dva;
 		spa_t *spa = zio->io_spa;
 
-		c = BP_GET_NDVAS(zio->io_bp);
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = B_FALSE;
-		mm->mm_preferred = spa_get_random(c);
-		mm->mm_root = B_TRUE;
-
-		/*
-		 * Check the other, lower-index DVAs to see if they're on
-		 * the same vdev as the child we picked.  If they are, use
-		 * them since they are likely to have been allocated from
-		 * the primary metaslab in use at the time, and hence are
-		 * more likely to have locality with single-copy data.
-		 */
-		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
-			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
-				mm->mm_preferred = d;
-		}
-
+		mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
+		    B_TRUE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
-
 			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
-		c = vd->vdev_children;
-
-		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
-		mm->mm_children = c;
-		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
-		mm->mm_root = B_FALSE;
-
+		mm = vdev_mirror_map_alloc(vd->vdev_children,
+		    (vd->vdev_ops == &vdev_replacing_ops ||
+                    vd->vdev_ops == &vdev_spare_ops), B_FALSE);
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
@@ -127,14 +234,15 @@ vdev_mirror_map_alloc(zio_t *zio)
 }
 
 static int
-vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	int numerrors = 0;
 	int lasterror = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
@@ -149,7 +257,10 @@ vdev_mirror_open(vdev_t *vd, uint64_t *a
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*ashift = MAX(*ashift, cvd->vdev_ashift);
+		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+		*physical_ashift = MAX(*physical_ashift,
+		    cvd->vdev_physical_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
@@ -184,9 +295,10 @@ vdev_mirror_scrub_done(zio_t *zio)
 
 	if (zio->io_error == 0) {
 		zio_t *pio;
+		zio_link_t *zl = NULL;
 
 		mutex_enter(&zio->io_lock);
-		while ((pio = zio_walk_parents(zio)) != NULL) {
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
 			mutex_enter(&pio->io_lock);
 			ASSERT3U(zio->io_size, >=, pio->io_size);
 			bcopy(zio->io_data, pio->io_data, pio->io_size);
@@ -203,50 +315,124 @@ vdev_mirror_scrub_done(zio_t *zio)
 }
 
 /*
- * Try to find a child whose DTL doesn't contain the block we want to read.
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked.  If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+	dva_t *dva = zio->io_bp->blk_dva;
+	mirror_map_t *mm = zio->io_vsd;
+	int preferred;
+	int c;
+
+	preferred = mm->mm_preferred[p];
+	for (p-- ; p >= 0; p--) {
+		c = mm->mm_preferred[p];
+		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+			preferred = c;
+	}
+	return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	int p;
+
+	if (mm->mm_root) {
+		p = spa_get_random(mm->mm_preferred_cnt);
+		return (vdev_mirror_dva_select(zio, p));
+	}
+
+	/*
+	 * To ensure we don't always favour the first matching vdev,
+	 * which could lead to wear leveling issues on SSD's, we
+	 * use the I/O offset as a pseudo random seed into the vdevs
+	 * which have the lowest load.
+	 */
+	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+	return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
  * If we can't, try the read on any vdev we haven't already tried.
  */
 static int
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
-	int i, c;
+	int c, lowest_load;
 
 	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
-	/*
-	 * Try to find a child whose DTL doesn't contain the block to read.
-	 * If a child is known to be completely inaccessible (indicated by
-	 * vdev_readable() returning B_FALSE), don't even try.
-	 */
-	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
-		if (c >= mm->mm_children)
-			c = 0;
+	lowest_load = INT_MAX;
+	mm->mm_preferred_cnt = 0;
+	for (c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc;
+
 		mc = &mm->mm_child[c];
 		if (mc->mc_tried || mc->mc_skipped)
 			continue;
+
 		if (!vdev_readable(mc->mc_vd)) {
-			mc->mc_error = ENXIO;
+			mc->mc_error = SET_ERROR(ENXIO);
 			mc->mc_tried = 1;	/* don't even try */
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
-			return (c);
-		mc->mc_error = ESTALE;
-		mc->mc_skipped = 1;
-		mc->mc_speculative = 1;
+
+		if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+			mc->mc_error = SET_ERROR(ESTALE);
+			mc->mc_skipped = 1;
+			mc->mc_speculative = 1;
+			continue;
+		}
+
+		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+		if (mc->mc_load > lowest_load)
+			continue;
+
+		if (mc->mc_load < lowest_load) {
+			lowest_load = mc->mc_load;
+			mm->mm_preferred_cnt = 0;
+		}
+		mm->mm_preferred[mm->mm_preferred_cnt] = c;
+		mm->mm_preferred_cnt++;
+	}
+
+	if (mm->mm_preferred_cnt == 1) {
+		vdev_queue_register_lastoffset(
+		    mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
+		return (mm->mm_preferred[0]);
+	}
+
+	if (mm->mm_preferred_cnt > 1) {
+		int c = vdev_mirror_preferred_child_randomize(zio);
+
+		vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
+		return (c);
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
 	 * Look for any child we haven't already tried before giving up.
 	 */
-	for (c = 0; c < mm->mm_children; c++)
-		if (!mm->mm_child[c].mc_tried)
+	for (c = 0; c < mm->mm_children; c++) {
+		if (!mm->mm_child[c].mc_tried) {
+			vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
+			    zio);
 			return (c);
+		}
+	}
 
 	/*
 	 * Every child failed.  There's no place left to look.
@@ -254,17 +440,18 @@ vdev_mirror_child_select(zio_t *zio)
 	return (-1);
 }
 
-static int
+static void
 vdev_mirror_io_start(zio_t *zio)
 {
 	mirror_map_t *mm;
 	mirror_child_t *mc;
 	int c, children;
 
-	mm = vdev_mirror_map_alloc(zio);
+	mm = vdev_mirror_map_init(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing &&
+		    mm->mm_children > 1) {
 			/*
 			 * For scrubbing reads we need to allocate a read
 			 * buffer for each child and issue reads to all
@@ -279,7 +466,8 @@ vdev_mirror_io_start(zio_t *zio)
 				    zio->io_type, zio->io_priority, 0,
 				    vdev_mirror_scrub_done, mc));
 			}
-			return (ZIO_PIPELINE_CONTINUE);
+			zio_execute(zio);
+			return;
 		}
 		/*
 		 * For normal reads just pick one child.
@@ -287,10 +475,11 @@ vdev_mirror_io_start(zio_t *zio)
 		c = vdev_mirror_child_select(zio);
 		children = (c >= 0);
 	} else {
-		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE ||
+		    zio->io_type == ZIO_TYPE_FREE);
 
 		/*
-		 * Writes go to all children.
+		 * Writes and frees go to all children.
 		 */
 		c = 0;
 		children = mm->mm_children;
@@ -305,7 +494,7 @@ vdev_mirror_io_start(zio_t *zio)
 		c++;
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_execute(zio);
 }
 
 static int
@@ -371,6 +560,8 @@ vdev_mirror_io_done(zio_t *zio)
 				zio->io_error = vdev_mirror_worst_error(mm);
 		}
 		return;
+	} else if (zio->io_type == ZIO_TYPE_FREE) {
+		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -420,13 +611,13 @@ vdev_mirror_io_done(zio_t *zio)
 				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
-				mc->mc_error = ESTALE;
+				mc->mc_error = SET_ERROR(ESTALE);
 			}
 
 			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_data, zio->io_size,
-			    ZIO_TYPE_WRITE, zio->io_priority,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
@@ -452,6 +643,8 @@ vdev_ops_t vdev_mirror_ops = {
 	vdev_mirror_io_start,
 	vdev_mirror_io_done,
 	vdev_mirror_state_change,
+	NULL,
+	NULL,
 	VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -463,6 +656,8 @@ vdev_ops_t vdev_replacing_ops = {
 	vdev_mirror_io_start,
 	vdev_mirror_io_done,
 	vdev_mirror_state_change,
+	NULL,
+	NULL,
 	VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -474,6 +669,8 @@ vdev_ops_t vdev_spare_ops = {
 	vdev_mirror_io_start,
 	vdev_mirror_io_done,
 	vdev_mirror_state_change,
+	NULL,
+	NULL,
 	VDEV_TYPE_SPARE,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_missing.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c	27 Feb 2010 22:31:12 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c	19 Nov 2014 12:25:12 -0000
@@ -19,11 +19,15 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+/*
  * The 'missing' vdev is a special vdev type used only during import.  It
  * signifies a placeholder in the root vdev for some vdev that we know is
  * missing.  We pass it down to the kernel to allow the rest of the
@@ -40,7 +44,8 @@
 
 /* ARGSUSED */
 static int
-vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	/*
 	 * Really this should just fail.  But then the root vdev will be in the
@@ -49,7 +54,9 @@ vdev_missing_open(vdev_t *vd, uint64_t *
 	 * will fail the GUID sum check before ever trying to open the pool.
 	 */
 	*psize = 0;
-	*ashift = 0;
+	*max_psize = 0;
+	*logical_ashift = 0;
+	*physical_ashift = 0;
 	return (0);
 }
 
@@ -60,11 +67,11 @@ vdev_missing_close(vdev_t *vd)
 }
 
 /* ARGSUSED */
-static int
+static void
 vdev_missing_io_start(zio_t *zio)
 {
-	zio->io_error = ENOTSUP;
-	return (ZIO_PIPELINE_CONTINUE);
+	zio->io_error = SET_ERROR(ENOTSUP);
+	zio_execute(zio);
 }
 
 /* ARGSUSED */
@@ -80,6 +87,8 @@ vdev_ops_t vdev_missing_ops = {
 	vdev_missing_io_start,
 	vdev_missing_io_done,
 	NULL,
+	NULL,
+	NULL,
 	VDEV_TYPE_MISSING,	/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -91,6 +100,8 @@ vdev_ops_t vdev_hole_ops = {
 	vdev_missing_io_start,
 	vdev_missing_io_done,
 	NULL,
+	NULL,
+	NULL,
 	VDEV_TYPE_HOLE,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_queue.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c	27 Feb 2010 22:31:12 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c	27 Mar 2017 06:19:46 -0000
@@ -23,28 +23,148 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
+
+/*
+ * ZFS I/O Scheduler
+ * ---------------
+ *
+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
+ * I/O scheduler determines when and in what order those operations are
+ * issued.  The I/O scheduler divides operations into six I/O classes
+ * prioritized in the following order: sync read, sync write, async read,
+ * async write, scrub/resilver and trim.  Each queue defines the minimum and
+ * maximum number of concurrent operations that may be issued to the device.
+ * In addition, the device has an aggregate maximum. Note that the sum of the
+ * per-queue minimums must not exceed the aggregate maximum, and if the
+ * aggregate maximum is equal to or greater than the sum of the per-queue
+ * maximums, the per-queue minimum has no effect.
+ *
+ * For many physical devices, throughput increases with the number of
+ * concurrent operations, but latency typically suffers. Further, physical
+ * devices typically have a limit at which more concurrent operations have no
+ * effect on throughput or can actually cause it to decrease.
+ *
+ * The scheduler selects the next operation to issue by first looking for an
+ * I/O class whose minimum has not been satisfied. Once all are satisfied and
+ * the aggregate maximum has not been hit, the scheduler looks for classes
+ * whose maximum has not been satisfied. Iteration through the I/O classes is
+ * done in the order specified above. No further operations are issued if the
+ * aggregate maximum number of concurrent operations has been hit or if there
+ * are no operations queued for an I/O class that has not hit its maximum.
+ * Every time an I/O is queued or an operation completes, the I/O scheduler
+ * looks for new operations to issue.
+ *
+ * All I/O classes have a fixed maximum number of outstanding operations
+ * except for the async write class. Asynchronous writes represent the data
+ * that is committed to stable storage during the syncing stage for
+ * transaction groups (see txg.c). Transaction groups enter the syncing state
+ * periodically so the number of queued async writes will quickly burst up and
+ * then bleed down to zero. Rather than servicing them as quickly as possible,
+ * the I/O scheduler changes the maximum number of active async write I/Os
+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since
+ * both throughput and latency typically increase with the number of
+ * concurrent operations issued to physical devices, reducing the burstiness
+ * in the number of concurrent operations also stabilizes the response time of
+ * operations from other -- and in particular synchronous -- queues. In broad
+ * strokes, the I/O scheduler will issue more concurrent operations from the
+ * async write queue as there's more dirty data in the pool.
+ *
+ * Async Writes
+ *
+ * The number of concurrent operations issued for the async write I/O class
+ * follows a piece-wise linear function defined by a few adjustable points.
+ *
+ *        |                   o---------| <-- zfs_vdev_async_write_max_active
+ *   ^    |                  /^         |
+ *   |    |                 / |         |
+ * active |                /  |         |
+ *  I/O   |               /   |         |
+ * count  |              /    |         |
+ *        |             /     |         |
+ *        |------------o      |         | <-- zfs_vdev_async_write_min_active
+ *       0|____________^______|_________|
+ *        0%           |      |       100% of zfs_dirty_data_max
+ *                     |      |
+ *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
+ *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
+ *
+ * Until the amount of dirty data exceeds a minimum percentage of the dirty
+ * data allowed in the pool, the I/O scheduler will limit the number of
+ * concurrent operations to the minimum. As that threshold is crossed, the
+ * number of concurrent operations issued increases linearly to the maximum at
+ * the specified maximum percentage of the dirty data allowed in the pool.
+ *
+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped
+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent
+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
+ * maximum percentage, this indicates that the rate of incoming data is
+ * greater than the rate that the backend storage can handle. In this case, we
+ * must further throttle incoming writes (see dmu_tx_delay() for details).
+ */
 
 /*
- * These tunables are for performance analysis.
+ * The maximum number of I/Os active to each device.  Ideally, this will be >=
+ * the sum of each queue's max_active.  It must be at least the sum of each
+ * queue's min_active.
  */
+uint32_t zfs_vdev_max_active = 1000;
+
+/*
+ * Per-queue limits on the number of I/Os active to each device.  If the
+ * sum of the queue's max_active is < zfs_vdev_max_active, then the
+ * min_active comes into play.  We will send min_active from each queue,
+ * and then select from queues in the order defined by zio_priority_t.
+ *
+ * In general, smaller max_active's will lead to lower latency of synchronous
+ * operations.  Larger max_active's may lead to higher overall throughput,
+ * depending on underlying storage.
+ *
+ * The ratio of the queues' max_actives determines the balance of performance
+ * between reads, writes, and scrubs.  E.g., increasing
+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
+ * more quickly, but reads and writes to have higher latency and lower
+ * throughput.
+ */
+uint32_t zfs_vdev_sync_read_min_active = 10;
+uint32_t zfs_vdev_sync_read_max_active = 10;
+uint32_t zfs_vdev_sync_write_min_active = 10;
+uint32_t zfs_vdev_sync_write_max_active = 10;
+uint32_t zfs_vdev_async_read_min_active = 1;
+uint32_t zfs_vdev_async_read_max_active = 3;
+uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_scrub_min_active = 1;
+uint32_t zfs_vdev_scrub_max_active = 2;
+uint32_t zfs_vdev_trim_min_active = 1;
 /*
- * zfs_vdev_max_pending is the maximum number of i/os concurrently
- * pending to each device.  zfs_vdev_min_pending is the initial number
- * of i/os pending to each device (before it starts ramping up to
- * max_pending).
+ * TRIM max active is large in comparison to the other values due to the fact
+ * that TRIM IOs are coalesced at the device layer. This value is set such
+ * that a typical SSD can process the queued IOs in a single request.
  */
-int zfs_vdev_max_pending = 10;
-int zfs_vdev_min_pending = 4;
+uint32_t zfs_vdev_trim_max_active = 64;
 
-/* deadline = pri + ddi_get_lbolt64() >> time_shift) */
-int zfs_vdev_time_shift = 6;
 
-/* exponential I/O issue ramp-up rate */
-int zfs_vdev_ramp_rate = 2;
+/*
+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
+ * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
+ * zfs_vdev_async_write_active_max_dirty_percent, use
+ * zfs_vdev_async_write_max_active. The value is linearly interpolated
+ * between min and max.
+ */
+int zfs_vdev_async_write_active_min_dirty_percent = 30;
+int zfs_vdev_async_write_active_max_dirty_percent = 60;
 
 /*
  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
@@ -52,24 +172,135 @@ int zfs_vdev_ramp_rate = 2;
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
 /*
- * Virtual device vector for disk I/O scheduling.
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
  */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs_vdev);
+
+static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+    sysctl_zfs_async_write_active_min_dirty_percent, "I",
+    "Percentage of async write dirty data below which "
+    "async_write_min_active is used.");
+
+static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+    sysctl_zfs_async_write_active_max_dirty_percent, "I",
+    "Percentage of async write dirty data above which "
+    "async_write_max_active is used.");
+
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
+    &zfs_vdev_max_active, 0,
+    "The maximum number of I/Os of all types active for each device.");
+
+#define ZFS_VDEV_QUEUE_KNOB_MIN(name)					\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
+    &zfs_vdev_ ## name ## _min_active, 0,				\
+    "Initial number of I/O requests of type " #name			\
+    " active for each device");
+
+#define ZFS_VDEV_QUEUE_KNOB_MAX(name)					\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\
+    &zfs_vdev_ ## name ## _max_active, 0,				\
+    "Maximum number of I/O requests of type " #name			\
+    " active for each device");
+
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
+ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
+ZFS_VDEV_QUEUE_KNOB_MIN(trim);
+ZFS_VDEV_QUEUE_KNOB_MAX(trim);
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
+    &zfs_vdev_aggregation_limit, 0,
+    "I/O requests are aggregated up to this size");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
+    &zfs_vdev_read_gap_limit, 0,
+    "Acceptable gap between two reads being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
+    &zfs_vdev_write_gap_limit, 0,
+    "Acceptable gap between two writes being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
+    &zfs_vdev_queue_depth_pct, 0,
+    "Queue depth percentage for each top-level");
+
+static int
+sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_vdev_async_write_active_min_dirty_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+	
+	if (val < 0 || val > 100 ||
+	    val >= zfs_vdev_async_write_active_max_dirty_percent)
+		return (EINVAL);
+
+	zfs_vdev_async_write_active_min_dirty_percent = val;
+
+	return (0);
+}
+
+static int
+sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+	int val, err;
+
+	val = zfs_vdev_async_write_active_max_dirty_percent;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < 0 || val > 100 ||
+	    val <= zfs_vdev_async_write_active_min_dirty_percent)
+		return (EINVAL);
+
+	zfs_vdev_async_write_active_max_dirty_percent = val;
+
+	return (0);
+}
+#endif
+#endif
+
 int
-vdev_queue_deadline_compare(const void *x1, const void *x2)
+vdev_queue_offset_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
-	if (z1->io_deadline < z2->io_deadline)
-		return (-1);
-	if (z1->io_deadline > z2->io_deadline)
-		return (1);
-
 	if (z1->io_offset < z2->io_offset)
 		return (-1);
 	if (z1->io_offset > z2->io_offset)
@@ -83,12 +314,34 @@ vdev_queue_deadline_compare(const void *
 	return (0);
 }
 
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+	return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+	if (t == ZIO_TYPE_READ)
+		return (&vq->vq_read_offset_tree);
+	else if (t == ZIO_TYPE_WRITE)
+		return (&vq->vq_write_offset_tree);
+	else
+		return (NULL);
+}
+
 int
-vdev_queue_offset_compare(const void *x1, const void *x2)
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
+	if (z1->io_timestamp < z2->io_timestamp)
+		return (-1);
+	if (z1->io_timestamp > z2->io_timestamp)
+		return (1);
+
 	if (z1->io_offset < z2->io_offset)
 		return (-1);
 	if (z1->io_offset > z2->io_offset)
@@ -108,18 +361,35 @@ vdev_queue_init(vdev_t *vd)
 	vdev_queue_t *vq = &vd->vdev_queue;
 
 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+	vq->vq_vdev = vd;
+
+	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
 
-	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		int (*compfn) (const void *, const void *);
 
-	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+		/*
+		 * The synchronous i/o queues are dispatched in FIFO rather
+		 * than LBA order.  This provides more consistent latency for
+		 * these i/os.
+		 */
+		if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+			compfn = vdev_queue_timestamp_compare;
+		else
+			compfn = vdev_queue_offset_compare;
 
-	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+		avl_create(vdev_queue_class_tree(vq, p), compfn,
+		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
+	}
 
-	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
-	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
+	vq->vq_lastoffset = 0;
 }
 
 void
@@ -127,10 +397,11 @@ vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
-	avl_destroy(&vq->vq_deadline_tree);
-	avl_destroy(&vq->vq_read_tree);
-	avl_destroy(&vq->vq_write_tree);
-	avl_destroy(&vq->vq_pending_tree);
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+		avl_destroy(vdev_queue_class_tree(vq, p));
+	avl_destroy(&vq->vq_active_tree);
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 
 	mutex_destroy(&vq->vq_lock);
 }
@@ -138,30 +409,231 @@ vdev_queue_fini(vdev_t *vd)
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
-	avl_add(&vq->vq_deadline_tree, zio);
-	avl_add(zio->io_vdev_tree, zio);
+	spa_t *spa = zio->io_spa;
+	avl_tree_t *qtt;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	qtt = vdev_queue_type_tree(vq, zio->io_type);
+	if (qtt)
+		avl_add(qtt, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	spa->spa_queue_stats[zio->io_priority].spa_queued++;
+	if (spa->spa_iokstat != NULL)
+		kstat_waitq_enter(spa->spa_iokstat->ks_data);
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	avl_remove(&vq->vq_deadline_tree, zio);
-	avl_remove(zio->io_vdev_tree, zio);
+	spa_t *spa = zio->io_spa;
+	avl_tree_t *qtt;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	qtt = vdev_queue_type_tree(vq, zio->io_type);
+	if (qtt)
+		avl_remove(qtt, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
+	spa->spa_queue_stats[zio->io_priority].spa_queued--;
+	if (spa->spa_iokstat != NULL)
+		kstat_waitq_exit(spa->spa_iokstat->ks_data);
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
 }
 
 static void
-vdev_queue_agg_io_done(zio_t *aio)
+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
-	zio_t *pio;
+	spa_t *spa = zio->io_spa;
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	vq->vq_class[zio->io_priority].vqc_active++;
+	avl_add(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	spa->spa_queue_stats[zio->io_priority].spa_active++;
+	if (spa->spa_iokstat != NULL)
+		kstat_runq_enter(spa->spa_iokstat->ks_data);
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
 
-	while ((pio = zio_walk_parents(aio)) != NULL)
-		if (aio->io_type == ZIO_TYPE_READ)
+static void
+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	vq->vq_class[zio->io_priority].vqc_active--;
+	avl_remove(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+	mutex_enter(&spa->spa_iokstat_lock);
+	ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
+	spa->spa_queue_stats[zio->io_priority].spa_active--;
+	if (spa->spa_iokstat != NULL) {
+		kstat_io_t *ksio = spa->spa_iokstat->ks_data;
+
+		kstat_runq_exit(spa->spa_iokstat->ks_data);
+		if (zio->io_type == ZIO_TYPE_READ) {
+			ksio->reads++;
+			ksio->nread += zio->io_size;
+		} else if (zio->io_type == ZIO_TYPE_WRITE) {
+			ksio->writes++;
+			ksio->nwritten += zio->io_size;
+		}
+	}
+	mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+	if (aio->io_type == ZIO_TYPE_READ) {
+		zio_t *pio;
+		zio_link_t *zl = NULL;
+		while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
 			bcopy((char *)aio->io_data + (pio->io_offset -
 			    aio->io_offset), pio->io_data, pio->io_size);
+		}
+	}
 
 	zio_buf_free(aio->io_data, aio->io_size);
 }
 
+static int
+vdev_queue_class_min_active(zio_priority_t p)
+{
+	switch (p) {
+	case ZIO_PRIORITY_SYNC_READ:
+		return (zfs_vdev_sync_read_min_active);
+	case ZIO_PRIORITY_SYNC_WRITE:
+		return (zfs_vdev_sync_write_min_active);
+	case ZIO_PRIORITY_ASYNC_READ:
+		return (zfs_vdev_async_read_min_active);
+	case ZIO_PRIORITY_ASYNC_WRITE:
+		return (zfs_vdev_async_write_min_active);
+	case ZIO_PRIORITY_SCRUB:
+		return (zfs_vdev_scrub_min_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_min_active);
+	default:
+		panic("invalid priority %u", p);
+		return (0);
+	}
+}
+
+static __noinline int
+vdev_queue_max_async_writes(spa_t *spa)
+{
+	int writes;
+	uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
+	uint64_t min_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_min_dirty_percent / 100;
+	uint64_t max_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_max_dirty_percent / 100;
+
+	/*
+	 * Sync tasks correspond to interactive user actions. To reduce the
+	 * execution time of those actions we push data out as fast as possible.
+	 */
+	if (spa_has_pending_synctask(spa)) {
+		return (zfs_vdev_async_write_max_active);
+	}
+
+	if (dirty < min_bytes)
+		return (zfs_vdev_async_write_min_active);
+	if (dirty > max_bytes)
+		return (zfs_vdev_async_write_max_active);
+
+	/*
+	 * linear interpolation:
+	 * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
+	 * move right by min_bytes
+	 * move up by min_writes
+	 */
+	writes = (dirty - min_bytes) *
+	    (zfs_vdev_async_write_max_active -
+	    zfs_vdev_async_write_min_active) /
+	    (max_bytes - min_bytes) +
+	    zfs_vdev_async_write_min_active;
+	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
+	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+	return (writes);
+}
+
+static int
+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
+{
+	switch (p) {
+	case ZIO_PRIORITY_SYNC_READ:
+		return (zfs_vdev_sync_read_max_active);
+	case ZIO_PRIORITY_SYNC_WRITE:
+		return (zfs_vdev_sync_write_max_active);
+	case ZIO_PRIORITY_ASYNC_READ:
+		return (zfs_vdev_async_read_max_active);
+	case ZIO_PRIORITY_ASYNC_WRITE:
+		return (vdev_queue_max_async_writes(spa));
+	case ZIO_PRIORITY_SCRUB:
+		return (zfs_vdev_scrub_max_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_max_active);
+	default:
+		panic("invalid priority %u", p);
+		return (0);
+	}
+}
+
+/*
+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * there is no eligible class.
+ */
+static zio_priority_t
+vdev_queue_class_to_issue(vdev_queue_t *vq)
+{
+	spa_t *spa = vq->vq_vdev->vdev_spa;
+	zio_priority_t p;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+		return (ZIO_PRIORITY_NUM_QUEUEABLE);
+
+	/* find a queue that has not reached its minimum # outstanding i/os */
+	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+		    vq->vq_class[p].vqc_active <
+		    vdev_queue_class_min_active(p))
+			return (p);
+	}
+
+	/*
+	 * If we haven't found a queue, look for one that hasn't reached its
+	 * maximum # outstanding i/os.
+	 */
+	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+		    vq->vq_class[p].vqc_active <
+		    vdev_queue_class_max_active(spa, p))
+			return (p);
+	}
+
+	/* No eligible queued i/os */
+	return (ZIO_PRIORITY_NUM_QUEUEABLE);
+}
+
 /*
  * Compute the range spanned by two i/os, which is the endpoint of the last
  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
@@ -172,153 +644,192 @@ vdev_queue_agg_io_done(zio_t *aio)
 #define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 
 static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 {
-	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
+	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
+	void *abuf;
+	uint64_t maxgap = 0;
+	uint64_t size;
+	boolean_t stretch;
 	avl_tree_t *t;
-	int flags;
-	uint64_t maxspan = zfs_vdev_aggregation_limit;
-	uint64_t maxgap;
-	int stretch;
+	enum zio_flag flags;
 
-again:
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
-	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
-	    avl_numnodes(&vq->vq_deadline_tree) == 0)
+	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 		return (NULL);
 
-	fio = lio = avl_first(&vq->vq_deadline_tree);
+	first = last = zio;
 
-	t = fio->io_vdev_tree;
-	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
-	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
+	if (zio->io_type == ZIO_TYPE_READ)
+		maxgap = zfs_vdev_read_gap_limit;
 
-	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
-		/*
-		 * We can aggregate I/Os that are sufficiently adjacent and of
-		 * the same flavor, as expressed by the AGG_INHERIT flags.
-		 * The latter requirement is necessary so that certain
-		 * attributes of the I/O, such as whether it's a normal I/O
-		 * or a scrub/resilver, can be preserved in the aggregate.
-		 * We can include optional I/Os, but don't allow them
-		 * to begin a range as they add no benefit in that situation.
-		 */
+	/*
+	 * We can aggregate I/Os that are sufficiently adjacent and of
+	 * the same flavor, as expressed by the AGG_INHERIT flags.
+	 * The latter requirement is necessary so that certain
+	 * attributes of the I/O, such as whether it's a normal I/O
+	 * or a scrub/resilver, can be preserved in the aggregate.
+	 * We can include optional I/Os, but don't allow them
+	 * to begin a range as they add no benefit in that situation.
+	 */
 
-		/*
-		 * We keep track of the last non-optional I/O.
-		 */
-		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+	/*
+	 * We keep track of the last non-optional I/O.
+	 */
+	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 
-		/*
-		 * Walk backwards through sufficiently contiguous I/Os
-		 * recording the last non-option I/O.
-		 */
-		while ((dio = AVL_PREV(t, fio)) != NULL &&
-		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-		    IO_SPAN(dio, lio) <= maxspan &&
-		    IO_GAP(dio, fio) <= maxgap) {
-			fio = dio;
-			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
-				mio = fio;
-		}
+	/*
+	 * Walk backwards through sufficiently contiguous I/Os
+	 * recording the last non-option I/O.
+	 */
+	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+	t = vdev_queue_type_tree(vq, zio->io_type);
+	while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
+	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
+	    IO_GAP(dio, first) <= maxgap) {
+		first = dio;
+		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
+			mandatory = first;
+	}
 
-		/*
-		 * Skip any initial optional I/Os.
-		 */
-		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
-			fio = AVL_NEXT(t, fio);
-			ASSERT(fio != NULL);
-		}
+	/*
+	 * Skip any initial optional I/Os.
+	 */
+	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
+		first = AVL_NEXT(t, first);
+		ASSERT(first != NULL);
+	}
 
-		/*
-		 * Walk forward through sufficiently contiguous I/Os.
-		 */
-		while ((dio = AVL_NEXT(t, lio)) != NULL &&
-		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
-		    IO_SPAN(fio, dio) <= maxspan &&
-		    IO_GAP(lio, dio) <= maxgap) {
-			lio = dio;
-			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
-				mio = lio;
-		}
+	/*
+	 * Walk forward through sufficiently contiguous I/Os.
+	 */
+	while ((dio = AVL_NEXT(t, last)) != NULL &&
+	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+	    IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
+	    IO_GAP(last, dio) <= maxgap) {
+		last = dio;
+		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
+			mandatory = last;
+	}
 
-		/*
-		 * Now that we've established the range of the I/O aggregation
-		 * we must decide what to do with trailing optional I/Os.
-		 * For reads, there's nothing to do. While we are unable to
-		 * aggregate further, it's possible that a trailing optional
-		 * I/O would allow the underlying device to aggregate with
-		 * subsequent I/Os. We must therefore determine if the next
-		 * non-optional I/O is close enough to make aggregation
-		 * worthwhile.
-		 */
-		stretch = B_FALSE;
-		if (t != &vq->vq_read_tree && mio != NULL) {
-			nio = lio;
-			while ((dio = AVL_NEXT(t, nio)) != NULL &&
-			    IO_GAP(nio, dio) == 0 &&
-			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
-				nio = dio;
-				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
-					stretch = B_TRUE;
-					break;
-				}
+	/*
+	 * Now that we've established the range of the I/O aggregation
+	 * we must decide what to do with trailing optional I/Os.
+	 * For reads, there's nothing to do. While we are unable to
+	 * aggregate further, it's possible that a trailing optional
+	 * I/O would allow the underlying device to aggregate with
+	 * subsequent I/Os. We must therefore determine if the next
+	 * non-optional I/O is close enough to make aggregation
+	 * worthwhile.
+	 */
+	stretch = B_FALSE;
+	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
+		zio_t *nio = last;
+		while ((dio = AVL_NEXT(t, nio)) != NULL &&
+		    IO_GAP(nio, dio) == 0 &&
+		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
+			nio = dio;
+			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+				stretch = B_TRUE;
+				break;
 			}
 		}
+	}
 
-		if (stretch) {
-			/* This may be a no-op. */
-			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
-			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
-		} else {
-			while (lio != mio && lio != fio) {
-				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
-				lio = AVL_PREV(t, lio);
-				ASSERT(lio != NULL);
-			}
+	if (stretch) {
+		/* This may be a no-op. */
+		dio = AVL_NEXT(t, last);
+		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+	} else {
+		while (last != mandatory && last != first) {
+			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
+			last = AVL_PREV(t, last);
+			ASSERT(last != NULL);
 		}
 	}
 
-	if (fio != lio) {
-		uint64_t size = IO_SPAN(fio, lio);
-		ASSERT(size <= zfs_vdev_aggregation_limit);
-
-		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
-		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
-		    vdev_queue_agg_io_done, NULL);
-
-		nio = fio;
-		do {
-			dio = nio;
-			nio = AVL_NEXT(t, dio);
-			ASSERT(dio->io_type == aio->io_type);
-			ASSERT(dio->io_vdev_tree == t);
-
-			if (dio->io_flags & ZIO_FLAG_NODATA) {
-				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
-				bzero((char *)aio->io_data + (dio->io_offset -
-				    aio->io_offset), dio->io_size);
-			} else if (dio->io_type == ZIO_TYPE_WRITE) {
-				bcopy(dio->io_data, (char *)aio->io_data +
-				    (dio->io_offset - aio->io_offset),
-				    dio->io_size);
-			}
+	if (first == last)
+		return (NULL);
+
+	size = IO_SPAN(first, last);
+	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
+
+	abuf = zio_buf_alloc_nowait(size);
+	if (abuf == NULL)
+		return (NULL);
+
+	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
+	    abuf, size, first->io_type, zio->io_priority,
+	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+	    vdev_queue_agg_io_done, NULL);
+	aio->io_timestamp = first->io_timestamp;
+
+	nio = first;
+	do {
+		dio = nio;
+		nio = AVL_NEXT(t, dio);
+		ASSERT3U(dio->io_type, ==, aio->io_type);
+
+		if (dio->io_flags & ZIO_FLAG_NODATA) {
+			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
+			bzero((char *)aio->io_data + (dio->io_offset -
+			    aio->io_offset), dio->io_size);
+		} else if (dio->io_type == ZIO_TYPE_WRITE) {
+			bcopy(dio->io_data, (char *)aio->io_data +
+			    (dio->io_offset - aio->io_offset),
+			    dio->io_size);
+		}
+
+		zio_add_child(dio, aio);
+		vdev_queue_io_remove(vq, dio);
+		zio_vdev_io_bypass(dio);
+		zio_execute(dio);
+	} while (dio != last);
+
+	return (aio);
+}
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq)
+{
+	zio_t *zio, *aio;
+	zio_priority_t p;
+	avl_index_t idx;
+	avl_tree_t *tree;
+	zio_t search;
 
-			zio_add_child(dio, aio);
-			vdev_queue_io_remove(vq, dio);
-			zio_vdev_io_bypass(dio);
-			zio_execute(dio);
-		} while (dio != lio);
+again:
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
-		avl_add(&vq->vq_pending_tree, aio);
+	p = vdev_queue_class_to_issue(vq);
 
-		return (aio);
+	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
+		/* No eligible queued i/os */
+		return (NULL);
 	}
 
-	ASSERT(fio->io_vdev_tree == t);
-	vdev_queue_io_remove(vq, fio);
+	/*
+	 * For LBA-ordered queues (async / scrub), issue the i/o which follows
+	 * the most recently issued i/o in LBA (offset) order.
+	 *
+	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+	 */
+	tree = vdev_queue_class_tree(vq, p);
+	search.io_timestamp = 0;
+	search.io_offset = vq->vq_last_offset + 1;
+	VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+	zio = avl_nearest(tree, idx, AVL_AFTER);
+	if (zio == NULL)
+		zio = avl_first(tree);
+	ASSERT3U(zio->io_priority, ==, p);
+
+	aio = vdev_queue_aggregate(vq, zio);
+	if (aio != NULL)
+		zio = aio;
+	else
+		vdev_queue_io_remove(vq, zio);
 
 	/*
 	 * If the I/O is or was optional and therefore has no data, we need to
@@ -326,17 +837,18 @@ again:
 	 * deadlock that we could encounter since this I/O will complete
 	 * immediately.
 	 */
-	if (fio->io_flags & ZIO_FLAG_NODATA) {
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		mutex_exit(&vq->vq_lock);
-		zio_vdev_io_bypass(fio);
-		zio_execute(fio);
+		zio_vdev_io_bypass(zio);
+		zio_execute(zio);
 		mutex_enter(&vq->vq_lock);
 		goto again;
 	}
 
-	avl_add(&vq->vq_pending_tree, fio);
+	vdev_queue_pending_add(vq, zio);
+	vq->vq_last_offset = zio->io_offset;
 
-	return (fio);
+	return (zio);
 }
 
 zio_t *
@@ -345,27 +857,33 @@ vdev_queue_io(zio_t *zio)
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-
 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 		return (zio);
 
-	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+	/*
+	 * Children i/os inherent their parent's priority, which might
+	 * not match the child's i/o type.  Fix it up here.
+	 */
+	if (zio->io_type == ZIO_TYPE_READ) {
+		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
+		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
+		    zio->io_priority != ZIO_PRIORITY_SCRUB)
+			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
+		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
+		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
+			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_FREE);
+		zio->io_priority = ZIO_PRIORITY_TRIM;
+	}
 
-	if (zio->io_type == ZIO_TYPE_READ)
-		zio->io_vdev_tree = &vq->vq_read_tree;
-	else
-		zio->io_vdev_tree = &vq->vq_write_tree;
+	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 
 	mutex_enter(&vq->vq_lock);
-
-	zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
-	    zio->io_priority;
-
+	zio->io_timestamp = gethrtime();
 	vdev_queue_io_add(vq, zio);
-
-	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
-
+	nio = vdev_queue_io_to_issue(vq);
 	mutex_exit(&vq->vq_lock);
 
 	if (nio == NULL)
@@ -383,15 +901,15 @@ void
 vdev_queue_io_done(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
 
 	mutex_enter(&vq->vq_lock);
 
-	avl_remove(&vq->vq_pending_tree, zio);
+	vdev_queue_pending_remove(vq, zio);
 
-	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
-		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
-		if (nio == NULL)
-			break;
+	vq->vq_io_complete_ts = gethrtime();
+
+	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 		mutex_exit(&vq->vq_lock);
 		if (nio->io_done == vdev_queue_agg_io_done) {
 			zio_nowait(nio);
@@ -404,3 +922,26 @@ vdev_queue_io_done(zio_t *zio)
 
 	mutex_exit(&vq->vq_lock);
 }
+
+/*
+ * As these three methods are only used for load calculations we're not concerned
+ * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
+ * use here, instead we prefer to keep it lock free for performance.
+ */ 
+int
+vdev_queue_length(vdev_t *vd)
+{
+	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_lastoffset(vdev_t *vd)
+{
+	return (vd->vdev_queue.vq_lastoffset);
+}
+
+void
+vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
+{
+	vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_raidz.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c	27 Feb 2010 22:31:14 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c	5 May 2017 18:03:34 -0000
@@ -20,17 +20,27 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
+#ifndef __FreeBSD__
+#include <sys/vdev_disk.h>
+#endif
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
+#ifdef __FreeBSD__
+#include <sys/bio.h>
+#endif
 
 /*
  * Virtual device vector for RAID-Z.
@@ -60,6 +70,7 @@
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  *   o multiplication of A by 2 is defined by the following bitwise expression:
+ *
  *	(A * 2)_7 = A_6
  *	(A * 2)_6 = A_5
  *	(A * 2)_5 = A_4
@@ -118,7 +129,7 @@ typedef struct raidz_map {
 	uint64_t rm_missingparity;	/* Count of missing parity devices */
 	uint64_t rm_firstdatacol;	/* First data column/parity count */
 	uint64_t rm_nskip;		/* Skipped sectors for padding */
-	uint64_t rm_skipstart;	/* Column index of padding start */
+	uint64_t rm_skipstart;		/* Column index of padding start */
 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
 	uintptr_t rm_reports;		/* # of referencing checksum reports */
 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
@@ -153,15 +164,14 @@ typedef struct raidz_map {
 	VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
+#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
+
 /*
  * Force reconstruction to use the general purpose method.
  */
 int vdev_raidz_default_to_general;
 
-/*
- * These two tables represent powers and logs of 2 in the Galois field defined
- * above. These values were computed by repeatedly multiplying by 2 as above.
- */
+/* Powers of 2 in the Galois field defined above. */
 static const uint8_t vdev_raidz_pow2[256] = {
 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
@@ -196,6 +206,7 @@ static const uint8_t vdev_raidz_pow2[256
 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 };
+/* Logs of 2 in the Galois field defined above. */
 static const uint8_t vdev_raidz_log2[256] = {
 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
@@ -259,7 +270,9 @@ vdev_raidz_map_free(raidz_map_t *rm)
 	size_t size;
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
-		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+		if (rm->rm_col[c].rc_data != NULL)
+			zio_buf_free(rm->rm_col[c].rc_data,
+			    rm->rm_col[c].rc_size);
 
 		if (rm->rm_col[c].rc_gdata != NULL)
 			zio_buf_free(rm->rm_col[c].rc_gdata,
@@ -281,7 +294,7 @@ vdev_raidz_map_free_vsd(zio_t *zio)
 {
 	raidz_map_t *rm = zio->io_vsd;
 
-	ASSERT3U(rm->rm_freed, ==, 0);
+	ASSERT0(rm->rm_freed);
 	rm->rm_freed = 1;
 
 	if (rm->rm_reports == 0)
@@ -431,23 +444,50 @@ static const zio_vsd_ops_t vdev_raidz_vs
 	vdev_raidz_cksum_report
 };
 
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ */
 static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
-    uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
+    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
 	raidz_map_t *rm;
-	uint64_t b = zio->io_offset >> unit_shift;
-	uint64_t s = zio->io_size >> unit_shift;
+	/* The starting RAIDZ (parent) vdev sector of the block. */
+	uint64_t b = offset >> unit_shift;
+	/* The zio's size in units of the vdev's minimum sector size. */
+	uint64_t s = size >> unit_shift;
+	/* The first column for this stripe. */
 	uint64_t f = b % dcols;
+	/* The starting byte offset on each child vdev. */
 	uint64_t o = (b / dcols) << unit_shift;
 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 */
 	q = s / (dcols - nparity);
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
 	r = s - q * (dcols - nparity);
+
+	/* The number of "big columns" - those which contain remainder data. */
 	bc = (r == 0 ? 0 : r + nparity);
+
+	/*
+	 * The total number of data and parity sectors associated with
+	 * this I/O.
+	 */
 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
 
+	/* acols: The columns that will be accessed. */
+	/* scols: The columns that will be accessed or skipped. */
 	if (q == 0) {
+		/* Our I/O request doesn't span all child vdevs. */
 		acols = bc;
 		scols = MIN(dcols, roundup(bc, nparity + 1));
 	} else {
@@ -504,14 +544,20 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_
 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 	ASSERT3U(rm->rm_nskip, <=, nparity);
 
-	for (c = 0; c < rm->rm_firstdatacol; c++)
-		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
-
-	rm->rm_col[c].rc_data = zio->io_data;
+	if (!dofree) {
+		for (c = 0; c < rm->rm_firstdatacol; c++) {
+			rm->rm_col[c].rc_data =
+			    zio_buf_alloc(rm->rm_col[c].rc_size);
+		}
 
-	for (c = c + 1; c < acols; c++)
-		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
-		    rm->rm_col[c - 1].rc_size;
+		rm->rm_col[c].rc_data = data;
+
+		for (c = c + 1; c < acols; c++) {
+			rm->rm_col[c].rc_data =
+			    (char *)rm->rm_col[c - 1].rc_data +
+			    rm->rm_col[c - 1].rc_size;
+		}
+	}
 
 	/*
 	 * If all data stored spans all columns, there's a danger that parity
@@ -536,7 +582,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
-	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
 		devidx = rm->rm_col[0].rc_devidx;
 		o = rm->rm_col[0].rc_offset;
 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -548,8 +594,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_
 			rm->rm_skipstart = 1;
 	}
 
-	zio->io_vsd = rm;
-	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	return (rm);
 }
 
@@ -959,12 +1003,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *r
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
- *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
- *           |  0   1   0   0   0   0   0   0  |
- *  (V|I)' = |  0   0   1   0   0   0   0   0  |
- *           |  0   0   0   1   0   0   0   0  |
+ *  (V|I)' = |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
@@ -1134,7 +1175,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm
 	 */
 	for (i = 0; i < nmissing; i++) {
 		for (j = 0; j < missing[i]; j++) {
-			ASSERT3U(rows[i][j], ==, 0);
+			ASSERT0(rows[i][j]);
 		}
 		ASSERT3U(rows[i][missing[i]], !=, 0);
 
@@ -1175,7 +1216,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm
 			if (j == missing[i]) {
 				ASSERT3U(rows[i][j], ==, 1);
 			} else {
-				ASSERT3U(rows[i][j], ==, 0);
+				ASSERT0(rows[i][j]);
 			}
 		}
 	}
@@ -1190,7 +1231,8 @@ vdev_raidz_matrix_reconstruct(raidz_map_
 	uint64_t ccount;
 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
-	uint8_t log, val;
+	uint8_t log = 0;
+	uint8_t val;
 	int ll;
 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 	uint8_t *p, *pp;
@@ -1442,7 +1484,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, 
 }
 
 static int
-vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	vdev_t *cvd;
 	uint64_t nparity = vd->vdev_nparity;
@@ -1455,7 +1498,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *as
 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
 	    vd->vdev_children < nparity + 1) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
@@ -1470,10 +1513,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *as
 		}
 
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
-		*ashift = MAX(*ashift, cvd->vdev_ashift);
+		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+		*physical_ashift = MAX(*physical_ashift,
+		    cvd->vdev_physical_ashift);
 	}
 
 	*asize *= vd->vdev_children;
+	*max_asize *= vd->vdev_children;
 
 	if (numerrors > nparity) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
@@ -1492,6 +1539,154 @@ vdev_raidz_close(vdev_t *vd)
 		vdev_close(vd->vdev_child[c]);
 }
 
+#ifdef illumos
+/*
+ * Handle a read or write I/O to a RAID-Z dump device.
+ *
+ * The dump device is in a unique situation compared to other ZFS datasets:
+ * writing to this device should be as simple and fast as possible.  In
+ * addition, durability matters much less since the dump will be extracted
+ * once the machine reboots.  For that reason, this function eschews parity for
+ * performance and simplicity.  The dump device uses the checksum setting
+ * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
+ * dataset.
+ *
+ * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
+ * 128 KB will not fill an entire block; in addition, they may not be properly
+ * aligned.  In that case, this function uses the preallocated 128 KB block and
+ * omits reading or writing any "empty" portions of that block, as opposed to
+ * allocating a fresh appropriately-sized block.
+ *
+ * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
+ *
+ *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
+ *
+ * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
+ * allocated which spans all five child vdevs.  8 KB of data would be written to
+ * each of four vdevs, with the fifth containing the parity bits.
+ *
+ *       parity    data     data     data     data
+ *     |   PP   |   XX   |   XX   |   XX   |   XX   |
+ *         ^        ^        ^        ^        ^
+ *         |        |        |        |        |
+ *   8 KB parity    ------8 KB data blocks------
+ *
+ * However, when writing to the dump device, the behavior is different:
+ *
+ *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
+ *
+ * Unlike the normal RAID-Z case in which the block is allocated based on the
+ * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
+ * I/O size is less than 128 KB, only the actual portions of data are written.
+ * In this example the data is written to the third data vdev since that vdev
+ * contains the offset [64 KB, 96 KB).
+ *
+ *       parity    data     data     data     data
+ *     |        |        |        |   XX   |        |
+ *                                    ^
+ *                                    |
+ *                             32 KB data block
+ *
+ * As a result, an individual I/O may not span all child vdevs; moreover, a
+ * small I/O may only operate on a single child vdev.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
+ * would look like:
+ *
+ *       parity   parity   parity    data     data     data     data
+ *     |        |        |        |        |        |   XX   |        |
+ *                                                      ^
+ *                                                      |
+ *                                               32 KB data block
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
+{
+	vdev_t *tvd = vd->vdev_top;
+	vdev_t *cvd;
+	raidz_map_t *rm;
+	raidz_col_t *rc;
+	int c, err = 0;
+
+	uint64_t start, end, colstart, colend;
+	uint64_t coloffset, colsize, colskip;
+
+	int flags = doread ? BIO_READ : BIO_WRITE;
+
+#ifdef	_KERNEL
+
+	/*
+	 * Don't write past the end of the block
+	 */
+	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
+
+	start = offset;
+	end = start + size;
+
+	/*
+	 * Allocate a RAID-Z map for this block.  Note that this block starts
+	 * from the "original" offset, this is, the offset of the extent which
+	 * contains the requisite offset of the data being read or written.
+	 *
+	 * Even if this I/O operation doesn't span the full block size, let's
+	 * treat the on-disk format as if the only blocks are the complete 128
+	 * KB size.
+	 */
+	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
+	    vd->vdev_children, vd->vdev_nparity);
+
+	coloffset = origoffset;
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+	    c++, coloffset += rc->rc_size) {
+		rc = &rm->rm_col[c];
+		cvd = vd->vdev_child[rc->rc_devidx];
+
+		/*
+		 * Find the start and end of this column in the RAID-Z map,
+		 * keeping in mind that the stated size and offset of the
+		 * operation may not fill the entire column for this vdev.
+		 *
+		 * If any portion of the data spans this column, issue the
+		 * appropriate operation to the vdev.
+		 */
+		if (coloffset + rc->rc_size <= start)
+			continue;
+		if (coloffset >= end)
+			continue;
+
+		colstart = MAX(coloffset, start);
+		colend = MIN(end, coloffset + rc->rc_size);
+		colsize = colend - colstart;
+		colskip = colstart - coloffset;
+
+		VERIFY3U(colsize, <=, rc->rc_size);
+		VERIFY3U(colskip, <=, rc->rc_size);
+
+		/*
+		 * Note that the child vdev will have a vdev label at the start
+		 * of its range of offsets, hence the need for
+		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
+		 * example of why this calculation is needed.
+		 */
+		if ((err = vdev_disk_physio(cvd,
+		    ((char *)rc->rc_data) + colskip, colsize,
+		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+		    flags, isdump)) != 0)
+			break;
+	}
+
+	vdev_raidz_map_free(rm);
+#endif	/* KERNEL */
+
+	return (err);
+}
+#endif
+
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
@@ -1517,7 +1712,24 @@ vdev_raidz_child_done(zio_t *zio)
 	rc->rc_skipped = 0;
 }
 
-static int
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ *   1. Generate the parity data
+ *   2. Create child zio write operations to each column's vdev, for both
+ *      data and parity.
+ *   3. If the column skips any sectors for padding, create optional dummy
+ *      write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ *   1. Create child zio read operations to each data column's vdev to read
+ *      the range of data required for zio.
+ *   2. If this is a scrub or resilver operation, or if any of the data
+ *      vdevs have had errors, then create zio read operations to the parity
+ *      columns' VDevs as well.
+ */
+static void
 vdev_raidz_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1527,11 +1739,30 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_col_t *rc;
 	int c, i;
 
-	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+	    zio->io_type == ZIO_TYPE_FREE,
+	    tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
 
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
+	if (zio->io_type == ZIO_TYPE_FREE) {
+		for (c = 0; c < rm->rm_cols; c++) {
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_devidx];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_data, rc->rc_size,
+			    zio->io_type, zio->io_priority, 0,
+			    vdev_raidz_child_done, rc));
+		}
+
+		zio_execute(zio);
+		return;
+	}
+
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		vdev_raidz_generate_parity(rm);
 
@@ -1561,7 +1792,8 @@ vdev_raidz_io_start(zio_t *zio)
 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
 		}
 
-		return (ZIO_PIPELINE_CONTINUE);
+		zio_execute(zio);
+		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1578,7 +1810,7 @@ vdev_raidz_io_start(zio_t *zio)
 				rm->rm_missingdata++;
 			else
 				rm->rm_missingparity++;
-			rc->rc_error = ENXIO;
+			rc->rc_error = SET_ERROR(ENXIO);
 			rc->rc_tried = 1;	/* don't even try */
 			rc->rc_skipped = 1;
 			continue;
@@ -1588,7 +1820,7 @@ vdev_raidz_io_start(zio_t *zio)
 				rm->rm_missingdata++;
 			else
 				rm->rm_missingparity++;
-			rc->rc_error = ESTALE;
+			rc->rc_error = SET_ERROR(ESTALE);
 			rc->rc_skipped = 1;
 			continue;
 		}
@@ -1601,9 +1833,10 @@ vdev_raidz_io_start(zio_t *zio)
 		}
 	}
 
-	return (ZIO_PIPELINE_CONTINUE);
+	zio_execute(zio);
 }
 
+
 /*
  * Report a checksum error for a child of a RAID-Z device.
  */
@@ -1659,6 +1892,13 @@ raidz_parity_verify(zio_t *zio, raidz_ma
 	int c, ret = 0;
 	raidz_col_t *rc;
 
+	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+	if (checksum == ZIO_CHECKSUM_NOPARITY)
+		return (ret);
+
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
@@ -1675,7 +1915,7 @@ raidz_parity_verify(zio_t *zio, raidz_ma
 			continue;
 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
 			raidz_checksum_error(zio, rc, orig[c]);
-			rc->rc_error = ECKSUM;
+			rc->rc_error = SET_ERROR(ECKSUM);
 			ret++;
 		}
 		zio_buf_free(orig[c], rc->rc_size);
@@ -1799,7 +2039,7 @@ vdev_raidz_combrec(zio_t *zio, int total
 					if (rc->rc_tried)
 						raidz_checksum_error(zio, rc,
 						    orig[i]);
-					rc->rc_error = ECKSUM;
+					rc->rc_error = SET_ERROR(ECKSUM);
 				}
 
 				ret = code;
@@ -1856,6 +2096,27 @@ done:
 	return (ret);
 }
 
+/*
+ * Complete an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ *   1. Check for errors on the child IOs.
+ *   2. Return, setting an error code if too few child VDevs were written
+ *      to reconstruct the data later.  Note that partial writes are
+ *      considered successful if they can be reconstructed at all.
+ * - For read operations:
+ *   1. Check for errors on the child IOs.
+ *   2. If data errors occurred:
+ *      a. Try to reassemble the data from the parity available.
+ *      b. If we haven't yet read the parity drives, read them now.
+ *      c. If all parity drives have been read but the data still doesn't
+ *         reassemble with a correct checksum, then try combinatorial
+ *         reconstruction.
+ *      d. If that doesn't work, return an error.
+ *   3. If there were unexpected errors or this is a resilver operation,
+ *      rewrite the vdevs that had errors.
+ */
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
@@ -1913,6 +2174,8 @@ vdev_raidz_io_done(zio_t *zio)
 			zio->io_error = vdev_raidz_worst_error(rm);
 
 		return;
+	} else if (zio->io_type == ZIO_TYPE_FREE) {
+		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -2075,7 +2338,7 @@ vdev_raidz_io_done(zio_t *zio)
 		 * Start checksum ereports for all children which haven't
 		 * failed, and the IO wasn't speculative.
 		 */
-		zio->io_error = ECKSUM;
+		zio->io_error = SET_ERROR(ECKSUM);
 
 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			for (c = 0; c < rm->rm_cols; c++) {
@@ -2113,7 +2376,7 @@ done:
 
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
-			    ZIO_TYPE_WRITE, zio->io_priority,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
@@ -2139,6 +2402,8 @@ vdev_ops_t vdev_raidz_ops = {
 	vdev_raidz_io_start,
 	vdev_raidz_io_done,
 	vdev_raidz_state_change,
+	NULL,
+	NULL,
 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_root.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c	27 Feb 2010 22:31:14 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c	2 Sep 2013 11:38:20 -0000
@@ -19,10 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
@@ -50,14 +54,15 @@ too_many_errors(vdev_t *vd, int numerror
 }
 
 static int
-vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	int lasterror = 0;
 	int numerrors = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	vdev_open_children(vd);
@@ -77,7 +82,9 @@ vdev_root_open(vdev_t *vd, uint64_t *asi
 	}
 
 	*asize = 0;
-	*ashift = 0;
+	*max_asize = 0;
+	*logical_ashift = 0;
+	*physical_ashift = 0;
 
 	return (0);
 }
@@ -109,6 +116,8 @@ vdev_ops_t vdev_root_ops = {
 	NULL,			/* io_start - not applicable to the root */
 	NULL,			/* io_done - not applicable to the root */
 	vdev_root_state_change,
+	NULL,
+	NULL,
 	VDEV_TYPE_ROOT,		/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zap.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c	27 Feb 2010 22:31:15 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c	2 May 2017 00:32:08 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 /*
@@ -50,9 +51,9 @@
 
 int fzap_default_block_shift = 14; /* 16k blocksize */
 
-static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
-static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
 
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
 
 void
 fzap_byteswap(void *vbuf, size_t size)
@@ -80,13 +81,13 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, z
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 	zap->zap_ismicro = FALSE;
 
-	(void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
-	    &zap->zap_f.zap_phys, zap_evict);
+	zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
+	zap->zap_dbu.dbu_evict_func_async = NULL;
 
 	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
-	zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
+	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
 
-	zp = zap->zap_f.zap_phys;
+	zp = zap_f_phys(zap);
 	/*
 	 * explicitly zero it since it might be coming from an
 	 * initialized microzap
@@ -112,12 +113,11 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, z
 	 * set up block 1 - the first leaf
 	 */
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
+	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
 	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	l->l_dbuf = db;
-	l->l_phys = db->db_data;
 
 	zap_leaf_init(l, zp->zap_normflags != 0);
 
@@ -162,9 +162,10 @@ zap_table_grow(zap_t *zap, zap_table_phy
 	} else {
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
-		ASSERT3U(tbl->zt_blks_copied, ==, 0);
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    tbl->zt_blk << bs, tbl->zt_numblks << bs);
+		ASSERT0(tbl->zt_blks_copied);
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
@@ -173,20 +174,20 @@ zap_table_grow(zap_t *zap, zap_table_phy
 
 	b = tbl->zt_blks_copied;
 	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + b) << bs, FTAG, &db_old);
+	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+0) << bs, FTAG, &db_new));
+	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (newblk + 2*b+1) << bs, FTAG, &db_new));
+	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
 	    db_new->db_data, hepb);
@@ -234,7 +235,7 @@ zap_table_store(zap_t *zap, zap_table_ph
 	off = idx & ((1<<(bs-3))-1);
 
 	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db);
+	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err)
 		return (err);
 	dmu_buf_will_dirty(db, tx);
@@ -246,7 +247,8 @@ zap_table_store(zap_t *zap, zap_table_ph
 		dmu_buf_t *db2;
 
 		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+		    DMU_READ_NO_PREFETCH);
 		if (err) {
 			dmu_buf_rele(db, FTAG);
 			return (err);
@@ -269,6 +271,7 @@ zap_table_load(zap_t *zap, zap_table_phy
 	uint64_t blk, off;
 	int err;
 	dmu_buf_t *db;
+	dnode_t *dn;
 	int bs = FZAP_BLOCK_SHIFT(zap);
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@@ -276,8 +279,15 @@ zap_table_load(zap_t *zap, zap_table_phy
 	blk = idx >> (bs-3);
 	off = idx & ((1<<(bs-3))-1);
 
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    (tbl->zt_blk + blk) << bs, FTAG, &db);
+	/*
+	 * Note: this is equivalent to dmu_buf_hold(), but we use
+	 * _dnode_enter / _by_dnode because it's faster because we don't
+	 * have to hold the dnode.
+	 */
+	dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -291,9 +301,13 @@ zap_table_load(zap_t *zap, zap_table_phy
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    (tbl->zt_nextblk + blk) << bs, FTAG, &db);
-		dmu_buf_rele(db, FTAG);
+		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+		err = dmu_buf_hold_by_dnode(dn,
+		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+		    DMU_READ_NO_PREFETCH);
+		dmu_buf_dnode_exit(zap->zap_dbuf);
+		if (err == 0)
+			dmu_buf_rele(db, FTAG);
 	}
 	return (err);
 }
@@ -322,10 +336,10 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx
 	 * If we are within 2 bits of running out, stop growing, since
 	 * this is already an aberrant condition.
 	 */
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
-		return (ENOSPC);
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+		return (SET_ERROR(ENOSPC));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/*
 		 * We are outgrowing the "embedded" ptrtbl (the one
 		 * stored in the header block).  Give it its own entire
@@ -335,13 +349,14 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx
 		dmu_buf_t *db_new;
 		int err;
 
-		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
 		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
-		ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
+		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
 
 		newblk = zap_allocate_blocks(zap, 1);
 		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+		    DMU_READ_NO_PREFETCH);
 		if (err)
 			return (err);
 		dmu_buf_will_dirty(db_new, tx);
@@ -349,17 +364,17 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx
 		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
 		dmu_buf_rele(db_new, FTAG);
 
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
-		zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
 
-		ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
 		    (FZAP_BLOCK_SHIFT(zap)-3));
 
 		return (0);
 	} else {
-		return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    zap_ptrtbl_transfer, tx));
 	}
 }
@@ -369,8 +384,8 @@ zap_increment_num_entries(zap_t *zap, in
 {
 	dmu_buf_will_dirty(zap->zap_dbuf, tx);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
-	ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
-	zap->zap_f.zap_phys->zap_num_entries += delta;
+	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+	zap_f_phys(zap)->zap_num_entries += delta;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 }
 
@@ -379,16 +394,25 @@ zap_allocate_blocks(zap_t *zap, int nblo
 {
 	uint64_t newblk;
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	newblk = zap->zap_f.zap_phys->zap_freeblk;
-	zap->zap_f.zap_phys->zap_freeblk += nblocks;
+	newblk = zap_f_phys(zap)->zap_freeblk;
+	zap_f_phys(zap)->zap_freeblk += nblocks;
 	return (newblk);
 }
 
+static void
+zap_leaf_evict_sync(void *dbu)
+{
+	zap_leaf_t *l = dbu;
+
+	rw_destroy(&l->l_rwlock);
+	kmem_free(l, sizeof (zap_leaf_t));
+}
+
 static zap_leaf_t *
 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 {
 	void *winner;
-	zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
@@ -396,17 +420,18 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
-	l->l_phys = NULL;
 
 	VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
-	winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+	    DMU_READ_NO_PREFETCH));
+	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+	winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
 	ASSERT(winner == NULL);
 	dmu_buf_will_dirty(l->l_dbuf, tx);
 
 	zap_leaf_init(l, zap->zap_normflags != 0);
 
-	zap->zap_f.zap_phys->zap_num_leafs++;
+	zap_f_phys(zap)->zap_num_leafs++;
 
 	return (l);
 }
@@ -416,7 +441,7 @@ fzap_count(zap_t *zap, uint64_t *count)
 {
 	ASSERT(!zap->zap_ismicro);
 	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
-	*count = zap->zap_f.zap_phys->zap_num_entries;
+	*count = zap_f_phys(zap)->zap_num_entries;
 	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
 	return (0);
 }
@@ -432,16 +457,6 @@ zap_put_leaf(zap_leaf_t *l)
 	dmu_buf_rele(l->l_dbuf, NULL);
 }
 
-_NOTE(ARGSUSED(0))
-static void
-zap_leaf_pageout(dmu_buf_t *db, void *vl)
-{
-	zap_leaf_t *l = vl;
-
-	rw_destroy(&l->l_rwlock);
-	kmem_free(l, sizeof (zap_leaf_t));
-}
-
 static zap_leaf_t *
 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
 {
@@ -449,20 +464,20 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t 
 
 	ASSERT(blkid != 0);
 
-	l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+	l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
 	rw_init(&l->l_rwlock, 0, 0, 0);
 	rw_enter(&l->l_rwlock, RW_WRITER);
 	l->l_blkid = blkid;
-	l->l_bs = highbit(db->db_size)-1;
+	l->l_bs = highbit64(db->db_size) - 1;
 	l->l_dbuf = db;
-	l->l_phys = NULL;
 
-	winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+	winner = dmu_buf_set_user(db, &l->l_dbu);
 
 	rw_exit(&l->l_rwlock);
 	if (winner != NULL) {
 		/* someone else set it first */
-		zap_leaf_pageout(NULL, l);
+		zap_leaf_evict_sync(&l->l_dbu);
 		l = winner;
 	}
 
@@ -471,7 +486,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t 
 	 * chain.  There should be no chained leafs (as we have removed
 	 * support for them).
 	 */
-	ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0);
+	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
 
 	/*
 	 * There should be more hash entries than there can be
@@ -481,11 +496,11 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t 
 
 	/* The chunks should begin at the end of the hash table */
 	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
-	    &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
 
 	/* The chunks should end at the end of the block */
 	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
-	    (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
 
 	return (l);
 }
@@ -501,8 +516,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t 
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-	    blkid << bs, NULL, &db);
+	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	err = dmu_buf_hold_by_dnode(dn,
+	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err)
 		return (err);
 
@@ -518,16 +535,15 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t 
 
 	rw_enter(&l->l_rwlock, lt);
 	/*
-	 * Must lock before dirtying, otherwise l->l_phys could change,
+	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
 	 * causing ASSERT below to fail.
 	 */
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
 	ASSERT3U(l->l_blkid, ==, blkid);
 	ASSERT3P(l->l_dbuf, ==, db);
-	ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
-	ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	*lp = l;
 	return (0);
@@ -538,13 +554,13 @@ zap_idx_to_blk(zap_t *zap, uint64_t idx,
 {
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		ASSERT3U(idx, <,
-		    (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
 		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
 		return (0);
 	} else {
-		return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, valp));
 	}
 }
@@ -555,11 +571,11 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t 
 	ASSERT(tx != NULL);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
 		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
 		return (0);
 	} else {
-		return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
 		    idx, blk, tx));
 	}
 }
@@ -571,52 +587,61 @@ zap_deref_leaf(zap_t *zap, uint64_t h, d
 	int err;
 
 	ASSERT(zap->zap_dbuf == NULL ||
-	    zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
-	ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
-	idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
+
+	/* Reality check for corrupt zap objects (leaf or header). */
+	if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
+	    zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
+	    zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
+		return (SET_ERROR(EIO));
+	}
+
+	idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	err = zap_idx_to_blk(zap, idx, &blk);
 	if (err != 0)
 		return (err);
 	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
 
-	ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
-	    (*lp)->l_phys->l_hdr.lh_prefix);
+	ASSERT(err ||
+	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
 	return (err);
 }
 
 static int
-zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
 {
 	zap_t *zap = zn->zn_zap;
 	uint64_t hash = zn->zn_hash;
 	zap_leaf_t *nl;
 	int prefix_diff, i, err;
 	uint64_t sibling;
-	int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
-	ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
 	if (zap_tryupgradedir(zap, tx) == 0 ||
-	    old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 		/* We failed to upgrade, or need to grow the pointer table */
 		objset_t *os = zap->zap_objset;
 		uint64_t object = zap->zap_object;
 
 		zap_put_leaf(l);
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, tag);
 		err = zap_lockdir(os, object, tx, RW_WRITER,
-		    FALSE, FALSE, &zn->zn_zap);
+		    FALSE, FALSE, tag, &zn->zn_zap);
 		zap = zn->zn_zap;
 		if (err)
 			return (err);
 		ASSERT(!zap->zap_ismicro);
 
 		while (old_prefix_len ==
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
 			err = zap_grow_ptrtbl(zap, tx);
 			if (err)
 				return (err);
@@ -626,18 +651,18 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf
 		if (err)
 			return (err);
 
-		if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
 			/* it split while our locks were down */
 			*lp = l;
 			return (0);
 		}
 	}
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
 	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
-	    l->l_phys->l_hdr.lh_prefix);
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
 
-	prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+	prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    (old_prefix_len + 1);
 	sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
 
@@ -654,12 +679,12 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf
 	zap_leaf_split(l, nl, zap->zap_normflags != 0);
 
 	/* set sibling pointers */
-	for (i = 0; i < (1ULL<<prefix_diff); i++) {
+	for (i = 0; i < (1ULL << prefix_diff); i++) {
 		err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
-		ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
+		ASSERT0(err); /* we checked for i/o errors above */
 	}
 
-	if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
 		/* we want the sibling */
 		zap_put_leaf(l);
 		*lp = nl;
@@ -672,16 +697,17 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf
 }
 
 static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx)
 {
 	zap_t *zap = zn->zn_zap;
-	int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
-	    l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
 
 	zap_put_leaf(l);
 
-	if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
 		int err;
 
 		/*
@@ -692,16 +718,16 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_
 			objset_t *os = zap->zap_objset;
 			uint64_t zapobj = zap->zap_object;
 
-			zap_unlockdir(zap);
+			zap_unlockdir(zap, tag);
 			err = zap_lockdir(os, zapobj, tx,
-			    RW_WRITER, FALSE, FALSE, &zn->zn_zap);
+			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
 			zap = zn->zn_zap;
 			if (err)
 				return;
 		}
 
 		/* could have finished growing while our locks were down */
-		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
 			(void) zap_grow_ptrtbl(zap, tx);
 	}
 }
@@ -710,7 +736,7 @@ static int
 fzap_checkname(zap_name_t *zn)
 {
 	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
-		return (ENAMETOOLONG);
+		return (SET_ERROR(ENAMETOOLONG));
 	return (0);
 }
 
@@ -725,7 +751,7 @@ fzap_checksize(uint64_t integer_size, ui
 	case 8:
 		break;
 	default:
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	if (integer_size * num_integers > ZAP_MAXVALUELEN)
@@ -784,7 +810,7 @@ fzap_lookup(zap_name_t *zn,
 int
 fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx)
+    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err;
@@ -801,7 +827,7 @@ fzap_add_cd(zap_name_t *zn,
 retry:
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
-		err = EEXIST;
+		err = SET_ERROR(EEXIST);
 		goto out;
 	}
 	if (err != ENOENT)
@@ -813,7 +839,7 @@ retry:
 	if (err == 0) {
 		zap_increment_num_entries(zap, 1, tx);
 	} else if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tx, &l);
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
@@ -821,26 +847,27 @@ retry:
 
 out:
 	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	return (err);
 }
 
 int
 fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx)
+    const void *val, void *tag, dmu_tx_t *tx)
 {
 	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_NEED_CD, tx));
+	    val, ZAP_NEED_CD, tag, tx));
 }
 
 int
 fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+    int integer_size, uint64_t num_integers, const void *val,
+    void *tag, dmu_tx_t *tx)
 {
 	zap_leaf_t *l;
 	int err, create;
@@ -870,14 +897,14 @@ retry:
 	}
 
 	if (err == EAGAIN) {
-		err = zap_expand_leaf(zn, l, tx, &l);
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
 		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
 		if (err == 0)
 			goto retry;
 	}
 
 	if (zap != NULL)
-		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
 	return (err);
 }
 
@@ -924,10 +951,39 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx
 	return (err);
 }
 
+void
+fzap_prefetch(zap_name_t *zn)
+{
+	uint64_t idx, blk;
+	zap_t *zap = zn->zn_zap;
+	int bs;
+
+	idx = ZAP_HASH_IDX(zn->zn_hash,
+	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	if (zap_idx_to_blk(zap, idx, &blk) != 0)
+		return;
+	bs = FZAP_BLOCK_SHIFT(zap);
+	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	    ZIO_PRIORITY_SYNC_READ);
+}
+
 /*
  * Helper functions for consumers.
  */
 
+uint64_t
+zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+    const char *name, dmu_tx_t *tx)
+{
+	uint64_t new_obj;
+
+	VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
+	VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
+	    tx));
+
+	return (new_obj);
+}
+
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name)
@@ -960,18 +1016,77 @@ zap_join(objset_t *os, uint64_t fromobj,
 	zap_attribute_t za;
 	int err;
 
+	err = 0;
 	for (zap_cursor_init(&zc, os, fromobj);
 	    zap_cursor_retrieve(&zc, &za) == 0;
 	    (void) zap_cursor_advance(&zc)) {
-		if (za.za_integer_length != 8 || za.za_num_integers != 1)
-			return (EINVAL);
+		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
 		err = zap_add(os, intoobj, za.za_name,
 		    8, 1, &za.za_first_integer, tx);
 		if (err)
-			return (err);
+			break;
 	}
 	zap_cursor_fini(&zc);
-	return (0);
+	return (err);
+}
+
+int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    uint64_t value, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err;
+
+	err = 0;
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
+		err = zap_add(os, intoobj, za.za_name,
+		    8, 1, &value, tx);
+		if (err)
+			break;
+	}
+	zap_cursor_fini(&zc);
+	return (err);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err;
+
+	err = 0;
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		uint64_t delta = 0;
+
+		if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
+
+		err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
+		if (err != 0 && err != ENOENT)
+			break;
+		delta += za.za_first_integer;
+		err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
+		if (err)
+			break;
+	}
+	zap_cursor_fini(&zc);
+	return (err);
 }
 
 int
@@ -1002,17 +1117,44 @@ zap_lookup_int(objset_t *os, uint64_t ob
 }
 
 int
-zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
-    dmu_tx_t *tx)
+zap_add_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
 {
 	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx)
+{
 	uint64_t value = 0;
 	int err;
 
 	if (delta == 0)
 		return (0);
 
-	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 	err = zap_lookup(os, obj, name, 8, 1, &value);
 	if (err != 0 && err != ENOENT)
 		return (err);
@@ -1024,6 +1166,15 @@ zap_increment_int(objset_t *os, uint64_t
 	return (err);
 }
 
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_increment(os, obj, name, delta, tx));
+}
 
 /*
  * Routines for iterating over the attributes.
@@ -1041,8 +1192,8 @@ fzap_cursor_retrieve(zap_t *zap, zap_cur
 
 	if (zc->zc_leaf &&
 	    (ZAP_HASH_IDX(zc->zc_hash,
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
-	    zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
 		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 		zap_put_leaf(zc->zc_leaf);
 		zc->zc_leaf = NULL;
@@ -1063,10 +1214,11 @@ again:
 
 	if (err == ENOENT) {
 		uint64_t nocare =
-		    (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+		    (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
 		zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
 		zc->zc_cd = 0;
-		if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
+		    zc->zc_hash == 0) {
 			zc->zc_hash = -1ULL;
 		} else {
 			zap_put_leaf(zc->zc_leaf);
@@ -1098,7 +1250,6 @@ again:
 	return (err);
 }
 
-
 static void
 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 {
@@ -1133,7 +1284,7 @@ fzap_cursor_move_to_key(zap_cursor_t *zc
 	zap_entry_handle_t zeh;
 
 	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
-		return (ENAMETOOLONG);
+		return (SET_ERROR(ENAMETOOLONG));
 
 	err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
 	if (err != 0)
@@ -1159,43 +1310,44 @@ fzap_get_stats(zap_t *zap, zap_stats_t *
 	/*
 	 * Set zap_phys_t fields
 	 */
-	zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
-	zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
-	zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
-	zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
-	zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
-	zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+	zs->zs_magic = zap_f_phys(zap)->zap_magic;
+	zs->zs_salt = zap_f_phys(zap)->zap_salt;
 
 	/*
 	 * Set zap_ptrtbl fields
 	 */
-	zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
-	zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
 	zs->zs_ptrtbl_blks_copied =
-	    zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
-	zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
-	zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
-	zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 		/* the ptrtbl is entirely in the header block. */
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
 		int b;
 
-		dmu_prefetch(zap->zap_objset, zap->zap_object,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
-		    zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
 
-		for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+		for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 		    b++) {
 			dmu_buf_t *db;
 			int err;
 
 			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
-			    (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
-			    FTAG, &db);
+			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
+			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
 				zap_stats_ptrtbl(zap, db->db_data,
 				    1<<(bs-3), zs);
@@ -1206,8 +1358,8 @@ fzap_get_stats(zap_t *zap, zap_stats_t *
 }
 
 int
-fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
-    uint64_t *tooverwrite)
+fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
+    refcount_t *tooverwrite)
 {
 	zap_t *zap = zn->zn_zap;
 	zap_leaf_t *l;
@@ -1217,9 +1369,11 @@ fzap_count_write(zap_name_t *zn, int add
 	 * Account for the header block of the fatzap.
 	 */
 	if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
-		*tooverwrite += zap->zap_dbuf->db_size;
+		(void) refcount_add_many(tooverwrite,
+		    zap->zap_dbuf->db_size, FTAG);
 	} else {
-		*towrite += zap->zap_dbuf->db_size;
+		(void) refcount_add_many(towrite,
+		    zap->zap_dbuf->db_size, FTAG);
 	}
 
 	/*
@@ -1231,10 +1385,13 @@ fzap_count_write(zap_name_t *zn, int add
 	 *   could extend the table.
 	 */
 	if (add) {
-		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
-			*towrite += zap->zap_dbuf->db_size;
-		else
-			*towrite += (zap->zap_dbuf->db_size * 3);
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
+			(void) refcount_add_many(towrite,
+			    zap->zap_dbuf->db_size, FTAG);
+		} else {
+			(void) refcount_add_many(towrite,
+			    zap->zap_dbuf->db_size * 3, FTAG);
+		}
 	}
 
 	/*
@@ -1247,13 +1404,14 @@ fzap_count_write(zap_name_t *zn, int add
 	}
 
 	if (!add && dmu_buf_freeable(l->l_dbuf)) {
-		*tooverwrite += l->l_dbuf->db_size;
+		(void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG);
 	} else {
 		/*
 		 * If this an add operation, the leaf block could split.
 		 * Hence, we need to account for an additional leaf block.
 		 */
-		*towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
+		(void) refcount_add_many(towrite,
+		    (add ? 2 : 1) * l->l_dbuf->db_size, FTAG);
 	}
 
 	zap_put_leaf(l);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zap_leaf.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c	27 Feb 2010 22:31:15 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c	22 Nov 2015 17:22:33 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -37,6 +37,7 @@
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
+#include <sys/arc.h>
 
 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
@@ -47,10 +48,12 @@ static uint16_t *zap_leaf_rehash_entry(z
 
 #define	LEAF_HASH(l, h) \
 	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
-	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+	((h) >> \
+	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
 
-#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+#define	LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
 
 static void
 zap_memset(void *a, int c, size_t n)
@@ -104,16 +107,19 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, 
 {
 	int i;
 	zap_leaf_t l;
-	l.l_bs = highbit(size)-1;
-	l.l_phys = buf;
+	dmu_buf_t l_dbuf;
 
-	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
-	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
-	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
-	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
-	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
-	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
-	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
+	l_dbuf.db_data = buf;
+	l.l_bs = highbit64(size) - 1;
+	l.l_dbuf = &l_dbuf;
+
+	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
+	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
+	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
+	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
+	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
+	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
+	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
@@ -156,19 +162,21 @@ zap_leaf_init(zap_leaf_t *l, boolean_t s
 {
 	int i;
 
-	l->l_bs = highbit(l->l_dbuf->db_size)-1;
-	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
+	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	    sizeof (struct zap_leaf_header));
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
 	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
-	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
-	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
-	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
 	if (sort)
-		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 }
 
 /*
@@ -180,15 +188,16 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	int chunk;
 
-	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	chunk = l->l_phys->l_hdr.lh_freelist;
+	chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
-	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+	zap_leaf_phys(l)->l_hdr.lh_freelist =
+	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
 
-	l->l_phys->l_hdr.lh_nfree--;
+	zap_leaf_phys(l)->l_hdr.lh_nfree--;
 
 	return (chunk);
 }
@@ -197,16 +206,16 @@ static void
 zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 {
 	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
-	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
 
 	zlf->lf_type = ZAP_CHUNK_FREE;
-	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
-	l->l_phys->l_hdr.lh_freelist = chunk;
+	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
 
-	l->l_phys->l_hdr.lh_nfree++;
+	zap_leaf_phys(l)->l_hdr.lh_nfree++;
 }
 
 /*
@@ -220,7 +229,7 @@ zap_leaf_array_create(zap_leaf_t *l, con
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
 	int byten = 0;
-	uint64_t value;
+	uint64_t value = 0;
 	int shift = (integer_size-1)*8;
 	int len = num_integers;
 
@@ -392,7 +401,7 @@ zap_leaf_lookup(zap_leaf_t *l, zap_name_
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
 
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 again:
 	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
@@ -412,7 +421,7 @@ again:
 		 * lowest-cd match for MT_FIRST.
 		 */
 		ASSERT(zn->zn_matchtype == MT_EXACT ||
-		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
 		    le->le_name_numints)) {
 			zeh->zeh_num_integers = le->le_value_numints;
@@ -434,7 +443,7 @@ again:
 		goto again;
 	}
 
-	return (ENOENT);
+	return (SET_ERROR(ENOENT));
 }
 
 /* Return (h1,cd1 >= h2,cd2) */
@@ -452,10 +461,10 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
 	uint16_t lh;
 	struct zap_leaf_entry *le;
 
-	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
 
 	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
-		for (chunk = l->l_phys->l_hash[lh];
+		for (chunk = zap_leaf_phys(l)->l_hash[lh];
 		    chunk != CHAIN_END; chunk = le->le_next) {
 			le = ZAP_LEAF_ENTRY(l, chunk);
 
@@ -492,14 +501,14 @@ zap_entry_read(const zap_entry_handle_t 
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
 	if (le->le_value_intlen > integer_size)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
 	    le->le_value_intlen, le->le_value_numints,
 	    integer_size, num_integers, buf);
 
 	if (zeh->zeh_num_integers > num_integers)
-		return (EOVERFLOW);
+		return (SET_ERROR(EOVERFLOW));
 	return (0);
 
 }
@@ -520,13 +529,13 @@ zap_entry_read_name(zap_t *zap, const za
 		    le->le_name_numints, 1, buflen, buf);
 	}
 	if (le->le_name_numints > buflen)
-		return (EOVERFLOW);
+		return (SET_ERROR(EOVERFLOW));
 	return (0);
 }
 
 int
 zap_entry_update(zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, const void *buf)
+    uint8_t integer_size, uint64_t num_integers, const void *buf)
 {
 	int delta_chunks;
 	zap_leaf_t *l = zeh->zeh_leaf;
@@ -535,16 +544,8 @@ zap_entry_update(zap_entry_handle_t *zeh
 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
 	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
-	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
-		return (EAGAIN);
-
-	/*
-	 * We should search other chained leaves (via
-	 * zap_entry_remove,create?) otherwise returning EAGAIN will
-	 * just send us into an infinite loop if we have to chain
-	 * another leaf block, rather than being able to split this
-	 * block.
-	 */
+	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
+		return (SET_ERROR(EAGAIN));
 
 	zap_leaf_array_free(l, &le->le_value_chunk);
 	le->le_value_chunk =
@@ -573,7 +574,7 @@ zap_entry_remove(zap_entry_handle_t *zeh
 	*zeh->zeh_chunkp = le->le_next;
 	zap_leaf_chunk_free(l, entry_chunk);
 
-	l->l_phys->l_hdr.lh_nentries--;
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
 }
 
 int
@@ -597,7 +598,7 @@ zap_entry_create(zap_leaf_t *l, zap_name
 
 	if (cd == ZAP_NEED_CD) {
 		/* find the lowest unused cd */
-		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
 			cd = 0;
 
 			for (chunk = *LEAF_HASH_ENTPTR(l, h);
@@ -633,8 +634,8 @@ zap_entry_create(zap_leaf_t *l, zap_name
 		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
 	}
 
-	if (l->l_phys->l_hdr.lh_nfree < numchunks)
-		return (EAGAIN);
+	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
+		return (SET_ERROR(EAGAIN));
 
 	/* make the entry */
 	chunk = zap_leaf_chunk_alloc(l);
@@ -654,7 +655,7 @@ zap_entry_create(zap_leaf_t *l, zap_name
 	/* XXX if we did the search above, we could just use that */
 	chunkp = zap_leaf_rehash_entry(l, chunk);
 
-	l->l_phys->l_hdr.lh_nentries++;
+	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
 	zeh->zeh_leaf = l;
 	zeh->zeh_num_integers = num_integers;
@@ -788,8 +789,8 @@ zap_leaf_transfer_entry(zap_leaf_t *l, i
 
 	zap_leaf_chunk_free(l, entry);
 
-	l->l_phys->l_hdr.lh_nentries--;
-	nl->l_phys->l_hdr.lh_nentries++;
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
+	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
 }
 
 /*
@@ -799,19 +800,22 @@ void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
 	int i;
-	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
-	l->l_phys->l_hdr.lh_prefix <<= 1;
-	l->l_phys->l_hdr.lh_prefix_len++;
-	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
-	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
-		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
 
 	/*
 	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
@@ -839,25 +843,25 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l
 {
 	int i, n;
 
-	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
-	    l->l_phys->l_hdr.lh_prefix_len;
+	n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
 
 
-	n = l->l_phys->l_hdr.lh_nentries/5;
+	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_with_n5_entries[n]++;
 
 	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
-	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
 	    (1<<FZAP_BLOCK_SHIFT(zap));
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
 	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
 		int nentries = 0;
-		int chunk = l->l_phys->l_hash[i];
+		int chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c,v
retrieving revision 1.3
diff -u -p -r1.3 zap_micro.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c	6 May 2017 23:27:27 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zio.h>
@@ -32,19 +34,24 @@
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
 #include <sys/avl.h>
+#include <sys/arc.h>
+#include <sys/dmu_objset.h>
 
 #ifdef _KERNEL
 #include <sys/sunddi.h>
 #endif
 
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+
+static int mzap_upgrade(zap_t **zapp,
+    void *tag, dmu_tx_t *tx, zap_flags_t flags);
 
 uint64_t
 zap_getflags(zap_t *zap)
 {
 	if (zap->zap_ismicro)
 		return (0);
-	return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+	return (zap_f_phys(zap)->zap_flags);
 }
 
 int
@@ -255,27 +262,33 @@ mze_compare(const void *arg1, const void
 		return (+1);
 	if (mze1->mze_hash < mze2->mze_hash)
 		return (-1);
-	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
+	if (mze1->mze_cd > mze2->mze_cd)
 		return (+1);
-	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
+	if (mze1->mze_cd < mze2->mze_cd)
 		return (-1);
 	return (0);
 }
 
-static void
-mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
+static int
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
 {
 	mzap_ent_t *mze;
+	avl_index_t idx;
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(mzep->mze_cd < zap_maxcd(zap));
 
 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
 	mze->mze_hash = hash;
-	mze->mze_phys = *mzep;
-	avl_add(&zap->zap_m.zap_avl, mze);
+	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
+	if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
+		kmem_free(mze, sizeof (mzap_ent_t));
+		return (EEXIST);
+	}
+	avl_insert(&zap->zap_m.zap_avl, mze, idx);
+	return (0);
 }
 
 static mzap_ent_t *
@@ -290,14 +303,15 @@ mze_find(zap_name_t *zn)
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
 	mze_tofind.mze_hash = zn->zn_hash;
-	mze_tofind.mze_phys.mze_cd = 0;
+	mze_tofind.mze_cd = 0;
 
 again:
 	mze = avl_find(avl, &mze_tofind, &idx);
 	if (mze == NULL)
 		mze = avl_nearest(avl, idx, AVL_AFTER);
 	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
-		if (zap_match(zn, mze->mze_phys.mze_name))
+		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
 			return (mze);
 	}
 	if (zn->zn_matchtype == MT_BEST) {
@@ -320,12 +334,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 
 	mze_tofind.mze_hash = hash;
-	mze_tofind.mze_phys.mze_cd = 0;
+	mze_tofind.mze_cd = 0;
 
 	cd = 0;
 	for (mze = avl_find(avl, &mze_tofind, &idx);
 	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
-		if (mze->mze_phys.mze_cd != cd)
+		if (mze->mze_cd != cd)
 			break;
 		cd++;
 	}
@@ -360,6 +374,9 @@ mzap_open(objset_t *os, uint64_t obj, dm
 	zap_t *winner;
 	zap_t *zap;
 	int i;
+	uint64_t *zap_hdr = (uint64_t *)db->db_data;
+	uint64_t zap_block_type = zap_hdr[0];
+	uint64_t zap_magic = zap_hdr[1];
 
 	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
 
@@ -370,9 +387,13 @@ mzap_open(objset_t *os, uint64_t obj, dm
 	zap->zap_object = obj;
 	zap->zap_dbuf = db;
 
-	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
+	if (zap_block_type != ZBT_MICRO) {
 		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
-		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
+		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
+		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
+			winner = NULL;	/* No actual winner here... */
+			goto handle_winner;
+		}
 	} else {
 		zap->zap_ismicro = TRUE;
 	}
@@ -382,40 +403,40 @@ mzap_open(objset_t *os, uint64_t obj, dm
 	 * it, because zap_lockdir() checks zap_ismicro without the lock
 	 * held.
 	 */
-	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
+	winner = dmu_buf_set_user(db, &zap->zap_dbu);
 
-	if (winner != NULL) {
-		rw_exit(&zap->zap_rwlock);
-		rw_destroy(&zap->zap_rwlock);
-		if (!zap->zap_ismicro)
-			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
-		kmem_free(zap, sizeof (zap_t));
-		return (winner);
-	}
+	if (winner != NULL)
+		goto handle_winner;
 
 	if (zap->zap_ismicro) {
-		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
-		zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
+		zap->zap_salt = zap_m_phys(zap)->mz_salt;
+		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
 		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
 		avl_create(&zap->zap_m.zap_avl, mze_compare,
 		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
 
 		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 			mzap_ent_phys_t *mze =
-			    &zap->zap_m.zap_phys->mz_chunk[i];
+			    &zap_m_phys(zap)->mz_chunk[i];
 			if (mze->mze_name[0]) {
 				zap_name_t *zn;
 
-				zap->zap_m.zap_num_entries++;
 				zn = zap_name_alloc(zap, mze->mze_name,
 				    MT_EXACT);
-				mze_insert(zap, i, zn->zn_hash, mze);
+				if (mze_insert(zap, i, zn->zn_hash) == 0)
+					zap->zap_m.zap_num_entries++;
+				else {
+					printf("ZFS WARNING: Duplicated ZAP "
+					    "entry detected (%s).\n",
+					    mze->mze_name);
+				}
 				zap_name_free(zn);
 			}
 		}
 	} else {
-		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
-		zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
+		zap->zap_salt = zap_f_phys(zap)->zap_salt;
+		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
 
 		ASSERT3U(sizeof (struct zap_leaf_header), ==,
 		    2*ZAP_LEAF_CHUNKSIZE);
@@ -425,7 +446,7 @@ mzap_open(objset_t *os, uint64_t obj, dm
 		 * other members.
 		 */
 		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
-		    &zap->zap_f.zap_phys->zap_salt);
+		    &zap_f_phys(zap)->zap_salt);
 
 		/*
 		 * The embedded pointer table should end at the end of
@@ -433,39 +454,53 @@ mzap_open(objset_t *os, uint64_t obj, dm
 		 */
 		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
 		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
-		    (uintptr_t)zap->zap_f.zap_phys, ==,
+		    (uintptr_t)zap_f_phys(zap), ==,
 		    zap->zap_dbuf->db_size);
 	}
 	rw_exit(&zap->zap_rwlock);
 	return (zap);
+
+handle_winner:
+	rw_exit(&zap->zap_rwlock);
+	rw_destroy(&zap->zap_rwlock);
+	if (!zap->zap_ismicro)
+		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+	kmem_free(zap, sizeof (zap_t));
+	return (winner);
 }
 
-int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+static int
+zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	zap_t *zap;
-	dmu_buf_t *db;
 	krw_t lt;
-	int err;
 
-	*zapp = NULL;
+	ASSERT0(db->db_offset);
+	objset_t *os = dmu_buf_get_objset(db);
+	uint64_t obj = db->db_object;
 
-	err = dmu_buf_hold(os, obj, 0, NULL, &db);
-	if (err)
-		return (err);
+	*zapp = NULL;
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
-		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
 	zap = dmu_buf_get_user(db);
-	if (zap == NULL)
+	if (zap == NULL) {
 		zap = mzap_open(os, obj, db);
+		if (zap == NULL) {
+			/*
+			 * mzap_open() didn't like what it saw on-disk.
+			 * Check for corruption!
+			 */
+			return (SET_ERROR(EIO));
+		}
+	}
 
 	/*
 	 * We're checking zap_ismicro without the lock held, in order to
@@ -501,10 +536,12 @@ zap_lockdir(objset_t *os, uint64_t obj, 
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
-			return (mzap_upgrade(zapp, tx, 0));
+			int err = mzap_upgrade(zapp, tag, tx, 0);
+			if (err != 0)
+				rw_exit(&zap->zap_rwlock);
+			return (err);
 		}
-		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
-		ASSERT3U(err, ==, 0);
+		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
 		zap->zap_m.zap_num_chunks =
 		    db->db_size / MZAP_ENT_LEN - 1;
 	}
@@ -513,15 +550,49 @@ zap_lockdir(objset_t *os, uint64_t obj, 
 	return (0);
 }
 
+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		return (err);
+	}
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
+		dmu_buf_rele(db, tag);
+	}
+	return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+	int err;
+
+	err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
+		dmu_buf_rele(db, tag);
+	return (err);
+}
+
 void
-zap_unlockdir(zap_t *zap)
+zap_unlockdir(zap_t *zap, void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
-	dmu_buf_rele(zap->zap_dbuf, NULL);
+	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
 static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
+mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
 {
 	mzap_phys_t *mzp;
 	int i, sz, nchunks;
@@ -531,7 +602,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx,
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 	sz = zap->zap_dbuf->db_size;
-	mzp = kmem_alloc(sz, KM_SLEEP);
+	mzp = zio_buf_alloc(sz);
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
 	nchunks = zap->zap_m.zap_num_chunks;
 
@@ -539,7 +610,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx,
 		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
 		    1ULL << fzap_default_block_shift, 0, tx);
 		if (err) {
-			kmem_free(mzp, sz);
+			zio_buf_free(mzp, sz);
 			return (err);
 		}
 	}
@@ -559,31 +630,32 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx,
 		dprintf("adding %s=%llu\n",
 		    mze->mze_name, mze->mze_value);
 		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
-		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
+		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+		    tag, tx);
 		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
 		zap_name_free(zn);
 		if (err)
 			break;
 	}
-	kmem_free(mzp, sz);
+	zio_buf_free(mzp, sz);
 	*zapp = zap;
 	return (err);
 }
 
-static void
+void
 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
     dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
 
-	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
+	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
 
 #ifdef ZFS_DEBUG
 	{
 		dmu_object_info_t doi;
 		dmu_object_info_from_db(db, &doi);
-		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
 	}
 #endif
 
@@ -598,9 +670,9 @@ mzap_create_impl(objset_t *os, uint64_t 
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
 		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
-		    B_FALSE, B_FALSE, &zap));
-		VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
-		zap_unlockdir(zap);
+		    B_FALSE, B_FALSE, FTAG, &zap));
+		VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
+		zap_unlockdir(zap, FTAG);
 	}
 }
 
@@ -651,9 +723,9 @@ zap_create_flags(objset_t *os, int normf
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
 	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
-	    leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+	    leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
 	    indirect_blockshift >= SPA_MINBLOCKSHIFT &&
-	    indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+	    indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
 
 	VERIFY(dmu_object_set_blocksize(os, obj,
 	    1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -674,11 +746,10 @@ zap_destroy(objset_t *os, uint64_t zapob
 	return (dmu_object_free(os, zapobj, tx));
 }
 
-_NOTE(ARGSUSED(0))
 void
-zap_evict(dmu_buf_t *db, void *vzap)
+zap_evict_sync(void *dbu)
 {
-	zap_t *zap = vzap;
+	zap_t *zap = dbu;
 
 	rw_destroy(&zap->zap_rwlock);
 
@@ -696,7 +767,7 @@ zap_count(objset_t *os, uint64_t zapobj,
 	zap_t *zap;
 	int err;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	if (!zap->zap_ismicro) {
@@ -704,7 +775,7 @@ zap_count(objset_t *os, uint64_t zapobj,
 	} else {
 		*count = zap->zap_m.zap_num_entries;
 	}
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -728,11 +799,11 @@ again:
 	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
 
 		if (zn == NULL) {
-			zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
+			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
 			    MT_FIRST);
 			allocdzn = B_TRUE;
 		}
-		if (zap_match(zn, other->mze_phys.mze_name)) {
+		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
@@ -761,25 +832,19 @@ zap_lookup(objset_t *os, uint64_t zapobj
 	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
 }
 
-int
-zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *ncp)
 {
-	zap_t *zap;
-	int err;
+	int err = 0;
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
-	if (err)
-		return (err);
 	zn = zap_name_alloc(zap, name, mt);
-	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
-	}
+	if (zn == NULL)
+		return (SET_ERROR(ENOTSUP));
 
 	if (!zap->zap_ismicro) {
 		err = fzap_lookup(zn, integer_size, num_integers, buf,
@@ -787,17 +852,18 @@ zap_lookup_norm(objset_t *os, uint64_t z
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
-			err = ENOENT;
+			err = SET_ERROR(ENOENT);
 		} else {
 			if (num_integers < 1) {
-				err = EOVERFLOW;
+				err = SET_ERROR(EOVERFLOW);
 			} else if (integer_size != 8) {
-				err = EINVAL;
+				err = SET_ERROR(EINVAL);
 			} else {
-				*(uint64_t *)buf = mze->mze_phys.mze_value;
+				*(uint64_t *)buf =
+				    MZE_PHYS(zap, mze)->mze_value;
 				if (realname != NULL)
 					(void) strlcpy(realname,
-					    mze->mze_phys.mze_name, rn_len);
+					    MZE_PHYS(zap, mze)->mze_name, rn_len);
 				if (ncp) {
 					*ncp = mzap_normalization_conflict(zap,
 					    zn, mze);
@@ -806,7 +872,74 @@ zap_lookup_norm(objset_t *os, uint64_t z
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	return (err);
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	fzap_prefetch(zn);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -818,27 +951,27 @@ zap_lookup_uint64(objset_t *os, uint64_t
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
 
 	err = fzap_lookup(zn, integer_size, num_integers, buf,
 	    NULL, 0, NULL);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
 int
 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
 {
-	int err = (zap_lookup_norm(os, zapobj, name, 0,
-	    0, NULL, MT_EXACT, NULL, 0, NULL));
+	int err = zap_lookup_norm(os, zapobj, name, 0,
+	    0, NULL, MT_EXACT, NULL, 0, NULL);
 	if (err == EOVERFLOW || err == EINVAL)
 		err = 0; /* found, but skipped reading the value */
 	return (err);
@@ -853,20 +986,20 @@ zap_length(objset_t *os, uint64_t zapobj
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_length(zn, integer_size, num_integers);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
-			err = ENOENT;
+			err = SET_ERROR(ENOENT);
 		} else {
 			if (integer_size)
 				*integer_size = 8;
@@ -875,7 +1008,7 @@ zap_length(objset_t *os, uint64_t zapobj
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -887,17 +1020,17 @@ zap_length_uint64(objset_t *os, uint64_t
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_length(zn, integer_size, num_integers);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -913,7 +1046,7 @@ mzap_addent(zap_name_t *zn, uint64_t val
 
 #ifdef ZFS_DEBUG
 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
@@ -924,7 +1057,7 @@ mzap_addent(zap_name_t *zn, uint64_t val
 
 again:
 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
-		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
@@ -934,7 +1067,7 @@ again:
 			if (zap->zap_m.zap_alloc_next ==
 			    zap->zap_m.zap_num_chunks)
 				zap->zap_m.zap_alloc_next = 0;
-			mze_insert(zap, i, zn->zn_hash, mze);
+			VERIFY(0 == mze_insert(zap, i, zn->zn_hash));
 			return;
 		}
 	}
@@ -956,27 +1089,29 @@ zap_add(objset_t *os, uint64_t zapobj, c
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, key, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_add(zn, integer_size, num_integers, val, tx);
+		err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(key) >= MZAP_NAME_LEN) {
-		err = mzap_upgrade(&zn->zn_zap, tx, 0);
-		if (err == 0)
-			err = fzap_add(zn, integer_size, num_integers, val, tx);
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
+			err = fzap_add(zn, integer_size, num_integers, val,
+			    FTAG, tx);
+		}
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else {
 		mze = mze_find(zn);
 		if (mze != NULL) {
-			err = EEXIST;
+			err = SET_ERROR(EEXIST);
 		} else {
 			mzap_addent(zn, *intval);
 		}
@@ -984,7 +1119,7 @@ zap_add(objset_t *os, uint64_t zapobj, c
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -997,19 +1132,19 @@ zap_add_uint64(objset_t *os, uint64_t za
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_add(zn, integer_size, num_integers, val, tx);
+	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
 	zap = zn->zn_zap;	/* fzap_add() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1019,36 +1154,47 @@ zap_update(objset_t *os, uint64_t zapobj
 {
 	zap_t *zap;
 	mzap_ent_t *mze;
+	uint64_t oldval;
 	const uint64_t *intval = val;
 	zap_name_t *zn;
 	int err;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+#ifdef ZFS_DEBUG
+	/*
+	 * If there is an old value, it shouldn't change across the
+	 * lockdir (eg, due to bprewrite's xlation).
+	 */
+	if (integer_size == 8 && num_integers == 1)
+		(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
+#endif
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, MT_EXACT);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
-		err = fzap_update(zn, integer_size, num_integers, val, tx);
+		err = fzap_update(zn, integer_size, num_integers, val,
+		    FTAG, tx);
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, tx, 0);
-		if (err == 0)
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
 			err = fzap_update(zn, integer_size, num_integers,
-			    val, tx);
+			    val, FTAG, tx);
+		}
 		zap = zn->zn_zap;	/* fzap_update() may change zap */
 	} else {
 		mze = mze_find(zn);
 		if (mze != NULL) {
-			mze->mze_phys.mze_value = *intval;
-			zap->zap_m.zap_phys->mz_chunk
-			    [mze->mze_chunkid].mze_value = *intval;
+			ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
+			MZE_PHYS(zap, mze)->mze_value = *intval;
 		} else {
 			mzap_addent(zn, *intval);
 		}
@@ -1056,7 +1202,7 @@ zap_update(objset_t *os, uint64_t zapobj
 	ASSERT(zap == zn->zn_zap);
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1069,19 +1215,19 @@ zap_update_uint64(objset_t *os, uint64_t
 	zap_name_t *zn;
 	int err;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap);
+		zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1100,29 +1246,29 @@ zap_remove_norm(objset_t *os, uint64_t z
 	mzap_ent_t *mze;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc(zap, name, mt);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
 	if (!zap->zap_ismicro) {
 		err = fzap_remove(zn, tx);
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
-			err = ENOENT;
+			err = SET_ERROR(ENOENT);
 		} else {
 			zap->zap_m.zap_num_entries--;
-			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
 			    sizeof (mzap_ent_phys_t));
 			mze_remove(zap, mze);
 		}
 	}
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1134,17 +1280,17 @@ zap_remove_uint64(objset_t *os, uint64_t
 	int err;
 	zap_name_t *zn;
 
-	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 	zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap);
-		return (ENOTSUP);
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
 	}
 	err = fzap_remove(zn, tx);
 	zap_name_free(zn);
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
 
@@ -1176,7 +1322,7 @@ zap_cursor_fini(zap_cursor_t *zc)
 {
 	if (zc->zc_zap) {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
-		zap_unlockdir(zc->zc_zap);
+		zap_unlockdir(zc->zc_zap, NULL);
 		zc->zc_zap = NULL;
 	}
 	if (zc->zc_leaf) {
@@ -1218,12 +1364,12 @@ zap_cursor_retrieve(zap_cursor_t *zc, za
 	mzap_ent_t *mze;
 
 	if (zc->zc_hash == -1ULL)
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 
 	if (zc->zc_zap == NULL) {
 		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, &zc->zc_zap);
+		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
 		if (err)
 			return (err);
 
@@ -1244,10 +1390,8 @@ zap_cursor_retrieve(zap_cursor_t *zc, za
 	if (!zc->zc_zap->zap_ismicro) {
 		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
 	} else {
-		err = ENOENT;
-
 		mze_tofind.mze_hash = zc->zc_hash;
-		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
+		mze_tofind.mze_cd = zc->zc_cd;
 
 		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
 		if (mze == NULL) {
@@ -1255,21 +1399,20 @@ zap_cursor_retrieve(zap_cursor_t *zc, za
 			    idx, AVL_AFTER);
 		}
 		if (mze) {
-			ASSERT(0 == bcmp(&mze->mze_phys,
-			    &zc->zc_zap->zap_m.zap_phys->mz_chunk
-			    [mze->mze_chunkid], sizeof (mze->mze_phys)));
-
+			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
 			za->za_normalization_conflict =
 			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
 			za->za_integer_length = 8;
 			za->za_num_integers = 1;
-			za->za_first_integer = mze->mze_phys.mze_value;
-			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
+			za->za_first_integer = mzep->mze_value;
+			(void) strcpy(za->za_name, mzep->mze_name);
 			zc->zc_hash = mze->mze_hash;
-			zc->zc_cd = mze->mze_phys.mze_cd;
+			zc->zc_cd = mze->mze_cd;
 			err = 0;
 		} else {
 			zc->zc_hash = -1ULL;
+			err = SET_ERROR(ENOENT);
 		}
 	}
 	rw_exit(&zc->zc_zap->zap_rwlock);
@@ -1293,7 +1436,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc,
 
 	if (zc->zc_zap == NULL) {
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
-		    RW_READER, TRUE, FALSE, &zc->zc_zap);
+		    RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
 		if (err)
 			return (err);
 	} else {
@@ -1303,7 +1446,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc,
 	zn = zap_name_alloc(zc->zc_zap, name, mt);
 	if (zn == NULL) {
 		rw_exit(&zc->zc_zap->zap_rwlock);
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 	}
 
 	if (!zc->zc_zap->zap_ismicro) {
@@ -1311,11 +1454,11 @@ zap_cursor_move_to_key(zap_cursor_t *zc,
 	} else {
 		mze = mze_find(zn);
 		if (mze == NULL) {
-			err = ENOENT;
+			err = SET_ERROR(ENOENT);
 			goto out;
 		}
 		zc->zc_hash = mze->mze_hash;
-		zc->zc_cd = mze->mze_phys.mze_cd;
+		zc->zc_cd = mze->mze_cd;
 	}
 
 out:
@@ -1330,7 +1473,7 @@ zap_get_stats(objset_t *os, uint64_t zap
 	int err;
 	zap_t *zap;
 
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
 	if (err)
 		return (err);
 
@@ -1343,42 +1486,43 @@ zap_get_stats(objset_t *os, uint64_t zap
 	} else {
 		fzap_get_stats(zap, zs);
 	}
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (0);
 }
 
 int
-zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
-    uint64_t *towrite, uint64_t *tooverwrite)
+zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
+    refcount_t *towrite, refcount_t *tooverwrite)
 {
 	zap_t *zap;
 	int err = 0;
 
-
 	/*
 	 * Since, we don't have a name, we cannot figure out which blocks will
 	 * be affected in this operation. So, account for the worst case :
 	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
 	 * - 4 new blocks written if adding:
-	 * 	- 2 blocks for possibly split leaves,
-	 * 	- 2 grown ptrtbl blocks
+	 *    - 2 blocks for possibly split leaves,
+	 *    - 2 grown ptrtbl blocks
 	 *
-	 * This also accomodates the case where an add operation to a fairly
+	 * This also accommodates the case where an add operation to a fairly
 	 * large microzap results in a promotion to fatzap.
 	 */
 	if (name == NULL) {
-		*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+		(void) refcount_add_many(towrite,
+		    (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
 		return (err);
 	}
 
 	/*
-	 * We lock the zap with adding ==  FALSE. Because, if we pass
+	 * We lock the zap with adding == FALSE. Because, if we pass
 	 * the actual value of add, it could trigger a mzap_upgrade().
 	 * At present we are just evaluating the possibility of this operation
-	 * and hence we donot want to trigger an upgrade.
+	 * and hence we do not want to trigger an upgrade.
 	 */
-	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
-	if (err)
+	err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
 		return (err);
 
 	if (!zap->zap_ismicro) {
@@ -1391,7 +1535,8 @@ zap_count_write(objset_t *os, uint64_t z
 			/*
 			 * We treat this case as similar to (name == NULL)
 			 */
-			*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+			(void) refcount_add_many(towrite,
+			    (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
 		}
 	} else {
 		/*
@@ -1409,16 +1554,20 @@ zap_count_write(objset_t *os, uint64_t z
 		 * 4 new blocks written : 2 new split leaf, 2 grown
 		 *			ptrtbl blocks
 		 */
-		if (dmu_buf_freeable(zap->zap_dbuf))
-			*tooverwrite += SPA_MAXBLOCKSIZE;
-		else
-			*towrite += SPA_MAXBLOCKSIZE;
+		if (dmu_buf_freeable(zap->zap_dbuf)) {
+			(void) refcount_add_many(tooverwrite,
+			    MZAP_MAX_BLKSZ, FTAG);
+		} else {
+			(void) refcount_add_many(towrite,
+			    MZAP_MAX_BLKSZ, FTAG);
+		}
 
 		if (add) {
-			*towrite += 4 * SPA_MAXBLOCKSIZE;
+			(void) refcount_add_many(towrite,
+			    4 * MZAP_MAX_BLKSZ, FTAG);
 		}
 	}
 
-	zap_unlockdir(zap);
+	zap_unlockdir(zap, FTAG);
 	return (err);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c	22 Nov 2015 17:22:31 -0000
@@ -0,0 +1,509 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfeature.h>
+#include <sys/dmu.h>
+#include <sys/nvpair.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include "zfeature_common.h"
+#include <sys/spa_impl.h>
+
+/*
+ * ZFS Feature Flags
+ * -----------------
+ *
+ * ZFS feature flags are used to provide fine-grained versioning to the ZFS
+ * on-disk format. Once enabled on a pool feature flags replace the old
+ * spa_version() number.
+ *
+ * Each new on-disk format change will be given a uniquely identifying string
+ * guid rather than a version number. This avoids the problem of different
+ * organizations creating new on-disk formats with the same version number. To
+ * keep feature guids unique they should consist of the reverse dns name of the
+ * organization which implemented the feature and a short name for the feature,
+ * separated by a colon (e.g. com.delphix:async_destroy).
+ *
+ * Reference Counts
+ * ----------------
+ *
+ * Within each pool features can be in one of three states: disabled, enabled,
+ * or active. These states are differentiated by a reference count stored on
+ * disk for each feature:
+ *
+ *   1) If there is no reference count stored on disk the feature is disabled.
+ *   2) If the reference count is 0 a system administrator has enabled the
+ *      feature, but the feature has not been used yet, so no on-disk
+ *      format changes have been made.
+ *   3) If the reference count is greater than 0 the feature is active.
+ *      The format changes required by the feature are currently on disk.
+ *      Note that if the feature's format changes are reversed the feature
+ *      may choose to set its reference count back to 0.
+ *
+ * Feature flags makes no differentiation between non-zero reference counts
+ * for an active feature (e.g. a reference count of 1 means the same thing as a
+ * reference count of 27834721), but feature implementations may choose to use
+ * the reference count to store meaningful information. For example, a new RAID
+ * implementation might set the reference count to the number of vdevs using
+ * it. If all those disks are removed from the pool the feature goes back to
+ * having a reference count of 0.
+ *
+ * It is the responsibility of the individual features to maintain a non-zero
+ * reference count as long as the feature's format changes are present on disk.
+ *
+ * Dependencies
+ * ------------
+ *
+ * Each feature may depend on other features. The only effect of this
+ * relationship is that when a feature is enabled all of its dependencies are
+ * automatically enabled as well. Any future work to support disabling of
+ * features would need to ensure that features cannot be disabled if other
+ * enabled features depend on them.
+ *
+ * On-disk Format
+ * --------------
+ *
+ * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
+ * (5000). In order for this to work the pool is automatically upgraded to
+ * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
+ * format changes will be in use.
+ *
+ * Information about features is stored in 3 ZAP objects in the pool's MOS.
+ * These objects are linked to by the following names in the pool directory
+ * object:
+ *
+ * 1) features_for_read: feature guid -> reference count
+ *    Features needed to open the pool for reading.
+ * 2) features_for_write: feature guid -> reference count
+ *    Features needed to open the pool for writing.
+ * 3) feature_descriptions: feature guid -> descriptive string
+ *    A human readable string.
+ *
+ * All enabled features appear in either features_for_read or
+ * features_for_write, but not both.
+ *
+ * To open a pool in read-only mode only the features listed in
+ * features_for_read need to be supported.
+ *
+ * To open the pool in read-write mode features in both features_for_read and
+ * features_for_write need to be supported.
+ *
+ * Some features may be required to read the ZAP objects containing feature
+ * information. To allow software to check for compatibility with these features
+ * before the pool is opened their names must be stored in the label in a
+ * new "features_for_read" entry (note that features that are only required
+ * to write to a pool never need to be stored in the label since the
+ * features_for_write ZAP object can be read before the pool is written to).
+ * To save space in the label features must be explicitly marked as needing to
+ * be written to the label. Also, reference counts are not stored in the label,
+ * instead any feature whose reference count drops to 0 is removed from the
+ * label.
+ *
+ * Adding New Features
+ * -------------------
+ *
+ * Features must be registered in zpool_feature_init() function in
+ * zfeature_common.c using the zfeature_register() function. This function
+ * has arguments to specify if the feature should be stored in the
+ * features_for_read or features_for_write ZAP object and if it needs to be
+ * written to the label when active.
+ *
+ * Once a feature is registered it will appear as a "feature@<feature name>"
+ * property which can be set by an administrator. Feature implementors should
+ * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
+ * query the state of a feature and the spa_feature_incr() and
+ * spa_feature_decr() functions to change an enabled feature's reference count.
+ * Reference counts may only be updated in the syncing context.
+ *
+ * Features may not perform enable-time initialization. Instead, any such
+ * initialization should occur when the feature is first used. This design
+ * enforces that on-disk changes be made only when features are used. Code
+ * should only check if a feature is enabled using spa_feature_is_enabled(),
+ * not by relying on any feature specific metadata existing. If a feature is
+ * enabled, but the feature's metadata is not on disk yet then it should be
+ * created as needed.
+ *
+ * As an example, consider the com.delphix:async_destroy feature. This feature
+ * relies on the existence of a bptree in the MOS that store blocks for
+ * asynchronous freeing. This bptree is not created when async_destroy is
+ * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
+ * called to check if async_destroy is enabled. If it is and the bptree object
+ * does not exist yet, the bptree object is created as part of the dataset
+ * destroy and async_destroy's reference count is incremented to indicate it
+ * has made an on-disk format change. Later, after the destroyed dataset's
+ * blocks have all been asynchronously freed there is no longer any use for the
+ * bptree object, so it is destroyed and async_destroy's reference count is
+ * decremented back to 0 to indicate that it has undone its on-disk format
+ * changes.
+ */
+
+typedef enum {
+	FEATURE_ACTION_INCR,
+	FEATURE_ACTION_DECR,
+} feature_action_t;
+
+/*
+ * Checks that the active features in the pool are supported by
+ * this software.  Adds each unsupported feature (name -> description) to
+ * the supplied nvlist.
+ */
+boolean_t
+spa_features_check(spa_t *spa, boolean_t for_write,
+    nvlist_t *unsup_feat, nvlist_t *enabled_feat)
+{
+	objset_t *os = spa->spa_meta_objset;
+	boolean_t supported;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	uint64_t obj = for_write ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	supported = B_TRUE;
+	for (zap_cursor_init(&zc, os, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+		    za.za_num_integers == 1);
+
+		if (NULL != enabled_feat) {
+			fnvlist_add_uint64(enabled_feat, za.za_name,
+			    za.za_first_integer);
+		}
+
+		if (za.za_first_integer != 0 &&
+		    !zfeature_is_supported(za.za_name)) {
+			supported = B_FALSE;
+
+			if (NULL != unsup_feat) {
+				char *desc = "";
+				char buf[MAXPATHLEN];
+
+				if (zap_lookup(os, spa->spa_feat_desc_obj,
+				    za.za_name, 1, sizeof (buf), buf) == 0)
+					desc = buf;
+
+				VERIFY(nvlist_add_string(unsup_feat, za.za_name,
+				    desc) == 0);
+			}
+		}
+	}
+	zap_cursor_fini(&zc);
+
+	return (supported);
+}
+
+/*
+ * Use an in-memory cache of feature refcounts for quick retrieval.
+ *
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+	ASSERT(VALID_FEATURE_FID(feature->fi_feature));
+	if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
+	    SPA_FEATURE_DISABLED) {
+		return (SET_ERROR(ENOTSUP));
+	}
+	*res = spa->spa_feat_refcount_cache[feature->fi_feature];
+	return (0);
+}
+
+/*
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+    uint64_t *res)
+{
+	int err;
+	uint64_t refcount;
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	/*
+	 * If the pool is currently being created, the feature objects may not
+	 * have been allocated yet.  Act as though all features are disabled.
+	 */
+	if (zapobj == 0)
+		return (SET_ERROR(ENOTSUP));
+
+	err = zap_lookup(spa->spa_meta_objset, zapobj,
+	    feature->fi_guid, sizeof (uint64_t), 1, &refcount);
+	if (err != 0) {
+		if (err == ENOENT)
+			return (SET_ERROR(ENOTSUP));
+		else
+			return (err);
+	}
+	*res = refcount;
+	return (0);
+}
+
+
+static int
+feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+	uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
+
+	ASSERT(zfeature_depends_on(feature->fi_feature,
+	    SPA_FEATURE_ENABLED_TXG));
+
+	if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	ASSERT(enabled_txg_obj != 0);
+
+	VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
+	    feature->fi_guid, sizeof (uint64_t), 1, res));
+
+	return (0);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
+    dmu_tx_t *tx)
+{
+	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
+	    sizeof (uint64_t), 1, &refcount, tx));
+
+	/*
+	 * feature_sync is called directly from zhack, allowing the
+	 * creation of arbitrary features whose fi_feature field may
+	 * be greater than SPA_FEATURES. When called from zhack, the
+	 * zfeature_info_t object's fi_feature field will be set to
+	 * SPA_FEATURE_NONE.
+	 */
+	if (feature->fi_feature != SPA_FEATURE_NONE) {
+		uint64_t *refcount_cache =
+		    &spa->spa_feat_refcount_cache[feature->fi_feature];
+#ifdef atomic_swap_64
+		VERIFY3U(*refcount_cache, ==,
+		    atomic_swap_64(refcount_cache, refcount));
+#else
+		*refcount_cache = refcount;
+#endif
+	}
+
+	if (refcount == 0)
+		spa_deactivate_mos_feature(spa, feature->fi_guid);
+	else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
+		spa_activate_mos_feature(spa, feature->fi_guid, tx);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+	uint64_t initial_refcount =
+	    (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	ASSERT(0 != zapobj);
+	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+	/*
+	 * If the feature is already enabled, ignore the request.
+	 */
+	if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
+		return;
+
+	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
+		spa_feature_enable(spa, feature->fi_depends[i], tx);
+
+	VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
+	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+	    feature->fi_desc, tx));
+
+	feature_sync(spa, feature, initial_refcount, tx);
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
+		uint64_t enabling_txg = dmu_tx_get_txg(tx);
+
+		if (spa->spa_feat_enabled_txg_obj == 0ULL) {
+			spa->spa_feat_enabled_txg_obj =
+			    zap_create_link(spa->spa_meta_objset,
+			    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_FEATURE_ENABLED_TXG, tx);
+		}
+		spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
+
+		VERIFY0(zap_add(spa->spa_meta_objset,
+		    spa->spa_feat_enabled_txg_obj, feature->fi_guid,
+		    sizeof (uint64_t), 1, &enabling_txg, tx));
+	}
+}
+
+static void
+feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
+    dmu_tx_t *tx)
+{
+	uint64_t refcount;
+	zfeature_info_t *feature = &spa_feature_table[fid];
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	ASSERT(0 != zapobj);
+	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
+
+	switch (action) {
+	case FEATURE_ACTION_INCR:
+		VERIFY3U(refcount, !=, UINT64_MAX);
+		refcount++;
+		break;
+	case FEATURE_ACTION_DECR:
+		VERIFY3U(refcount, !=, 0);
+		refcount--;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	feature_sync(spa, feature, refcount, tx);
+}
+
+void
+spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
+{
+	/*
+	 * We create feature flags ZAP objects in two instances: during pool
+	 * creation and during pool upgrade.
+	 */
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
+	    tx->tx_txg == TXG_INITIAL));
+
+	spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURES_FOR_READ, tx);
+	spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURES_FOR_WRITE, tx);
+	spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURE_DESCRIPTIONS, tx);
+}
+
+/*
+ * Enable any required dependencies, then enable the requested feature.
+ */
+void
+spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+	ASSERT(VALID_FEATURE_FID(fid));
+	feature_enable_sync(spa, &spa_feature_table[fid], tx);
+}
+
+void
+spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+	feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
+}
+
+void
+spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+	feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
+}
+
+boolean_t
+spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
+{
+	int err;
+	uint64_t refcount;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+	ASSERT(err == 0 || err == ENOTSUP);
+	return (err == 0);
+}
+
+boolean_t
+spa_feature_is_active(spa_t *spa, spa_feature_t fid)
+{
+	int err;
+	uint64_t refcount;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+	ASSERT(err == 0 || err == ENOTSUP);
+	return (err == 0 && refcount > 0);
+}
+
+/*
+ * For the feature specified by fid (which must depend on
+ * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
+ * OUT txg argument.
+ *
+ * Returns B_TRUE if the feature is enabled, in which case txg will be filled
+ * with the transaction group in which the specified feature was enabled.
+ * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
+ */
+boolean_t
+spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
+{
+	int err;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
+	ASSERT(err == 0 || err == ENOTSUP);
+
+	return (err == 0);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf	12 Jun 2012 05:57:28 -0000
@@ -0,0 +1,28 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+name="zfs" parent="pseudo";
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c,v
retrieving revision 1.3
diff -u -p -r1.3 zfs_acl.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c	4 May 2017 20:15:44 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -31,7 +32,6 @@
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
-#include <sys/sid.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
@@ -40,7 +40,6 @@
 #include <sys/unistd.h>
 #include <sys/sdt.h>
 #include <sys/fs/zfs.h>
-#include <sys/mode.h>
 #include <sys/policy.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_fuid.h>
@@ -50,8 +49,8 @@
 #include <sys/dmu.h>
 #include <sys/dnode.h>
 #include <sys/zap.h>
-#include <sys/fs/fs_subr.h>
-#include <sys/acl/acl_common.h>
+#include <sys/sa.h>
+#include <acl/acl_common.h>
 
 #define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
 #define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
@@ -321,6 +320,117 @@ static acl_ops_t zfs_acl_fuid_ops = {
 	zfs_ace_fuid_data
 };
 
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file.  Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+	int error;
+
+	if (zp->z_is_sa)
+		return (0);
+
+	/*
+	 * Need to deal with a potential
+	 * race where zfs_sa_upgrade could cause
+	 * z_isa_sa to change.
+	 *
+	 * If the lookup fails then the state of z_is_sa should have
+	 * changed.
+	 */
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+	    &acl_phys, sizeof (acl_phys))) == 0)
+		return (acl_phys.z_acl_extern_obj);
+	else {
+		/*
+		 * after upgrade the SA_ZPL_ZNODE_ACL should have been
+		 * removed
+		 */
+		VERIFY(zp->z_is_sa && error == ENOENT);
+		return (0);
+	}
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+    zfs_acl_phys_t *aclphys)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t acl_count;
+	int size;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	if (zp->z_is_sa) {
+		if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+		    &size)) != 0)
+			return (error);
+		*aclsize = size;
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+		    &acl_count, sizeof (acl_count))) != 0)
+			return (error);
+		*aclcount = acl_count;
+	} else {
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    aclphys, sizeof (*aclphys))) != 0)
+			return (error);
+
+		if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+			*aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+			*aclcount = aclphys->z_acl_size;
+		} else {
+			*aclsize = aclphys->z_acl_size;
+			*aclcount = aclphys->z_acl_count;
+		}
+	}
+	return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+
+	if (zp->z_is_sa)
+		return (ZFS_ACL_VERSION_FUID);
+	else {
+		int error;
+
+		/*
+		 * Need to deal with a potential
+		 * race where zfs_sa_upgrade could cause
+		 * z_isa_sa to change.
+		 *
+		 * If the lookup fails then the state of z_is_sa should have
+		 * changed.
+		 */
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) == 0)
+			return (acl_phys.z_acl_version);
+		else {
+			/*
+			 * After upgrade SA_ZPL_ZNODE_ACL should have
+			 * been removed.
+			 */
+			VERIFY(zp->z_is_sa && error == ENOENT);
+			return (ZFS_ACL_VERSION_FUID);
+		}
+	}
+}
+
 static int
 zfs_acl_version(int version)
 {
@@ -336,7 +446,7 @@ zfs_acl_version_zp(znode_t *zp)
 	return (zfs_acl_version(zp->z_zfsvfs->z_version));
 }
 
-static zfs_acl_t *
+zfs_acl_t *
 zfs_acl_alloc(int vers)
 {
 	zfs_acl_t *aclp;
@@ -352,7 +462,7 @@ zfs_acl_alloc(int vers)
 	return (aclp);
 }
 
-static zfs_acl_node_t *
+zfs_acl_node_t *
 zfs_acl_node_alloc(size_t bytes)
 {
 	zfs_acl_node_t *aclnode;
@@ -463,6 +573,8 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *
 {
 	zfs_acl_node_t *aclnode;
 
+	ASSERT(aclp);
+
 	if (start == NULL) {
 		aclnode = list_head(&aclp->z_acl);
 		if (aclnode == NULL)
@@ -509,6 +621,7 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void *
 		*who = aclp->z_ops.ace_who_get(acep);
 		aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
 		aclnode->z_ace_idx++;
+
 		return ((void *)acep);
 	}
 	return (NULL);
@@ -542,7 +655,7 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
  */
 int
 zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
-    void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+    void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
     zfs_fuid_info_t **fuidp, cred_t *cr)
 {
 	int i;
@@ -569,7 +682,7 @@ zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vt
 		 */
 		if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
 		    aceptr->z_hdr.z_flags) != B_TRUE)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 
 		switch (acep->a_type) {
 		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
@@ -676,7 +789,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, 
 		 */
 		if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
 		    aceptr->z_flags) != B_TRUE)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 	}
 	*size = (caddr_t)aceptr - (caddr_t)z_acl;
 	return (0);
@@ -771,10 +884,10 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep,
 
 /*
  * Determine mode of file based on ACL.
- * Also, create FUIDs for any User/Group ACEs
  */
-static uint64_t
-zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+    uint64_t *pflags, uint64_t fuid, uint64_t fgid)
 {
 	int		entry_type;
 	mode_t		mode;
@@ -785,7 +898,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t 
 	uint32_t	access_mask;
 	boolean_t	an_exec_denied = B_FALSE;
 
-	mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+	mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
 
 	while (acep = zfs_acl_next_ace(aclp, acep, &who,
 	    &access_mask, &iflags, &type)) {
@@ -796,14 +909,13 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t 
 		entry_type = (iflags & ACE_TYPE_FLAGS);
 
 		/*
-		 * Skip over owner@, group@ or everyone@ inherit only ACEs
+		 * Skip over any inherit_only ACEs
 		 */
-		if ((iflags & ACE_INHERIT_ONLY_ACE) &&
-		    (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
-		    entry_type == OWNING_GROUP))
+		if (iflags & ACE_INHERIT_ONLY_ACE)
 			continue;
 
-		if (entry_type == ACE_OWNER) {
+		if (entry_type == ACE_OWNER || (entry_type == 0 &&
+		    who == fuid)) {
 			if ((access_mask & ACE_READ_DATA) &&
 			    (!(seen & S_IRUSR))) {
 				seen |= S_IRUSR;
@@ -825,7 +937,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t 
 					mode |= S_IXUSR;
 				}
 			}
-		} else if (entry_type == OWNING_GROUP) {
+		} else if (entry_type == OWNING_GROUP ||
+		    (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
 			if ((access_mask & ACE_READ_DATA) &&
 			    (!(seen & S_IRGRP))) {
 				seen |= S_IRGRP;
@@ -930,48 +1043,13 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t 
 		an_exec_denied = B_TRUE;
 
 	if (an_exec_denied)
-		zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+		*pflags &= ~ZFS_NO_EXECS_DENIED;
 	else
-		zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+		*pflags |= ZFS_NO_EXECS_DENIED;
 
 	return (mode);
 }
 
-static zfs_acl_t *
-zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
-{
-	zfs_acl_t	*aclp;
-	zfs_acl_node_t	*aclnode;
-
-	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-
-	/*
-	 * Version 0 to 1 znode_acl_phys has the size/count fields swapped.
-	 * Version 0 didn't have a size field, only a count.
-	 */
-	if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
-		aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size;
-		aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count);
-	} else {
-		aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
-		aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size;
-	}
-
-	aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0);
-	aclnode->z_ace_count = aclp->z_acl_count;
-	if (will_modify) {
-		bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata,
-		    aclp->z_acl_bytes);
-	} else {
-		aclnode->z_size = aclp->z_acl_bytes;
-		aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0];
-	}
-
-	list_insert_head(&aclp->z_acl, aclnode);
-
-	return (aclp);
-}
-
 /*
  * Read an external acl object.  If the intent is to modify, always
  * create a new acl and leave any cached acl in place.
@@ -979,60 +1057,100 @@ zfs_acl_node_read_internal(znode_t *zp, 
 static int
 zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 {
-	uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
 	zfs_acl_t	*aclp;
-	size_t		aclsize;
-	size_t		acl_count;
+	int		aclsize;
+	int		acl_count;
 	zfs_acl_node_t	*aclnode;
-	int error;
+	zfs_acl_phys_t	znode_acl;
+	int		version;
+	int		error;
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 
 	if (zp->z_acl_cached && !will_modify) {
 		*aclpp = zp->z_acl_cached;
 		return (0);
 	}
 
-	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
-		*aclpp = zfs_acl_node_read_internal(zp, will_modify);
-		if (!will_modify)
-			zp->z_acl_cached = *aclpp;
-		return (0);
+	version = zfs_znode_acl_version(zp);
+
+	if ((error = zfs_acl_znode_info(zp, &aclsize,
+	    &acl_count, &znode_acl)) != 0) {
+		goto done;
 	}
 
-	aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version);
-	if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) {
-		zfs_acl_phys_v0_t *zacl0 =
-		    (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl;
+	aclp = zfs_acl_alloc(version);
 
-		aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count);
-		acl_count = zacl0->z_acl_count;
-	} else {
-		aclsize = zp->z_phys->zp_acl.z_acl_size;
-		acl_count = zp->z_phys->zp_acl.z_acl_count;
-		if (aclsize == 0)
-			aclsize = acl_count * sizeof (zfs_ace_t);
-	}
-	aclnode = zfs_acl_node_alloc(aclsize);
-	list_insert_head(&aclp->z_acl, aclnode);
-	error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
-	    aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
-	aclnode->z_ace_count = acl_count;
 	aclp->z_acl_count = acl_count;
 	aclp->z_acl_bytes = aclsize;
 
+	aclnode = zfs_acl_node_alloc(aclsize);
+	aclnode->z_ace_count = aclp->z_acl_count;
+	aclnode->z_size = aclsize;
+
+	if (!zp->z_is_sa) {
+		if (znode_acl.z_acl_extern_obj) {
+			error = dmu_read(zp->z_zfsvfs->z_os,
+			    znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+			    aclnode->z_acldata, DMU_READ_PREFETCH);
+		} else {
+			bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+			    aclnode->z_size);
+		}
+	} else {
+		error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+		    aclnode->z_acldata, aclnode->z_size);
+	}
+
 	if (error != 0) {
 		zfs_acl_free(aclp);
+		zfs_acl_node_free(aclnode);
 		/* convert checksum errors into IO errors */
 		if (error == ECKSUM)
-			error = EIO;
-		return (error);
+			error = SET_ERROR(EIO);
+		goto done;
 	}
 
+	list_insert_head(&aclp->z_acl, aclnode);
+
 	*aclpp = aclp;
 	if (!will_modify)
 		zp->z_acl_cached = aclp;
-	return (0);
+done:
+	return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+    boolean_t start, void *userdata)
+{
+	zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+	if (start) {
+		cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+	} else {
+		cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+		    cb->cb_acl_node);
+	}
+	*dataptr = cb->cb_acl_node->z_acldata;
+	*length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+	int error;
+	zfs_acl_t *aclp;
+
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
+		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+		    &zp->z_pflags, zp->z_uid, zp->z_gid);
+	return (error);
 }
 
 /*
@@ -1045,28 +1163,35 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t
 int
 zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 {
-	int		error;
-	znode_phys_t	*zphys = zp->z_phys;
-	zfs_acl_phys_t	*zacl = &zphys->zp_acl;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint64_t	aoid = zphys->zp_acl.z_acl_extern_obj;
-	uint64_t	off = 0;
-	dmu_object_type_t otype;
-	zfs_acl_node_t	*aclnode;
-
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	int			error;
+	zfsvfs_t		*zfsvfs = zp->z_zfsvfs;
+	dmu_object_type_t	otype;
+	zfs_acl_locator_cb_t	locate = { 0 };
+	uint64_t		mode;
+	sa_bulk_attr_t		bulk[5];
+	uint64_t		ctime[2];
+	int			count = 0;
+
+	mode = zp->z_mode;
+
+	mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+	    zp->z_uid, zp->z_gid);
+
+	zp->z_mode = mode;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, sizeof (ctime));
 
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 
-	zphys->zp_mode = zfs_mode_compute(zp, aclp);
-
 	/*
-	 * Decide which object type to use.  If we are forced to
-	 * use old ACL format then transform ACL into zfs_oldace_t
-	 * layout.
+	 * Upgrade needed?
 	 */
 	if (!zfsvfs->z_use_fuids) {
 		otype = DMU_OT_OLDACL;
@@ -1078,462 +1203,196 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t
 		otype = DMU_OT_ACL;
 	}
 
-	if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-		/*
-		 * If ACL was previously external and we are now
-		 * converting to new ACL format then release old
-		 * ACL object and create a new one.
-		 */
-		if (aoid && aclp->z_version != zacl->z_acl_version) {
-			error = dmu_object_free(zfsvfs->z_os,
-			    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
-			if (error)
-				return (error);
-			aoid = 0;
-		}
-		if (aoid == 0) {
-			aoid = dmu_object_alloc(zfsvfs->z_os,
-			    otype, aclp->z_acl_bytes,
-			    otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE,
-			    otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx);
-		} else {
-			(void) dmu_object_set_blocksize(zfsvfs->z_os, aoid,
-			    aclp->z_acl_bytes, 0, tx);
-		}
-		zphys->zp_acl.z_acl_extern_obj = aoid;
-		for (aclnode = list_head(&aclp->z_acl); aclnode;
-		    aclnode = list_next(&aclp->z_acl, aclnode)) {
-			if (aclnode->z_ace_count == 0)
-				continue;
-			dmu_write(zfsvfs->z_os, aoid, off,
-			    aclnode->z_size, aclnode->z_acldata, tx);
-			off += aclnode->z_size;
-		}
-	} else {
-		void *start = zacl->z_ace_data;
-		/*
-		 * Migrating back embedded?
-		 */
-		if (zphys->zp_acl.z_acl_extern_obj) {
-			error = dmu_object_free(zfsvfs->z_os,
-			    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
-			if (error)
-				return (error);
-			zphys->zp_acl.z_acl_extern_obj = 0;
-		}
-
-		for (aclnode = list_head(&aclp->z_acl); aclnode;
-		    aclnode = list_next(&aclp->z_acl, aclnode)) {
-			if (aclnode->z_ace_count == 0)
-				continue;
-			bcopy(aclnode->z_acldata, start, aclnode->z_size);
-			start = (caddr_t)start + aclnode->z_size;
-		}
-	}
-
-	/*
-	 * If Old version then swap count/bytes to match old
-	 * layout of znode_acl_phys_t.
-	 */
-	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
-		zphys->zp_acl.z_acl_size = aclp->z_acl_count;
-		zphys->zp_acl.z_acl_count = aclp->z_acl_bytes;
-	} else {
-		zphys->zp_acl.z_acl_size = aclp->z_acl_bytes;
-		zphys->zp_acl.z_acl_count = aclp->z_acl_count;
-	}
-
-	zphys->zp_acl.z_acl_version = aclp->z_version;
-
 	/*
-	 * Replace ACL wide bits, but first clear them.
+	 * Arrgh, we have to handle old on disk format
+	 * as well as newer (preferred) SA format.
 	 */
-	zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS;
 
-	zp->z_phys->zp_flags |= aclp->z_hints;
+	if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+		locate.cb_aclp = aclp;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+		    NULL, &aclp->z_acl_count, sizeof (uint64_t));
+	} else { /* Painful legacy way */
+		zfs_acl_node_t *aclnode;
+		uint64_t off = 0;
+		zfs_acl_phys_t acl_phys;
+		uint64_t aoid;
 
-	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
-		zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
-
-	return (0);
-}
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) != 0)
+			return (error);
 
-/*
- * Update access mask for prepended ACE
- *
- * This applies the "groupmask" value for aclmode property.
- */
-static void
-zfs_acl_prepend_fixup(zfs_acl_t *aclp, void  *acep, void  *origacep,
-    mode_t mode, uint64_t owner)
-{
-	int	rmask, wmask, xmask;
-	int	user_ace;
-	uint16_t aceflags;
-	uint32_t origmask, acepmask;
-	uint64_t fuid;
-
-	aceflags = aclp->z_ops.ace_flags_get(acep);
-	fuid = aclp->z_ops.ace_who_get(acep);
-	origmask = aclp->z_ops.ace_mask_get(origacep);
-	acepmask = aclp->z_ops.ace_mask_get(acep);
-
-	user_ace = (!(aceflags &
-	    (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP)));
-
-	if (user_ace && (fuid == owner)) {
-		rmask = S_IRUSR;
-		wmask = S_IWUSR;
-		xmask = S_IXUSR;
-	} else {
-		rmask = S_IRGRP;
-		wmask = S_IWGRP;
-		xmask = S_IXGRP;
-	}
+		aoid = acl_phys.z_acl_extern_obj;
 
-	if (origmask & ACE_READ_DATA) {
-		if (mode & rmask) {
-			acepmask &= ~ACE_READ_DATA;
+		if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			/*
+			 * If ACL was previously external and we are now
+			 * converting to new ACL format then release old
+			 * ACL object and create a new one.
+			 */
+			if (aoid &&
+			    aclp->z_version != acl_phys.z_acl_version) {
+				error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+				if (error)
+					return (error);
+				aoid = 0;
+			}
+			if (aoid == 0) {
+				aoid = dmu_object_alloc(zfsvfs->z_os,
+				    otype, aclp->z_acl_bytes,
+				    otype == DMU_OT_ACL ?
+				    DMU_OT_SYSACL : DMU_OT_NONE,
+				    otype == DMU_OT_ACL ?
+				    DN_MAX_BONUSLEN : 0, tx);
+			} else {
+				(void) dmu_object_set_blocksize(zfsvfs->z_os,
+				    aoid, aclp->z_acl_bytes, 0, tx);
+			}
+			acl_phys.z_acl_extern_obj = aoid;
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				dmu_write(zfsvfs->z_os, aoid, off,
+				    aclnode->z_size, aclnode->z_acldata, tx);
+				off += aclnode->z_size;
+			}
 		} else {
-			acepmask |= ACE_READ_DATA;
-		}
-	}
+			void *start = acl_phys.z_ace_data;
+			/*
+			 * Migrating back embedded?
+			 */
+			if (acl_phys.z_acl_extern_obj) {
+				error = dmu_object_free(zfsvfs->z_os,
+				    acl_phys.z_acl_extern_obj, tx);
+				if (error)
+					return (error);
+				acl_phys.z_acl_extern_obj = 0;
+			}
 
-	if (origmask & ACE_WRITE_DATA) {
-		if (mode & wmask) {
-			acepmask &= ~ACE_WRITE_DATA;
-		} else {
-			acepmask |= ACE_WRITE_DATA;
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
 		}
-	}
-
-	if (origmask & ACE_APPEND_DATA) {
-		if (mode & wmask) {
-			acepmask &= ~ACE_APPEND_DATA;
+		/*
+		 * If Old version then swap count/bytes to match old
+		 * layout of znode_acl_phys_t.
+		 */
+		if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+			acl_phys.z_acl_size = aclp->z_acl_count;
+			acl_phys.z_acl_count = aclp->z_acl_bytes;
 		} else {
-			acepmask |= ACE_APPEND_DATA;
+			acl_phys.z_acl_size = aclp->z_acl_bytes;
+			acl_phys.z_acl_count = aclp->z_acl_count;
 		}
-	}
+		acl_phys.z_acl_version = aclp->z_version;
 
-	if (origmask & ACE_EXECUTE) {
-		if (mode & xmask) {
-			acepmask &= ~ACE_EXECUTE;
-		} else {
-			acepmask |= ACE_EXECUTE;
-		}
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (acl_phys));
 	}
-	aclp->z_ops.ace_mask_set(acep, acepmask);
-}
-
-/*
- * Apply mode to canonical six ACEs.
- */
-static void
-zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode)
-{
-	zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
-	void	*acep;
-	int	maskoff = aclp->z_ops.ace_mask_off();
-	size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
-	ASSERT(aclnode != NULL);
-
-	acep = (void *)((caddr_t)aclnode->z_acldata +
-	    aclnode->z_size - (abstract_size * 6));
 
 	/*
-	 * Fixup final ACEs to match the mode
+	 * Replace ACL wide bits, but first clear them.
 	 */
+	zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
 
-	adjust_ace_pair_common(acep, maskoff, abstract_size,
-	    (mode & 0700) >> 6);	/* owner@ */
-
-	acep = (caddr_t)acep + (abstract_size * 2);
-
-	adjust_ace_pair_common(acep, maskoff, abstract_size,
-	    (mode & 0070) >> 3);	/* group@ */
-
-	acep = (caddr_t)acep + (abstract_size * 2);
-	adjust_ace_pair_common(acep, maskoff,
-	    abstract_size, mode);	/* everyone@ */
-}
-
-
-static int
-zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny,
-    int entry_type, int accessmask)
-{
-	uint32_t mask = aclp->z_ops.ace_mask_get(acep);
-	uint16_t type = aclp->z_ops.ace_type_get(acep);
-	uint16_t flags = aclp->z_ops.ace_flags_get(acep);
-
-	return (mask == accessmask && type == allow_deny &&
-	    ((flags & ACE_TYPE_FLAGS) == entry_type));
-}
-
-/*
- * Can prepended ACE be reused?
- */
-static int
-zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep)
-{
-	int okay_masks;
-	uint16_t prevtype;
-	uint16_t prevflags;
-	uint16_t flags;
-	uint32_t mask, prevmask;
-
-	if (prevacep == NULL)
-		return (B_FALSE);
-
-	prevtype = aclp->z_ops.ace_type_get(prevacep);
-	prevflags = aclp->z_ops.ace_flags_get(prevacep);
-	flags = aclp->z_ops.ace_flags_get(acep);
-	mask = aclp->z_ops.ace_mask_get(acep);
-	prevmask = aclp->z_ops.ace_mask_get(prevacep);
-
-	if (prevtype != DENY)
-		return (B_FALSE);
-
-	if (prevflags != (flags & ACE_IDENTIFIER_GROUP))
-		return (B_FALSE);
-
-	okay_masks = (mask & OKAY_MASK_BITS);
-
-	if (prevmask & ~okay_masks)
-		return (B_FALSE);
-
-	return (B_TRUE);
-}
-
-
-/*
- * Insert new ACL node into chain of zfs_acl_node_t's
- *
- * This will result in two possible results.
- * 1. If the ACL is currently just a single zfs_acl_node and
- *    we are prepending the entry then current acl node will have
- *    a new node inserted above it.
- *
- * 2. If we are inserting in the middle of current acl node then
- *    the current node will be split in two and new node will be inserted
- *    in between the two split nodes.
- */
-static zfs_acl_node_t *
-zfs_acl_ace_insert(zfs_acl_t *aclp, void  *acep)
-{
-	zfs_acl_node_t 	*newnode;
-	zfs_acl_node_t 	*trailernode = NULL;
-	zfs_acl_node_t 	*currnode = zfs_acl_curr_node(aclp);
-	int		curr_idx = aclp->z_curr_node->z_ace_idx;
-	int		trailer_count;
-	size_t		oldsize;
-
-	newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep));
-	newnode->z_ace_count = 1;
-
-	oldsize = currnode->z_size;
-
-	if (curr_idx != 1) {
-		trailernode = zfs_acl_node_alloc(0);
-		trailernode->z_acldata = acep;
-
-		trailer_count = currnode->z_ace_count - curr_idx + 1;
-		currnode->z_ace_count = curr_idx - 1;
-		currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata;
-		trailernode->z_size = oldsize - currnode->z_size;
-		trailernode->z_ace_count = trailer_count;
-	}
-
-	aclp->z_acl_count += 1;
-	aclp->z_acl_bytes += aclp->z_ops.ace_size(acep);
-
-	if (curr_idx == 1)
-		list_insert_before(&aclp->z_acl, currnode, newnode);
-	else
-		list_insert_after(&aclp->z_acl, currnode, newnode);
-	if (trailernode) {
-		list_insert_after(&aclp->z_acl, newnode, trailernode);
-		aclp->z_curr_node = trailernode;
-		trailernode->z_ace_idx = 1;
-	}
-
-	return (newnode);
-}
-
-/*
- * Prepend deny ACE
- */
-static void *
-zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
-    mode_t mode)
-{
-	zfs_acl_node_t *aclnode;
-	void  *newacep;
-	uint64_t fuid;
-	uint16_t flags;
-
-	aclnode = zfs_acl_ace_insert(aclp, acep);
-	newacep = aclnode->z_acldata;
-	fuid = aclp->z_ops.ace_who_get(acep);
-	flags = aclp->z_ops.ace_flags_get(acep);
-	zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
-	zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
-
-	return (newacep);
-}
-
-/*
- * Split an inherited ACE into inherit_only ACE
- * and original ACE with inheritance flags stripped off.
- */
-static void
-zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep)
-{
-	zfs_acl_node_t *aclnode;
-	zfs_acl_node_t *currnode;
-	void  *newacep;
-	uint16_t type, flags;
-	uint32_t mask;
-	uint64_t fuid;
-
-	type = aclp->z_ops.ace_type_get(acep);
-	flags = aclp->z_ops.ace_flags_get(acep);
-	mask = aclp->z_ops.ace_mask_get(acep);
-	fuid = aclp->z_ops.ace_who_get(acep);
-
-	aclnode = zfs_acl_ace_insert(aclp, acep);
-	newacep = aclnode->z_acldata;
-
-	aclp->z_ops.ace_type_set(newacep, type);
-	aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE);
-	aclp->z_ops.ace_mask_set(newacep, mask);
-	aclp->z_ops.ace_type_set(newacep, type);
-	aclp->z_ops.ace_who_set(newacep, fuid);
-	aclp->z_next_ace = acep;
-	flags &= ~ALL_INHERIT;
-	aclp->z_ops.ace_flags_set(acep, flags);
-	currnode = zfs_acl_curr_node(aclp);
-	ASSERT(currnode->z_ace_idx >= 1);
-	currnode->z_ace_idx -= 1;
-}
+	zp->z_pflags |= aclp->z_hints;
 
-/*
- * Are ACES started at index i, the canonical six ACES?
- */
-static int
-zfs_have_canonical_six(zfs_acl_t *aclp)
-{
-	void *acep;
-	zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl);
-	int		i = 0;
-	size_t abstract_size = aclp->z_ops.ace_abstract_size();
-
-	ASSERT(aclnode != NULL);
-
-	if (aclnode->z_ace_count < 6)
-		return (0);
-
-	acep = (void *)((caddr_t)aclnode->z_acldata +
-	    aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6));
-
-	if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-	    DENY, ACE_OWNER, 0) &&
-	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-	    ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) &&
-	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY,
-	    OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep +
-	    (abstract_size * i++),
-	    ALLOW, OWNING_GROUP, 0) &&
-	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-	    DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) &&
-	    zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++),
-	    ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) {
-		return (1);
-	} else {
-		return (0);
-	}
-}
-
-
-/*
- * Apply step 1g, to group entries
- *
- * Need to deal with corner case where group may have
- * greater permissions than owner.  If so then limit
- * group permissions, based on what extra permissions
- * group has.
- */
-static void
-zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
-    mode_t mode)
-{
-	uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep);
-	uint32_t mask = aclp->z_ops.ace_mask_get(acep);
-	uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep);
-	mode_t extramode = (mode >> 3) & 07;
-	mode_t ownermode = (mode >> 6);
-
-	if (prevflags & ACE_IDENTIFIER_GROUP) {
-
-		extramode &= ~ownermode;
+	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+		zp->z_pflags |= ZFS_ACL_TRIVIAL;
 
-		if (extramode) {
-			if (extramode & S_IROTH) {
-				prevmask &= ~ACE_READ_DATA;
-				mask &= ~ACE_READ_DATA;
-			}
-			if (extramode & S_IWOTH) {
-				prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-				mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
-			}
-			if (extramode & S_IXOTH) {
-				prevmask  &= ~ACE_EXECUTE;
-				mask &= ~ACE_EXECUTE;
-			}
-		}
-	}
-	aclp->z_ops.ace_mask_set(acep, mask);
-	aclp->z_ops.ace_mask_set(prevacep, prevmask);
+	zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
+	return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
 }
 
-/*
- * Apply the chmod algorithm as described
- * in PSARC/2002/240
- */
 static void
-zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
-    uint64_t mode, zfs_acl_t *aclp)
+zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
+    zfs_acl_t *aclp)
 {
-	void		*acep = NULL, *prevacep = NULL;
+	void		*acep = NULL;
 	uint64_t	who;
-	int 		i;
+	int		new_count, new_bytes;
+	int		ace_size;
 	int 		entry_type;
-	int 		reuse_deny;
-	int 		need_canonical_six = 1;
 	uint16_t	iflags, type;
 	uint32_t	access_mask;
-
-	/*
-	 * If discard then just discard all ACL nodes which
-	 * represent the ACEs.
-	 *
-	 * New owner@/group@/everone@ ACEs will be added
-	 * later.
-	 */
-	if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
-		zfs_acl_release_nodes(aclp);
+	zfs_acl_node_t	*newnode;
+	size_t 		abstract_size = aclp->z_ops.ace_abstract_size();
+	void 		*zacep;
+	boolean_t	isdir;
+	trivial_acl_t	masks;
+
+	new_count = new_bytes = 0;
+
+	isdir = (vtype == VDIR);
+
+	acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+	newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+	zacep = newnode->z_acldata;
+	if (masks.allow0) {
+		zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny1) {
+		zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny2) {
+		zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
 
 	while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
 	    &iflags, &type)) {
-
 		entry_type = (iflags & ACE_TYPE_FLAGS);
-		iflags = (iflags & ALL_INHERIT);
+		/*
+		 * ACEs used to represent the file mode may be divided
+		 * into an equivalent pair of inherit-only and regular
+		 * ACEs, if they are inheritable.
+		 * Skip regular ACEs, which are replaced by the new mode.
+		 */
+		if (split && (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)) {
+			if (!isdir || !(iflags &
+			    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+				continue;
+			/*
+			 * We preserve owner@, group@, or @everyone
+			 * permissions, if they are inheritable, by
+			 * copying them to inherit_only ACEs. This
+			 * prevents inheritable permissions from being
+			 * altered along with the file mode.
+			 */
+			iflags |= ACE_INHERIT_ONLY_ACE;
+		}
+
+		/*
+		 * If this ACL has any inheritable ACEs, mark that in
+		 * the hints (which are later masked into the pflags)
+		 * so create knows to do inheritance.
+		 */
+		if (isdir && (iflags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+			aclp->z_hints |= ZFS_INHERIT_ACE;
 
 		if ((type != ALLOW && type != DENY) ||
 		    (iflags & ACE_INHERIT_ONLY_ACE)) {
-			if (iflags)
-				aclp->z_hints |= ZFS_INHERIT_ACE;
 			switch (type) {
 			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
 			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
@@ -1542,131 +1401,58 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t
 				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
 				break;
 			}
-			goto nextace;
-		}
-
-		/*
-		 * Need to split ace into two?
-		 */
-		if ((iflags & (ACE_FILE_INHERIT_ACE|
-		    ACE_DIRECTORY_INHERIT_ACE)) &&
-		    (!(iflags & ACE_INHERIT_ONLY_ACE))) {
-			zfs_acl_split_ace(aclp, acep);
-			aclp->z_hints |= ZFS_INHERIT_ACE;
-			goto nextace;
-		}
-
-		if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
-		    (entry_type == OWNING_GROUP)) {
-			access_mask &= ~OGE_CLEAR;
-			aclp->z_ops.ace_mask_set(acep, access_mask);
-			goto nextace;
 		} else {
-			reuse_deny = B_TRUE;
-			if (type == ALLOW) {
-
-				/*
-				 * Check preceding ACE if any, to see
-				 * if we need to prepend a DENY ACE.
-				 * This is only applicable when the acl_mode
-				 * property == groupmask.
-				 */
-				if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) {
-
-					reuse_deny = zfs_reuse_deny(aclp, acep,
-					    prevacep);
-
-					if (!reuse_deny) {
-						prevacep =
-						    zfs_acl_prepend_deny(uid,
-						    aclp, acep, mode);
-					} else {
-						zfs_acl_prepend_fixup(
-						    aclp, prevacep,
-						    acep, mode, uid);
-					}
-					zfs_fixup_group_entries(aclp, acep,
-					    prevacep, mode);
-				}
-			}
-		}
-nextace:
-		prevacep = acep;
-	}
-
-	/*
-	 * Check out last six aces, if we have six.
-	 */
-
-	if (aclp->z_acl_count >= 6) {
-		if (zfs_have_canonical_six(aclp)) {
-			need_canonical_six = 0;
+			/*
+			 * Limit permissions granted by ACEs to be no greater
+			 * than permissions of the requested group mode.
+			 * Applies when the "aclmode" property is set to
+			 * "groupmask".
+			 */
+			if ((type == ALLOW) && trim)
+				access_mask &= masks.group;
 		}
-	}
-
-	if (need_canonical_six) {
-		size_t abstract_size = aclp->z_ops.ace_abstract_size();
-		void *zacep;
-		zfs_acl_node_t *aclnode =
-		    zfs_acl_node_alloc(abstract_size * 6);
-
-		aclnode->z_size = abstract_size * 6;
-		aclnode->z_ace_count = 6;
-		aclp->z_acl_bytes += aclnode->z_size;
-		list_insert_tail(&aclp->z_acl, aclnode);
-
-		zacep = aclnode->z_acldata;
-
-		i = 0;
-		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-		    0, DENY, -1, ACE_OWNER);
-		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-		    OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER);
-		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
-		    DENY, -1, OWNING_GROUP);
-		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0,
-		    ALLOW, -1, OWNING_GROUP);
-		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-		    EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE);
-		zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++),
-		    EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE);
-		aclp->z_acl_count += 6;
-	}
+		zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+		ace_size = aclp->z_ops.ace_size(acep);
+		zacep = (void *)((uintptr_t)zacep + ace_size);
+		new_count++;
+		new_bytes += ace_size;
+	}
+	zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
 
-	zfs_acl_fixup_canonical_six(aclp, mode);
+	new_count += 3;
+	new_bytes += abstract_size * 3;
+	zfs_acl_release_nodes(aclp);
+	aclp->z_acl_count = new_count;
+	aclp->z_acl_bytes = new_bytes;
+	newnode->z_ace_count = new_count;
+	newnode->z_size = new_bytes;
+	list_insert_tail(&aclp->z_acl, newnode);
 }
 
 int
 zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 {
-	int error;
+	int error = 0;
 
-	mutex_enter(&zp->z_lock);
 	mutex_enter(&zp->z_acl_lock);
-	*aclp = NULL;
-	error = zfs_acl_node_read(zp, aclp, B_TRUE);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
+		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+	else
+		error = zfs_acl_node_read(zp, aclp, B_TRUE);
+
 	if (error == 0) {
-		(*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
-		zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
+		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+		zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
+		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
 	}
 	mutex_exit(&zp->z_acl_lock);
-	mutex_exit(&zp->z_lock);
-	return (error);
-}
 
-/*
- * strip off write_owner and write_acl
- */
-static void
-zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
-{
-	uint32_t mask = aclp->z_ops.ace_mask_get(acep);
-
-	if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) &&
-	    (aclp->z_ops.ace_type_get(acep) == ALLOW)) {
-		mask &= ~RESTRICTED_CLEAR;
-		aclp->z_ops.ace_mask_set(acep, mask);
-	}
+	return (error);
 }
 
 /*
@@ -1690,11 +1476,11 @@ zfs_ace_can_use(vtype_t vtype, uint16_t 
  */
 static zfs_acl_t *
 zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
-    uint64_t mode, boolean_t *need_chmod)
+    uint64_t mode)
 {
-	void		*pacep;
-	void		*acep, *acep2;
-	zfs_acl_node_t  *aclnode, *aclnode2;
+	void		*pacep = NULL;
+	void		*acep;
+	zfs_acl_node_t  *aclnode;
 	zfs_acl_t	*aclp = NULL;
 	uint64_t	who;
 	uint32_t	access_mask;
@@ -1702,22 +1488,14 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_
 	size_t		ace_size;
 	void		*data1, *data2;
 	size_t		data1sz, data2sz;
-	boolean_t	vdir = vtype == VDIR;
-	boolean_t	vreg = vtype == VREG;
-	boolean_t	passthrough, passthrough_x, noallow;
-
-	passthrough_x =
-	    zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X;
-	passthrough = passthrough_x ||
-	    zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH;
-	noallow =
-	    zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW;
+	uint_t		aclinherit;
+	boolean_t	isdir = (vtype == VDIR);
 
-	*need_chmod = B_TRUE;
-	pacep = NULL;
 	aclp = zfs_acl_alloc(paclp->z_version);
-	if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD)
+	aclinherit = zfsvfs->z_acl_inherit;
+	if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
 		return (aclp);
+
 	while (pacep = zfs_acl_next_ace(paclp, pacep, &who,
 	    &access_mask, &iflags, &type)) {
 
@@ -1727,31 +1505,31 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_
 		if (!zfs_acl_valid_ace_type(type, iflags))
 			continue;
 
-		if (noallow && type == ALLOW)
+		/*
+		 * Check if ACE is inheritable by this vnode
+		 */
+		if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+		    !zfs_ace_can_use(vtype, iflags))
 			continue;
 
-		ace_size = aclp->z_ops.ace_size(pacep);
-
-		if (!zfs_ace_can_use(vtype, iflags))
-			continue;
+		/*
+		 * Strip inherited execute permission from file if
+		 * not in mode
+		 */
+		if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+		    !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+			access_mask &= ~ACE_EXECUTE;
+		}
 
 		/*
-		 * If owner@, group@, or everyone@ inheritable
-		 * then zfs_acl_chmod() isn't needed.
+		 * Strip write_acl and write_owner from permissions
+		 * when inheriting an ACE
 		 */
-		if (passthrough &&
-		    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
-		    ((iflags & OWNING_GROUP) ==
-		    OWNING_GROUP)) && (vreg || (vdir && (iflags &
-		    ACE_DIRECTORY_INHERIT_ACE)))) {
-			*need_chmod = B_FALSE;
-
-			if (!vdir && passthrough_x &&
-			    ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
-				access_mask &= ~ACE_EXECUTE;
-			}
+		if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+			access_mask &= ~RESTRICTED_CLEAR;
 		}
 
+		ace_size = aclp->z_ops.ace_size(pacep);
 		aclnode = zfs_acl_node_alloc(ace_size);
 		list_insert_tail(&aclp->z_acl, aclnode);
 		acep = aclnode->z_acldata;
@@ -1767,66 +1545,51 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_
 			    &data2)) == data1sz);
 			bcopy(data1, data2, data2sz);
 		}
+
 		aclp->z_acl_count++;
 		aclnode->z_ace_count++;
 		aclp->z_acl_bytes += aclnode->z_size;
 		newflags = aclp->z_ops.ace_flags_get(acep);
 
-		if (vdir)
-			aclp->z_hints |= ZFS_INHERIT_ACE;
-
-		if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) {
+		/*
+		 * If ACE is not to be inherited further, or if the vnode is
+		 * not a directory, remove all inheritance flags
+		 */
+		if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
 			newflags &= ~ALL_INHERIT;
 			aclp->z_ops.ace_flags_set(acep,
 			    newflags|ACE_INHERITED_ACE);
-			zfs_restricted_update(zfsvfs, aclp, acep);
 			continue;
 		}
 
-		ASSERT(vdir);
+		/*
+		 * This directory has an inheritable ACE
+		 */
+		aclp->z_hints |= ZFS_INHERIT_ACE;
 
-		newflags = aclp->z_ops.ace_flags_get(acep);
+		/*
+		 * If only FILE_INHERIT is set then turn on
+		 * inherit_only
+		 */
 		if ((iflags & (ACE_FILE_INHERIT_ACE |
-		    ACE_DIRECTORY_INHERIT_ACE)) !=
-		    ACE_FILE_INHERIT_ACE) {
-			aclnode2 = zfs_acl_node_alloc(ace_size);
-			list_insert_tail(&aclp->z_acl, aclnode2);
-			acep2 = aclnode2->z_acldata;
-			zfs_set_ace(aclp, acep2,
-			    access_mask, type, who,
-			    iflags|ACE_INHERITED_ACE);
+		    ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
 			newflags |= ACE_INHERIT_ONLY_ACE;
-			aclp->z_ops.ace_flags_set(acep, newflags);
-			newflags &= ~ALL_INHERIT;
-			aclp->z_ops.ace_flags_set(acep2,
+			aclp->z_ops.ace_flags_set(acep,
 			    newflags|ACE_INHERITED_ACE);
-
-			/*
-			 * Copy special opaque data if any
-			 */
-			if ((data1sz = aclp->z_ops.ace_data(acep,
-			    &data1)) != 0) {
-				VERIFY((data2sz =
-				    aclp->z_ops.ace_data(acep2,
-				    &data2)) == data1sz);
-				bcopy(data1, data2, data1sz);
-			}
-			aclp->z_acl_count++;
-			aclnode2->z_ace_count++;
-			aclp->z_acl_bytes += aclnode->z_size;
-			zfs_restricted_update(zfsvfs, aclp, acep2);
 		} else {
-			newflags |= ACE_INHERIT_ONLY_ACE;
+			newflags &= ~ACE_INHERIT_ONLY_ACE;
 			aclp->z_ops.ace_flags_set(acep,
 			    newflags|ACE_INHERITED_ACE);
 		}
 	}
+
 	return (aclp);
 }
 
 /*
  * Create file system object initial permissions
  * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
  */
 int
 zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
@@ -1836,8 +1599,12 @@ zfs_acl_ids_create(znode_t *dzp, int fla
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zfs_acl_t	*paclp;
 	gid_t		gid;
-	boolean_t	need_chmod = B_TRUE;
+	boolean_t	trim = B_FALSE;
+	boolean_t	inherited = B_FALSE;
 
+#ifndef __NetBSD__
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+#endif
 	bzero(acl_ids, sizeof (zfs_acl_ids_t));
 	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
@@ -1845,7 +1612,6 @@ zfs_acl_ids_create(znode_t *dzp, int fla
 		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
 		    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
 			return (error);
-
 	/*
 	 * Determine uid and gid.
 	 */
@@ -1867,17 +1633,17 @@ zfs_acl_ids_create(znode_t *dzp, int fla
 			    (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
 			gid = vap->va_gid;
-			if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
+			if (acl_ids->z_fgid != dzp->z_gid &&
 			    !groupmember(vap->va_gid, cr) &&
 			    secpolicy_vnode_create_gid(cr) != 0)
 				acl_ids->z_fgid = 0;
 		}
 		if (acl_ids->z_fgid == 0) {
-			if (dzp->z_phys->zp_mode & S_ISGID) {
+			if (dzp->z_mode & S_ISGID) {
 				char		*domain;
 				uint32_t	rid;
 
-				acl_ids->z_fgid = dzp->z_phys->zp_gid;
+				acl_ids->z_fgid = dzp->z_gid;
 				gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
 				    cr, ZFS_GROUP);
 
@@ -1895,7 +1661,11 @@ zfs_acl_ids_create(znode_t *dzp, int fla
 			} else {
 				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
 				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
+#ifdef __FreeBSD_kernel__
+				gid = acl_ids->z_fgid = dzp->z_gid;
+#else
 				gid = crgetgid(cr);
+#endif
 			}
 		}
 	}
@@ -1907,36 +1677,48 @@ zfs_acl_ids_create(znode_t *dzp, int fla
 	 * file's new group, clear the file's set-GID bit.
 	 */
 
-	if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+	if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
 	    (vap->va_type == VDIR)) {
 		acl_ids->z_mode |= S_ISGID;
 	} else {
 		if ((acl_ids->z_mode & S_ISGID) &&
-		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
+		    secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
 			acl_ids->z_mode &= ~S_ISGID;
 	}
 
 	if (acl_ids->z_aclp == NULL) {
-		mutex_enter(&dzp->z_lock);
-		if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
-		    (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
-		    !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
-			mutex_enter(&dzp->z_acl_lock);
+		mutex_enter(&dzp->z_acl_lock);
+		if (!(flag & IS_ROOT_NODE) &&
+		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+		    !(dzp->z_pflags & ZFS_XATTR)) {
 			VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
-			mutex_exit(&dzp->z_acl_lock);
 			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
-			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+			    vap->va_type, paclp, acl_ids->z_mode);
+			inherited = B_TRUE;
 		} else {
 			acl_ids->z_aclp =
 			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
 		}
-		mutex_exit(&dzp->z_lock);
-		if (need_chmod) {
-			acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
-			    ZFS_ACL_AUTO_INHERIT : 0;
-			zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
-			    acl_ids->z_mode, acl_ids->z_aclp);
-		}
+		mutex_exit(&dzp->z_acl_lock);
+
+		if (vap->va_type == VDIR)
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+
+		if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+		    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+		    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+			trim = B_TRUE;
+		zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, trim,
+		    acl_ids->z_aclp);
+	}
+
+	if (inherited || vsecp) {
+		acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+		    acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+		if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
 	}
 
 	return (0);
@@ -1959,12 +1741,12 @@ zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
 boolean_t
 zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
 {
-	return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
-	    zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
+	return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+	    zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
 }
 
 /*
- * Retrieve a files ACL
+ * Retrieve a file's ACL
  */
 int
 zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
@@ -1978,14 +1760,15 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec
 	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
 	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
 
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
 	if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))
 		return (error);
 
-	if (mask == 0)
-		return (ENOSYS);
-
 	mutex_enter(&zp->z_acl_lock);
 
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
@@ -1995,8 +1778,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec
 	/*
 	 * Scan ACL to determine number of ACEs
 	 */
-	if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) &&
-	    !(mask & VSA_ACE_ALLTYPES)) {
+	if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
 		void *zacep = NULL;
 		uint64_t who;
 		uint32_t access_mask;
@@ -2017,7 +1799,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec
 		}
 		vsecp->vsa_aclcnt = count;
 	} else
-		count = aclp->z_acl_count;
+		count = (int)aclp->z_acl_count;
 
 	if (mask & VSA_ACECNT) {
 		vsecp->vsa_aclcnt = count;
@@ -2051,11 +1833,11 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec
 	}
 	if (mask & VSA_ACE_ACLFLAGS) {
 		vsecp->vsa_aclflags = 0;
-		if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED)
+		if (zp->z_pflags & ZFS_ACL_DEFAULTED)
 			vsecp->vsa_aclflags |= ACL_DEFAULTED;
-		if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED)
+		if (zp->z_pflags & ZFS_ACL_PROTECTED)
 			vsecp->vsa_aclflags |= ACL_PROTECTED;
-		if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT)
+		if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
 			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
 	}
 
@@ -2074,7 +1856,7 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_
 	int error;
 
 	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
 
@@ -2120,7 +1902,7 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_
 }
 
 /*
- * Set a files ACL
+ * Set a file's ACL
  */
 int
 zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
@@ -2133,12 +1915,14 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsec
 	zfs_acl_t	*aclp;
 	zfs_fuid_info_t	*fuidp = NULL;
 	boolean_t	fuid_dirtied;
+	uint64_t	acl_obj;
 
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 	if (mask == 0)
-		return (ENOSYS);
+		return (SET_ERROR(ENOSYS));
 
-	if (zp->z_phys->zp_flags & ZFS_IMMUTABLE)
-		return (EPERM);
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		return (SET_ERROR(EPERM));
 
 	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
 		return (error);
@@ -2153,41 +1937,43 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsec
 	 * existing flags.
 	 */
 	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
-		aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
+		aclp->z_hints |=
+		    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
 	}
 top:
-	mutex_enter(&zp->z_lock);
 	mutex_enter(&zp->z_acl_lock);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
 
-	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
-		/* Are we upgrading ACL? */
-		if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
-		    zp->z_phys->zp_acl.z_acl_version ==
-		    ZFS_ACL_VERSION_INITIAL) {
-			dmu_tx_hold_free(tx,
-			    zp->z_phys->zp_acl.z_acl_extern_obj,
-			    0, DMU_OBJECT_END);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, aclp->z_acl_bytes);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	/*
+	 * If old version and ACL won't fit in bonus and we aren't
+	 * upgrading then take out necessary DMU holds
+	 */
+
+	if ((acl_obj = zfs_external_acl(zp)) != 0) {
+		if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+		    zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+			dmu_tx_hold_free(tx, acl_obj, 0,
+			    DMU_OBJECT_END);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    aclp->z_acl_bytes);
 		} else {
-			dmu_tx_hold_write(tx,
-			    zp->z_phys->zp_acl.z_acl_extern_obj,
-			    0, aclp->z_acl_bytes);
+			dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
 		}
-	} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
 	}
-	fuid_dirtied = zfsvfs->z_fuid_dirty;
-	if (fuid_dirtied)
-		zfs_fuid_txhold(zfsvfs, tx);
 
+	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		mutex_exit(&zp->z_acl_lock);
-		mutex_exit(&zp->z_lock);
 
 		if (error == ERESTART) {
 			dmu_tx_wait(tx);
@@ -2201,20 +1987,18 @@ top:
 
 	error = zfs_aclset_common(zp, aclp, cr, tx);
 	ASSERT(error == 0);
+	ASSERT(zp->z_acl_cached == NULL);
 	zp->z_acl_cached = aclp;
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
-	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
 
 	if (fuidp)
 		zfs_fuid_info_free(fuidp);
 	dmu_tx_commit(tx);
-done:
 	mutex_exit(&zp->z_acl_lock);
-	mutex_exit(&zp->z_lock);
 
 	return (error);
 }
@@ -2231,7 +2015,7 @@ zfs_zaccess_dataset_check(znode_t *zp, u
 	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
 	    (!IS_DEVVP(ZTOV(zp)) ||
 	    (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
-		return (EROFS);
+		return (SET_ERROR(EROFS));
 	}
 
 	/*
@@ -2239,20 +2023,32 @@ zfs_zaccess_dataset_check(znode_t *zp, u
 	 */
 	if ((v4_mode & WRITE_MASK_DATA) &&
 	    (((ZTOV(zp)->v_type != VDIR) &&
-	    (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+	    (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
 	    (ZTOV(zp)->v_type == VDIR &&
-	    (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
-		return (EPERM);
+	    (zp->z_pflags & ZFS_IMMUTABLE)))) {
+		return (SET_ERROR(EPERM));
 	}
 
+#ifdef illumos
 	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
-	    (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
+	    (zp->z_pflags & ZFS_NOUNLINK)) {
+		return (SET_ERROR(EPERM));
+	}
+#else
+	/*
+	 * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
+	 * (sunlnk) is set. We just don't allow directory removal, which is
+	 * handled in zfs_zaccess_delete().
+	 */
+	if ((v4_mode & ACE_DELETE) &&
+	    (zp->z_pflags & ZFS_NOUNLINK)) {
 		return (EPERM);
 	}
+#endif
 
 	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
-	    (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
-		return (EACCES);
+	    (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+		return (SET_ERROR(EACCES));
 	}
 
 	return (0);
@@ -2298,19 +2094,22 @@ zfs_zaccess_aces_check(znode_t *zp, uint
 	uint32_t	deny_mask = 0;
 	zfs_ace_hdr_t	*acep = NULL;
 	boolean_t	checkit;
-	uid_t		fowner;
 	uid_t		gowner;
+	uid_t		fowner;
 
 	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
 
 	mutex_enter(&zp->z_acl_lock);
 
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
 	error = zfs_acl_node_read(zp, &aclp, B_FALSE);
 	if (error != 0) {
 		mutex_exit(&zp->z_acl_lock);
 		return (error);
 	}
 
+	ASSERT(zp->z_acl_cached);
+
 	while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
 	    &iflags, &type)) {
 		uint32_t mask_matched;
@@ -2358,7 +2157,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint
 				break;
 			} else {
 				mutex_exit(&zp->z_acl_lock);
-				return (EIO);
+				return (SET_ERROR(EIO));
 			}
 		}
 
@@ -2392,7 +2191,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint
 	/* Put the found 'denies' back on the working mode */
 	if (deny_mask) {
 		*working_mode |= deny_mask;
-		return (EACCES);
+		return (SET_ERROR(EACCES));
 	} else if (*working_mode) {
 		return (-1);
 	}
@@ -2410,18 +2209,10 @@ zfs_has_access(znode_t *zp, cred_t *cr)
 	uint32_t have = ACE_ALL_PERMS;
 
 	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
-		uid_t		owner;
-
-		owner = zfs_fuid_map_id(zp->z_zfsvfs,
-		    zp->z_phys->zp_uid, cr, ZFS_OWNER);
+		uid_t owner;
 
-		return (
-		    secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
-		    secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
-		    secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
-		    secpolicy_vnode_chown(cr, owner) == 0 ||
-		    secpolicy_vnode_setdac(cr, owner) == 0 ||
-		    secpolicy_vnode_remove(cr) == 0);
+		owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+		return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
 	}
 	return (B_TRUE);
 }
@@ -2467,7 +2258,7 @@ zfs_zaccess_append(znode_t *zp, uint32_t
     cred_t *cr)
 {
 	if (*working_mode != ACE_WRITE_DATA)
-		return (EACCES);
+		return (SET_ERROR(EACCES));
 
 	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
 	    check_privs, B_FALSE, cr));
@@ -2479,38 +2270,33 @@ zfs_fastaccesschk_execute(znode_t *zdp, 
 	boolean_t owner = B_FALSE;
 	boolean_t groupmbr = B_FALSE;
 	boolean_t is_attr;
-	uid_t fowner;
-	uid_t gowner;
 	uid_t uid = crgetuid(cr);
 	int error;
 
-	if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
-		return (EACCES);
+	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
 
-	is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
 	    (ZTOV(zdp)->v_type == VDIR));
 	if (is_attr)
 		goto slow;
 
+
 	mutex_enter(&zdp->z_acl_lock);
 
-	if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
 		mutex_exit(&zdp->z_acl_lock);
 		return (0);
 	}
 
-	if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
-	    FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+	if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) {
 		mutex_exit(&zdp->z_acl_lock);
 		goto slow;
 	}
 
-	fowner = (uid_t)zdp->z_phys->zp_uid;
-	gowner = (uid_t)zdp->z_phys->zp_gid;
-
-	if (uid == fowner) {
+	if (uid == zdp->z_uid) {
 		owner = B_TRUE;
-		if (zdp->z_phys->zp_mode & S_IXUSR) {
+		if (zdp->z_mode & S_IXUSR) {
 			mutex_exit(&zdp->z_acl_lock);
 			return (0);
 		} else {
@@ -2518,9 +2304,9 @@ zfs_fastaccesschk_execute(znode_t *zdp, 
 			goto slow;
 		}
 	}
-	if (groupmember(gowner, cr)) {
+	if (groupmember(zdp->z_gid, cr)) {
 		groupmbr = B_TRUE;
-		if (zdp->z_phys->zp_mode & S_IXGRP) {
+		if (zdp->z_mode & S_IXGRP) {
 			mutex_exit(&zdp->z_acl_lock);
 			return (0);
 		} else {
@@ -2529,7 +2315,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, 
 		}
 	}
 	if (!owner && !groupmbr) {
-		if (zdp->z_phys->zp_mode & S_IXOTH) {
+		if (zdp->z_mode & S_IXOTH) {
 			mutex_exit(&zdp->z_acl_lock);
 			return (0);
 		}
@@ -2546,8 +2332,10 @@ slow:
 }
 
 /*
- * Determine whether Access should be granted/denied, invoking least
- * priv subsytem when a deny is determined.
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsytem is always consulted as a basic privilege
+ * can define any form of access.
  */
 int
 zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
@@ -2555,20 +2343,36 @@ zfs_zaccess(znode_t *zp, int mode, int f
 	uint32_t	working_mode;
 	int		error;
 	int		is_attr;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	boolean_t 	check_privs;
 	znode_t		*xzp;
 	znode_t 	*check_zp = zp;
+	mode_t		needed_bits;
+	uid_t		owner;
 
-	is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) &&
-	    (ZTOV(zp)->v_type == VDIR));
+	is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
 
+#ifndef illumos
+	/*
+	 * In FreeBSD, we don't care about permissions of individual ADS.
+	 * Note that not checking them is not just an optimization - without
+	 * this shortcut, EA operations may bogusly fail with EACCES.
+	 */
+	if (zp->z_pflags & ZFS_XATTR)
+		return (0);
+#else
 	/*
 	 * If attribute then validate against base file
 	 */
 	if (is_attr) {
+		uint64_t	parent;
+
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+		    sizeof (parent))) != 0)
+			return (error);
+
 		if ((error = zfs_zget(zp->z_zfsvfs,
-		    zp->z_phys->zp_parent, &xzp)) != 0)	{
+		    parent, &xzp)) != 0)	{
 			return (error);
 		}
 
@@ -2588,12 +2392,38 @@ zfs_zaccess(znode_t *zp, int mode, int f
 			mode |= ACE_READ_NAMED_ATTRS;
 		}
 	}
+#endif
+
+	owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+	/*
+	 * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+	 * in needed_bits.  Map the bits mapped by working_mode (currently
+	 * missing) in missing_bits.
+	 * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+	 * needed_bits.
+	 */
+	needed_bits = 0;
+
+	working_mode = mode;
+	if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+	    owner == crgetuid(cr))
+		working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+	if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+	    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= VREAD;
+	if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+	    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= VWRITE;
+	if (working_mode & ACE_EXECUTE)
+		needed_bits |= VEXEC;
 
 	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
 	    &check_privs, skipaclchk, cr)) == 0) {
 		if (is_attr)
 			VN_RELE(ZTOV(xzp));
-		return (0);
+		return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+		    needed_bits, needed_bits));
 	}
 
 	if (error && !check_privs) {
@@ -2607,12 +2437,8 @@ zfs_zaccess(znode_t *zp, int mode, int f
 	}
 
 	if (error && check_privs) {
-		uid_t		owner;
 		mode_t		checkmode = 0;
 
-		owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr,
-		    ZFS_OWNER);
-
 		/*
 		 * First check for implicit owner permission on
 		 * read_acl/read_attributes
@@ -2634,21 +2460,20 @@ zfs_zaccess(znode_t *zp, int mode, int f
 		if (working_mode & ACE_EXECUTE)
 			checkmode |= VEXEC;
 
-		if (checkmode)
-			error = secpolicy_vnode_access(cr, ZTOV(check_zp),
-			    owner, checkmode);
+		error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner,
+		    needed_bits & ~checkmode, needed_bits);
 
 		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
-			error = secpolicy_vnode_chown(cr, owner);
+			error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
 		if (error == 0 && (working_mode & ACE_WRITE_ACL))
-			error = secpolicy_vnode_setdac(cr, owner);
+			error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner);
 
 		if (error == 0 && (working_mode &
 		    (ACE_DELETE|ACE_DELETE_CHILD)))
-			error = secpolicy_vnode_remove(cr);
+			error = secpolicy_vnode_remove(ZTOV(check_zp), cr);
 
 		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
-			error = secpolicy_vnode_chown(cr, owner);
+			error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner);
 		}
 		if (error == 0) {
 			/*
@@ -2656,11 +2481,15 @@ zfs_zaccess(znode_t *zp, int mode, int f
 			 * for are still present.  If so then return EACCES
 			 */
 			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
-				error = EACCES;
+				error = SET_ERROR(EACCES);
 			}
 		}
+	} else if (error == 0) {
+		error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+		    needed_bits, needed_bits);
 	}
 
+
 	if (is_attr)
 		VN_RELE(ZTOV(xzp));
 
@@ -2690,15 +2519,15 @@ zfs_zaccess_unix(znode_t *zp, mode_t mod
 
 static int
 zfs_delete_final_check(znode_t *zp, znode_t *dzp,
-    mode_t missing_perms, cred_t *cr)
+    mode_t available_perms, cred_t *cr)
 {
 	int error;
 	uid_t downer;
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
-	downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER);
+	downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
 
-	error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms);
+	error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+	    downer, available_perms, VWRITE|VEXEC);
 
 	if (error == 0)
 		error = zfs_sticky_remove_access(dzp, zp, cr);
@@ -2710,7 +2539,6 @@ zfs_delete_final_check(znode_t *zp, znod
  * Determine whether Access should be granted/deny, without
  * consulting least priv subsystem.
  *
- *
  * The following chart is the recommended NFSv4 enforcement for
  * ability to delete an object.
  *
@@ -2747,7 +2575,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t
 	uint32_t dzp_working_mode = 0;
 	uint32_t zp_working_mode = 0;
 	int dzp_error, zp_error;
-	mode_t missing_perms;
+	mode_t available_perms;
 	boolean_t dzpcheck_privs = B_TRUE;
 	boolean_t zpcheck_privs = B_TRUE;
 
@@ -2765,8 +2593,8 @@ zfs_zaccess_delete(znode_t *dzp, znode_t
 	 * to determine what was found.
 	 */
 
-	if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
-		return (EPERM);
+	if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+		return (SET_ERROR(EPERM));
 
 	/*
 	 * First row
@@ -2801,30 +2629,27 @@ zfs_zaccess_delete(znode_t *dzp, znode_t
 	 */
 
 	if (dzp_error == EACCES)
-		return (secpolicy_vnode_remove(cr));
+		return (secpolicy_vnode_remove(ZTOV(dzp), cr));	/* XXXPJD: s/dzp/zp/ ? */
 
 	/*
 	 * Third Row
 	 * only need to see if we have write/execute on directory.
 	 */
 
-	if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
-	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
-		return (zfs_sticky_remove_access(dzp, zp, cr));
+	dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
 
-	if (!dzpcheck_privs)
+	if (dzp_error != 0 && !dzpcheck_privs)
 		return (dzp_error);
 
 	/*
 	 * Fourth row
 	 */
 
-	missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0;
-	missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0;
-
-	ASSERT(missing_perms);
+	available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+	available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
 
-	return (zfs_delete_final_check(zp, dzp, missing_perms, cr));
+	return (zfs_delete_final_check(zp, dzp, available_perms, cr));
 
 }
 
@@ -2835,8 +2660,8 @@ zfs_zaccess_rename(znode_t *sdzp, znode_
 	int add_perm;
 	int error;
 
-	if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
-		return (EACCES);
+	if (szp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
 
 	add_perm = (ZTOV(szp)->v_type == VDIR) ?
 	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
@@ -2844,7 +2669,15 @@ zfs_zaccess_rename(znode_t *sdzp, znode_
 	/*
 	 * Rename permissions are combination of delete permission +
 	 * add file/subdir permission.
+	 *
+	 * BSD operating systems also require write permission
+	 * on the directory being moved from one parent directory
+	 * to another.
 	 */
+	if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
+		if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))
+			return (error);
+	}
 
 	/*
 	 * first make sure we do the delete portion.
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_byteswap.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c	27 Feb 2010 22:31:18 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c	23 Mar 2013 15:29:23 -0000
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,6 +27,7 @@
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
 #include <sys/zfs_acl.h>
 
 void
@@ -50,7 +51,7 @@ zfs_ace_byteswap(void *buf, size_t size,
 {
 	caddr_t end;
 	caddr_t ptr;
-	zfs_ace_t *zacep;
+	zfs_ace_t *zacep = NULL;
 	ace_t *acep;
 	uint16_t entry_type;
 	size_t entry_size;
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c,v
retrieving revision 1.2
diff -u -p -r1.2 zfs_ctldir.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c	14 Dec 2010 01:28:18 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c	11 May 2017 23:15:03 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 /*
@@ -64,95 +65,248 @@
  * so that it cannot be freed until all snapshots have been unmounted.
  */
 
-#include <fs/fs_subr.h>
+#ifdef __FreeBSD__
+
+#include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
-#include <sys/vfs_opreg.h>
-#include <sys/gfs.h>
+#include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/mount.h>
-#include <sys/sunddi.h>
+#include <sys/zap.h>
 
 #include "zfs_namecheck.h"
 
-typedef struct zfsctl_node {
-	gfs_dir_t	zc_gfs_private;
-	uint64_t	zc_id;
-	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
-} zfsctl_node_t;
-
-typedef struct zfsctl_snapdir {
-	zfsctl_node_t	sd_node;
-	kmutex_t	sd_lock;
-	avl_tree_t	sd_snaps;
-} zfsctl_snapdir_t;
+/*
+ * "Synthetic" filesystem implementation.
+ */
 
-typedef struct {
-	char		*se_name;
-	vnode_t		*se_root;
-	avl_node_t	se_node;
-} zfs_snapentry_t;
-
-static int
-snapentry_compare(const void *a, const void *b)
-{
-	const zfs_snapentry_t *sa = a;
-	const zfs_snapentry_t *sb = b;
-	int ret = strcmp(sa->se_name, sb->se_name);
-
-	if (ret < 0)
-		return (-1);
-	else if (ret > 0)
-		return (1);
-	else
-		return (0);
-}
-
-//vnodeops_t *zfsctl_ops_shares_dir;
-
-static struct vnodeopv_entry_desc zfsctl_ops_root;
-static struct vnodeopv_entry_desc zfsctl_ops_snapdir;
-static struct vnodeopv_entry_desc zfsctl_ops_snapshot;
-static struct vnodeopv_entry_desc zfsctl_ops_shares;
-
-
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_mknode_shares(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
-
-static gfs_opsvec_t zfsctl_opsvec[] = {
-	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
-	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
-	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
-	{ ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
-	{ ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
-	{ NULL }
-};
+/*
+ * Assert that A implies B.
+ */
+#define KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
+
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t	sn_parent_id;
+	uint64_t	sn_id;
+} sfs_node_t;
 
 /*
- * Root directory elements.  We only have two entries
- * snapshot and shares.
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artifical
+ * IDs, znode IDs) may clash.
  */
-static gfs_dirent_t zfsctl_root_entries[] = {
-	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
-	{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
-	{ NULL }
-};
+static int
+sfs_compare_ids(struct vnode *vp, void *arg)
+{
+	sfs_node_t *n1 = vp->v_data;
+	sfs_node_t *n2 = arg;
+	bool equal;
+
+	equal = n1->sn_id == n2->sn_id &&
+	    n1->sn_parent_id == n2->sn_parent_id;
+
+	/* Zero means equality. */
+	return (!equal);
+}
 
-/* include . and .. in the calculation */
-#define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
-    sizeof (gfs_dirent_t)) + 1)
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+   uint64_t id, struct vnode **vpp)
+{
+	sfs_node_t search;
+	int err;
+
+	search.sn_id = id;
+	search.sn_parent_id = parent_id;
+	err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
+	    sfs_compare_ids, &search);
+	return (err);
+}
+
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+   uint64_t id, struct vnode **vpp)
+{
+	int err;
+
+	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+	err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
+	    sfs_compare_ids, vp->v_data);
+	return (err);
+}
+
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+	vfs_hash_remove(vp);
+}
+
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
+
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+    const char *tag, struct vop_vector *vops,
+    sfs_vnode_setup_fn setup, void *arg,
+    struct vnode **vpp)
+{
+	struct vnode *vp;
+	int error;
+
+	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	/* Allocate a new vnode/inode. */
+	error = getnewvnode(tag, mp, vops, &vp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	/*
+	 * Exclusively lock the vnode vnode while it's being constructed.
+	 */
+	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+	error = insmntque(vp, mp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	setup(vp, arg);
+
+	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	*vpp = vp;
+	return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+	printf("\tname = %s\n", node->sn_name);
+	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+	struct sfs_node *node;
+
+	KASSERT(strlen(name) < sizeof(node->sn_name),
+	    ("sfs node name is too long"));
+	KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
+	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+	strlcpy(node->sn_name, name, sizeof(node->sn_name));
+	node->sn_parent_id = parent_id;
+	node->sn_id = id;
+
+	return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+	free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+	sfs_node_t *node;
+	void *data;
+
+	sfs_vnode_remove(vp);
+	data = vp->v_data;
+	vp->v_data = NULL;
+	return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+    uio_t *uio, off_t *offp)
+{
+	struct dirent entry;
+	int error;
+
+	/* Reset ncookies for subsequent use of vfs_read_dirent. */
+	if (ap->a_ncookies != NULL)
+		*ap->a_ncookies = 0;
+
+	if (uio->uio_resid < sizeof(entry))
+		return (SET_ERROR(EINVAL));
+
+	if (uio->uio_offset < 0)
+		return (SET_ERROR(EINVAL));
+	if (uio->uio_offset == 0) {
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '\0';
+		entry.d_namlen = 1;
+		entry.d_reclen = sizeof(entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (uio->uio_offset < sizeof(entry))
+		return (SET_ERROR(EINVAL));
+	if (uio->uio_offset == sizeof(entry)) {
+		entry.d_fileno = parent_id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '.';
+		entry.d_name[2] = '\0';
+		entry.d_namlen = 2;
+		entry.d_reclen = sizeof(entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (offp != NULL)
+		*offp = 2 * sizeof(entry);
+	return (0);
+}
 
 
 /*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories.  This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem.  We use the following scheme:
+ *
+ * 	ENTRY			ZFSCTL_INODE
+ * 	.zfs			1
+ * 	.zfs/snapshot		2
+ * 	.zfs/snapshot/<snap>	objectid(snap)
  */
+#define	ZFSCTL_INO_SNAP(id)	(id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+static struct vop_vector zfsctl_ops_shares_dir;
+
 void
 zfsctl_init(void)
 {
@@ -163,80 +317,120 @@ zfsctl_fini(void)
 {
 }
 
-/*
- * Return the inode number associated with the 'snapshot' or
- * 'shares' directory.
- */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
+boolean_t
+zfsctl_is_node(vnode_t *vp)
 {
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	return (vn_matchops(vp, zfsctl_ops_root) ||
+	    vn_matchops(vp, zfsctl_ops_snapdir) ||
+	    vn_matchops(vp, zfsctl_ops_snapshot) ||
+	    vn_matchops(vp, zfsctl_ops_shares_dir));
 
-	ASSERT(index <= 2);
+}
 
-	if (index == 0)
-		return (ZFSCTL_INO_SNAPDIR);
+typedef struct zfsctl_root {
+	sfs_node_t	node;
+	sfs_node_t	*snapdir;
+	timestruc_t	cmtime;
+} zfsctl_root_t;
 
-	return (zfsvfs->z_shares_dir);
-}
 
 /*
- * Create the '.zfs' directory.  This directory is cached as part of the VFS
- * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1.  This reference
- * is removed when the ctldir is destroyed in the unmount.
+ * Create the '.zfs' directory.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
-	vnode_t *vp, *rvp;
-	zfsctl_node_t *zcp;
+	zfsctl_root_t *dot_zfs;
+	sfs_node_t *snapdir;
+	vnode_t *rvp;
+	uint64_t crtime[2];
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
-	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
-	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
-	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
-	zcp = vp->v_data;
-	zcp->zc_id = ZFSCTL_INO_ROOT;
-
-	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
-	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
-	VN_RELE(rvp);
+	snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+	    ZFSCTL_INO_SNAPDIR);
+	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
+	    ZFSCTL_INO_ROOT);
+	dot_zfs->snapdir = snapdir;
+
+	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
+	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+	    &crtime, sizeof(crtime)));
+	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+	vput(rvp);
 
-	/*
-	 * We're only faking the fact that we have a root of a filesystem for
-	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
-	 * for us.
-	 */
-	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
-
-	zfsvfs->z_ctldir = vp;
+	zfsvfs->z_ctldir = dot_zfs;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
-	VN_RELE(zfsvfs->z_ctldir);
+	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* We support shared locking. */
+	VN_LOCK_ASHARE(vp);
+	vp->v_type = VDIR;
+	vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
+	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+	    zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
+	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+	   &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
-vnode_t *
-zfsctl_root(znode_t *zp)
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
 {
-	ASSERT(zfs_has_ctldir(zp));
-	VN_HOLD(zp->z_zfsvfs->z_ctldir);
-	return (zp->z_zfsvfs->z_ctldir);
+	vnode_t *vp;
+	int error;
+
+	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+	return (error);
 }
 
 /*
@@ -244,10 +438,12 @@ zfsctl_root(znode_t *zp)
  */
 /* ARGSUSED */
 static int
-zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
+zfsctl_common_open(struct vop_open_args *ap)
 {
+	int flags = ap->a_mode;
+
 	if (flags & FWRITE)
-		return (EACCES);
+		return (SET_ERROR(EACCES));
 
 	return (0);
 }
@@ -257,8 +453,7 @@ zfsctl_common_open(vnode_t **vpp, int fl
  */
 /* ARGSUSED */
 static int
-zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
-    cred_t *cr, caller_context_t *ct)
+zfsctl_common_close(struct vop_close_args *ap)
 {
 	return (0);
 }
@@ -268,17 +463,18 @@ zfsctl_common_close(vnode_t *vpp, int fl
  */
 /* ARGSUSED */
 static int
-zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
-    caller_context_t *ct)
+zfsctl_common_access(ap)
+	struct vop_access_args /* {
+		struct vnode *a_vp;
+		accmode_t a_accmode;
+		struct ucred *a_cred;
+		struct thread *a_td;
+	} */ *ap;
 {
-	if (flags & V_ACE_MASK) {
-		if (mode & ACE_ALL_WRITE_PERMS)
-			return (EACCES);
-	} else {
-		if (mode & VWRITE)
-			return (EACCES);
-	}
+	accmode_t accmode = ap->a_accmode;
 
+	if (accmode & VWRITE)
+		return (SET_ERROR(EACCES));
 	return (0);
 }
 
@@ -289,6 +485,9 @@ static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	timestruc_t	now;
+	sfs_node_t *node;
+
+	node = vp->v_data;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
@@ -300,7 +499,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
-	vap->va_fsid = vp->v_vfsp->vfs_dev;
+	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
 	    S_IROTH | S_IXOTH;
 	vap->va_type = VDIR;
@@ -309,417 +508,314 @@ zfsctl_common_getattr(vnode_t *vp, vattr
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
+	/* FreeBSD: Reset chflags(2) flags. */
+	vap->va_flags = 0;
+
+	vap->va_nodeid = node->sn_id;
+
+	/* At least '.' and '..'. */
+	vap->va_nlink = 2;
 }
 
 /*ARGSUSED*/
 static int
-zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
-{
-	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_node_t	*zcp = vp->v_data;
-	uint64_t	object = zcp->zc_id;
+zfsctl_common_fid(ap)
+	struct vop_fid_args /* {
+		struct vnode *a_vp;
+		struct fid *a_fid;
+	} */ *ap;
+{
+	vnode_t		*vp = ap->a_vp;
+	fid_t		*fidp = (void *)ap->a_fid;
+	sfs_node_t	*node = vp->v_data;
+	uint64_t	object = node->sn_id;
 	zfid_short_t	*zfid;
 	int		i;
 
-	ZFS_ENTER(zfsvfs);
-
-	if (fidp->fid_len < SHORT_FID_LEN) {
-		fidp->fid_len = SHORT_FID_LEN;
-		ZFS_EXIT(zfsvfs);
-		return (ENOSPC);
-	}
-
 	zfid = (zfid_short_t *)fidp;
-
 	zfid->zf_len = SHORT_FID_LEN;
 
-	for (i = 0; i < sizeof (zfid->zf_object); i++)
+	for (i = 0; i < sizeof(zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
-	/* .zfs znodes always have a generation number of 0 */
-	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+	/* .zfs nodes always have a generation number of 0 */
+	for (i = 0; i < sizeof(zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
-	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
-
-/*ARGSUSED*/
 static int
-zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+zfsctl_common_reclaim(ap)
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+		struct thread *a_td;
+	} */ *ap;
 {
-	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
-	znode_t		*dzp;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
+	vnode_t *vp = ap->a_vp;
 
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (ENOTSUP);
-	}
-
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-		error = VOP_FID(ZTOV(dzp), fidp, ct);
-		VN_RELE(ZTOV(dzp));
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
+	(void) sfs_reclaim_vnode(vp);
+	return (0);
 }
-/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem.  We use the following scheme:
- *
- * 	ENTRY			ZFSCTL_INODE
- * 	.zfs			1
- * 	.zfs/snapshot		2
- * 	.zfs/snapshot/<snap>	objectid(snap)
- */
 
-#define	ZFSCTL_INO_SNAP(id)	(id)
+static int
+zfsctl_common_print(ap)
+	struct vop_print_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	sfs_print_node(ap->a_vp->v_data);
+	return (0);
+}
 
 /*
  * Get root directory attributes.
  */
 /* ARGSUSED */
 static int
-zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_node_t *zcp = vp->v_data;
-
-	ZFS_ENTER(zfsvfs);
-	vap->va_nodeid = ZFSCTL_INO_ROOT;
-	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
-	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+zfsctl_root_getattr(ap)
+	struct vop_getattr_args /* {
+		struct vnode *a_vp;
+		struct vattr *a_vap;
+		struct ucred *a_cred;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+	struct vattr *vap = ap->a_vap;
+	zfsctl_root_t *node = vp->v_data;
 
 	zfsctl_common_getattr(vp, vap);
-	ZFS_EXIT(zfsvfs);
+	vap->va_ctime = node->cmtime;
+	vap->va_mtime = vap->va_ctime;
+	vap->va_birthtime = vap->va_ctime;
+	vap->va_nlink += 1; /* snapdir */
+	vap->va_size = vap->va_nlink;
+	return (0);
+}
 
+/*
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
+ */
+int
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
+{
+	vref(dvp);
+	if (ltype != VOP_ISLOCKED(dvp)) {
+		if (ltype == LK_EXCLUSIVE)
+			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+		else /* if (ltype == LK_SHARED) */
+			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+		/* Relock for the "." case may left us with reclaimed vnode. */
+		if ((dvp->v_iflag & VI_DOOMED) != 0) {
+			vrele(dvp);
+			return (SET_ERROR(ENOENT));
+		}
+	}
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
-/* ARGSUSED */
 int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
-{
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+zfsctl_root_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	struct componentname *cnp = ap->a_cnp;
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	cred_t *cr = ap->a_cnp->cn_cred;
+	int flags = ap->a_cnp->cn_flags;
+	int lkflags = ap->a_cnp->cn_lkflags;
+	int nameiop = ap->a_cnp->cn_nameiop;
 	int err;
+	int ltype;
 
-	/*
-	 * No extended attributes allowed under .zfs
-	 */
-	if (flags & LOOKUP_XATTR)
-		return (EINVAL);
+	ASSERT(dvp->v_type == VDIR);
 
-	ZFS_ENTER(zfsvfs);
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
 
-	if (strcmp(nm, "..") == 0) {
-		err = VFS_ROOT(dvp->v_vfsp, vpp);
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+	} else if ((flags & ISDOTDOT) != 0) {
+		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+		    lkflags, vpp);
+	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
 	} else {
-		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
-		    cr, ct, direntflags, realpnp);
+		err = SET_ERROR(ENOENT);
 	}
-
-	ZFS_EXIT(zfsvfs);
-
+	if (err != 0)
+		*vpp = NULL;
 	return (err);
 }
 
 static int
-zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
-    caller_context_t *ct)
-{
-	/*
-	 * We only care about ACL_ENABLED so that libsec can
-	 * display ACL correctly and not default to POSIX draft.
-	 */
-	if (cmd == _PC_ACL_ENABLED) {
-		*valp = _ACL_ACE_ENABLED;
-		return (0);
-	}
-
-	return (fs_pathconf(vp, cmd, valp, cr, ct));
-}
-
-static const fs_operation_def_t zfsctl_tops_root[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
-	{ VOPNAME_PATHCONF,	{ .vop_pathconf = zfsctl_pathconf }	},
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
-	{ NULL }
-};
-
-static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
-{
-	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
-
-	if (snapshot_namecheck(name, NULL, NULL) != 0)
-		return (EILSEQ);
-	dmu_objset_name(os, zname);
-	if (strlen(zname) + 1 + strlen(name) >= len)
-		return (ENAMETOOLONG);
-	(void) strcat(zname, "@");
-	(void) strcat(zname, name);
-	return (0);
-}
-
-static int
-zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
+zfsctl_root_readdir(ap)
+	struct vop_readdir_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		struct ucred *a_cred;
+		int *a_eofflag;
+		int *ncookies;
+		u_long **a_cookies;
+	} */ *ap;
 {
-	vnode_t *svp = sep->se_root;
+	struct dirent entry;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_root_t *node = vp->v_data;
+	uio_t *uio = ap->a_uio;
+	int *eofp = ap->a_eofflag;
+	off_t dots_offset;
 	int error;
 
-	ASSERT(vn_ismntpt(svp));
-
-	/* this will be dropped by dounmount() */
-	if ((error = vn_vfswlock(svp)) != 0)
-		return (error);
+	ASSERT(vp->v_type == VDIR);
 
-	VN_HOLD(svp);
-	error = dounmount(vn_mountedvfs(svp), fflags, cr);
-	if (error) {
-		VN_RELE(svp);
+	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
+	    &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
 		return (error);
 	}
+	if (uio->uio_offset != dots_offset)
+		return (SET_ERROR(EINVAL));
 
-	/*
-	 * We can't use VN_RELE(), as that will try to invoke
-	 * zfsctl_snapdir_inactive(), which would cause us to destroy
-	 * the sd_lock mutex held by our caller.
-	 */
-	ASSERT(svp->v_count == 1);
-	gfs_vop_inactive(svp, cr, NULL);
-
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	kmem_free(sep, sizeof (zfs_snapentry_t));
-
+	CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
+	entry.d_fileno = node->snapdir->sn_id;
+	entry.d_type = DT_DIR;
+	strcpy(entry.d_name, node->snapdir->sn_name);
+	entry.d_namlen = strlen(entry.d_name);
+	entry.d_reclen = sizeof(entry);
+	error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG)
+			error = 0;
+		return (SET_ERROR(error));
+	}
+	if (eofp != NULL)
+		*eofp = 1;
 	return (0);
 }
 
-static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+static int
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
 {
-	avl_index_t where;
-	vfs_t *vfsp;
-	refstr_t *pathref;
-	char newpath[MAXNAMELEN];
-	char *tail;
-
-	ASSERT(MUTEX_HELD(&sdp->sd_lock));
-	ASSERT(sep != NULL);
-
-	vfsp = vn_mountedvfs(sep->se_root);
-	ASSERT(vfsp != NULL);
+	static const char dotzfs_name[4] = ".zfs";
+	vnode_t *dvp;
+	int error;
 
-	vfs_lock_wait(vfsp);
+	if (*ap->a_buflen < sizeof (dotzfs_name))
+		return (SET_ERROR(ENOMEM));
 
-	/*
-	 * Change the name in the AVL tree.
-	 */
-	avl_remove(&sdp->sd_snaps, sep);
-	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
-	avl_insert(&sdp->sd_snaps, sep, where);
-
-	/*
-	 * Change the current mountpoint info:
-	 * 	- update the tail of the mntpoint path
-	 *	- update the tail of the resource path
-	 */
-	pathref = vfs_getmntpoint(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '/')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setmntpoint(vfsp, newpath);
-
-	pathref = vfs_getresource(vfsp);
-	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
-	VERIFY((tail = strrchr(newpath, '@')) != NULL);
-	*(tail+1) = '\0';
-	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
-	(void) strcat(newpath, nm);
-	refstr_rele(pathref);
-	vfs_setresource(vfsp, newpath);
+	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+	    LK_SHARED, &dvp);
+	if (error != 0)
+		return (SET_ERROR(error));
 
-	vfs_unlock(vfsp);
+	VOP_UNLOCK(dvp, 0);
+	*ap->a_vpp = dvp;
+	*ap->a_buflen -= sizeof (dotzfs_name);
+	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
+	return (0);
 }
 
-/*ARGSUSED*/
+static struct vop_vector zfsctl_ops_root = {
+	.vop_default =	&default_vnodeops,
+	.vop_open =	zfsctl_common_open,
+	.vop_close =	zfsctl_common_close,
+	.vop_ioctl =	VOP_EINVAL,
+	.vop_getattr =	zfsctl_root_getattr,
+	.vop_access =	zfsctl_common_access,
+	.vop_readdir =	zfsctl_root_readdir,
+	.vop_lookup =	zfsctl_root_lookup,
+	.vop_inactive =	VOP_NULL,
+	.vop_reclaim =	zfsctl_common_reclaim,
+	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+	.vop_vptocnp =	zfsctl_root_vptocnp,
+};
+
 static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
-    cred_t *cr, caller_context_t *ct, int flags)
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
-	zfsctl_snapdir_t *sdp = sdvp->v_data;
-	zfs_snapentry_t search, *sep;
-	zfsvfs_t *zfsvfs;
-	avl_index_t where;
-	char from[MAXNAMELEN], to[MAXNAMELEN];
-	char real[MAXNAMELEN];
-	int err;
-
-	zfsvfs = sdvp->v_vfsp->vfs_data;
-	ZFS_ENTER(zfsvfs);
-
-	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
-		    MAXNAMELEN, NULL);
-		if (err == 0) {
-			snm = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-	}
-
-	ZFS_EXIT(zfsvfs);
-
-	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
-	if (!err)
-		err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
-	if (!err)
-		err = zfs_secpolicy_rename_perms(from, to, cr);
-	if (err)
-		return (err);
-
-	/*
-	 * Cannot move snapshots out of the snapdir.
-	 */
-	if (sdvp != tdvp)
-		return (EINVAL);
-
-	if (strcmp(snm, tnm) == 0)
-		return (0);
-
-	mutex_enter(&sdp->sd_lock);
-
-	search.se_name = (char *)snm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
-		mutex_exit(&sdp->sd_lock);
-		return (ENOENT);
-	}
-
-	err = dmu_objset_rename(from, to, B_FALSE);
-	if (err == 0)
-		zfsctl_rename_snap(sdp, sep, tnm);
-
-	mutex_exit(&sdp->sd_lock);
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
-	return (err);
+	dmu_objset_name(os, zname);
+	if (strlen(zname) + 1 + strlen(name) >= len)
+		return (SET_ERROR(ENAMETOOLONG));
+	(void) strcat(zname, "@");
+	(void) strcat(zname, name);
+	return (0);
 }
 
-/* ARGSUSED */
 static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
 {
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	zfs_snapentry_t *sep;
-	zfs_snapentry_t search;
-	zfsvfs_t *zfsvfs;
-	char snapname[MAXNAMELEN];
-	char real[MAXNAMELEN];
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 	int err;
 
-	zfsvfs = dvp->v_vfsp->vfs_data;
-	ZFS_ENTER(zfsvfs);
-
-	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-
-		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
-		    MAXNAMELEN, NULL);
-		if (err == 0) {
-			name = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
-	}
-
-	ZFS_EXIT(zfsvfs);
-
-	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
-	if (!err)
-		err = zfs_secpolicy_destroy_perms(snapname, cr);
-	if (err)
-		return (err);
-
-	mutex_enter(&sdp->sd_lock);
-
-	search.se_name = name;
-	sep = avl_find(&sdp->sd_snaps, &search, NULL);
-	if (sep) {
-		avl_remove(&sdp->sd_snaps, sep);
-		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
-		if (err)
-			avl_add(&sdp->sd_snaps, sep);
-		else
-			err = dmu_objset_destroy(snapname, B_FALSE);
-	} else {
-		err = ENOENT;
-	}
-
-	mutex_exit(&sdp->sd_lock);
-
+	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
 	return (err);
 }
 
 /*
- * This creates a snapshot under '.zfs/snapshot'.
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any.  The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked.  If any error happens the orinal vnode is unlocked and
+ * released.
  */
-/* ARGSUSED */
 static int
-zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
-    cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
+zfsctl_mounted_here(vnode_t **vpp, int flags)
 {
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	char name[MAXNAMELEN];
+	struct mount *mp;
 	int err;
-	static enum symfollow follow = NO_FOLLOW;
-	static enum uio_seg seg = UIO_SYSSPACE;
 
-	if (snapshot_namecheck(dirname, NULL, NULL) != 0)
-		return (EILSEQ);
+	ASSERT_VOP_LOCKED(*vpp, __func__);
+	ASSERT3S((*vpp)->v_type, ==, VDIR);
 
-	dmu_objset_name(zfsvfs->z_os, name);
+	if ((mp = (*vpp)->v_mountedhere) != NULL) {
+		err = vfs_busy(mp, 0);
+		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+		vput(*vpp);
+		err = VFS_ROOT(mp, flags, vpp);
+		vfs_unbusy(mp);
+		return (err);
+	}
+	return (EJUSTRETURN);
+}
 
-	*vpp = NULL;
+typedef struct {
+	const char *snap_name;
+	uint64_t    snap_id;
+} snapshot_setup_arg_t;
 
-	err = zfs_secpolicy_snapshot_perms(name, cr);
-	if (err)
-		return (err);
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
+{
+	snapshot_setup_arg_t *ssa = arg;
+	sfs_node_t *node;
 
-	if (err == 0) {
-		err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
-		if (err)
-			return (err);
-		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
-	}
+	ASSERT_VOP_ELOCKED(vp, __func__);
 
-	return (err);
+	node = sfs_alloc_node(sizeof(sfs_node_t),
+	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+	zfsctl_common_vnode_setup(vp, node);
+
+	/* We have to support recursive locking. */
+	VN_LOCK_AREC(vp);
 }
 
 /*
@@ -728,151 +824,112 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char 
  * Perform a mount of the associated dataset on top of the vnode.
  */
 /* ARGSUSED */
-static int
-zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
-{
-	zfsctl_snapdir_t *sdp = dvp->v_data;
-	objset_t *snap;
-	char snapname[MAXNAMELEN];
-	char real[MAXNAMELEN];
+int
+zfsctl_snapdir_lookup(ap)
+	struct vop_lookup_args /* {
+		struct vnode *a_dvp;
+		struct vnode **a_vpp;
+		struct componentname *a_cnp;
+	} */ *ap;
+{
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	char name[NAME_MAX + 1];
+	char fullname[ZFS_MAX_DATASET_NAME_LEN];
 	char *mountpoint;
-	zfs_snapentry_t *sep, search;
-	struct mounta margs;
-	vfs_t *vfsp;
 	size_t mountpoint_len;
-	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	uint64_t snap_id;
+	int nameiop = cnp->cn_nameiop;
+	int lkflags = cnp->cn_lkflags;
+	int flags = cnp->cn_flags;
 	int err;
 
-	/*
-	 * No extended attributes allowed under .zfs
-	 */
-	if (flags & LOOKUP_XATTR)
-		return (EINVAL);
-
 	ASSERT(dvp->v_type == VDIR);
 
-	/*
-	 * If we get a recursive call, that means we got called
-	 * from the domount() code while it was trying to look up the
-	 * spec (which looks like a local path for zfs).  We need to
-	 * add some flag to domount() to tell it not to do this lookup.
-	 */
-	if (MUTEX_HELD(&sdp->sd_lock))
-		return (ENOENT);
-
-	ZFS_ENTER(zfsvfs);
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
 
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+		return (err);
+	}
+	if (flags & ISDOTDOT) {
+		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+		    vpp);
+		return (err);
 	}
 
-	if (flags & FIGNORECASE) {
-		boolean_t conflict = B_FALSE;
+	if (cnp->cn_namelen >= sizeof(name))
+		return (SET_ERROR(ENAMETOOLONG));
 
-		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
-		    MAXNAMELEN, &conflict);
-		if (err == 0) {
-			nm = real;
-		} else if (err != ENOTSUP) {
-			ZFS_EXIT(zfsvfs);
+	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+	if (err != 0)
+		return (SET_ERROR(ENOENT));
+
+	for (;;) {
+		snapshot_setup_arg_t ssa;
+
+		ssa.snap_name = name;
+		ssa.snap_id = snap_id;
+		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+		   snap_id, "zfs", &zfsctl_ops_snapshot,
+		   zfsctl_snapshot_vnode_setup, &ssa, vpp);
+		if (err != 0)
 			return (err);
-		}
-		if (realpnp)
-			(void) strlcpy(realpnp->pn_buf, nm,
-			    realpnp->pn_bufsize);
-		if (conflict && direntflags)
-			*direntflags = ED_CASE_CONFLICT;
-	}
-
-	mutex_enter(&sdp->sd_lock);
-	search.se_name = (char *)nm;
-	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
-		*vpp = sep->se_root;
-		VN_HOLD(*vpp);
-		err = traverse(vpp);
-		if (err) {
-			VN_RELE(*vpp);
-			*vpp = NULL;
-		} else if (*vpp == sep->se_root) {
-			/*
-			 * The snapshot was unmounted behind our backs,
-			 * try to remount it.
-			 */
-			goto domount;
-		} else {
-			/*
-			 * VROOT was set during the traverse call.  We need
-			 * to clear it since we're pretending to be part
-			 * of our parent's vfs.
-			 */
-			(*vpp)->v_flag &= ~VROOT;
-		}
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (err);
-	}
 
-	/*
-	 * The requested snapshot is not currently mounted, look it up.
-	 */
-	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
-	if (err) {
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
+		/* Check if a new vnode has just been created. */
+		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+			break;
+
 		/*
-		 * handle "ls *" or "?" in a graceful manner,
-		 * forcing EILSEQ to ENOENT.
-		 * Since shell ultimately passes "*" or "?" as name to lookup
+		 * The vnode must be referenced at least by this thread and
+		 * the mounted snapshot or the thread doing the mounting.
+		 * There can be more references from concurrent lookups.
 		 */
-		return (err == EILSEQ ? ENOENT : err);
-	}
-	if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
-		mutex_exit(&sdp->sd_lock);
-		ZFS_EXIT(zfsvfs);
-		return (ENOENT);
-	}
-
-	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
-	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
-	(void) strcpy(sep->se_name, nm);
-	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
-	avl_insert(&sdp->sd_snaps, sep, where);
-
-	dmu_objset_rele(snap, FTAG);
-domount:
-	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
-	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
-	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
-	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
-	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
+		KASSERT(vrefcnt(*vpp) > 1, ("found unreferenced mountpoint"));
 
-	margs.spec = snapname;
-	margs.dir = mountpoint;
-	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
-	margs.fstype = "zfs";
-	margs.dataptr = NULL;
-	margs.datalen = 0;
-	margs.optptr = NULL;
-	margs.optlen = 0;
+		/*
+		 * Check if a snapshot is already mounted on top of the vnode.
+		 */
+		err = zfsctl_mounted_here(vpp, lkflags);
+		if (err != EJUSTRETURN)
+			return (err);
 
-	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
-	kmem_free(mountpoint, mountpoint_len);
+#ifdef INVARIANTS
+		/*
+		 * If the vnode not covered yet, then the mount operation
+		 * must be in progress.
+		 */
+		VI_LOCK(*vpp);
+		KASSERT(((*vpp)->v_iflag & VI_MOUNT) != 0,
+		    ("snapshot vnode not covered"));
+		VI_UNLOCK(*vpp);
+#endif
+		vput(*vpp);
 
-	if (err == 0) {
 		/*
-		 * Return the mounted root rather than the covered mount point.
-		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
-		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
-		 * vnode is the root of the newly created vfsp.
+		 * In this situation we can loop on uncontested locks and starve
+		 * the thread doing the lengthy, non-trivial mount operation.
 		 */
-		VFS_RELE(vfsp);
-		err = traverse(vpp);
+		kern_yield(PRI_USER);
 	}
 
+	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
+
+	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
+	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+	(void) snprintf(mountpoint, mountpoint_len,
+	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
+	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+	kmem_free(mountpoint, mountpoint_len);
 	if (err == 0) {
 		/*
 		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
@@ -883,385 +940,246 @@ domount:
 		 */
 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
-		(*vpp)->v_vfsp = zfsvfs->z_vfs;
-		(*vpp)->v_flag &= ~VROOT;
+
+		/* Clear the root flag (set via VFS_ROOT) as well. */
+		(*vpp)->v_vflag &= ~VV_ROOT;
 	}
-	mutex_exit(&sdp->sd_lock);
-	ZFS_EXIT(zfsvfs);
 
-	/*
-	 * If we had an error, drop our hold on the vnode and
-	 * zfsctl_snapshot_inactive() will clean up.
-	 */
-	if (err) {
-		VN_RELE(*vpp);
+	if (err != 0)
 		*vpp = NULL;
-	}
 	return (err);
 }
 
-/* ARGSUSED */
 static int
-zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp)
-{
-	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
-	znode_t *dzp;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (ENOTSUP);
-	}
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
-		error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
-		    flags, rdir, cr, ct, direntflags, realpnp);
-
-	VN_RELE(ZTOV(dzp));
-	ZFS_EXIT(zfsvfs);
-
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
-    offset_t *offp, offset_t *nextp, void *data, int flags)
-{
+zfsctl_snapdir_readdir(ap)
+	struct vop_readdir_args /* {
+		struct vnode *a_vp;
+		struct uio *a_uio;
+		struct ucred *a_cred;
+		int *a_eofflag;
+		int *ncookies;
+		u_long **a_cookies;
+	} */ *ap;
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	struct dirent entry;
+	vnode_t *vp = ap->a_vp;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	char snapname[MAXNAMELEN];
-	uint64_t id, cookie;
-	boolean_t case_conflict;
+	uio_t *uio = ap->a_uio;
+	int *eofp = ap->a_eofflag;
+	off_t dots_offset;
 	int error;
 
-	ZFS_ENTER(zfsvfs);
+	ASSERT(vp->v_type == VDIR);
 
-	cookie = *offp;
-	error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
-	    &cookie, &case_conflict);
-	if (error) {
-		ZFS_EXIT(zfsvfs);
-		if (error == ENOENT) {
-			*eofp = 1;
-			return (0);
-		}
+	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
+	    &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
 		return (error);
 	}
 
-	if (flags & V_RDDIR_ENTFLAGS) {
-		edirent_t *eodp = dp;
-
-		(void) strcpy(eodp->ed_name, snapname);
-		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
-		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
-	} else {
-		struct dirent64 *odp = dp;
+	for (;;) {
+		uint64_t cookie;
+		uint64_t id;
+
+		cookie = uio->uio_offset - dots_offset;
+
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT) {
+				if (eofp != NULL)
+					*eofp = 1;
+				error = 0;
+			}
+			return (error);
+		}
 
-		(void) strcpy(odp->d_name, snapname);
-		odp->d_ino = ZFSCTL_INO_SNAP(id);
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		strcpy(entry.d_name, snapname);
+		entry.d_namlen = strlen(entry.d_name);
+		entry.d_reclen = sizeof(entry);
+		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+		if (error != 0) {
+			if (error == ENAMETOOLONG)
+				error = 0;
+			return (SET_ERROR(error));
+		}
+		uio->uio_offset = cookie + dots_offset;
 	}
-	*nextp = cookie;
-
-	ZFS_EXIT(zfsvfs);
-
-	return (0);
+	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
-zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
-    caller_context_t *ct, int flags)
+zfsctl_snapdir_getattr(ap)
+	struct vop_getattr_args /* {
+		struct vnode *a_vp;
+		struct vattr *a_vap;
+		struct ucred *a_cred;
+	} */ *ap;
 {
+	vnode_t *vp = ap->a_vp;
+	vattr_t *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	znode_t *dzp;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
+	dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
+	sfs_node_t *node = vp->v_data;
+	uint64_t snap_count;
+	int err;
 
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (ENOTSUP);
-	}
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-		error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
-		VN_RELE(ZTOV(dzp));
-	} else {
-		*eofp = 1;
-		error = ENOENT;
+	zfsctl_common_getattr(vp, vap);
+	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	vap->va_mtime = vap->va_ctime;
+	vap->va_birthtime = vap->va_ctime;
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+		if (err != 0)
+			return (err);
+		vap->va_nlink += snap_count;
 	}
+	vap->va_size = vap->va_nlink;
 
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*
- * pvp is the '.zfs' directory (zfsctl_node_t).
- * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
- *
- * This function is the callback to create a GFS vnode for '.zfs/snapshot'
- * when a lookup is performed on .zfs for "snapshot".
- */
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
-{
-	vnode_t *vp;
-	zfsctl_snapdir_t *sdp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
-	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
-	    zfsctl_snapdir_readdir_cb, NULL);
-	sdp = vp->v_data;
-	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
-	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
-	avl_create(&sdp->sd_snaps, snapentry_compare,
-	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
-	return (vp);
+	return (0);
 }
 
-vnode_t *
-zfsctl_mknode_shares(vnode_t *pvp)
-{
-	vnode_t *vp;
-	zfsctl_node_t *sdp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
-	    zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
-	    NULL, NULL);
-	sdp = vp->v_data;
-	sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
-	return (vp);
-
-}
+static struct vop_vector zfsctl_ops_snapdir = {
+	.vop_default =	&default_vnodeops,
+	.vop_open =	zfsctl_common_open,
+	.vop_close =	zfsctl_common_close,
+	.vop_getattr =	zfsctl_snapdir_getattr,
+	.vop_access =	zfsctl_common_access,
+	.vop_readdir =	zfsctl_snapdir_readdir,
+	.vop_lookup =	zfsctl_snapdir_lookup,
+	.vop_reclaim =	zfsctl_common_reclaim,
+	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+};
 
-/* ARGSUSED */
 static int
-zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
+zfsctl_snapshot_inactive(ap)
+	struct vop_inactive_args /* {
+		struct vnode *a_vp;
+		struct thread *a_td;
+	} */ *ap;
 {
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	znode_t *dzp;
-	int error;
-
-	ZFS_ENTER(zfsvfs);
-	if (zfsvfs->z_shares_dir == 0) {
-		ZFS_EXIT(zfsvfs);
-		return (ENOTSUP);
-	}
-	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
-		error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
-		VN_RELE(ZTOV(dzp));
-	}
-	ZFS_EXIT(zfsvfs);
-	return (error);
-
+	vnode_t *vp = ap->a_vp;
 
+	VERIFY(vrecycle(vp) == 1);
+	return (0);
 }
 
-/* ARGSUSED */
 static int
-zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-    caller_context_t *ct)
+zfsctl_snapshot_reclaim(ap)
+	struct vop_reclaim_args /* {
+		struct vnode *a_vp;
+		struct thread *a_td;
+	} */ *ap;
 {
-	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
-	zfsctl_snapdir_t *sdp = vp->v_data;
-
-	ZFS_ENTER(zfsvfs);
-	zfsctl_common_getattr(vp, vap);
-	vap->va_nodeid = gfs_file_inode(vp);
-	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
-	vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
-	ZFS_EXIT(zfsvfs);
+	vnode_t *vp = ap->a_vp;
+	void *data = vp->v_data;
 
+	sfs_reclaim_vnode(vp);
+	sfs_destroy_node(data);
 	return (0);
 }
 
-/* ARGSUSED */
-static void
-zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-	zfsctl_snapdir_t *sdp = vp->v_data;
-	void *private;
-
-	private = gfs_dir_inactive(vp);
-	if (private != NULL) {
-		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
-		mutex_destroy(&sdp->sd_lock);
-		avl_destroy(&sdp->sd_snaps);
-		kmem_free(private, sizeof (zfsctl_snapdir_t));
-	}
-}
-
-#ifndef __NetBSD__
-static const fs_operation_def_t zfsctl_tops_snapdir[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
-	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
-	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
-	{ NULL }
-};
-
-static const fs_operation_def_t zfsctl_tops_shares[] = {
-	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
-	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
-	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
-	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_shares_getattr } },
-	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
-	{ VOPNAME_READDIR,	{ .vop_readdir = zfsctl_shares_readdir } },
-	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_shares_lookup }	},
-	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
-	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive } },
-	{ VOPNAME_FID,		{ .vop_fid = zfsctl_shares_fid } },
-	{ NULL }
-};
-#endif
-/*
- * pvp is the GFS vnode '.zfs/snapshot'.
- *
- * This creates a GFS node under '.zfs/snapshot' representing each
- * snapshot.  This newly created GFS node is what we mount snapshot
- * vfs_t's ontop of.
- */
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
-	vnode_t *vp;
-	zfsctl_node_t *zcp;
-
-	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
-	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
-	zcp = vp->v_data;
-	zcp->zc_id = objset;
-
-	return (vp);
-}
-
-static void
-zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+static int
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
 {
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
+	struct mount *mp;
 	vnode_t *dvp;
+	vnode_t *vp;
+	sfs_node_t *node;
+	size_t len;
+	int locked;
+	int error;
 
-	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
-	sdp = dvp->v_data;
-
-	mutex_enter(&sdp->sd_lock);
-
-	if (vp->v_count > 1) {
-		mutex_exit(&sdp->sd_lock);
-		return;
-	}
-	ASSERT(!vn_ismntpt(vp));
-
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
+	vp = ap->a_vp;
+	node = vp->v_data;
+	len = strlen(node->sn_name);
+	if (*ap->a_buflen < len)
+		return (SET_ERROR(ENOMEM));
 
-		if (sep->se_root == vp) {
-			avl_remove(&sdp->sd_snaps, sep);
-			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
-			kmem_free(sep, sizeof (zfs_snapentry_t));
-			break;
-		}
-		sep = next;
-	}
-	ASSERT(sep != NULL);
-
-	mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
+	/*
+	 * Prevent unmounting of the snapshot while the vnode lock
+	 * is not held.  That is not strictly required, but allows
+	 * us to assert that an uncovered snapshot vnode is never
+	 * "leaked".
+	 */
+	mp = vp->v_mountedhere;
+	if (mp == NULL)
+		return (SET_ERROR(ENOENT));
+	error = vfs_busy(mp, 0);
+	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
 
 	/*
-	 * Dispose of the vnode for the snapshot mount point.
-	 * This is safe to do because once this entry has been removed
-	 * from the AVL tree, it can't be found again, so cannot become
-	 * "active".  If we lookup the same name again we will end up
-	 * creating a new vnode.
+	 * We can vput the vnode as we can now depend on the reference owned
+	 * by the busied mp.  But we also need to hold the vnode, because
+	 * the reference may go after vfs_unbusy() which has to be called
+	 * before we can lock the vnode again.
 	 */
-	gfs_vop_inactive(vp, cr, ct);
+	locked = VOP_ISLOCKED(vp);
+	vhold(vp);
+	vput(vp);
+
+	/* Look up .zfs/snapshot, our parent. */
+	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+	if (error == 0) {
+		VOP_UNLOCK(dvp, 0);
+		*ap->a_vpp = dvp;
+		*ap->a_buflen -= len;
+		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
+	}
+	vfs_unbusy(mp);
+	vget(vp, locked | LK_VNHELD | LK_RETRY, curthread);
+	return (error);
 }
 
-#ifndef __NetBSD__
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
-static const fs_operation_def_t zfsctl_tops_snapshot[] = {
-	VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
-	NULL, NULL
+static struct vop_vector zfsctl_ops_snapshot = {
+	.vop_default =		NULL, /* ensure very restricted access */
+	.vop_inactive =		zfsctl_snapshot_inactive,
+	.vop_reclaim =		zfsctl_snapshot_reclaim,
+	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
+	.vop_lock1 =		vop_stdlock,
+	.vop_unlock =		vop_stdunlock,
+	.vop_islocked =		vop_stdislocked,
+	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
+	.vop_print =		zfsctl_common_print,
 };
-#endif
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
+	struct mount *mp;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	vnode_t *dvp, *vp;
-	zfsctl_snapdir_t *sdp;
-	zfsctl_node_t *zcp;
-	zfs_snapentry_t *sep;
+	vnode_t *vp;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
-
-	mutex_enter(&sdp->sd_lock);
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		vp = sep->se_root;
-		zcp = vp->v_data;
-		if (zcp->zc_id == objsetid)
-			break;
-
-		sep = AVL_NEXT(&sdp->sd_snaps, sep);
-	}
-
-	if (sep != NULL) {
-		VN_HOLD(vp);
+	*zfsvfsp = NULL;
+	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+	if (error == 0 && vp != NULL) {
 		/*
-		 * Return the mounted root rather than the covered mount point.
-		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
-		 * and returns the ZFS vnode mounted on top of the GFS node.
-		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+		 * XXX Probably need to at least reference, if not busy, the mp.
 		 */
-		error = traverse(&vp);
-		if (error == 0) {
-			if (vp == sep->se_root)
-				error = EINVAL;
-			else
-				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
-		}
-		mutex_exit(&sdp->sd_lock);
-		VN_RELE(vp);
-	} else {
-		error = EINVAL;
-		mutex_exit(&sdp->sd_lock);
+		if (vp->v_mountedhere != NULL)
+			*zfsvfsp = vp->v_mountedhere->mnt_data;
+		vput(vp);
 	}
-
-	VN_RELE(dvp);
-
-	return (error);
+	if (*zfsvfsp == NULL)
+		return (SET_ERROR(EINVAL));
+	return (0);
 }
 
 /*
@@ -1272,43 +1190,84 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	struct mount *mp;
 	vnode_t *dvp;
-	zfsctl_snapdir_t *sdp;
-	zfs_snapentry_t *sep, *next;
+	vnode_t *vp;
+	sfs_node_t *node;
+	sfs_node_t *snap;
+	uint64_t cookie;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
-	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
-	    NULL, 0, NULL, cr, NULL, NULL, NULL);
-	if (error != 0)
-		return (error);
-	sdp = dvp->v_data;
 
-	mutex_enter(&sdp->sd_lock);
+	cookie = 0;
+	for (;;) {
+		uint64_t id;
+
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT)
+				error = 0;
+			break;
+		}
 
-	sep = avl_first(&sdp->sd_snaps);
-	while (sep != NULL) {
-		next = AVL_NEXT(&sdp->sd_snaps, sep);
+		for (;;) {
+			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+			    ZFSCTL_INO_SNAPDIR, id, &vp);
+			if (error != 0 || vp == NULL)
+				break;
 
-		/*
-		 * If this snapshot is not mounted, then it must
-		 * have just been unmounted by somebody else, and
-		 * will be cleaned up by zfsctl_snapdir_inactive().
-		 */
-		if (vn_ismntpt(sep->se_root)) {
-			avl_remove(&sdp->sd_snaps, sep);
-			error = zfsctl_unmount_snap(sep, fflags, cr);
-			if (error) {
-				avl_add(&sdp->sd_snaps, sep);
+			mp = vp->v_mountedhere;
+
+			/*
+			 * v_mountedhere being NULL means that the
+			 * (uncovered) vnode is in a transient state
+			 * (mounting or unmounting), so loop until it
+			 * settles down.
+			 */
+			if (mp != NULL)
 				break;
-			}
+			vput(vp);
 		}
-		sep = next;
+		if (error != 0)
+			break;
+		if (vp == NULL)
+			continue;	/* no mountpoint, nothing to do */
+
+		/*
+		 * The mount-point vnode is kept locked to avoid spurious EBUSY
+		 * from a concurrent umount.
+		 * The vnode lock must have recursive locking enabled.
+		 */
+		vfs_ref(mp);
+		error = dounmount(mp, fflags, curthread);
+		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+		    ("extra references after unmount"));
+		vput(vp);
+		if (error != 0)
+			break;
 	}
+	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+	    ("force unmounting failed"));
+	return (error);
+}
 
-	mutex_exit(&sdp->sd_lock);
-	VN_RELE(dvp);
+#endif /* __FreeBSD__ */
 
-	return (error);
+#ifdef __NetBSD__
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+
+boolean_t
+zfsctl_is_node(vnode_t *vp)
+{
+
+	return B_FALSE;
 }
+#endif /* __NetBSD__ */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c	31 Jul 2014 04:37:12 -0000
@@ -0,0 +1,112 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+
+void
+zfs_dbgmsg_init(void)
+{
+	list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+	    offsetof(zfs_dbgmsg_t, zdm_node));
+	mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+	zfs_dbgmsg_t *zdm;
+
+	while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) {
+		int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+	mutex_destroy(&zfs_dbgmsgs_lock);
+	ASSERT0(zfs_dbgmsg_size);
+}
+
+/*
+ * Print these messages by running:
+ * echo ::zfs_dbgmsg | mdb -k
+ *
+ * Monitor these messages by running:
+ * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ *
+ * When used with libzpool, monitor with:
+ * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
+ */
+void
+zfs_dbgmsg(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	zfs_dbgmsg_t *zdm;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx);
+	va_end(adx);
+
+	/*
+	 * There is one byte of string in sizeof (zfs_dbgmsg_t), used
+	 * for the terminating null.
+	 */
+	zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP);
+	zdm->zdm_timestamp = gethrestime_sec();
+
+	va_start(adx, fmt);
+	(void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx);
+	va_end(adx);
+
+	DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg);
+
+	mutex_enter(&zfs_dbgmsgs_lock);
+	list_insert_tail(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size;
+	while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) {
+		zdm = list_remove_head(&zfs_dbgmsgs);
+		size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg);
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+	mutex_exit(&zfs_dbgmsgs_lock);
+}
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+	zfs_dbgmsg_t *zdm;
+
+	(void) printf("ZFS_DBGMSG(%s):\n", tag);
+	mutex_enter(&zfs_dbgmsgs_lock);
+	for (zdm = list_head(&zfs_dbgmsgs); zdm;
+	    zdm = list_next(&zfs_dbgmsgs, zdm))
+		(void) printf("%s\n", zdm->zdm_msg);
+	mutex_exit(&zfs_dbgmsgs_lock);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c,v
retrieving revision 1.10
diff -u -p -r1.10 zfs_dir.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c	9 Apr 2015 05:02:28 -0000	1.10
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c	28 Apr 2017 21:13:09 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -32,10 +32,8 @@
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
-#include <sys/mode.h>
 #include <sys/kmem.h>
 #include <sys/uio.h>
-#include <sys/pathname.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
@@ -43,6 +41,11 @@
 #include <sys/sunddi.h>
 #include <sys/random.h>
 #include <sys/policy.h>
+#ifdef __FreeBSD__
+#include <sys/kcondvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#endif
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/fs/zfs.h>
@@ -51,135 +54,70 @@
 #include <sys/atomic.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/extdirent.h>
 
 /*
- * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
  * of names after deciding which is the appropriate lookup interface.
  */
 static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
-    boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+    boolean_t exact, uint64_t *zoid)
 {
 	int error;
 
 	if (zfsvfs->z_norm) {
-		matchtype_t mt = MT_FIRST;
-		boolean_t conflict = B_FALSE;
-		size_t bufsz = 0;
-		char *buf = NULL;
-
-		if (rpnp) {
-			buf = rpnp->pn_buf;
-			bufsz = rpnp->pn_bufsize;
-		}
-		if (exact)
-			mt = MT_EXACT;
+		matchtype_t mt = exact? MT_EXACT : MT_FIRST;
+
 		/*
 		 * In the non-mixed case we only expect there would ever
 		 * be one match, but we need to use the normalizing lookup.
 		 */
 		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
-		    zoid, mt, buf, bufsz, &conflict);
-		if (!error && deflags)
-			*deflags = conflict ? ED_CASE_CONFLICT : 0;
+		    zoid, mt, NULL, 0, NULL);
 	} else {
 		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
 	}
 	*zoid = ZFS_DIRENT_OBJ(*zoid);
 
-	if (error == ENOENT && update)
-		dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-
 	return (error);
 }
 
 /*
- * Reference counting for dirlocks.  Solaris destroys the condvar as
- * soon as it broadcasts, which works for them because cv_wait doesn't
- * need to use the condvar after it is woken, but which is too fast and
- * loose with the abstraction for us in NetBSD.
- */
-
-static int
-zfs_dirlock_hold(zfs_dirlock_t *dl)
-{
-
-	KASSERT(mutex_owned(&dl->dl_dzp->z_lock));
-
-	if (dl->dl_refcnt >= ULONG_MAX) /* XXX Name this constant.  */
-		return (ENFILE);	/* XXX What to do?  */
-
-	dl->dl_refcnt++;
-	return (0);
-}
-
-static void
-zfs_dirlock_rele(zfs_dirlock_t *dl)
-{
-
-	KASSERT(mutex_owned(&dl->dl_dzp->z_lock));
-	KASSERT(dl->dl_refcnt > 0);
-
-	if (--dl->dl_refcnt == 0) {
-		if (dl->dl_namesize != 0)
-			kmem_free(dl->dl_name, dl->dl_namesize);
-		cv_destroy(&dl->dl_cv);
-		kmem_free(dl, sizeof(*dl));
-	}
-}
-
-/*
- * Lock a directory entry.  A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object.  As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
  *
  * Input arguments:
  *	dzp	- znode for directory
  *	name	- name of entry to lock
  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
- *		  ZSHARED: allow concurrent access with other ZSHARED callers.
  *		  ZXATTR: we want dzp's xattr directory
- *		  ZCILOOK: On a mixed sensitivity file system,
- *			   this lookup should be case-insensitive.
- *		  ZCIEXACT: On a purely case-insensitive file system,
- *			    this lookup should be case-sensitive.
- *		  ZRENAMING: we are locking for renaming, force narrow locks
- *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
- *			     current thread already holds it.
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
- *	dlpp	- pointer to the dirlock for this entry (NULL on error)
- *      direntflags - (case-insensitive lookup only)
- *		flags if multiple case-sensitive matches exist in directory
- *      realpnp     - (case-insensitive lookup only)
- *		actual name matched within the directory
  *
  * Return value: 0 on success or errno on failure.
  *
  * NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- *	 but return znode pointers to a single match.
  */
 int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
-    int flag, int *direntflags, pathname_t *realpnp)
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	*dl;
-	boolean_t	update;
 	boolean_t	exact;
 	uint64_t	zoid;
 	vnode_t		*vp = NULL;
 	int		error = 0;
-	int		cmpflags;
+
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
 
 	*zpp = NULL;
-	*dlpp = NULL;
 
 	/*
 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
@@ -187,287 +125,99 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, zn
 	if (name[0] == '.' &&
 	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
 	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
-		return (EEXIST);
+		return (SET_ERROR(EEXIST));
 
 	/*
 	 * Case sensitivity and normalization preferences are set when
 	 * the file system is created.  These are stored in the
 	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
-	 * affect what vnodes can be cached in the DNLC, how we
-	 * perform zap lookups, and the "width" of our dirlocks.
+	 * affect how we perform zap lookups.
 	 *
-	 * A normal dirlock locks a single name.  Note that with
-	 * normalization a name can be composed multiple ways, but
-	 * when normalized, these names all compare equal.  A wide
-	 * dirlock locks multiple names.  We need these when the file
-	 * system is supporting mixed-mode access.  It is sometimes
-	 * necessary to lock all case permutations of file name at
-	 * once so that simultaneous case-insensitive/case-sensitive
-	 * behaves as rationally as possible.
-	 */
-
-	/*
 	 * Decide if exact matches should be requested when performing
 	 * a zap lookup on file systems supporting case-insensitive
 	 * access.
-	 */
-	exact =
-	    ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
-
-	/*
-	 * Only look in or update the DNLC if we are looking for the
-	 * name on a file system that does not require normalization
-	 * or case folding.  We can also look there if we happen to be
-	 * on a non-normalizing, mixed sensitivity file system IF we
-	 * are looking for the exact name.
 	 *
-	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
-	 * case for performance improvement?
+	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+	 * because in that case MT_EXACT and MT_FIRST should produce exactly
+	 * the same result.
 	 */
-	update = !zfsvfs->z_norm ||
-	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
-	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+	exact = zfsvfs->z_case == ZFS_CASE_MIXED;
 
-	/*
-	 * ZRENAMING indicates we are in a situation where we should
-	 * take narrow locks regardless of the file system's
-	 * preferences for normalizing and case folding.  This will
-	 * prevent us deadlocking trying to grab the same wide lock
-	 * twice if the two names happen to be case-insensitive
-	 * matches.
-	 */
-	if (flag & ZRENAMING)
-		cmpflags = 0;
-	else
-		cmpflags = zfsvfs->z_norm;
-
-	/*
-	 * Wait until there are no locks on this name.
-	 *
-	 * Don't grab the the lock if it is already held. However, cannot
-	 * have both ZSHARED and ZHAVELOCK together.
-	 */
-	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
-	if (!(flag & ZHAVELOCK))
-		rw_enter(&dzp->z_name_lock, RW_READER);
-
-	mutex_enter(&dzp->z_lock);
-	for (;;) {
-		if (dzp->z_unlinked) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (ENOENT);
-		}
-		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
-			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
-			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
-				break;
-		}
-		if (error != 0) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (ENOENT);
-		}
-		if (dl == NULL)	{
-			/*
-			 * Allocate a new dirlock and add it to the list.
-			 */
-			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
-			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
-			dl->dl_name = name;
-			dl->dl_sharecnt = 0;
-			dl->dl_namelock = 0;
-			dl->dl_namesize = 0;
-			dl->dl_refcnt = 1;
-			dl->dl_dzp = dzp;
-			dl->dl_next = dzp->z_dirlocks;
-			dzp->z_dirlocks = dl;
-			break;
-		}
-		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
-			break;
-		error = zfs_dirlock_hold(dl);
-		if (error) {
-			mutex_exit(&dzp->z_lock);
-			if (!(flag & ZHAVELOCK))
-				rw_exit(&dzp->z_name_lock);
-			return (error);
-		}
-		cv_wait(&dl->dl_cv, &dzp->z_lock);
-		zfs_dirlock_rele(dl);
-	}
-
-	/*
-	 * If the z_name_lock was NOT held for this dirlock record it.
-	 */
-	if (flag & ZHAVELOCK)
-		dl->dl_namelock = 1;
-
-	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
-		/*
-		 * We're the second shared reference to dl.  Make a copy of
-		 * dl_name in case the first thread goes away before we do.
-		 * Note that we initialize the new name before storing its
-		 * pointer into dl_name, because the first thread may load
-		 * dl->dl_name at any time.  He'll either see the old value,
-		 * which is his, or the new shared copy; either is OK.
-		 */
-		dl->dl_namesize = strlen(dl->dl_name) + 1;
-		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
-		bcopy(dl->dl_name, name, dl->dl_namesize);
-		dl->dl_name = name;
-	}
-
-	mutex_exit(&dzp->z_lock);
-
-	/*
-	 * We have a dirlock on the name.  (Note that it is the dirlock,
-	 * not the dzp's z_lock, that protects the name in the zap object.)
-	 * See if there's an object by this name; if so, put a hold on it.
-	 */
+	if (dzp->z_unlinked && !(flag & ZXATTR))
+		return (ENOENT);
 	if (flag & ZXATTR) {
-		zoid = dzp->z_phys->zp_xattr;
-		error = (zoid == 0 ? ENOENT : 0);
+		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+		    sizeof (zoid));
+		if (error == 0)
+			error = (zoid == 0 ? ENOENT : 0);
 	} else {
-		if (update)
-			vp = dnlc_lookup(ZTOV(dzp), name);
-		if (vp == DNLC_NO_VNODE) {
-			VN_RELE(vp);
-			error = ENOENT;
-		} else if (vp) {
-			if (flag & ZNEW) {
-				zfs_dirent_unlock(dl);
-				VN_RELE(vp);
-				return (EEXIST);
-			}
-			*dlpp = dl;
-			*zpp = VTOZ(vp);
-			return (0);
-		} else {
-			error = zfs_match_find(zfsvfs, dzp, name, exact,
-			    update, direntflags, realpnp, &zoid);
-		}
+		error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
 	}
 	if (error) {
 		if (error != ENOENT || (flag & ZEXISTS)) {
-			zfs_dirent_unlock(dl);
 			return (error);
 		}
 	} else {
 		if (flag & ZNEW) {
-			zfs_dirent_unlock(dl);
-			return (EEXIST);
+			return (SET_ERROR(EEXIST));
 		}
 		error = zfs_zget(zfsvfs, zoid, zpp);
-		if (error) {
-			zfs_dirent_unlock(dl);
+		if (error)
 			return (error);
-		}
-		if (!(flag & ZXATTR) && update)
-			dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+		ASSERT(!(*zpp)->z_unlinked);
 	}
 
-	*dlpp = dl;
-
 	return (0);
 }
 
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
 {
-	znode_t *dzp = dl->dl_dzp;
-	zfs_dirlock_t **prev_dl, *cur_dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	znode_t *zp;
+	uint64_t parent;
+	int error;
 
-	mutex_enter(&dzp->z_lock);
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
-	if (!dl->dl_namelock)
-		rw_exit(&dzp->z_name_lock);
-
-	if (dl->dl_sharecnt > 1) {
-		dl->dl_sharecnt--;
-		mutex_exit(&dzp->z_lock);
-		return;
-	}
-	prev_dl = &dzp->z_dirlocks;
-	while ((cur_dl = *prev_dl) != dl)
-		prev_dl = &cur_dl->dl_next;
-	*prev_dl = dl->dl_next;
-	cv_broadcast(&dl->dl_cv);
-	zfs_dirlock_rele(dl);
-	mutex_exit(&dzp->z_lock);
+	if (dzp->z_unlinked)
+		return (ENOENT);
+
+	if ((error = sa_lookup(dzp->z_sa_hdl,
+	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+		return (error);
+
+	error = zfs_zget(zfsvfs, parent, &zp);
+	if (error == 0)
+		*zpp = zp;
+	return (error);
 }
 
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- *	no directory entries are actually stored for them.  If this is
- *	the root of a filesystem, then '.zfs' is also treated as a
- *	special pseudo-directory.
- */
 int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
-    int *deflg, pathname_t *rpnp)
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
 {
-	zfs_dirlock_t *dl;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	znode_t *zp;
 	int error = 0;
 
+	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+
+	if (dzp->z_unlinked)
+		return (SET_ERROR(ENOENT));
+
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
-		*vpp = ZTOV(dzp);
-		VN_HOLD(*vpp);
+		*zpp = dzp;
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
-		zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-		/*
-		 * If we are a snapshot mounted under .zfs, return
-		 * the vp for the snapshot directory.
-		 */
-		if (dzp->z_phys->zp_parent == dzp->z_id &&
-		    zfsvfs->z_parent != zfsvfs) {
-			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
-			    "snapshot", vpp, NULL, 0, NULL, kcred,
-			    NULL, NULL, NULL);
-			return (error);
-		}
-		rw_enter(&dzp->z_parent_lock, RW_READER);
-		mutex_enter(&dzp->z_lock);
-		if (dzp->z_phys->zp_links == 0) {
-			/* Directory has been rmdir'd.  */
-			error = ENOENT;
-		} else {
-			error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp);
-			if (error == 0)
-				*vpp = ZTOV(zp);
-		}
-		mutex_exit(&dzp->z_lock);
-		rw_exit(&dzp->z_parent_lock);
-	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
-		*vpp = zfsctl_root(dzp);
+		error = zfs_dd_lookup(dzp, zpp);
 	} else {
-		int zf;
-
-		zf = ZEXISTS | ZSHARED;
-		if (flags & FIGNORECASE)
-			zf |= ZCILOOK;
-
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
 		if (error == 0) {
-			*vpp = ZTOV(zp);
-			zfs_dirent_unlock(dl);
 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+			*zpp = zp;
 		}
-		rpnp = NULL;
 	}
-
-	if ((flags & FIGNORECASE) && rpnp && !error)
-		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
 	return (error);
 }
 
@@ -491,7 +241,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp->z_unlinked);
-	ASSERT3U(zp->z_phys->zp_links, ==, 0);
+	ASSERT(zp->z_links == 0);
 
 	VERIFY3U(0, ==,
 	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
@@ -543,8 +293,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 		if (error != 0)
 			continue;
 
+		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 		zp->z_unlinked = B_TRUE;
-		VN_RELE(ZTOV(zp));
+		vput(ZTOV(zp));
 	}
 	zap_cursor_fini(&zc);
 }
@@ -568,7 +319,6 @@ zfs_purgedir(znode_t *dzp)
 	znode_t		*xzp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
-	zfs_dirlock_t	dl;
 	int skipped = 0;
 	int error;
 
@@ -582,31 +332,32 @@ zfs_purgedir(znode_t *dzp)
 			continue;
 		}
 
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
 		    (ZTOV(xzp)->v_type == VLNK));
 
 		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_bonus(tx, dzp->z_id);
+		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
-		dmu_tx_hold_bonus(tx, xzp->z_id);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+		/* Is this really needed ? */
+		zfs_sa_upgrade_txholds(tx, xzp);
+		dmu_tx_mark_netfree(tx);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
-			VN_RELE(ZTOV(xzp));
+			vput(ZTOV(xzp));
 			skipped += 1;
 			continue;
 		}
-		bzero(&dl, sizeof (dl));
-		dl.dl_dzp = dzp;
-		dl.dl_name = zap.za_name;
 
-		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
 		if (error)
 			skipped += 1;
 		dmu_tx_commit(tx);
 
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT)
@@ -622,15 +373,19 @@ zfs_rmnode(znode_t *zp)
 	znode_t		*xzp = NULL;
 	dmu_tx_t	*tx;
 	uint64_t	acl_obj;
+	uint64_t	xattr_obj;
 	int		error;
 
-	ASSERT(ZTOV(zp)->v_count == 0);
-	ASSERT(zp->z_phys->zp_links == 0);
+	ASSERT(zp->z_links == 0);
+#ifndef __NetBSD__
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#endif
 
 	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
-	if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) {
+	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+	    (zp->z_pflags & ZFS_XATTR)) {
 		if (zfs_purgedir(zp) != 0) {
 			/*
 			 * Not enough space to delete some xattrs.
@@ -640,31 +395,40 @@ zfs_rmnode(znode_t *zp)
 			zfs_znode_free(zp);
 			return;
 		}
-	}
-
-	/*
-	 * Free up all the data in the file.
-	 */
-	error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
-	if (error) {
+	} else {
 		/*
-		 * Not enough space.  Leave the file in the unlinked set.
+		 * Free up all the data in the file.  We don't do this for
+		 * XATTR directories because we need truncate and remove to be
+		 * in the same tx, like in zfs_znode_delete(). Otherwise, if
+		 * we crash here we'll end up with an inconsistent truncated
+		 * zap object in the delete queue.  Note a truncated file is
+		 * harmless since it only contains user data.
 		 */
-		zfs_znode_dmu_fini(zp);
-		zfs_znode_free(zp);
-		return;
+		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+		if (error) {
+			/*
+			 * Not enough space.  Leave the file in the unlinked
+			 * set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			return;
+		}
 	}
 
 	/*
 	 * If the file has extended attributes, we're going to unlink
 	 * the xattr dir.
 	 */
-	if (zp->z_phys->zp_xattr) {
-		error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
-		ASSERT(error == 0);
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT3S(error, ==, 0);
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 	}
 
-	acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+	acl_obj = zfs_external_acl(zp);
 
 	/*
 	 * Set up the final transaction.
@@ -673,11 +437,13 @@ zfs_rmnode(znode_t *zp)
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	if (xzp) {
-		dmu_tx_hold_bonus(tx, xzp->z_id);
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 	if (acl_obj)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		/*
@@ -692,11 +458,11 @@ zfs_rmnode(znode_t *zp)
 	}
 
 	if (xzp) {
-		dmu_buf_will_dirty(xzp->z_dbuf, tx);
-		mutex_enter(&xzp->z_lock);
+		ASSERT(error == 0);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
-		xzp->z_phys->zp_links = 0;	/* no more links to it */
-		mutex_exit(&xzp->z_lock);
+		xzp->z_links = 0;	/* no more links to it */
+		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+		    &xzp->z_links, sizeof (xzp->z_links), tx));
 		zfs_unlinked_add(xzp, tx);
 	}
 
@@ -709,140 +475,211 @@ zfs_rmnode(znode_t *zp)
 	dmu_tx_commit(tx);
 out:
 	if (xzp)
-		VN_RELE(ZTOV(xzp));
+		vput(ZTOV(xzp));
 }
 
 static uint64_t
-zfs_dirent(znode_t *zp)
+zfs_dirent(znode_t *zp, uint64_t mode)
 {
 	uint64_t de = zp->z_id;
+
 	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
-		de |= IFTODT((zp)->z_phys->zp_mode) << 60;
+		de |= IFTODT(mode) << 60;
 	return (de);
 }
 
 /*
- * Link zp into dl.  Can only fail if zp has been unlinked.
+ * Link zp into dzp.  Can only fail if zp has been unlinked.
  */
 int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
 {
-	znode_t *dzp = dl->dl_dzp;
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	uint64_t value;
 	int zp_is_dir = (vp->v_type == VDIR);
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	int count = 0;
 	int error;
 
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
-	mutex_enter(&zp->z_lock);
-
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#if 0
+	if (zp_is_dir) {
+		error = 0;
+		if (dzp->z_links >= LINK_MAX)
+			error = SET_ERROR(EMLINK);
+		return (error);
+	}
+#endif
 	if (!(flag & ZRENAMING)) {
 		if (zp->z_unlinked) {	/* no new links to unlinked zp */
 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
-			mutex_exit(&zp->z_lock);
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 		}
-		zp->z_phys->zp_links++;
-	}
-	zp->z_phys->zp_parent = dzp->z_id;	/* dzp is now zp's parent */
+#if 0
+		if (zp->z_links >= LINK_MAX) {
+			return (SET_ERROR(EMLINK));
+		}
+#endif
+		zp->z_links++;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+		    &zp->z_links, sizeof (zp->z_links));
 
-	if (!(flag & ZNEW))
-		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-	mutex_exit(&zp->z_lock);
-
-	dmu_buf_will_dirty(dzp->z_dbuf, tx);
-	mutex_enter(&dzp->z_lock);
-	dzp->z_phys->zp_size++;			/* one dirent added */
-	dzp->z_phys->zp_links += zp_is_dir;	/* ".." link from zp */
-	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
-	mutex_exit(&dzp->z_lock);
+	} else {
+		ASSERT(zp->z_unlinked == 0);
+	}
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+	    &dzp->z_id, sizeof (dzp->z_id));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (!(flag & ZNEW)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+		    ctime, B_TRUE);
+	}
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
+
+	dzp->z_size++;
+	dzp->z_links += zp_is_dir;
+	count = 0;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &dzp->z_links, sizeof (dzp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
 
-	value = zfs_dirent(zp);
-	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+	value = zfs_dirent(zp, zp->z_mode);
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
 	    8, 1, &value, tx);
-	ASSERT(error == 0);
-
-	dnlc_update(ZTOV(dzp), dl->dl_name, vp);
+	VERIFY0(error);
 
 	return (0);
 }
 
+static int
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
+{
+	int error;
+
+	if (zp->z_zfsvfs->z_norm) {
+		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
+			error = zap_remove_norm(zp->z_zfsvfs->z_os,
+			    dzp->z_id, name, MT_EXACT, tx);
+		else
+			error = zap_remove_norm(zp->z_zfsvfs->z_os,
+			    dzp->z_id, name, MT_FIRST, tx);
+	} else {
+		error = zap_remove(zp->z_zfsvfs->z_os,
+		    dzp->z_id, name, tx);
+	}
+
+	return (error);
+}
+
 /*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
  * and it's the caller's job to do it.
  */
 int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
-	boolean_t *unlinkedp)
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag, boolean_t *unlinkedp)
 {
-	znode_t *dzp = dl->dl_dzp;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	int zp_is_dir = (vp->v_type == VDIR);
 	boolean_t unlinked = B_FALSE;
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	int count = 0;
 	int error;
 
-	dnlc_remove(ZTOV(dzp), dl->dl_name);
+	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	if (!(flag & ZRENAMING)) {
-		dmu_buf_will_dirty(zp->z_dbuf, tx);
 
-		if (vn_vfswlock(vp))		/* prevent new mounts on zp */
-			return (EBUSY);
-
-		if (vn_ismntpt(vp)) {		/* don't remove mount point */
-			vn_vfsunlock(vp);
-			return (EBUSY);
+		if (zp_is_dir && !zfs_dirempty(zp)) {
+#ifdef illumos
+			return (SET_ERROR(EEXIST));
+#else
+			return (SET_ERROR(ENOTEMPTY));
+#endif
 		}
 
-		mutex_enter(&zp->z_lock);
-		if (zp_is_dir && !zfs_dirempty(zp)) {	/* dir not empty */
-			mutex_exit(&zp->z_lock);
-			vn_vfsunlock(vp);
-			return (ENOTEMPTY);
+		/*
+		 * If we get here, we are going to try to remove the object.
+		 * First try removing the name from the directory; if that
+		 * fails, return the error.
+		 */
+		error = zfs_dropname(dzp, name, zp, tx, flag);
+		if (error != 0) {
+			return (error);
 		}
-		if (zp->z_phys->zp_links <= zp_is_dir) {
+
+		if (zp->z_links <= zp_is_dir) {
 			zfs_panic_recover("zfs: link count on vnode %p is %u, "
-				"should be at least %u",
-				zp->z_vnode, (int)zp->z_phys->zp_links,
-		                zp_is_dir + 1);
-			zp->z_phys->zp_links = zp_is_dir + 1;
+			    "should be at least %u", zp->z_vnode,
+			    (int)zp->z_links,
+			    zp_is_dir + 1);
+			zp->z_links = zp_is_dir + 1;
 		}
-		if (--zp->z_phys->zp_links == zp_is_dir) {
+		if (--zp->z_links == zp_is_dir) {
 			zp->z_unlinked = B_TRUE;
-			zp->z_phys->zp_links = 0;
+			zp->z_links = 0;
 			unlinked = B_TRUE;
 		} else {
-			zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
-		}
-		mutex_exit(&zp->z_lock);
-		vn_vfsunlock(vp);
-	}
-
-	dmu_buf_will_dirty(dzp->z_dbuf, tx);
-	mutex_enter(&dzp->z_lock);
-	dzp->z_phys->zp_size--;			/* one dirent removed */
-	dzp->z_phys->zp_links -= zp_is_dir;	/* ".." link from zp */
-	zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx);
-	mutex_exit(&dzp->z_lock);
-
-	if (zp->z_zfsvfs->z_norm) {
-		if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
-		    (flag & ZCIEXACT)) ||
-		    ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
-		    !(flag & ZCILOOK)))
-			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_EXACT, tx);
-		else
-			error = zap_remove_norm(zp->z_zfsvfs->z_os,
-			    dzp->z_id, dl->dl_name, MT_FIRST, tx);
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+			    NULL, &ctime, sizeof (ctime));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+			    B_TRUE);
+		}
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+		    NULL, &zp->z_links, sizeof (zp->z_links));
+		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		count = 0;
+		ASSERT0(error);
 	} else {
-		error = zap_remove(zp->z_zfsvfs->z_os,
-		    dzp->z_id, dl->dl_name, tx);
+		ASSERT(zp->z_unlinked == 0);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
+		if (error != 0)
+			return (error);
 	}
-	ASSERT(error == 0);
+
+	dzp->z_size--;		/* one dirent removed */
+	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+	    NULL, &dzp->z_links, sizeof (dzp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+	    NULL, ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+	    NULL, mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
@@ -853,14 +690,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znod
 }
 
 /*
- * Indicate whether the directory is empty.  Works with or without z_lock
- * held, but can only be consider a hint in the latter case.  Returns true
- * if only "." and ".." remain and there's no work in progress.
+ * Indicate whether the directory is empty.
  */
 boolean_t
 zfs_dirempty(znode_t *dzp)
 {
-	return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0);
+	return (dzp->z_size == 2);
 }
 
 int
@@ -872,42 +707,56 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *
 	int error;
 	zfs_acl_ids_t acl_ids;
 	boolean_t fuid_dirtied;
+	uint64_t parent;
 
 	*xvpp = NULL;
 
+	/*
+	 * In FreeBSD, access checking for creating an EA is being done
+	 * in zfs_setextattr(),
+	 */
+#ifndef __FreeBSD_kernel__
 	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
 		return (error);
+#endif
 
 	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
 	    &acl_ids)) != 0)
 		return (error);
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		return (EDQUOT);
+		return (SET_ERROR(EDQUOT));
 	}
 
+	getnewvnode_reserve(1);
+
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		if (error == ERESTART)
-			dmu_tx_wait(tx);
 		dmu_tx_abort(tx);
 		return (error);
 	}
-	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
-	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
-	zp->z_phys->zp_xattr = xzp->z_id;
+#ifdef DEBUG
+	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent));
+	ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+	    sizeof (xzp->z_id), tx));
 
 	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
 	    xzp, "", NULL, acl_ids.z_fuidp, vap);
@@ -915,6 +764,8 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
+	getnewvnode_drop_reserve();
+
 	*xvpp = ZTOV(xzp);
 
 	return (0);
@@ -938,30 +789,29 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	znode_t		*xzp;
-	zfs_dirlock_t	*dl;
 	vattr_t		va;
 	int		error;
 top:
-	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
 	if (error)
 		return (error);
 
 	if (xzp != NULL) {
 		*xvpp = ZTOV(xzp);
-		zfs_dirent_unlock(dl);
 		return (0);
 	}
 
-	ASSERT(zp->z_phys->zp_xattr == 0);
 
 	if (!(flags & CREATE_XATTR_DIR)) {
-		zfs_dirent_unlock(dl);
-		return (ENOENT);
+#ifdef illumos
+		return (SET_ERROR(ENOENT));
+#else
+		return (SET_ERROR(ENOATTR));
+#endif
 	}
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
-		zfs_dirent_unlock(dl);
-		return (EROFS);
+		return (SET_ERROR(EROFS));
 	}
 
 	/*
@@ -980,12 +830,13 @@ top:
 	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
-	zfs_dirent_unlock(dl);
 
 	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
 		goto top;
 	}
+	if (error == 0)
+		VOP_UNLOCK(*xvpp, 0);
 
 	return (error);
 }
@@ -1014,16 +865,16 @@ zfs_sticky_remove_access(znode_t *zdp, z
 	if (zdp->z_zfsvfs->z_replay)
 		return (0);
 
-	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
+	if ((zdp->z_mode & S_ISVTX) == 0)
 		return (0);
 
-	downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER);
-	fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER);
+	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
 
 	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
 	    (ZTOV(zp)->v_type == VREG &&
 	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
 		return (0);
 	else
-		return (secpolicy_vnode_remove(cr));
+		return (secpolicy_vnode_remove(ZTOV(zp), cr));
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_fm.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c	27 Feb 2010 22:31:21 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c	12 Jun 2012 05:57:28 -0000
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
@@ -709,6 +713,10 @@ zfs_ereport_start_checksum(spa_t *spa, v
 
 	if (report->zcr_ereport == NULL) {
 		report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
+		if (report->zcr_ckinfo != NULL) {
+			kmem_free(report->zcr_ckinfo,
+			    sizeof (*report->zcr_ckinfo));
+		}
 		kmem_free(report, sizeof (*report));
 		return;
 	}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c,v
retrieving revision 1.3
diff -u -p -r1.3 zfs_fuid.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c	18 Apr 2017 00:53:30 -0000
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -376,7 +375,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, u
 
 	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
 
-	if (zfsvfs->z_fuid_obj)
+	if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
 		domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
 	else
 		domain = nulldomain;
@@ -389,10 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, u
 void
 zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
 {
-	*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid,
-	    cr, ZFS_OWNER);
-	*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid,
-	    cr, ZFS_GROUP);
+	*uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+	*gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP);
 }
 
 uid_t
@@ -409,6 +406,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64
 	domain = zfs_fuid_find_by_idx(zfsvfs, index);
 	ASSERT(domain != NULL);
 
+#ifdef illumos
 	if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
 		(void) kidmap_getuidbysid(crgetzone(cr), domain,
 		    FUID_RID(fuid), &id);
@@ -416,6 +414,9 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64
 		(void) kidmap_getgidbysid(crgetzone(cr), domain,
 		    FUID_RID(fuid), &id);
 	}
+#else
+	id = UID_NOBODY;
+#endif
 	return (id);
 }
 
@@ -487,6 +488,11 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuid
 
 /*
  * Create a file system FUID, based on information in the users cred
+ *
+ * If cred contains KSID_OWNER then it should be used to determine
+ * the uid otherwise cred's uid will be used. By default cred's gid
+ * is used unless it's an ephemeral ID in which case KSID_GROUP will
+ * be used if it exists.
  */
 uint64_t
 zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
@@ -501,32 +507,28 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, z
 
 	VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
 
-	if (type == ZFS_OWNER)
-		id = crgetuid(cr);
-	else
-		id = crgetgid(cr);
-	
-#ifdef PORT_SOLARIS	
 	ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
-	if (ksid) {
-		id = ksid_getid(ksid);
-	} else {
-		if (type == ZFS_OWNER)
-			id = crgetuid(cr);
-		else
-			id = crgetgid(cr);
 
-		if (IS_EPHEMERAL(id)) {
-			return ((uint64_t)(type == ZFS_OWNER ?
-			    UID_NOBODY : GID_NOBODY));
-		}
+	if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
+		id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+		if (IS_EPHEMERAL(id))
+			return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
+
+		return ((uint64_t)id);
 	}
-#endif
 
-	if (!zfsvfs->z_use_fuids || (!IS_EPHEMERAL(id)))
+	/*
+	 * ksid is present and FUID is supported
+	 */
+	id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
+
+	if (!IS_EPHEMERAL(id))
 		return ((uint64_t)id);
 
-#ifdef PORT_SOLARIS
+	if (type == ZFS_GROUP)
+		id = ksid_getid(ksid);
+
 	rid = ksid_getrid(ksid);
 	domain = ksid_getdomain(ksid);
 
@@ -535,9 +537,6 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, z
 	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
 
 	return (FUID_ENCODE(idx, rid));
-#else
-	panic(__func__);
-#endif	
 }
 
 /*
@@ -561,9 +560,9 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64
 	uint32_t fuid_idx = FUID_INDEX(id);
 	uint32_t rid;
 	idmap_stat status;
-	uint64_t idx;
+	uint64_t idx = 0;
 	zfs_fuid_t *zfuid = NULL;
-	zfs_fuid_info_t *fuidp;
+	zfs_fuid_info_t *fuidp = NULL;
 
 	/*
 	 * If POSIX ID, or entry is already a FUID then
@@ -585,10 +584,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64
 		 * This is most likely a result of idmap service
 		 * not being available.
 		 */
-		/* XXX NetBSD we need to define UID_NOBODY in
-		   kernel sources otherwise */
 		if (fuidp == NULL)
-			return (crgetuid(cr));
+			return (UID_NOBODY);
+
+		VERIFY3U(type, >=, ZFS_OWNER);
+		VERIFY3U(type, <=, ZFS_ACE_GROUP);
 
 		switch (type) {
 		case ZFS_ACE_USER:
@@ -606,9 +606,8 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64
 			idx = FUID_INDEX(fuidp->z_fuid_group);
 			break;
 		};
-		domain = fuidp->z_domain_table[idx -1];
+		domain = fuidp->z_domain_table[idx - 1];
 	} else {
-#ifdef PORT_SOLARIS
 		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
 			status = kidmap_getsidbyuid(crgetzone(cr), id,
 			    &domain, &rid);
@@ -625,9 +624,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64
 			rid = UID_NOBODY;
 			domain = nulldomain;
 		}
-#else
-		panic(__func__);
-#endif
 	}
 
 	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
@@ -707,12 +703,13 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuid
 boolean_t
 zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 {
+#ifdef illumos
 	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
+	ksidlist_t	*ksidlist = crgetsidlist(cr);
+#endif
 	uid_t		gid;
 
-#ifdef PORT_SOLARIS
-	ksidlist_t	*ksidlist = crgetsidlist(cr);
-		
+#ifdef illumos
 	if (ksid && ksidlist) {
 		int 		i;
 		ksid_t		*ksid_groups;
@@ -744,7 +741,8 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64
 			}
 		}
 	}
-#endif
+#endif	/* illumos */
+
 	/*
 	 * Not found in ksidlist, check posix groups
 	 */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c,v
retrieving revision 1.10
diff -u -p -r1.10 zfs_ioctl.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c	10 Apr 2015 20:55:38 -0000	1.10
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c	13 Sep 2017 07:55:04 -0000
@@ -18,18 +18,139 @@
  *
  * CDDL HEADER END
  */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright 2014 Xin Li <delphij@FreeBSD.org>. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ */
+
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * ZFS ioctls.
+ *
+ * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
+ * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
+ *
+ * There are two ways that we handle ioctls: the legacy way where almost
+ * all of the logic is in the ioctl callback, and the new way where most
+ * of the marshalling is handled in the common entry point, zfsdev_ioctl().
+ *
+ * Non-legacy ioctls should be registered by calling
+ * zfs_ioctl_register() from zfs_ioctl_init().  The ioctl is invoked
+ * from userland by lzc_ioctl().
+ *
+ * The registration arguments are as follows:
+ *
+ * const char *name
+ *   The name of the ioctl.  This is used for history logging.  If the
+ *   ioctl returns successfully (the callback returns 0), and allow_log
+ *   is true, then a history log entry will be recorded with the input &
+ *   output nvlists.  The log entry can be printed with "zpool history -i".
+ *
+ * zfs_ioc_t ioc
+ *   The ioctl request number, which userland will pass to ioctl(2).
+ *   The ioctl numbers can change from release to release, because
+ *   the caller (libzfs) must be matched to the kernel.
+ *
+ * zfs_secpolicy_func_t *secpolicy
+ *   This function will be called before the zfs_ioc_func_t, to
+ *   determine if this operation is permitted.  It should return EPERM
+ *   on failure, and 0 on success.  Checks include determining if the
+ *   dataset is visible in this zone, and if the user has either all
+ *   zfs privileges in the zone (SYS_MOUNT), or has been granted permission
+ *   to do this operation on this dataset with "zfs allow".
+ *
+ * zfs_ioc_namecheck_t namecheck
+ *   This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
+ *   name, a dataset name, or nothing.  If the name is not well-formed,
+ *   the ioctl will fail and the callback will not be called.
+ *   Therefore, the callback can assume that the name is well-formed
+ *   (e.g. is null-terminated, doesn't have more than one '@' character,
+ *   doesn't have invalid characters).
+ *
+ * zfs_ioc_poolcheck_t pool_check
+ *   This specifies requirements on the pool state.  If the pool does
+ *   not meet them (is suspended or is readonly), the ioctl will fail
+ *   and the callback will not be called.  If any checks are specified
+ *   (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
+ *   Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
+ *   POOL_CHECK_READONLY).
+ *
+ * boolean_t smush_outnvlist
+ *   If smush_outnvlist is true, then the output is presumed to be a
+ *   list of errors, and it will be "smushed" down to fit into the
+ *   caller's buffer, by removing some entries and replacing them with a
+ *   single "N_MORE_ERRORS" entry indicating how many were removed.  See
+ *   nvlist_smush() for details.  If smush_outnvlist is false, and the
+ *   outnvlist does not fit into the userland-provided buffer, then the
+ *   ioctl will fail with ENOMEM.
+ *
+ * zfs_ioc_func_t *func
+ *   The callback function that will perform the operation.
+ *
+ *   The callback should return 0 on success, or an error number on
+ *   failure.  If the function fails, the userland ioctl will return -1,
+ *   and errno will be set to the callback's return value.  The callback
+ *   will be called with the following arguments:
+ *
+ *   const char *name
+ *     The name of the pool or dataset to operate on, from
+ *     zfs_cmd_t:zc_name.  The 'namecheck' argument specifies the
+ *     expected type (pool, dataset, or none).
+ *
+ *   nvlist_t *innvl
+ *     The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src.  Or
+ *     NULL if no input nvlist was provided.  Changes to this nvlist are
+ *     ignored.  If the input nvlist could not be deserialized, the
+ *     ioctl will fail and the callback will not be called.
+ *
+ *   nvlist_t *outnvl
+ *     The output nvlist, initially empty.  The callback can fill it in,
+ *     and it will be returned to userland by serializing it into
+ *     zfs_cmd_t:zc_nvlist_dst.  If it is non-empty, and serialization
+ *     fails (e.g. because the caller didn't supply a large enough
+ *     buffer), then the overall ioctl will fail.  See the
+ *     'smush_nvlist' argument above for additional behaviors.
+ *
+ *     There are two typical uses of the output nvlist:
+ *       - To return state, e.g. property values.  In this case,
+ *         smush_outnvlist should be false.  If the buffer was not large
+ *         enough, the caller will reallocate a larger buffer and try
+ *         the ioctl again.
+ *
+ *       - To return multiple errors from an ioctl which makes on-disk
+ *         changes.  In this case, smush_outnvlist should be true.
+ *         Ioctls which make on-disk modifications should generally not
+ *         use the outnvl if they succeed, because the caller can not
+ *         distinguish between the operation failing, and
+ *         deserialization failing.
  */
+#ifdef __FreeBSD__
+#include "opt_kstack_pages.h"
+#endif
 
 #include <sys/types.h>
 #include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/open.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
-#include <sys/modctl.h>
-#include <sys/open.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
@@ -42,51 +163,75 @@
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
-#include <sys/priv_impl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
-#include <sys/ddi.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
 #include <sys/sunddi.h>
-#include <sys/sunldi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
 #include <sys/nvpair.h>
-#include <sys/pathname.h>
 #include <sys/mount.h>
+#ifdef __FreeBSD__
+#include <sys/taskqueue.h>
+#endif
+#ifdef __NetBSD__
+#include <sys/callb.h>
+#include <sys/taskq.h>
+#endif
 #include <sys/sdt.h>
+#include <sys/varargs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
-#include <sharefs/share.h>
+#include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
-#include <sys/callb.h>
-#include <sys/taskq.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_deleg.h"
+#include "zfs_comutil.h"
+#include "zfs_ioctl_compat.h"
 
-#ifdef __NetBSD__
-static int	zfs_cmajor = -1;
-static int	zfs_bmajor = -1;
-#define	ddi_driver_major(x)	zfs_cmajor
+#ifdef __FreeBSD__
+CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX);
+static struct cdev *zfsdev;
 #endif
 
-extern struct modlfs zfs_modlfs;
+#ifdef __NetBSD__
+static int zfs_cmajor = -1;
+static int zfs_bmajor = -1;
+dev_info_t *zfs_dip;
 
-extern void zfs_init(void);
-extern void zfs_fini(void);
+#define ddi_driver_major(x)	zfs_cmajor
 
-ldi_ident_t zfs_li = NULL;
-dev_info_t *zfs_dip;
+#define zfs_init() /* nothing */
+#define zfs_fini() /* nothing */
+
+#define vfs_busy(x, y)	vfs_busy(x)
+#define vfs_rel(x)	vfs_rele(x)
+#endif
+
+uint_t zfs_fsyncer_key;
+extern uint_t rrw_tsd_key;
+static uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
 
-typedef int zfs_ioc_func_t(zfs_cmd_t *);
-typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *);
+typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
+typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
+typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
 
 typedef enum {
 	NO_NAME,
@@ -94,12 +239,21 @@ typedef enum {
 	DATASET_NAME
 } zfs_ioc_namecheck_t;
 
+typedef enum {
+	POOL_CHECK_NONE		= 1 << 0,
+	POOL_CHECK_SUSPENDED	= 1 << 1,
+	POOL_CHECK_READONLY	= 1 << 2,
+} zfs_ioc_poolcheck_t;
+
 typedef struct zfs_ioc_vec {
+	zfs_ioc_legacy_func_t	*zvec_legacy_func;
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
 	zfs_ioc_namecheck_t	zvec_namecheck;
-	boolean_t		zvec_his_log;
-	boolean_t		zvec_pool_check;
+	boolean_t		zvec_allow_log;
+	zfs_ioc_poolcheck_t	zvec_pool_check;
+	boolean_t		zvec_smush_outnvlist;
+	const char		*zvec_name;
 } zfs_ioc_vec_t;
 
 /* This array is indexed by zfs_userquota_prop_t */
@@ -117,14 +271,21 @@ static int zfs_check_clearable(char *dat
     nvlist_t **errors);
 static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
     boolean_t *);
-int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
+static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
+ 
+#ifdef __FreeBSD__
+static void zfsdev_close(void *data);
+#endif
+
+static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
 	const char *newfile;
-	char buf[256];
+	char buf[512];
 	va_list adx;
 
 	/*
@@ -199,9 +360,7 @@ zfs_is_bootfs(const char *name)
 }
 
 /*
- * zfs_earlier_version
- *
- *	Return non-zero if the spa version is less than requested version.
+ * Return non-zero if the spa version is less than requested version.
  */
 static int
 zfs_earlier_version(const char *name, int version)
@@ -219,8 +378,6 @@ zfs_earlier_version(const char *name, in
 }
 
 /*
- * zpl_earlier_version
- *
  * Return TRUE if the ZPL version is less than requested version.
  */
 static boolean_t
@@ -255,7 +412,7 @@ zfs_log_history(zfs_cmd_t *zc)
 
 	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
-			(void) spa_history_log(spa, buf, LOG_CMD_NORMAL);
+			(void) spa_history_log(spa, buf);
 		spa_close(spa, FTAG);
 	}
 	history_str_free(buf);
@@ -267,7 +424,7 @@ zfs_log_history(zfs_cmd_t *zc)
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (0);
 }
@@ -278,67 +435,127 @@ zfs_secpolicy_none(zfs_cmd_t *zc, cred_t
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	if (INGLOBALZONE(curproc) ||
+	if (INGLOBALZONE(curthread) ||
 	    zone_dataset_visible(zc->zc_name, NULL))
 		return (0);
 
-	return (ENOENT);
+	return (SET_ERROR(ENOENT));
 }
 
 static int
-zfs_dozonecheck(const char *dataset, cred_t *cr)
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
 {
-	uint64_t zoned;
 	int writable = 1;
 
 	/*
 	 * The dataset must be visible by this zone -- check this first
 	 * so they don't see EPERM on something they shouldn't know about.
 	 */
-	if (!INGLOBALZONE(curproc) &&
+	if (!INGLOBALZONE(curthread) &&
 	    !zone_dataset_visible(dataset, &writable))
-		return (ENOENT);
-
-	if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL))
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 
-	if (INGLOBALZONE(curproc)) {
+	if (INGLOBALZONE(curthread)) {
 		/*
 		 * If the fs is zoned, only root can access it from the
 		 * global zone.
 		 */
 		if (secpolicy_zfs(cr) && zoned)
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 	} else {
 		/*
 		 * If we are in a local zone, the 'zoned' property must be set.
 		 */
 		if (!zoned)
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 
 		/* must be writable by this zone */
 		if (!writable)
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 	}
 	return (0);
 }
 
-int
-zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+	uint64_t zoned;
+
+#ifdef __NetBSD__
+	zoned = 0;
+#else
+	if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL))
+		return (SET_ERROR(ENOENT));
+#endif
+
+	return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+	uint64_t zoned;
+
+#ifdef __NetBSD__
+	zoned = 0;
+#else
+	if (dsl_prop_get_int_ds(ds, "jailed", &zoned))
+		return (SET_ERROR(ENOENT));
+#endif
+
+	return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+    const char *perm, cred_t *cr)
 {
 	int error;
 
-	error = zfs_dozonecheck(name, cr);
+	error = zfs_dozonecheck_ds(name, ds, cr);
 	if (error == 0) {
 		error = secpolicy_zfs(cr);
-		if (error)
-			error = dsl_deleg_access(name, perm, cr);
+		if (error != 0)
+			error = dsl_deleg_access_impl(ds, perm, cr);
+	}
+	return (error);
+}
+
+static int
+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
+{
+	int error;
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp;
+
+	/*
+	 * First do a quick check for root in the global zone, which
+	 * is allowed to do all write_perms.  This ensures that zfs_ioc_*
+	 * will get to handle nonexistent datasets.
+	 */
+	if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
+		return (0);
+
+	error = dsl_pool_hold(name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, name, FTAG, &ds);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
 	}
+
+	error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
+
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
+#ifdef SECLABEL
 /*
  * Policy for setting the security label property.
  *
@@ -347,7 +564,6 @@ zfs_secpolicy_write_perms(const char *na
 static int
 zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
 {
-#ifdef PORT_SOLARIS
 	char		ds_hexsl[MAXNAMELEN];
 	bslabel_t	ds_sl, new_sl;
 	boolean_t	new_default = FALSE;
@@ -358,15 +574,15 @@ zfs_set_slabel_policy(const char *name, 
 	/* First get the existing dataset label. */
 	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
-	if (error)
-		return (EPERM);
+	if (error != 0)
+		return (SET_ERROR(EPERM));
 
 	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
 		new_default = TRUE;
 
 	/* The label must be translatable */
 	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	/*
 	 * In a non-global zone, disallow attempts to set a label that
@@ -375,7 +591,7 @@ zfs_set_slabel_policy(const char *name, 
 	 */
 	if (!INGLOBALZONE(curproc)) {
 		if (new_default || !blequal(&new_sl, CR_SL(CRED())))
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 		return (0);
 	}
 
@@ -386,10 +602,10 @@ zfs_set_slabel_policy(const char *name, 
 	 */
 	if (dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
-		return (EPERM);
+		return (SET_ERROR(EPERM));
 	if (!zoned) {
 		if (zfs_check_global_label(name, strval) != 0)
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 	}
 
 	/*
@@ -408,8 +624,8 @@ zfs_set_slabel_policy(const char *name, 
 		 */
 		error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
 		    setsl_tag, &os);
-		if (error)
-			return (EPERM);
+		if (error != 0)
+			return (SET_ERROR(EPERM));
 
 		dmu_objset_disown(os, setsl_tag);
 
@@ -419,7 +635,7 @@ zfs_set_slabel_policy(const char *name, 
 		}
 
 		if (hexstr_to_label(strval, &new_sl) != 0)
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 
 		if (blstrictdom(&ds_sl, &new_sl))
 			needed_priv = PRIV_FILE_DOWNGRADE_SL;
@@ -435,16 +651,13 @@ out_check:
 	if (needed_priv != -1)
 		return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
 	return (0);
-#else
-	return (ENOTSUP);
-#endif
 }
+#endif	/* SECLABEL */
 
 static int
 zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
     cred_t *cr)
 {
-#ifdef PORT_SOLARIS
 	char *strval;
 
 	/*
@@ -455,30 +668,33 @@ zfs_secpolicy_setprop(const char *dsname
 		/*
 		 * Disallow setting of 'zoned' from within a local zone.
 		 */
-		if (!INGLOBALZONE(curproc))
-			return (EPERM);
+		if (!INGLOBALZONE(curthread))
+			return (SET_ERROR(EPERM));
 		break;
 
 	case ZFS_PROP_QUOTA:
-		if (!INGLOBALZONE(curproc)) {
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+		if (!INGLOBALZONE(curthread)) {
 			uint64_t zoned;
-			char setpoint[MAXNAMELEN];
+			char setpoint[ZFS_MAX_DATASET_NAME_LEN];
 			/*
 			 * Unprivileged users are allowed to modify the
-			 * quota on things *under* (ie. contained by)
+			 * limit on things *under* (ie. contained by)
 			 * the thing they own.
 			 */
-			if (dsl_prop_get_integer(dsname, "zoned", &zoned,
+			if (dsl_prop_get_integer(dsname, "jailed", &zoned,
 			    setpoint))
-				return (EPERM);
+				return (SET_ERROR(EPERM));
 			if (!zoned || strlen(dsname) <= strlen(setpoint))
-				return (EPERM);
+				return (SET_ERROR(EPERM));
 		}
 		break;
 
 	case ZFS_PROP_MLSLABEL:
+#ifdef SECLABEL
 		if (!is_system_labeled())
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 
 		if (nvpair_value_string(propval, &strval) == 0) {
 			int err;
@@ -487,22 +703,23 @@ zfs_secpolicy_setprop(const char *dsname
 			if (err != 0)
 				return (err);
 		}
+#else
+		return (EOPNOTSUPP);
+#endif
 		break;
 	}
 
 	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
-#else
-	return (ENOTSUP);
-#endif
 }
 
-int
-zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr)
+/* ARGSUSED */
+static int
+zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
 	error = zfs_dozonecheck(zc->zc_name, cr);
-	if (error)
+	if (error != 0)
 		return (error);
 
 	/*
@@ -512,78 +729,107 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_
 	return (0);
 }
 
-int
-zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
+/* ARGSUSED */
+static int
+zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
-int
-zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
+/* ARGSUSED */
+static int
+zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	char *cp;
+	int error;
+
+	/*
+	 * Generate the current snapshot name from the given objsetid, then
+	 * use that name for the secpolicy/zone checks.
+	 */
+	cp = strchr(zc->zc_name, '@');
+	if (cp == NULL)
+		return (SET_ERROR(EINVAL));
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	dsl_dataset_name(ds, zc->zc_name);
+
+	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+	    ZFS_DELEG_PERM_SEND, cr);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_SEND, cr));
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	vnode_t *vp;
 	int error;
 
 	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
-		    NULL, &vp)) != 0)
+	    NO_FOLLOW, NULL, &vp)) != 0)
 		return (error);
 
 	/* Now make sure mntpnt and dataset are ZFS */
-#ifndef __NetBSD__
-	if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+
+	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
 	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
 	    zc->zc_name) != 0)) {
 		VN_RELE(vp);
-		return (EPERM);
+		return (SET_ERROR(EPERM));
 	}
-#endif
+
 	VN_RELE(vp);
 	return (dsl_deleg_access(zc->zc_name,
 	    ZFS_DELEG_PERM_SHARE, cr));
 }
 
 int
-zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-#ifdef __NetBSD__
-	printf("XXX zfs_secpolicy_share write me\n");
-	return EPERM;
-#else
-	if (!INGLOBALZONE(curproc))
-		return (EPERM);
+	if (!INGLOBALZONE(curthread))
+		return (SET_ERROR(EPERM));
 
 	if (secpolicy_nfs(cr) == 0) {
 		return (0);
 	} else {
-		return (zfs_secpolicy_deleg_share(zc, cr));
+		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
 	}
-#endif
 }
 
 int
-zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-#ifdef __NetBSD__
-	printf("XXX zfs_secpolicy_share write me\n");
-	return EPERM;
-#else	
-	if (!INGLOBALZONE(curproc))
-		return (EPERM);
+	if (!INGLOBALZONE(curthread))
+		return (SET_ERROR(EPERM));
 
 	if (secpolicy_smb(cr) == 0) {
 		return (0);
 	} else {
-		return (zfs_secpolicy_deleg_share(zc, cr));
+		return (zfs_secpolicy_deleg_share(zc, innvl, cr));
 	}
-#endif	/* __NetBSD__ */
 }
 
 static int
@@ -601,7 +847,7 @@ zfs_get_parent(const char *datasetname, 
 	} else {
 		cp = strrchr(parent, '/');
 		if (cp == NULL)
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 		cp[0] = '\0';
 	}
 
@@ -620,51 +866,54 @@ zfs_secpolicy_destroy_perms(const char *
 	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
 }
 
 /*
  * Destroying snapshots with delegated permissions requires
- * descendent mount and destroy permissions.
- * Reassemble the full filesystem@snap name so dsl_deleg_access()
- * can do the correct permission check.
- *
- * Since this routine is used when doing a recursive destroy of snapshots
- * and destroying snapshots requires descendent permissions, a successfull
- * check of the top level snapshot applies to snapshots of all descendent
- * datasets as well.
+ * descendant mount and destroy permissions.
  */
+/* ARGSUSED */
 static int
-zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	int error;
-	char *dsname;
-
-	dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
+	nvlist_t *snaps;
+	nvpair_t *pair, *nextpair;
+	int error = 0;
 
-	error = zfs_secpolicy_destroy_perms(dsname, cr);
+	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
+		return (SET_ERROR(EINVAL));
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nextpair) {
+		nextpair = nvlist_next_nvpair(snaps, pair);
+		error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
+		if (error == ENOENT) {
+			/*
+			 * Ignore any snapshots that don't exist (we consider
+			 * them "already destroyed").  Remove the name from the
+			 * nvl here in case the snapshot is created between
+			 * now and when we try to destroy it (in which case
+			 * we don't want to destroy it since we haven't
+			 * checked for permission).
+			 */
+			fnvlist_remove_nvpair(snaps, pair);
+			error = 0;
+		}
+		if (error != 0)
+			break;
+	}
 
-	strfree(dsname);
 	return (error);
 }
 
-/*
- * Must have sys_config privilege to check the iscsi permission
- */
-/* ARGSUSED */
-static int
-zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr)
-{
-	return (secpolicy_zfs(cr));
-}
-
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
-	char	parentname[MAXNAMELEN];
+	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
 	int	error;
 
 	if ((error = zfs_secpolicy_write_perms(from,
@@ -690,55 +939,83 @@ zfs_secpolicy_rename_perms(const char *f
 	return (error);
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
+	char *at = NULL;
+	int error;
+
+	if ((zc->zc_cookie & 1) != 0) {
+		/*
+		 * This is recursive rename, so the starting snapshot might
+		 * not exist. Check file system or volume permission instead.
+		 */
+		at = strchr(zc->zc_name, '@');
+		if (at == NULL)
+			return (EINVAL);
+		*at = '\0';
+	}
+
+	error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr);
+
+	if (at != NULL)
+		*at = '@';
+
+	return (error);
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	char	parentname[MAXNAMELEN];
-	objset_t *clone;
+	dsl_pool_t *dp;
+	dsl_dataset_t *clone;
 	int error;
 
 	error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_PROMOTE, cr);
-	if (error)
+	if (error != 0)
+		return (error);
+
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
 		return (error);
 
-	error = dmu_objset_hold(zc->zc_name, FTAG, &clone);
+	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
 
 	if (error == 0) {
-		dsl_dataset_t *pclone = NULL;
+		char parentname[ZFS_MAX_DATASET_NAME_LEN];
+		dsl_dataset_t *origin = NULL;
 		dsl_dir_t *dd;
-		dd = clone->os_dsl_dataset->ds_dir;
+		dd = clone->ds_dir;
 
-		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 		error = dsl_dataset_hold_obj(dd->dd_pool,
-		    dd->dd_phys->dd_origin_obj, FTAG, &pclone);
-		rw_exit(&dd->dd_pool->dp_config_rwlock);
-		if (error) {
-			dmu_objset_rele(clone, FTAG);
+		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
+		if (error != 0) {
+			dsl_dataset_rele(clone, FTAG);
+			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
-		error = zfs_secpolicy_write_perms(zc->zc_name,
+		error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
 		    ZFS_DELEG_PERM_MOUNT, cr);
 
-		dsl_dataset_name(pclone, parentname);
-		dmu_objset_rele(clone, FTAG);
-		dsl_dataset_rele(pclone, FTAG);
-		if (error == 0)
-			error = zfs_secpolicy_write_perms(parentname,
+		dsl_dataset_name(origin, parentname);
+		if (error == 0) {
+			error = zfs_secpolicy_write_perms_ds(parentname, origin,
 			    ZFS_DELEG_PERM_PROMOTE, cr);
+		}
+		dsl_dataset_rele(clone, FTAG);
+		dsl_dataset_rele(origin, FTAG);
 	}
+	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int error;
 
@@ -761,49 +1038,141 @@ zfs_secpolicy_snapshot_perms(const char 
 	    ZFS_DELEG_PERM_SNAPSHOT, cr));
 }
 
+/*
+ * Check for permission to create each snapshot in the nvlist.
+ */
+/* ARGSUSED */
 static int
-zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
+	nvlist_t *snaps;
+	int error;
+	nvpair_t *pair;
 
-	return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr));
+	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
+		return (SET_ERROR(EINVAL));
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(snaps, pair)) {
+		char *name = nvpair_name(pair);
+		char *atp = strchr(name, '@');
+
+		if (atp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		*atp = '\0';
+		error = zfs_secpolicy_snapshot_perms(name, cr);
+		*atp = '@';
+		if (error != 0)
+			break;
+	}
+	return (error);
 }
 
+/*
+ * Check for permission to create each snapshot in the nvlist.
+ */
+/* ARGSUSED */
 static int
-zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	char	parentname[MAXNAMELEN];
-	int	error;
+	int error = 0;
 
-	if ((error = zfs_get_parent(zc->zc_name, parentname,
-	    sizeof (parentname))) != 0)
-		return (error);
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		char *name = nvpair_name(pair);
+		char *hashp = strchr(name, '#');
 
-	if (zc->zc_value[0] != '\0') {
-		if ((error = zfs_secpolicy_write_perms(zc->zc_value,
-		    ZFS_DELEG_PERM_CLONE, cr)) != 0)
-			return (error);
+		if (hashp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		*hashp = '\0';
+		error = zfs_secpolicy_write_perms(name,
+		    ZFS_DELEG_PERM_BOOKMARK, cr);
+		*hashp = '#';
+		if (error != 0)
+			break;
 	}
+	return (error);
+}
 
-	if ((error = zfs_secpolicy_write_perms(parentname,
-	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
-		return (error);
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	nvpair_t *pair, *nextpair;
+	int error = 0;
+
+	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+	    pair = nextpair) {
+		char *name = nvpair_name(pair);
+		char *hashp = strchr(name, '#');
+		nextpair = nvlist_next_nvpair(innvl, pair);
+
+		if (hashp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
 
-	error = zfs_secpolicy_write_perms(parentname,
-	    ZFS_DELEG_PERM_MOUNT, cr);
+		*hashp = '\0';
+		error = zfs_secpolicy_write_perms(name,
+		    ZFS_DELEG_PERM_DESTROY, cr);
+		*hashp = '#';
+		if (error == ENOENT) {
+			/*
+			 * Ignore any filesystems that don't exist (we consider
+			 * their bookmarks "already destroyed").  Remove
+			 * the name from the nvl here in case the filesystem
+			 * is created between now and when we try to destroy
+			 * the bookmark (in which case we don't want to
+			 * destroy it since we haven't checked for permission).
+			 */
+			fnvlist_remove_nvpair(innvl, pair);
+			error = 0;
+		}
+		if (error != 0)
+			break;
+	}
 
 	return (error);
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	int error;
+	/*
+	 * Even root must have a proper TSD so that we know what pool
+	 * to log to.
+	 */
+	if (tsd_get(zfs_allow_log_key) == NULL)
+		return (SET_ERROR(EPERM));
+	return (0);
+}
 
-	error = secpolicy_fs_unmount(cr, NULL);
-	if (error) {
-		error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr);
-	}
-	return (error);
+static int
+zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
+	int	error;
+	char	*origin;
+
+	if ((error = zfs_get_parent(zc->zc_name, parentname,
+	    sizeof (parentname))) != 0)
+		return (error);
+
+	if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
+	    (error = zfs_secpolicy_write_perms(origin,
+	    ZFS_DELEG_PERM_CLONE, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
+		return (error);
+
+	return (zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_MOUNT, cr));
 }
 
 /*
@@ -812,32 +1181,49 @@ zfs_secpolicy_umount(zfs_cmd_t *zc, cred
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
-		return (EPERM);
+		return (SET_ERROR(EPERM));
 
 	return (0);
 }
 
 /*
- * Policy for fault injection.  Requires all privileges.
+ * Policy for object to name lookups.
  */
 /* ARGSUSED */
 static int
-zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	return (secpolicy_zinject(cr));
+	int error;
+
+	if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+		return (0);
+
+	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+	return (error);
+}
+
+/*
+ * Policy for fault injection.  Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (secpolicy_zinject(cr));
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
 
 	if (prop == ZPROP_INVAL) {
 		if (!zfs_prop_user(zc->zc_value))
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 		return (zfs_secpolicy_write_perms(zc->zc_name,
 		    ZFS_DELEG_PERM_USERPROP, cr));
 	} else {
@@ -847,14 +1233,14 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cre
 }
 
 static int
-zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	int err = zfs_secpolicy_read(zc, cr);
+	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	if (zc->zc_value[0] == 0) {
 		/*
@@ -876,38 +1262,99 @@ zfs_secpolicy_userspace_one(zfs_cmd_t *z
 }
 
 static int
-zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	int err = zfs_secpolicy_read(zc, cr);
+	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
 	    NULL, cr));
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_HOLD, cr));
+	nvpair_t *pair;
+	nvlist_t *holds;
+	int error;
+
+	error = nvlist_lookup_nvlist(innvl, "holds", &holds);
+	if (error != 0)
+		return (SET_ERROR(EINVAL));
+
+	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		char fsname[ZFS_MAX_DATASET_NAME_LEN];
+		error = dmu_fsname(nvpair_name(pair), fsname);
+		if (error != 0)
+			return (error);
+		error = zfs_secpolicy_write_perms(fsname,
+		    ZFS_DELEG_PERM_HOLD, cr);
+		if (error != 0)
+			return (error);
+	}
+	return (0);
 }
 
+/* ARGSUSED */
 static int
-zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
+zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
-	return (zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_RELEASE, cr));
+	nvpair_t *pair;
+	int error;
+
+	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(innvl, pair)) {
+		char fsname[ZFS_MAX_DATASET_NAME_LEN];
+		error = dmu_fsname(nvpair_name(pair), fsname);
+		if (error != 0)
+			return (error);
+		error = zfs_secpolicy_write_perms(fsname,
+		    ZFS_DELEG_PERM_RELEASE, cr);
+		if (error != 0)
+			return (error);
+	}
+	return (0);
+}
+
+/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	/*
+	 * A temporary snapshot is the same as a snapshot,
+	 * hold, destroy and release all rolled into one.
+	 * Delegated diff alone is sufficient that we allow this.
+	 */
+	int error;
+
+	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_DIFF, cr)) == 0)
+		return (0);
+
+	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
+	if (error == 0)
+		error = zfs_secpolicy_hold(zc, innvl, cr);
+	if (error == 0)
+		error = zfs_secpolicy_release(zc, innvl, cr);
+	if (error == 0)
+		error = zfs_secpolicy_destroy(zc, innvl, cr);
+	return (error);
 }
 
 /*
@@ -924,14 +1371,14 @@ get_nvlist(uint64_t nvl, uint64_t size, 
 	 * Read in and unpack the user-supplied nvlist.
 	 */
 	if (size == 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	packed = kmem_alloc(size, KM_SLEEP);
 
 	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
 	    iflag)) != 0) {
 		kmem_free(packed, size);
-		return (error);
+		return (SET_ERROR(EFAULT));
 	}
 
 	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
@@ -945,36 +1392,40 @@ get_nvlist(uint64_t nvl, uint64_t size, 
 	return (0);
 }
 
+/*
+ * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
+ * Entries will be removed from the end of the nvlist, and one int32 entry
+ * named "N_MORE_ERRORS" will be added indicating how many entries were
+ * removed.
+ */
 static int
-fit_error_list(zfs_cmd_t *zc, nvlist_t **errors)
+nvlist_smush(nvlist_t *errors, size_t max)
 {
 	size_t size;
 
-	VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+	size = fnvlist_size(errors);
 
-	if (size > zc->zc_nvlist_dst_size) {
+	if (size > max) {
 		nvpair_t *more_errors;
 		int n = 0;
 
-		if (zc->zc_nvlist_dst_size < 1024)
-			return (ENOMEM);
+		if (max < 1024)
+			return (SET_ERROR(ENOMEM));
 
-		VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0);
-		more_errors = nvlist_prev_nvpair(*errors, NULL);
+		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
+		more_errors = nvlist_prev_nvpair(errors, NULL);
 
 		do {
-			nvpair_t *pair = nvlist_prev_nvpair(*errors,
+			nvpair_t *pair = nvlist_prev_nvpair(errors,
 			    more_errors);
-			VERIFY(nvlist_remove_nvpair(*errors, pair) == 0);
+			fnvlist_remove_nvpair(errors, pair);
 			n++;
-			VERIFY(nvlist_size(*errors, &size,
-			    NV_ENCODE_NATIVE) == 0);
-		} while (size > zc->zc_nvlist_dst_size);
-
-		VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0);
-		VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0);
-		ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
-		ASSERT(size <= zc->zc_nvlist_dst_size);
+			size = fnvlist_size(errors);
+		} while (size > max);
+
+		fnvlist_remove_nvpair(errors, more_errors);
+		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
+		ASSERT3U(fnvlist_size(errors), <=, max);
 	}
 
 	return (0);
@@ -984,23 +1435,31 @@ static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	char *packed = NULL;
+	int error = 0;
 	size_t size;
-	int error;
 
-	VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0);
+	size = fnvlist_size(nvl);
 
 	if (size > zc->zc_nvlist_dst_size) {
-		error = ENOMEM;
+		/*
+		 * Solaris returns ENOMEM here, because even if an error is
+		 * returned from an ioctl(2), new zc_nvlist_dst_size will be
+		 * passed to the userland. This is not the case for FreeBSD.
+		 * We need to return 0, so the kernel will copy the
+		 * zc_nvlist_dst_size back and the userland can discover that a
+		 * bigger buffer is needed.
+		 */
+		error = 0;
 	} else {
-		packed = kmem_alloc(size, KM_SLEEP);
-		VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
-		    KM_SLEEP) == 0);
-		error = ddi_copyout(packed,
-		    (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags);
-		kmem_free(packed, size);
+		packed = fnvlist_pack(nvl, &size);
+		if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+		    size, zc->zc_iflags) != 0)
+			error = SET_ERROR(EFAULT);
+		fnvlist_pack_free(packed, size);
 	}
 
 	zc->zc_nvlist_dst_size = size;
+	zc->zc_nvlist_dst_filled = B_TRUE;
 	return (error);
 }
 
@@ -1008,49 +1467,62 @@ static int
 getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
+	vfs_t *vfsp;
 	int error;
 
 	error = dmu_objset_hold(dsname, FTAG, &os);
-	if (error)
+	if (error != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	mutex_enter(&os->os_user_ptr_lock);
 	*zfvp = dmu_objset_get_user(os);
 	if (*zfvp) {
-		VFS_HOLD((*zfvp)->z_vfs);
+		vfsp = (*zfvp)->z_vfs;
+		vfs_ref(vfsp);
 	} else {
-		error = ESRCH;
+		error = SET_ERROR(ESRCH);
 	}
 	mutex_exit(&os->os_user_ptr_lock);
 	dmu_objset_rele(os, FTAG);
+	if (error == 0) {
+		error = vfs_busy(vfsp, 0);
+		vfs_rel(vfsp);
+		if (error != 0) {
+			*zfvp = NULL;
+			error = SET_ERROR(ESRCH);
+		}
+	}
 	return (error);
 }
 
 /*
  * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
  * case its z_vfs will be NULL, and it will be opened as the owner.
+ * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
+ * which prevents all vnode ops from running.
  */
 static int
-zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp)
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
 {
 	int error = 0;
 
 	if (getzfsvfs(name, zfvp) != 0)
 		error = zfsvfs_create(name, zfvp);
 	if (error == 0) {
-		rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag);
+		rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+		    RW_READER, tag);
 		if ((*zfvp)->z_unmounted) {
 			/*
 			 * XXX we could probably try again, since the unmounting
 			 * thread should be just about to disassociate the
 			 * objset from the zfsvfs.
 			 */
-			rrw_exit(&(*zfvp)->z_teardown_lock, tag);
-			return (EBUSY);
+			rrm_exit(&(*zfvp)->z_teardown_lock, tag);
+			return (SET_ERROR(EBUSY));
 		}
 	}
 	return (error);
@@ -1059,10 +1531,14 @@ zfsvfs_hold(const char *name, void *tag,
 static void
 zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
 {
-	rrw_exit(&zfsvfs->z_teardown_lock, tag);
+	rrm_exit(&zfsvfs->z_teardown_lock, tag);
 
 	if (zfsvfs->z_vfs) {
+#ifdef illumos
 		VFS_RELE(zfsvfs->z_vfs);
+#else
+		vfs_unbusy(zfsvfs->z_vfs);
+#endif
 	} else {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
@@ -1076,7 +1552,6 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 	nvlist_t *config, *props = NULL;
 	nvlist_t *rootprops = NULL;
 	nvlist_t *zplprops = NULL;
-	char *buf;
 
 	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config))
@@ -1095,8 +1570,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 
 		(void) nvlist_lookup_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
-		if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) {
-			error = EINVAL;
+		if (!SPA_VERSION_IS_SUPPORTED(version)) {
+			error = SET_ERROR(EINVAL);
 			goto pool_props_bad;
 		}
 		(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
@@ -1112,13 +1587,11 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 		VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		error = zfs_fill_zplprops_root(version, rootprops,
 		    zplprops, NULL);
-		if (error)
+		if (error != 0)
 			goto pool_props_bad;
 	}
 
-	buf = history_str_get(zc);
-
-	error = spa_create(zc->zc_name, config, props, buf, zplprops);
+	error = spa_create(zc->zc_name, config, props, zplprops);
 
 	/*
 	 * Set the remaining root properties
@@ -1127,9 +1600,6 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 	    ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
 		(void) spa_destroy(zc->zc_name);
 
-	if (buf != NULL)
-		history_str_free(buf);
-
 pool_props_bad:
 	nvlist_free(rootprops);
 	nvlist_free(zplprops);
@@ -1170,19 +1640,20 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 	    guid != zc->zc_guid)
-		error = EINVAL;
-	else if (zc->zc_cookie)
-		error = spa_import_verbatim(zc->zc_name, config, props);
+		error = SET_ERROR(EINVAL);
 	else
-		error = spa_import(zc->zc_name, config, props);
+		error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
+
+	if (zc->zc_nvlist_dst != 0) {
+		int err;
 
-	if (zc->zc_nvlist_dst != 0)
-		(void) put_nvlist(zc, config);
+		if ((err = put_nvlist(zc, config)) != 0)
+			error = err;
+	}
 
 	nvlist_free(config);
 
-	if (props)
-		nvlist_free(props);
+	nvlist_free(props);
 
 	return (error);
 }
@@ -1208,7 +1679,7 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
 	int error;
 
 	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
-		return (EEXIST);
+		return (SET_ERROR(EEXIST));
 
 	error = put_nvlist(zc, configs);
 
@@ -1217,6 +1688,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of the pool
+ *
+ * outputs:
+ * zc_cookie		real errno
+ * zc_nvlist_dst	config nvlist
+ * zc_nvlist_dst_size	size of config nvlist
+ */
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
@@ -1263,7 +1743,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 	nvlist_free(tryconfig);
 
 	if (config == NULL)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	error = put_nvlist(zc, config);
 	nvlist_free(config);
@@ -1271,8 +1751,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name              name of the pool
+ * zc_cookie            scan func (pool_scan_func_t)
+ */
 static int
-zfs_ioc_pool_scrub(zfs_cmd_t *zc)
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
@@ -1280,7 +1765,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_scrub(spa, zc->zc_cookie);
+	if (zc->zc_cookie == POOL_SCAN_NONE)
+		error = spa_scan_stop(spa);
+	else
+		error = spa_scan(spa, zc->zc_cookie);
 
 	spa_close(spa, FTAG);
 
@@ -1310,9 +1798,10 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) {
+	if (zc->zc_cookie < spa_version(spa) ||
+	    !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
 		spa_close(spa, FTAG);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	spa_upgrade(spa, zc->zc_cookie);
@@ -1330,14 +1819,14 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 	int error;
 
 	if ((size = zc->zc_history_len) == 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 	}
 
 	hist_buf = kmem_alloc(size, KM_SLEEP);
@@ -1354,14 +1843,23 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 }
 
 static int
-zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
+zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 {
+	spa_t *spa;
 	int error;
 
-	if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value))
-		return (error);
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		error = spa_change_guid(spa);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
 
-	return (0);
+static int
+zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
+{
+	return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
 }
 
 /*
@@ -1383,7 +1881,7 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele(os, FTAG);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
@@ -1392,6 +1890,35 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_obj		object to find
+ *
+ * outputs:
+ * zc_stat		stats on object
+ * zc_value		path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+
+	/* XXX reading from objset not owned */
+	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
+		return (error);
+	if (dmu_objset_type(os) != DMU_OST_ZFS) {
+		dmu_objset_rele(os, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+	    sizeof (zc->zc_value));
+	dmu_objset_rele(os, FTAG);
+
+	return (error);
+}
+
 static int
 zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
@@ -1412,6 +1939,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares);
 
+#ifdef illumos
 	/*
 	 * A root pool with concatenated devices is not supported.
 	 * Thus, can not add a device to a root pool.
@@ -1425,8 +1953,9 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
 		nvlist_free(config);
 		spa_close(spa, FTAG);
-		return (EDOM);
+		return (SET_ERROR(EDOM));
 	}
+#endif /* illumos */
 
 	if (error == 0) {
 		error = spa_vdev_add(spa, config);
@@ -1436,6 +1965,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_name		name of the pool
+ * zc_nvlist_conf	nvlist of devices to remove
+ * zc_cookie		to stop the remove?
+ */
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
@@ -1485,7 +2020,7 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 		break;
 
 	default:
-		error = EINVAL;
+		error = SET_ERROR(EINVAL);
 	}
 	zc->zc_cookie = newstate;
 	spa_close(spa, FTAG);
@@ -1597,26 +2132,12 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 	return (error);
 }
 
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_nvlist_dst_size	size of buffer for property nvlist
- *
- * outputs:
- * zc_objset_stats	stats
- * zc_nvlist_dst	property nvlist
- * zc_nvlist_dst_size	size of property nvlist
- */
 static int
-zfs_ioc_objset_stats(zfs_cmd_t *zc)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 {
-	objset_t *os = NULL;
-	int error;
+	int error = 0;
 	nvlist_t *nv;
 
-	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
-		return (error);
-
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (zc->zc_nvlist_dst != 0 &&
@@ -1629,15 +2150,44 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
 		 * inconsistent.  So this is a bit of a workaround...
 		 * XXX reading with out owning
 		 */
-		if (!zc->zc_objset_stats.dds_inconsistent) {
-			if (dmu_objset_type(os) == DMU_OST_ZVOL)
-				VERIFY(zvol_get_stats(os, nv) == 0);
+		if (!zc->zc_objset_stats.dds_inconsistent &&
+		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+			error = zvol_get_stats(os, nv);
+			if (error == EIO)
+				return (error);
+			VERIFY0(error);
 		}
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
-	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_objset_stats	stats
+ * zc_nvlist_dst	property nvlist
+ * zc_nvlist_dst_size	size of property nvlist
+ */
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+	if (error == 0) {
+		error = zfs_ioc_objset_stats_impl(zc, os);
+		dmu_objset_rele(os, FTAG);
+	}
+
+	if (error == ENOMEM)
+		error = 0;
 	return (error);
 }
 
@@ -1657,30 +2207,23 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
 static int
 zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
 {
-	objset_t *os = NULL;
-	int error;
+	int error = 0;
 	nvlist_t *nv;
 
-	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
-		return (error);
-
 	/*
 	 * Without this check, we would return local property values if the
 	 * caller has not already received properties on or after
 	 * SPA_VERSION_RECVD_PROPS.
 	 */
-	if (!dsl_prop_get_hasrecvd(os)) {
-		dmu_objset_rele(os, FTAG);
-		return (ENOTSUP);
-	}
+	if (!dsl_prop_get_hasrecvd(zc->zc_name))
+		return (SET_ERROR(ENOTSUP));
 
 	if (zc->zc_nvlist_dst != 0 &&
-	    (error = dsl_prop_get_received(os, &nv)) == 0) {
+	    (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
-	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
@@ -1739,13 +2282,13 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 			err = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	} else {
-		err = ENOENT;
+		err = SET_ERROR(ENOENT);
 	}
 	dmu_objset_rele(os, FTAG);
 	return (err);
 }
 
-static boolean_t
+boolean_t
 dataset_name_hidden(const char *name)
 {
 	/*
@@ -1757,7 +2300,7 @@ dataset_name_hidden(const char *name)
 		return (B_TRUE);
 	if (strchr(name, '%') != NULL)
 		return (B_TRUE);
-	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
+	if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL))
 		return (B_TRUE);
 	return (B_FALSE);
 }
@@ -1786,7 +2329,7 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 top:
 	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
 		if (error == ENOENT)
-			error = ESRCH;
+			error = SET_ERROR(ESRCH);
 		return (error);
 	}
 
@@ -1795,26 +2338,13 @@ top:
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
-	/*
-	 * Pre-fetch the datasets.  dmu_objset_prefetch() always returns 0
-	 * but is not declared void because its called by dmu_objset_find().
-	 */
-	if (zc->zc_cookie == 0) {
-		uint64_t cookie = 0;
-		int len = sizeof (zc->zc_name) - (p - zc->zc_name);
-
-		while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
-			(void) dmu_objset_prefetch(p, NULL);
-	}
-
 	do {
 		error = dmu_dir_list_next(os,
 		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
-			error = ESRCH;
-	} while (error == 0 && dataset_name_hidden(zc->zc_name) &&
-	    !(zc->zc_iflags & FKIOCTL));
+			error = SET_ERROR(ESRCH);
+	} while (error == 0 && dataset_name_hidden(zc->zc_name));
 	dmu_objset_rele(os, FTAG);
 
 	/*
@@ -1837,6 +2367,7 @@ top:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_nvlist_dst_size	size of buffer for property nvlist
+ * zc_simple		when set, only name is requested
  *
  * outputs:
  * zc_name		name of next snapshot
@@ -1850,41 +2381,46 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc
 	objset_t *os;
 	int error;
 
-top:
-	if (zc->zc_cookie == 0)
-		(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
-		    NULL, DS_FIND_SNAPSHOTS);
-
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-	if (error)
+	if (error != 0) {
 		return (error == ENOENT ? ESRCH : error);
+	}
 
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
-	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
+	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+	    ZFS_MAX_DATASET_NAME_LEN) {
 		dmu_objset_rele(os, FTAG);
-		return (ESRCH);
+		return (SET_ERROR(ESRCH));
 	}
 
 	error = dmu_snapshot_list_next(os,
 	    sizeof (zc->zc_name) - strlen(zc->zc_name),
-	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
-	dmu_objset_rele(os, FTAG);
-	if (error == 0) {
-		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-		if (error == ENOENT)  {
-			/* We lost a race with destroy, get the next one. */
-			*strchr(zc->zc_name, '@') = '\0';
-			goto top;
+	    zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie,
+	    NULL);
+
+	if (error == 0 && !zc->zc_simple) {
+		dsl_dataset_t *ds;
+		dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
+
+		error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds);
+		if (error == 0) {
+			objset_t *ossnap;
+
+			error = dmu_objset_from_ds(ds, &ossnap);
+			if (error == 0)
+				error = zfs_ioc_objset_stats_impl(zc, ossnap);
+			dsl_dataset_rele(ds, FTAG);
 		}
 	} else if (error == ENOENT) {
-		error = ESRCH;
+		error = SET_ERROR(ESRCH);
 	}
 
+	dmu_objset_rele(os, FTAG);
 	/* if we failed, undo the @ that we tacked on to zc_name */
-	if (error)
+	if (error != 0)
 		*strchr(zc->zc_name, '@') = '\0';
 	return (error);
 }
@@ -1896,6 +2432,7 @@ zfs_prop_set_userquota(const char *dsnam
 	uint64_t *valary;
 	unsigned int vallen;
 	const char *domain;
+	char *dash;
 	zfs_userquota_prop_t type;
 	uint64_t rid;
 	uint64_t quota;
@@ -1905,22 +2442,26 @@ zfs_prop_set_userquota(const char *dsnam
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-		    &pair) == 0);
+		if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &pair) != 0)
+			return (SET_ERROR(EINVAL));
 	}
 
-	VERIFY(nvpair_value_uint64_array(pair, &valary, &vallen) == 0);
-	VERIFY(vallen == 3);
-	type = valary[0];
-	rid = valary[1];
-	quota = valary[2];
 	/*
-	 * The propname is encoded as
+	 * A correctly constructed propname is encoded as
 	 * userquota@<rid>-<domain>.
 	 */
-	domain = strchr(propname, '-') + 1;
+	if ((dash = strchr(propname, '-')) == NULL ||
+	    nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
+	    vallen != 3)
+		return (SET_ERROR(EINVAL));
+
+	domain = dash + 1;
+	type = valary[0];
+	rid = valary[1];
+	quota = valary[2];
 
-	err = zfsvfs_hold(dsname, FTAG, &zfsvfs);
+	err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
 	if (err == 0) {
 		err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
 		zfsvfs_rele(zfsvfs, FTAG);
@@ -1934,7 +2475,7 @@ zfs_prop_set_userquota(const char *dsnam
  * return 0 on success and a positive error code on failure; otherwise if it is
  * not one of the special properties handled by this function, return -1.
  *
- * XXX: It would be better for callers of the properety interface if we handled
+ * XXX: It would be better for callers of the property interface if we handled
  * these special cases in dsl_prop.c (in the dsl layer).
  */
 static int
@@ -1944,7 +2485,7 @@ zfs_prop_set_special(const char *dsname,
 	const char *propname = nvpair_name(pair);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval;
-	int err;
+	int err = -1;
 
 	if (prop == ZPROP_INVAL) {
 		if (zfs_prop_userquota(propname))
@@ -1969,23 +2510,37 @@ zfs_prop_set_special(const char *dsname,
 		err = dsl_dir_set_quota(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFQUOTA:
-		err = dsl_dataset_set_quota(dsname, source, intval);
+		err = dsl_dataset_set_refquota(dsname, source, intval);
+		break;
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+		if (intval == UINT64_MAX) {
+			/* clearing the limit, just do it */
+			err = 0;
+		} else {
+			err = dsl_dir_activate_fs_ss_limit(dsname);
+		}
+		/*
+		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
+		 * default path to set the value in the nvlist.
+		 */
+		if (err == 0)
+			err = -1;
 		break;
 	case ZFS_PROP_RESERVATION:
 		err = dsl_dir_set_reservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFRESERVATION:
-		err = dsl_dataset_set_reservation(dsname, source, intval);
+		err = dsl_dataset_set_refreservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_VOLSIZE:
-		err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
-		    intval);
+		err = zvol_set_volsize(dsname, intval);
 		break;
 	case ZFS_PROP_VERSION:
 	{
 		zfsvfs_t *zfsvfs;
 
-		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0)
+		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
 			break;
 
 		err = zfs_set_version(zfsvfs, intval);
@@ -2001,7 +2556,6 @@ zfs_prop_set_special(const char *dsname,
 		}
 		break;
 	}
-
 	default:
 		err = -1;
 	}
@@ -2011,31 +2565,25 @@ zfs_prop_set_special(const char *dsname,
 
 /*
  * This function is best effort. If it fails to set any of the given properties,
- * it continues to set as many as it can and returns the first error
- * encountered. If the caller provides a non-NULL errlist, it also gives the
- * complete list of names of all the properties it failed to set along with the
- * corresponding error numbers. The caller is responsible for freeing the
- * returned errlist.
+ * it continues to set as many as it can and returns the last error
+ * encountered. If the caller provides a non-NULL errlist, it will be filled in
+ * with the list of names of all the properties that failed along with the
+ * corresponding error numbers.
  *
- * If every property is set successfully, zero is returned and the list pointed
- * at by errlist is NULL.
+ * If every property is set successfully, zero is returned and errlist is not
+ * modified.
  */
 int
 zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
-    nvlist_t **errlist)
+    nvlist_t *errlist)
 {
 	nvpair_t *pair;
 	nvpair_t *propval;
 	int rv = 0;
 	uint64_t intval;
 	char *strval;
-	nvlist_t *genericnvl;
-	nvlist_t *errors;
-	nvlist_t *retrynvl;
-
-	VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	nvlist_t *genericnvl = fnvlist_alloc();
+	nvlist_t *retrynvl = fnvlist_alloc();
 
 retry:
 	pair = NULL;
@@ -2048,48 +2596,50 @@ retry:
 		propval = pair;
 		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 			nvlist_t *attrs;
-			VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-			VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-			    &propval) == 0);
+			attrs = fnvpair_value_nvlist(pair);
+			if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+			    &propval) != 0)
+				err = SET_ERROR(EINVAL);
 		}
 
 		/* Validate value type */
-		if (prop == ZPROP_INVAL) {
+		if (err == 0 && prop == ZPROP_INVAL) {
 			if (zfs_prop_user(propname)) {
 				if (nvpair_type(propval) != DATA_TYPE_STRING)
-					err = EINVAL;
+					err = SET_ERROR(EINVAL);
 			} else if (zfs_prop_userquota(propname)) {
 				if (nvpair_type(propval) !=
 				    DATA_TYPE_UINT64_ARRAY)
-					err = EINVAL;
+					err = SET_ERROR(EINVAL);
+			} else {
+				err = SET_ERROR(EINVAL);
 			}
-		} else {
+		} else if (err == 0) {
 			if (nvpair_type(propval) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
-					err = EINVAL;
+					err = SET_ERROR(EINVAL);
 			} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
 				const char *unused;
 
-				VERIFY(nvpair_value_uint64(propval,
-				    &intval) == 0);
+				intval = fnvpair_value_uint64(propval);
 
 				switch (zfs_prop_get_type(prop)) {
 				case PROP_TYPE_NUMBER:
 					break;
 				case PROP_TYPE_STRING:
-					err = EINVAL;
+					err = SET_ERROR(EINVAL);
 					break;
 				case PROP_TYPE_INDEX:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
-						err = EINVAL;
+						err = SET_ERROR(EINVAL);
 					break;
 				default:
 					cmn_err(CE_PANIC,
 					    "unknown property type");
 				}
 			} else {
-				err = EINVAL;
+				err = SET_ERROR(EINVAL);
 			}
 		}
 
@@ -2115,8 +2665,11 @@ retry:
 			}
 		}
 
-		if (err != 0)
-			VERIFY(nvlist_add_int32(errors, propname, err) == 0);
+		if (err != 0) {
+			if (errlist != NULL)
+				fnvlist_add_int32(errlist, propname, err);
+			rv = err;
+		}
 	}
 
 	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
@@ -2138,44 +2691,33 @@ retry:
 			propval = pair;
 			if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 				nvlist_t *attrs;
-				VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
-				VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
-				    &propval) == 0);
+				attrs = fnvpair_value_nvlist(pair);
+				propval = fnvlist_lookup_nvpair(attrs,
+				    ZPROP_VALUE);
 			}
 
 			if (nvpair_type(propval) == DATA_TYPE_STRING) {
-				VERIFY(nvpair_value_string(propval,
-				    &strval) == 0);
-				err = dsl_prop_set(dsname, propname, source, 1,
-				    strlen(strval) + 1, strval);
+				strval = fnvpair_value_string(propval);
+				err = dsl_prop_set_string(dsname, propname,
+				    source, strval);
 			} else {
-				VERIFY(nvpair_value_uint64(propval,
-				    &intval) == 0);
-				err = dsl_prop_set(dsname, propname, source, 8,
-				    1, &intval);
+				intval = fnvpair_value_uint64(propval);
+				err = dsl_prop_set_int(dsname, propname, source,
+				    intval);
 			}
 
 			if (err != 0) {
-				VERIFY(nvlist_add_int32(errors, propname,
-				    err) == 0);
+				if (errlist != NULL) {
+					fnvlist_add_int32(errlist, propname,
+					    err);
+				}
+				rv = err;
 			}
 		}
 	}
 	nvlist_free(genericnvl);
 	nvlist_free(retrynvl);
 
-	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
-		nvlist_free(errors);
-		errors = NULL;
-	} else {
-		VERIFY(nvpair_value_int32(pair, &rv) == 0);
-	}
-
-	if (errlist == NULL)
-		nvlist_free(errors);
-	else
-		*errlist = errors;
-
 	return (rv);
 }
 
@@ -2183,28 +2725,26 @@ retry:
  * Check that all the properties are valid user properties.
  */
 static int
-zfs_check_userprops(char *fsname, nvlist_t *nvl)
+zfs_check_userprops(const char *fsname, nvlist_t *nvl)
 {
 	nvpair_t *pair = NULL;
 	int error = 0;
 
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
-		char *valstr;
 
 		if (!zfs_prop_user(propname) ||
 		    nvpair_type(pair) != DATA_TYPE_STRING)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 
 		if (error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_USERPROP, CRED()))
 			return (error);
 
 		if (strlen(propname) >= ZAP_MAXNAMELEN)
-			return (ENAMETOOLONG);
+			return (SET_ERROR(ENAMETOOLONG));
 
-		VERIFY(nvpair_value_string(pair, &valstr) == 0);
-		if (strlen(valstr) >= ZAP_MAXVALUELEN)
+		if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
 			return (E2BIG);
 	}
 	return (0);
@@ -2227,7 +2767,7 @@ props_skip(nvlist_t *props, nvlist_t *sk
 }
 
 static int
-clear_received_props(objset_t *os, const char *fs, nvlist_t *props,
+clear_received_props(const char *dsname, nvlist_t *props,
     nvlist_t *skipped)
 {
 	int err = 0;
@@ -2239,8 +2779,8 @@ clear_received_props(objset_t *os, const
 		 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
 		 */
 		zprop_source_t flags = (ZPROP_SRC_NONE |
-		    (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0));
-		err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL);
+		    (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
+		err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
 	}
 	nvlist_free(cleared_props);
 	return (err);
@@ -2263,7 +2803,7 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
 	boolean_t received = zc->zc_cookie;
 	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
 	    ZPROP_SRC_LOCAL);
-	nvlist_t *errors = NULL;
+	nvlist_t *errors;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
@@ -2272,23 +2812,21 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
 
 	if (received) {
 		nvlist_t *origprops;
-		objset_t *os;
-
-		if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) {
-			if (dsl_prop_get_received(os, &origprops) == 0) {
-				(void) clear_received_props(os,
-				    zc->zc_name, origprops, nvl);
-				nvlist_free(origprops);
-			}
 
-			dsl_prop_set_hasrecvd(os);
-			dmu_objset_rele(os, FTAG);
+		if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
+			(void) clear_received_props(zc->zc_name,
+			    origprops, nvl);
+			nvlist_free(origprops);
 		}
+
+		error = dsl_prop_set_hasrecvd(zc->zc_name);
 	}
 
-	error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors);
+	errors = fnvlist_alloc();
+	if (error == 0)
+		error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
 
-	if (zc->zc_nvlist_dst != 0 && errors != 0) {
+	if (zc->zc_nvlist_dst != 0 && errors != NULL) {
 		(void) put_nvlist(zc, errors);
 	}
 
@@ -2327,12 +2865,12 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 		 */
 		if (prop == ZPROP_INVAL) {
 			if (!zfs_prop_user(propname))
-				return (EINVAL);
+				return (SET_ERROR(EINVAL));
 
 			type = PROP_TYPE_STRING;
 		} else if (prop == ZFS_PROP_VOLSIZE ||
 		    prop == ZFS_PROP_VERSION) {
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 		} else {
 			type = zfs_prop_get_type(prop);
 		}
@@ -2349,7 +2887,7 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 			break;
 		default:
 			nvlist_free(dummy);
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 		}
 
 		pair = nvlist_next_nvpair(dummy, NULL);
@@ -2365,11 +2903,11 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 		 * they are not considered inheritable.
 		 */
 		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 	}
 
-	/* the property name has been validated by zfs_secpolicy_inherit() */
-	return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL));
+	/* property name has been validated by zfs_secpolicy_inherit_prop() */
+	return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source));
 }
 
 static int
@@ -2442,56 +2980,9 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 	if (error == 0 && zc->zc_nvlist_dst != 0)
 		error = put_nvlist(zc, nvp);
 	else
-		error = EFAULT;
-
-	nvlist_free(nvp);
-	return (error);
-}
-
-static int
-zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
-{
-	nvlist_t *nvp;
-	int error;
-	uint32_t uid;
-	uint32_t gid;
-	uint32_t *groups;
-	uint_t group_cnt;
-	cred_t	*usercred;
-
-	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &nvp)) != 0) {
-		return (error);
-	}
-
-	if ((error = nvlist_lookup_uint32(nvp,
-	    ZFS_DELEG_PERM_UID, &uid)) != 0) {
-		nvlist_free(nvp);
-		return (EPERM);
-	}
-
-	if ((error = nvlist_lookup_uint32(nvp,
-	    ZFS_DELEG_PERM_GID, &gid)) != 0) {
-		nvlist_free(nvp);
-		return (EPERM);
-	}
+		error = SET_ERROR(EFAULT);
 
-	if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS,
-	    &groups, &group_cnt)) != 0) {
-		nvlist_free(nvp);
-		return (EPERM);
-	}
-	usercred = cralloc();
-	if ((crsetugid(usercred, uid, gid) != 0) ||
-	    (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) {
-		nvlist_free(nvp);
-		crfree(usercred);
-		return (EPERM);
-	}
 	nvlist_free(nvp);
-	error = dsl_deleg_access(zc->zc_name,
-	    zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred);
-	crfree(usercred);
 	return (error);
 }
 
@@ -2518,7 +3009,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
 	 */
 	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
 		nvlist_free(fsaclnv);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
@@ -2528,7 +3019,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
 	 */
 
 	error = secpolicy_zfs(CRED());
-	if (error) {
+	if (error != 0) {
 		if (zc->zc_perm_action == B_FALSE) {
 			error = dsl_deleg_can_allow(zc->zc_name,
 			    fsaclnv, CRED());
@@ -2574,9 +3065,35 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 static vfs_t *
 zfs_get_vfs(const char *resource)
 {
+	vfs_t *vfsp;
+
+#ifdef __FreeBSD__
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
+		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
+			if (vfs_busy(vfsp, MBF_MNTLSTLOCK) != 0)
+				vfsp = NULL;
+			break;
+		}
+	}
+	if (vfsp == NULL)
+		mtx_unlock(&mountlist_mtx);
+#endif
+#ifdef __NetBSD__
+        mount_iterator_t *iter;
+
+	mountlist_iterator_init(&iter);
+	while ((vfsp = mountlist_iterator_next(iter)) != NULL) {
+		if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
+			if (vfs_busy(vfsp, 0) != 0)
+				vfsp = NULL;
+			break;
+		}
+	}
+	mountlist_iterator_destroy(iter);
+#endif
 
-	printf("XXX zfs_get_vfs write me\n");
-	return NULL;
+	return (vfsp);
 }
 
 /* ARGSUSED */
@@ -2592,10 +3109,10 @@ zfs_create_cb(objset_t *os, void *arg, c
 
 /*
  * inputs:
- * createprops		list of properties requested by creator
- * default_zplver	zpl version to use if unspecified in createprops
- * fuids_ok		fuids allowed in this version of the spa?
  * os			parent objset pointer (NULL if root fs)
+ * fuids_ok		fuids allowed in this version of the spa?
+ * sa_ok		SAs allowed in this version of the spa?
+ * createprops		list of properties requested by creator
  *
  * outputs:
  * zplprops	values for the zplprops we attach to the master node object
@@ -2612,8 +3129,8 @@ zfs_create_cb(objset_t *os, void *arg, c
  */
 static int
 zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
-    boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
-    boolean_t *is_ci)
+    boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+    nvlist_t *zplprops, boolean_t *is_ci)
 {
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
@@ -2649,10 +3166,11 @@ zfs_fill_zplprops_impl(objset_t *os, uin
 	 */
 	if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
 	    (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+	    (zplver >= ZPL_VERSION_SA && !sa_ok) ||
 	    (zplver < ZPL_VERSION_NORMALIZATION &&
 	    (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
 	    sense != ZFS_PROP_UNDEFINED)))
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Put the version in the zplprops
@@ -2690,11 +3208,13 @@ static int
 zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
-	boolean_t fuids_ok = B_TRUE;
+	boolean_t fuids_ok, sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	objset_t *os = NULL;
-	char parentname[MAXNAMELEN];
+	char parentname[ZFS_MAX_DATASET_NAME_LEN];
 	char *cp;
+	spa_t *spa;
+	uint64_t spa_vers;
 	int error;
 
 	(void) strlcpy(parentname, dataset, sizeof (parentname));
@@ -2702,12 +3222,15 @@ zfs_fill_zplprops(const char *dataset, n
 	ASSERT(cp != NULL);
 	cp[0] = '\0';
 
-	if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
-		zplver = ZPL_VERSION_USERSPACE - 1;
-	if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
-		zplver = ZPL_VERSION_FUID - 1;
-		fuids_ok = B_FALSE;
-	}
+	if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+		return (error);
+
+	spa_vers = spa_version(spa);
+	spa_close(spa, FTAG);
+
+	zplver = zfs_zpl_version_map(spa_vers);
+	fuids_ok = (zplver >= ZPL_VERSION_FUID);
+	sa_ok = (zplver >= ZPL_VERSION_SA);
 
 	/*
 	 * Open parent object set so we can inherit zplprop values.
@@ -2715,7 +3238,7 @@ zfs_fill_zplprops(const char *dataset, n
 	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
 		return (error);
 
-	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops,
+	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
 	    zplprops, is_ci);
 	dmu_objset_rele(os, FTAG);
 	return (error);
@@ -2725,41 +3248,45 @@ static int
 zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
-	boolean_t fuids_ok = B_TRUE;
+	boolean_t fuids_ok;
+	boolean_t sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	int error;
 
-	if (spa_vers < SPA_VERSION_FUID) {
-		zplver = ZPL_VERSION_FUID - 1;
-		fuids_ok = B_FALSE;
-	}
+	zplver = zfs_zpl_version_map(spa_vers);
+	fuids_ok = (zplver >= ZPL_VERSION_FUID);
+	sa_ok = (zplver >= ZPL_VERSION_SA);
 
-	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops,
-	    zplprops, is_ci);
+	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+	    createprops, zplprops, is_ci);
 	return (error);
 }
 
 /*
- * inputs:
- * zc_objset_type	type of objset to create (fs vs zvol)
- * zc_name		name of new objset
- * zc_value		name of snapshot to clone from (may be empty)
- * zc_nvlist_src{_size}	nvlist of properties to apply
+ * innvl: {
+ *     "type" -> dmu_objset_type_t (int32)
+ *     (optional) "props" -> { prop -> value }
+ * }
  *
- * outputs: none
+ * outnvl: propname -> error code (int32)
  */
 static int
-zfs_ioc_create(zfs_cmd_t *zc)
+zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
-	objset_t *clone;
 	int error = 0;
-	zfs_creat_t zct;
+	zfs_creat_t zct = { 0 };
 	nvlist_t *nvprops = NULL;
 	void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-	dmu_objset_type_t type = zc->zc_objset_type;
+	int32_t type32;
+	dmu_objset_type_t type;
+	boolean_t is_insensitive = B_FALSE;
+
+	if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
+		return (SET_ERROR(EINVAL));
+	type = type32;
+	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
 
 	switch (type) {
-
 	case DMU_OST_ZFS:
 		cbfunc = zfs_create_cb;
 		break;
@@ -2772,359 +3299,626 @@ zfs_ioc_create(zfs_cmd_t *zc)
 		cbfunc = NULL;
 		break;
 	}
-	if (strchr(zc->zc_name, '@') ||
-	    strchr(zc->zc_name, '%'))
-		return (EINVAL);
-
-	if (zc->zc_nvlist_src != 0 &&
-	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &nvprops)) != 0)
-		return (error);
+	if (strchr(fsname, '@') ||
+	    strchr(fsname, '%'))
+		return (SET_ERROR(EINVAL));
 
-	zct.zct_zplprops = NULL;
 	zct.zct_props = nvprops;
 
-	if (zc->zc_value[0] != '\0') {
-		/*
-		 * We're creating a clone of an existing snapshot.
-		 */
-		zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
-		if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) {
-			nvlist_free(nvprops);
-			return (EINVAL);
-		}
-
-		error = dmu_objset_hold(zc->zc_value, FTAG, &clone);
-		if (error) {
-			nvlist_free(nvprops);
-			return (error);
-		}
-
-		error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0);
-		dmu_objset_rele(clone, FTAG);
-		if (error) {
-			nvlist_free(nvprops);
-			return (error);
-		}
-	} else {
-		boolean_t is_insensitive = B_FALSE;
-
-		if (cbfunc == NULL) {
-			nvlist_free(nvprops);
-			return (EINVAL);
-		}
-
-		if (type == DMU_OST_ZVOL) {
-			uint64_t volsize, volblocksize;
+	if (cbfunc == NULL)
+		return (SET_ERROR(EINVAL));
 
-			if (nvprops == NULL ||
-			    nvlist_lookup_uint64(nvprops,
-			    zfs_prop_to_name(ZFS_PROP_VOLSIZE),
-			    &volsize) != 0) {
-				nvlist_free(nvprops);
-				return (EINVAL);
-			}
+	if (type == DMU_OST_ZVOL) {
+		uint64_t volsize, volblocksize;
 
-			if ((error = nvlist_lookup_uint64(nvprops,
-			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-			    &volblocksize)) != 0 && error != ENOENT) {
-				nvlist_free(nvprops);
-				return (EINVAL);
-			}
+		if (nvprops == NULL)
+			return (SET_ERROR(EINVAL));
+		if (nvlist_lookup_uint64(nvprops,
+		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
+			return (SET_ERROR(EINVAL));
+
+		if ((error = nvlist_lookup_uint64(nvprops,
+		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+		    &volblocksize)) != 0 && error != ENOENT)
+			return (SET_ERROR(EINVAL));
 
-			if (error != 0)
-				volblocksize = zfs_prop_default_numeric(
-				    ZFS_PROP_VOLBLOCKSIZE);
+		if (error != 0)
+			volblocksize = zfs_prop_default_numeric(
+			    ZFS_PROP_VOLBLOCKSIZE);
 
-			if ((error = zvol_check_volblocksize(
-			    volblocksize)) != 0 ||
-			    (error = zvol_check_volsize(volsize,
-			    volblocksize)) != 0) {
-				nvlist_free(nvprops);
-				return (error);
-			}
-		} else if (type == DMU_OST_ZFS) {
-			int error;
+		if ((error = zvol_check_volblocksize(
+		    volblocksize)) != 0 ||
+		    (error = zvol_check_volsize(volsize,
+		    volblocksize)) != 0)
+			return (error);
+	} else if (type == DMU_OST_ZFS) {
+		int error;
 
-			/*
-			 * We have to have normalization and
-			 * case-folding flags correct when we do the
-			 * file system creation, so go figure them out
-			 * now.
-			 */
-			VERIFY(nvlist_alloc(&zct.zct_zplprops,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-			error = zfs_fill_zplprops(zc->zc_name, nvprops,
-			    zct.zct_zplprops, &is_insensitive);
-			if (error != 0) {
-				nvlist_free(nvprops);
-				nvlist_free(zct.zct_zplprops);
-				return (error);
-			}
+		/*
+		 * We have to have normalization and
+		 * case-folding flags correct when we do the
+		 * file system creation, so go figure them out
+		 * now.
+		 */
+		VERIFY(nvlist_alloc(&zct.zct_zplprops,
+		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		error = zfs_fill_zplprops(fsname, nvprops,
+		    zct.zct_zplprops, &is_insensitive);
+		if (error != 0) {
+			nvlist_free(zct.zct_zplprops);
+			return (error);
 		}
-		error = dmu_objset_create(zc->zc_name, type,
-		    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
-		nvlist_free(zct.zct_zplprops);
 	}
 
+	error = dmu_objset_create(fsname, type,
+	    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
+	nvlist_free(zct.zct_zplprops);
+
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
-		error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL,
-		    nvprops, NULL);
+		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+		    nvprops, outnvl);
 		if (error != 0)
-			(void) dmu_objset_destroy(zc->zc_name, B_FALSE);
+			(void) dsl_destroy_head(fsname);
 	}
-	nvlist_free(nvprops);
+#ifdef __FreeBSD__
+	if (error == 0 && type == DMU_OST_ZVOL)
+		zvol_create_minors(fsname);
+#endif
 	return (error);
 }
 
 /*
- * inputs:
- * zc_name	name of filesystem
- * zc_value	short name of snapshot
- * zc_cookie	recursive flag
- * zc_nvlist_src[_size] property list
+ * innvl: {
+ *     "origin" -> name of origin snapshot
+ *     (optional) "props" -> { prop -> value }
+ * }
  *
- * outputs:
- * zc_value	short snapname (i.e. part after the '@')
+ * outnvl: propname -> error code (int32)
  */
 static int
-zfs_ioc_snapshot(zfs_cmd_t *zc)
+zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
+	int error = 0;
 	nvlist_t *nvprops = NULL;
-	int error;
-	boolean_t recursive = zc->zc_cookie;
-
-	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-		return (EINVAL);
+	char *origin_name;
 
-	if (zc->zc_nvlist_src != 0 &&
-	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    zc->zc_iflags, &nvprops)) != 0)
+	if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0)
+		return (SET_ERROR(EINVAL));
+	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+
+	if (strchr(fsname, '@') ||
+	    strchr(fsname, '%'))
+		return (SET_ERROR(EINVAL));
+
+	if (dataset_namecheck(origin_name, NULL, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+	error = dmu_objset_clone(fsname, origin_name);
+	if (error != 0)
 		return (error);
 
-	error = zfs_check_userprops(zc->zc_name, nvprops);
-	if (error)
-		goto out;
-
-	if (!nvlist_empty(nvprops) &&
-	    zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) {
-		error = ENOTSUP;
-		goto out;
+	/*
+	 * It would be nice to do this atomically.
+	 */
+	if (error == 0) {
+		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+		    nvprops, outnvl);
+		if (error != 0)
+			(void) dsl_destroy_head(fsname);
 	}
-
-	error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
-	    nvprops, recursive);
-
-out:
-	nvlist_free(nvprops);
+#ifdef __FreeBSD__
+	if (error == 0)
+		zvol_create_minors(fsname);
+#endif
 	return (error);
 }
 
-int
-zfs_unmount_snap(const char *name, void *arg)
+/*
+ * innvl: {
+ *     "snaps" -> { snapshot1, snapshot2 }
+ *     (optional) "props" -> { prop -> value (string) }
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ */
+static int
+zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
-	vfs_t *vfsp = NULL;
+	nvlist_t *snaps;
+	nvlist_t *props = NULL;
+	int error, poollen;
+	nvpair_t *pair;
 
-	if (arg) {
-		char *snapname = arg;
-		char *fullname = kmem_asprintf("%s@%s", name, snapname);
-		vfsp = zfs_get_vfs(fullname);
-		strfree(fullname);
-	} else if (strchr(name, '@')) {
-		vfsp = zfs_get_vfs(name);
-	}
+	(void) nvlist_lookup_nvlist(innvl, "props", &props);
+	if ((error = zfs_check_userprops(poolname, props)) != 0)
+		return (error);
 
-	if (vfsp) {
-#ifdef __NetBSD__
-		int err;
-		if ((err = dounmount(vfsp, MNT_FORCE, curlwp)) != 0)
-			return (err);
-#else
+	if (!nvlist_empty(props) &&
+	    zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
+		return (SET_ERROR(ENOTSUP));
+
+	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
+		return (SET_ERROR(EINVAL));
+	poollen = strlen(poolname);
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(snaps, pair)) {
+		const char *name = nvpair_name(pair);
+		const char *cp = strchr(name, '@');
 
 		/*
-		 * Always force the unmount for snapshots.
+		 * The snap name must contain an @, and the part after it must
+		 * contain only valid characters.
 		 */
-		int flag = MS_FORCE;
-		int err;
+		if (cp == NULL ||
+		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+			return (SET_ERROR(EINVAL));
 
-		if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) {
-			VFS_RELE(vfsp);
-			return (err);
+		/*
+		 * The snap must be in the specified pool.
+		 */
+		if (strncmp(name, poolname, poollen) != 0 ||
+		    (name[poollen] != '/' && name[poollen] != '@'))
+			return (SET_ERROR(EXDEV));
+
+		/* This must be the only snap of this fs. */
+		for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
+		    pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
+			if (strncmp(name, nvpair_name(pair2), cp - name + 1)
+			    == 0) {
+				return (SET_ERROR(EXDEV));
+			}
 		}
-		VFS_RELE(vfsp);
-		if ((err = dounmount(vfsp, flag, kcred)) != 0)
-			return (err);
-#endif
 	}
-	return (0);
+
+	error = dsl_dataset_snapshot(snaps, props, outnvl);
+	return (error);
 }
 
 /*
- * inputs:
- * zc_name		name of filesystem
- * zc_value		short name of snapshot
- * zc_defer_destroy	mark for deferred destroy
- *
- * outputs:	none
+ * innvl: "message" -> string
  */
+/* ARGSUSED */
 static int
-zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
+zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
-	int err;
+	char *message;
+	spa_t *spa;
+	int error;
+	char *poolname;
 
-	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-		return (EINVAL);
-	err = dmu_objset_find(zc->zc_name,
-	    zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
-	if (err)
-		return (err);
-	return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value,
-	    zc->zc_defer_destroy));
+	/*
+	 * The poolname in the ioctl is not set, we get it from the TSD,
+	 * which was set at the end of the last successful ioctl that allows
+	 * logging.  The secpolicy func already checked that it is set.
+	 * Only one log ioctl is allowed after each successful ioctl, so
+	 * we clear the TSD here.
+	 */
+	poolname = tsd_get(zfs_allow_log_key);
+	(void) tsd_set(zfs_allow_log_key, NULL);
+	error = spa_open(poolname, &spa, FTAG);
+	strfree(poolname);
+	if (error != 0)
+		return (error);
+
+	if (nvlist_lookup_string(innvl, "message", &message) != 0)  {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = spa_history_log(spa, message);
+	spa_close(spa, FTAG);
+	return (error);
 }
 
-/*
- * inputs:
- * zc_name		name of dataset to destroy
- * zc_objset_type	type of objset
- * zc_defer_destroy	mark for deferred destroy
- *
- * outputs:		none
- */
+#ifdef __FreeBSD__
 static int
-zfs_ioc_destroy(zfs_cmd_t *zc)
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
-	int err;
-	if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
-		err = zfs_unmount_snap(zc->zc_name, NULL);
-		if (err)
-			return (err);
-	}
+	char name[MAXNAMELEN];
+	spa_t *spa;
+	vdev_t *vd;
+	char *command;
+	uint64_t pool_guid;
+	uint64_t vdev_guid;
+	int error;
 
-	err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy);
-	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
-		(void) zvol_remove_minor(zc->zc_name);
-	return (err);
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_string(innvl,
+	    "command", &command) != 0)
+		return (EINVAL);
+
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_by_guid(pool_guid, vdev_guid);
+	if (spa != NULL)
+		strcpy(name, spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+	if (spa == NULL)
+		return (ENOENT);
+
+	if ((error = spa_open(name, &spa, FTAG)) != 0)
+		return (error);
+	spa_vdev_state_enter(spa, SCL_ALL);
+	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+	if (vd == NULL) {
+		(void) spa_vdev_state_exit(spa, NULL, ENXIO);
+		spa_close(spa, FTAG);
+		return (ENODEV);
+	}
+	error = vdev_label_write_pad2(vd, command, strlen(command));
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	spa_close(spa, FTAG);
+	return (error);
 }
+#endif
 
 /*
- * inputs:
- * zc_name	name of dataset to rollback (to most recent snapshot)
+ * The dp_config_rwlock must not be held when calling this, because the
+ * unmount may need to write out data.
  *
- * outputs:	none
+ * This function is best-effort.  Callers must deal gracefully if it
+ * remains mounted (or is remounted after this call).
+ *
+ * Returns 0 if the argument is not a snapshot, or it is not currently a
+ * filesystem, or we were able to unmount it.  Returns error code otherwise.
  */
-static int
-zfs_ioc_rollback(zfs_cmd_t *zc)
+int
+zfs_unmount_snap(const char *snapname)
 {
-	dsl_dataset_t *ds, *clone;
-	int error;
+	vfs_t *vfsp;
 	zfsvfs_t *zfsvfs;
-	char *clone_name;
-
-	error = dsl_dataset_hold(zc->zc_name, FTAG, &ds);
-	if (error)
-		return (error);
+	int err;
 
-	/* must not be a snapshot */
-	if (dsl_dataset_is_snapshot(ds)) {
-		dsl_dataset_rele(ds, FTAG);
-		return (EINVAL);
-	}
+	if (strchr(snapname, '@') == NULL)
+		return (0);
 
-	/* must have a most recent snapshot */
-	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
-		dsl_dataset_rele(ds, FTAG);
-		return (EINVAL);
-	}
+	vfsp = zfs_get_vfs(snapname);
+	if (vfsp == NULL)
+		return (0);
 
-	/*
-	 * Create clone of most recent snapshot.
-	 */
-	clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name);
-	error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT);
-	if (error)
-		goto out;
+	zfsvfs = vfsp->vfs_data;
+	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
 
-	error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone);
-	if (error)
-		goto out;
+	err = vn_vfswlock(vfsp->vfs_vnodecovered);
+#ifdef illumos
+	VFS_RELE(vfsp);
+#else
+	vfs_unbusy(vfsp);
+#endif
+	if (err != 0)
+		return (SET_ERROR(err));
 
 	/*
-	 * Do clone swap.
+	 * Always force the unmount for snapshots.
 	 */
-	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
-		error = zfs_suspend_fs(zfsvfs);
-		if (error == 0) {
-			int resume_err;
-
-			if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
-				error = dsl_dataset_clone_swap(clone, ds,
-				    B_TRUE);
-				dsl_dataset_disown(ds, FTAG);
-				ds = NULL;
-			} else {
-				error = EBUSY;
-			}
-			resume_err = zfs_resume_fs(zfsvfs, zc->zc_name);
-			error = error ? error : resume_err;
-		}
-		VFS_RELE(zfsvfs->z_vfs);
-	} else {
-		if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
-			error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
-			dsl_dataset_disown(ds, FTAG);
-			ds = NULL;
-		} else {
-			error = EBUSY;
-		}
-	}
 
-	/*
-	 * Destroy clone (which also closes it).
-	 */
-	(void) dsl_dataset_destroy(clone, FTAG, B_FALSE);
+#ifdef illumos
+	(void) dounmount(vfsp, MS_FORCE, kcred);
+#else
+	vfs_ref(vfsp);
+	(void) dounmount(vfsp, MS_FORCE, curthread);
+#endif
+	return (0);
+}
 
-out:
-	strfree(clone_name);
-	if (ds)
-		dsl_dataset_rele(ds, FTAG);
-	return (error);
+/* ARGSUSED */
+static int
+zfs_unmount_snap_cb(const char *snapname, void *arg)
+{
+	return (zfs_unmount_snap(snapname));
 }
 
 /*
- * inputs:
- * zc_name	old name of dataset
- * zc_value	new name of dataset
- * zc_cookie	recursive flag (only valid for snapshots)
- *
- * outputs:	none
+ * When a clone is destroyed, its origin may also need to be destroyed,
+ * in which case it must be unmounted.  This routine will do that unmount
+ * if necessary.
  */
-static int
+void
+zfs_destroy_unmount_origin(const char *fsname)
+{
+	int error;
+	objset_t *os;
+	dsl_dataset_t *ds;
+
+	error = dmu_objset_hold(fsname, FTAG, &os);
+	if (error != 0)
+		return;
+	ds = dmu_objset_ds(os);
+	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
+		char originname[ZFS_MAX_DATASET_NAME_LEN];
+		dsl_dataset_name(ds->ds_prev, originname);
+		dmu_objset_rele(os, FTAG);
+		(void) zfs_unmount_snap(originname);
+	} else {
+		dmu_objset_rele(os, FTAG);
+	}
+}
+
+/*
+ * innvl: {
+ *     "snaps" -> { snapshot1, snapshot2 }
+ *     (optional boolean) "defer"
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ *
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int error, poollen;
+	nvlist_t *snaps;
+	nvpair_t *pair;
+	boolean_t defer;
+
+	if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
+		return (SET_ERROR(EINVAL));
+	defer = nvlist_exists(innvl, "defer");
+
+	poollen = strlen(poolname);
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(snaps, pair)) {
+		const char *name = nvpair_name(pair);
+
+		/*
+		 * The snap must be in the specified pool to prevent the
+		 * invalid removal of zvol minors below.
+		 */
+		if (strncmp(name, poolname, poollen) != 0 ||
+		    (name[poollen] != '/' && name[poollen] != '@'))
+			return (SET_ERROR(EXDEV));
+
+		error = zfs_unmount_snap(name);
+		if (error != 0)
+			return (error);
+#if defined(__FreeBSD__)
+		zvol_remove_minors(name);
+#endif
+	}
+
+	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
+}
+
+/*
+ * Create bookmarks.  Bookmark names are of the form <fs>#<bmark>.
+ * All bookmarks must be in the same pool.
+ *
+ * innvl: {
+ *     bookmark1 -> snapshot1, bookmark2 -> snapshot2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		char *snap_name;
+
+		/*
+		 * Verify the snapshot argument.
+		 */
+		if (nvpair_value_string(pair, &snap_name) != 0)
+			return (SET_ERROR(EINVAL));
+
+
+		/* Verify that the keys (bookmarks) are unique */
+		for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
+		    pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
+			if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
+				return (SET_ERROR(EINVAL));
+		}
+	}
+
+	return (dsl_bookmark_create(innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ *     property 1, property 2, ...
+ * }
+ *
+ * outnvl: {
+ *     bookmark name 1 -> { property 1, property 2, ... },
+ *     bookmark name 2 -> { property 1, property 2, ... }
+ * }
+ *
+ */
+static int
+zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	return (dsl_get_bookmarks(fsname, innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ *     bookmark name 1, bookmark name 2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static int
+zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	int error, poollen;
+
+	poollen = strlen(poolname);
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		const char *name = nvpair_name(pair);
+		const char *cp = strchr(name, '#');
+
+		/*
+		 * The bookmark name must contain an #, and the part after it
+		 * must contain only valid characters.
+		 */
+		if (cp == NULL ||
+		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The bookmark must be in the specified pool.
+		 */
+		if (strncmp(name, poolname, poollen) != 0 ||
+		    (name[poollen] != '/' && name[poollen] != '#'))
+			return (SET_ERROR(EXDEV));
+	}
+
+	error = dsl_bookmark_destroy(innvl, outnvl);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of dataset to destroy
+ * zc_objset_type	type of objset
+ * zc_defer_destroy	mark for deferred destroy
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+	int err;
+
+	if (zc->zc_objset_type == DMU_OST_ZFS) {
+		err = zfs_unmount_snap(zc->zc_name);
+		if (err != 0)
+			return (err);
+	}
+
+	if (strchr(zc->zc_name, '@'))
+		err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
+	else
+		err = dsl_destroy_head(zc->zc_name);
+	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+#ifdef __FreeBSD__
+		zvol_remove_minors(zc->zc_name);
+#else
+		(void) zvol_remove_minor(zc->zc_name);
+#endif
+	return (err);
+}
+
+/*
+ * fsname is name of dataset to rollback (to most recent snapshot)
+ *
+ * innvl is not used.
+ *
+ * outnvl: "target" -> name of most recent snapshot
+ * }
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl)
+{
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	if (getzfsvfs(fsname, &zfsvfs) == 0) {
+		dsl_dataset_t *ds;
+
+		ds = dmu_objset_ds(zfsvfs->z_os);
+		error = zfs_suspend_fs(zfsvfs);
+		if (error == 0) {
+			int resume_err;
+
+			error = dsl_dataset_rollback(fsname, zfsvfs, outnvl);
+			resume_err = zfs_resume_fs(zfsvfs, ds);
+			error = error ? error : resume_err;
+		}
+#ifdef illumos
+		VFS_RELE(zfsvfs->z_vfs);
+#else
+		vfs_unbusy(zfsvfs->z_vfs);
+#endif
+	} else {
+		error = dsl_dataset_rollback(fsname, NULL, outnvl);
+	}
+	return (error);
+}
+
+static int
+recursive_unmount(const char *fsname, void *arg)
+{
+	const char *snapname = arg;
+	char fullname[ZFS_MAX_DATASET_NAME_LEN];
+
+	(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
+	return (zfs_unmount_snap(fullname));
+}
+
+/*
+ * inputs:
+ * zc_name	old name of dataset
+ * zc_value	new name of dataset
+ * zc_cookie	recursive flag (only valid for snapshots)
+ *
+ * outputs:	none
+ */
+static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
 	boolean_t recursive = zc->zc_cookie & 1;
+	char *at;
+	boolean_t allow_mounted = B_TRUE;
+
+#ifdef __FreeBSD__
+	allow_mounted = (zc->zc_cookie & 2) != 0;
+#endif
 
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '%'))
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
-	/*
-	 * Unmount snapshot unless we're doing a recursive rename,
-	 * in which case the dataset code figures out which snapshots
-	 * to unmount.
-	 */
-	if (!recursive && strchr(zc->zc_name, '@') != NULL &&
-	    zc->zc_objset_type == DMU_OST_ZFS) {
-		int err = zfs_unmount_snap(zc->zc_name, NULL);
-		if (err)
-			return (err);
+	at = strchr(zc->zc_name, '@');
+	if (at != NULL) {
+		/* snaps must be in same fs */
+		int error;
+
+		if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
+			return (SET_ERROR(EXDEV));
+		*at = '\0';
+		if (zc->zc_objset_type == DMU_OST_ZFS && !allow_mounted) {
+			error = dmu_objset_find(zc->zc_name,
+			    recursive_unmount, at + 1,
+			    recursive ? DS_FIND_CHILDREN : 0);
+			if (error != 0) {
+				*at = '@';
+				return (error);
+			}
+		}
+		error = dsl_dataset_rename_snapshot(zc->zc_name,
+		    at + 1, strchr(zc->zc_value, '@') + 1, recursive);
+		*at = '@';
+
+		return (error);
+	} else {
+#ifdef illumos
+		if (zc->zc_objset_type == DMU_OST_ZVOL)
+			(void) zvol_remove_minor(zc->zc_name);
+#endif
+		return (dsl_dir_rename(zc->zc_name, zc->zc_value));
 	}
-	if (zc->zc_objset_type == DMU_OST_ZVOL)
-		(void) zvol_remove_minor(zc->zc_name);
-	return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
 }
 
 static int
@@ -3159,7 +3953,7 @@ zfs_check_settable(const char *dsname, n
 				perm = ZFS_DELEG_PERM_GROUPQUOTA;
 			} else {
 				/* USERUSED and GROUPUSED are read-only */
-				return (EINVAL);
+				return (SET_ERROR(EINVAL));
 			}
 
 			if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
@@ -3167,11 +3961,11 @@ zfs_check_settable(const char *dsname, n
 			return (0);
 		}
 
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	if (issnap)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		/*
@@ -3194,19 +3988,32 @@ zfs_check_settable(const char *dsname, n
 		 * the SPA supports it. We ignore any errors here since
 		 * we'll catch them later.
 		 */
-		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
-		    nvpair_value_uint64(pair, &intval) == 0) {
+		if (nvpair_value_uint64(pair, &intval) == 0) {
 			if (intval >= ZIO_COMPRESS_GZIP_1 &&
 			    intval <= ZIO_COMPRESS_GZIP_9 &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_GZIP_COMPRESSION)) {
-				return (ENOTSUP);
+				return (SET_ERROR(ENOTSUP));
 			}
 
 			if (intval == ZIO_COMPRESS_ZLE &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_ZLE_COMPRESSION))
-				return (ENOTSUP);
+				return (SET_ERROR(ENOTSUP));
+
+			if (intval == ZIO_COMPRESS_LZ4) {
+				spa_t *spa;
+
+				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+					return (err);
+
+				if (!spa_feature_is_enabled(spa,
+				    SPA_FEATURE_LZ4_COMPRESS)) {
+					spa_close(spa, FTAG);
+					return (SET_ERROR(ENOTSUP));
+				}
+				spa_close(spa, FTAG);
+			}
 
 			/*
 			 * If this is a bootable dataset then
@@ -3217,24 +4024,45 @@ zfs_check_settable(const char *dsname, n
 			 */
 			if (zfs_is_bootfs(dsname) &&
 			    !BOOTFS_COMPRESS_VALID(intval)) {
-				return (ERANGE);
+				return (SET_ERROR(ERANGE));
 			}
 		}
 		break;
 
 	case ZFS_PROP_COPIES:
 		if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
-			return (ENOTSUP);
+			return (SET_ERROR(ENOTSUP));
 		break;
 
-	case ZFS_PROP_DEDUP:
-		if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
-			return (ENOTSUP);
+	case ZFS_PROP_RECORDSIZE:
+		/* Record sizes above 128k need the feature to be enabled */
+		if (nvpair_value_uint64(pair, &intval) == 0 &&
+		    intval > SPA_OLD_MAXBLOCKSIZE) {
+			spa_t *spa;
+
+			/*
+			 * We don't allow setting the property above 1MB,
+			 * unless the tunable has been changed.
+			 */
+			if (intval > zfs_max_recordsize ||
+			    intval > SPA_MAXBLOCKSIZE)
+				return (SET_ERROR(ERANGE));
+
+			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+				return (err);
+
+			if (!spa_feature_is_enabled(spa,
+			    SPA_FEATURE_LARGE_BLOCKS)) {
+				spa_close(spa, FTAG);
+				return (SET_ERROR(ENOTSUP));
+			}
+			spa_close(spa, FTAG);
+		}
 		break;
 
 	case ZFS_PROP_SHARESMB:
 		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
-			return (ENOTSUP);
+			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_ACLINHERIT:
@@ -3243,15 +4071,104 @@ zfs_check_settable(const char *dsname, n
 			if (intval == ZFS_ACL_PASSTHROUGH_X &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_PASSTHROUGH_X))
-				return (ENOTSUP);
+				return (SET_ERROR(ENOTSUP));
+		}
+		break;
+
+	case ZFS_PROP_CHECKSUM:
+	case ZFS_PROP_DEDUP:
+	{
+		spa_feature_t feature;
+		spa_t *spa;
+
+		/* dedup feature version checks */
+		if (prop == ZFS_PROP_DEDUP &&
+		    zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+			return (SET_ERROR(ENOTSUP));
+
+		if (nvpair_value_uint64(pair, &intval) != 0)
+			return (SET_ERROR(EINVAL));
+
+		/* check prop value is enabled in features */
+		feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
+		if (feature == SPA_FEATURE_NONE)
+			break;
+
+		if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+			return (err);
+		/*
+		 * Salted checksums are not supported on root pools.
+		 */
+		if (spa_bootfs(spa) != 0 &&
+		    intval < ZIO_CHECKSUM_FUNCTIONS &&
+		    (zio_checksum_table[intval].ci_flags &
+		    ZCHECKSUM_FLAG_SALTED)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ERANGE));
+		}
+		if (!spa_feature_is_enabled(spa, feature)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ENOTSUP));
 		}
+		spa_close(spa, FTAG);
 		break;
 	}
+	}
 
 	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
 }
 
 /*
+ * Checks for a race condition to make sure we don't increment a feature flag
+ * multiple times.
+ */
+static int
+zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	spa_feature_t *featurep = arg;
+
+	if (!spa_feature_is_active(spa, *featurep))
+		return (0);
+	else
+		return (SET_ERROR(EBUSY));
+}
+
+/*
+ * The callback invoked on feature activation in the sync task caused by
+ * zfs_prop_activate_feature.
+ */
+static void
+zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	spa_feature_t *featurep = arg;
+
+	spa_feature_incr(spa, *featurep, tx);
+}
+
+/*
+ * Activates a feature on a pool in response to a property setting. This
+ * creates a new sync task which modifies the pool to reflect the feature
+ * as being active.
+ */
+static int
+zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
+{
+	int err;
+
+	/* EBUSY here indicates that the feature is already active */
+	err = dsl_sync_task(spa_name(spa),
+	    zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
+	    &feature, 2, ZFS_SPACE_CHECK_RESERVED);
+
+	if (err != 0 && err != EBUSY)
+		return (err);
+	else
+		return (0);
+}
+
+/*
  * Removes properties from the given props list that fail permission checks
  * needed to clear them and to restore them in case of a receive error. For each
  * property, make sure we have both set and inherit permissions.
@@ -3286,7 +4203,7 @@ zfs_check_clearable(char *dataset, nvlis
 
 		(void) strcpy(zc->zc_value, nvpair_name(pair));
 		if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
-		    (err = zfs_secpolicy_inherit(zc, CRED())) != 0) {
+		    (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
 			VERIFY(nvlist_remove_nvpair(props, pair) == 0);
 			VERIFY(nvlist_add_int32(errors,
 			    zc->zc_value, err) == 0);
@@ -3379,6 +4296,56 @@ next:
 	}
 }
 
+/*
+ * Extract properties that cannot be set PRIOR to the receipt of a dataset.
+ * For example, refquota cannot be set until after the receipt of a dataset,
+ * because in replication streams, an older/earlier snapshot may exceed the
+ * refquota.  We want to receive the older/earlier snapshot, but setting
+ * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
+ * the older/earlier snapshot from being received (with EDQUOT).
+ *
+ * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
+ *
+ * libzfs will need to be judicious handling errors encountered by props
+ * extracted by this function.
+ */
+static nvlist_t *
+extract_delay_props(nvlist_t *props)
+{
+	nvlist_t *delayprops;
+	nvpair_t *nvp, *tmp;
+	static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
+	int i;
+
+	VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
+	    nvp = nvlist_next_nvpair(props, nvp)) {
+		/*
+		 * strcmp() is safe because zfs_prop_to_name() always returns
+		 * a bounded string.
+		 */
+		for (i = 0; delayable[i] != 0; i++) {
+			if (strcmp(zfs_prop_to_name(delayable[i]),
+			    nvpair_name(nvp)) == 0) {
+				break;
+			}
+		}
+		if (delayable[i] != 0) {
+			tmp = nvlist_prev_nvpair(props, nvp);
+			VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
+			VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
+			nvp = tmp;
+		}
+	}
+
+	if (nvlist_empty(delayprops)) {
+		nvlist_free(delayprops);
+		delayprops = NULL;
+	}
+	return (delayprops);
+}
+
 #ifdef	DEBUG
 static boolean_t zfs_ioc_recv_inject_err;
 #endif
@@ -3392,17 +4359,20 @@ static boolean_t zfs_ioc_recv_inject_err
  * zc_cookie		file descriptor to recv from
  * zc_begin_record	the BEGIN record of the stream (not byteswapped)
  * zc_guid		force flag
+ * zc_cleanup_fd	cleanup-on-exit file descriptor
+ * zc_action_handle	handle for this guid/ds mapping (or zero on first call)
+ * zc_resumable		if data is incomplete assume sender will resume
  *
  * outputs:
  * zc_cookie		number of bytes read
  * zc_nvlist_dst{_size} error for each unapplied received property
  * zc_obj		zprop_errflags_t
+ * zc_action_handle	handle for this guid/ds mapping
  */
 static int
 zfs_ioc_recv(zfs_cmd_t *zc)
 {
 	file_t *fp;
-	objset_t *os;
 	dmu_recv_cookie_t drc;
 	boolean_t force = (boolean_t)zc->zc_guid;
 	int fd;
@@ -3412,15 +4382,19 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 	offset_t off;
 	nvlist_t *props = NULL; /* sent properties */
 	nvlist_t *origprops = NULL; /* existing properties */
-	objset_t *origin = NULL;
+	nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
+	char *origin = NULL;
 	char *tosnap;
-	char tofs[ZFS_MAXNAMELEN];
+	char tofs[ZFS_MAX_DATASET_NAME_LEN];
+#ifdef __FreeBSD__
+	cap_rights_t rights;
+#endif
 	boolean_t first_recvd_props = B_FALSE;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
 	    strchr(zc->zc_value, '%'))
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(tofs, zc->zc_value);
 	tosnap = strchr(tofs, '@');
@@ -3432,28 +4406,45 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		return (error);
 
 	fd = zc->zc_cookie;
-	error = fd_getvnode(fd, &fp);
-	if (error != 0) {
+#ifdef __FreeBSD__
+	fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
+#else
+	fp = getf(fd);
+#endif
+	if (fp == NULL) {
 		nvlist_free(props);
-		return (error);
+		return (SET_ERROR(EBADF));
 	}
 
-	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	errors = fnvlist_alloc();
 
-	if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) {
-		if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) &&
-		    !dsl_prop_get_hasrecvd(os)) {
-			first_recvd_props = B_TRUE;
-		}
+	if (zc->zc_string[0])
+		origin = zc->zc_string;
 
-		/*
-		 * If new received properties are supplied, they are to
-		 * completely replace the existing received properties, so stash
-		 * away the existing ones.
-		 */
-		if (dsl_prop_get_received(os, &origprops) == 0) {
-			nvlist_t *errlist = NULL;
-			/*
+	error = dmu_recv_begin(tofs, tosnap,
+	    &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
+	if (error != 0)
+		goto out;
+
+	/*
+	 * Set properties before we receive the stream so that they are applied
+	 * to the new data. Note that we must call dmu_recv_stream() if
+	 * dmu_recv_begin() succeeds.
+	 */
+	if (props != NULL && !drc.drc_newfs) {
+		if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
+		    SPA_VERSION_RECVD_PROPS &&
+		    !dsl_prop_get_hasrecvd(tofs))
+			first_recvd_props = B_TRUE;
+
+		/*
+		 * If new received properties are supplied, they are to
+		 * completely replace the existing received properties, so stash
+		 * away the existing ones.
+		 */
+		if (dsl_prop_get_received(tofs, &origprops) == 0) {
+			nvlist_t *errlist = NULL;
+			/*
 			 * Don't bother writing a property if its value won't
 			 * change (and avoid the unnecessary security checks).
 			 *
@@ -3463,97 +4454,96 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 			 */
 			if (!first_recvd_props)
 				props_reduce(props, origprops);
-			if (zfs_check_clearable(tofs, origprops,
-			    &errlist) != 0)
+			if (zfs_check_clearable(tofs, origprops, &errlist) != 0)
 				(void) nvlist_merge(errors, errlist, 0);
 			nvlist_free(errlist);
-		}
-
-		dmu_objset_rele(os, FTAG);
-	}
-
-	if (zc->zc_string[0]) {
-		error = dmu_objset_hold(zc->zc_string, FTAG, &origin);
-		if (error)
-			goto out;
-	}
-
-	error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds,
-	    &zc->zc_begin_record, force, origin, &drc);
-	if (origin)
-		dmu_objset_rele(origin, FTAG);
-	if (error)
-		goto out;
-
-	/*
-	 * Set properties before we receive the stream so that they are applied
-	 * to the new data. Note that we must call dmu_recv_stream() if
-	 * dmu_recv_begin() succeeds.
-	 */
-	if (props) {
-		nvlist_t *errlist;
 
-		if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) {
-			if (drc.drc_newfs) {
-				if (spa_version(os->os_spa) >=
-				    SPA_VERSION_RECVD_PROPS)
-					first_recvd_props = B_TRUE;
-			} else if (origprops != NULL) {
-				if (clear_received_props(os, tofs, origprops,
-				    first_recvd_props ? NULL : props) != 0)
-					zc->zc_obj |= ZPROP_ERR_NOCLEAR;
-			} else {
+			if (clear_received_props(tofs, origprops,
+			    first_recvd_props ? NULL : props) != 0)
 				zc->zc_obj |= ZPROP_ERR_NOCLEAR;
-			}
-			dsl_prop_set_hasrecvd(os);
-		} else if (!drc.drc_newfs) {
+		} else {
 			zc->zc_obj |= ZPROP_ERR_NOCLEAR;
 		}
-
-		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
-		    props, &errlist);
-		(void) nvlist_merge(errors, errlist, 0);
-		nvlist_free(errlist);
 	}
 
-	if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) {
-		/*
-		 * Caller made zc->zc_nvlist_dst less than the minimum expected
-		 * size or supplied an invalid address.
-		 */
-		props_error = EINVAL;
+	if (props != NULL) {
+		props_error = dsl_prop_set_hasrecvd(tofs);
+
+		if (props_error == 0) {
+			delayprops = extract_delay_props(props);
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+			    props, errors);
+		}
 	}
 
 	off = fp->f_offset;
-	error = dmu_recv_stream(&drc, fp->f_data, &off);
+	error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
+	    &zc->zc_action_handle);
 
 	if (error == 0) {
 		zfsvfs_t *zfsvfs = NULL;
 
 		if (getzfsvfs(tofs, &zfsvfs) == 0) {
 			/* online recv */
+			dsl_dataset_t *ds;
 			int end_err;
 
+			ds = dmu_objset_ds(zfsvfs->z_os);
 			error = zfs_suspend_fs(zfsvfs);
 			/*
 			 * If the suspend fails, then the recv_end will
 			 * likely also fail, and clean up after itself.
 			 */
-			end_err = dmu_recv_end(&drc);
-			if (error == 0) {
-				int resume_err =
-				    zfs_resume_fs(zfsvfs, tofs);
-				error = error ? error : resume_err;
-			}
+			end_err = dmu_recv_end(&drc, zfsvfs);
+			if (error == 0)
+				error = zfs_resume_fs(zfsvfs, ds);
 			error = error ? error : end_err;
+#ifdef illumos
 			VFS_RELE(zfsvfs->z_vfs);
+#else
+			vfs_unbusy(zfsvfs->z_vfs);
+#endif
 		} else {
-			error = dmu_recv_end(&drc);
+			error = dmu_recv_end(&drc, NULL);
+		}
+
+		/* Set delayed properties now, after we're done receiving. */
+		if (delayprops != NULL && error == 0) {
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+			    delayprops, errors);
 		}
 	}
 
+	if (delayprops != NULL) {
+		/*
+		 * Merge delayed props back in with initial props, in case
+		 * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
+		 * we have to make sure clear_received_props() includes
+		 * the delayed properties).
+		 *
+		 * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
+		 * using ASSERT() will be just like a VERIFY.
+		 */
+		ASSERT(nvlist_merge(props, delayprops, 0) == 0);
+		nvlist_free(delayprops);
+	}
+
+	/*
+	 * Now that all props, initial and delayed, are set, report the prop
+	 * errors to the caller.
+	 */
+	if (zc->zc_nvlist_dst_size != 0 &&
+	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
+	    put_nvlist(zc, errors) != 0)) {
+		/*
+		 * Caller made zc->zc_nvlist_dst less than the minimum expected
+		 * size or supplied an invalid address.
+		 */
+		props_error = SET_ERROR(EINVAL);
+	}
+
 	zc->zc_cookie = off - fp->f_offset;
-	if (VOP_SEEK(fp->f_data, fp->f_offset, &off, NULL) == 0)
+	if (off >= 0 && off <= MAXOFFSET_T)
 		fp->f_offset = off;
 
 #ifdef	DEBUG
@@ -3562,25 +4552,25 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		error = 1;
 	}
 #endif
+
+#ifdef __FreeBSD__
+	if (error == 0)
+		zvol_create_minors(tofs);
+#endif
+
 	/*
 	 * On error, restore the original props.
 	 */
-	if (error && props) {
-		if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
-			if (clear_received_props(os, tofs, props, NULL) != 0) {
-				/*
-				 * We failed to clear the received properties.
-				 * Since we may have left a $recvd value on the
-				 * system, we can't clear the $hasrecvd flag.
-				 */
-				zc->zc_obj |= ZPROP_ERR_NORESTORE;
-			} else if (first_recvd_props) {
-				dsl_prop_unset_hasrecvd(os);
-			}
-			dmu_objset_rele(os, FTAG);
-		} else if (!drc.drc_newfs) {
-			/* We failed to clear the received properties. */
+	if (error != 0 && props != NULL && !drc.drc_newfs) {
+		if (clear_received_props(tofs, props, NULL) != 0) {
+			/*
+			 * We failed to clear the received properties.
+			 * Since we may have left a $recvd value on the
+			 * system, we can't clear the $hasrecvd flag.
+			 */
 			zc->zc_obj |= ZPROP_ERR_NORESTORE;
+		} else if (first_recvd_props) {
+			dsl_prop_unset_hasrecvd(tofs);
 		}
 
 		if (origprops == NULL && !drc.drc_newfs) {
@@ -3609,7 +4599,7 @@ out:
 	nvlist_free(props);
 	nvlist_free(origprops);
 	nvlist_free(errors);
-	fd_putfile(fd);
+	releasef(fd);
 
 	if (error == 0)
 		error = props_error;
@@ -3620,60 +4610,157 @@ out:
 /*
  * inputs:
  * zc_name	name of snapshot to send
- * zc_value	short name of incremental fromsnap (may be empty)
  * zc_cookie	file descriptor to send stream to
- * zc_obj	fromorigin flag (mutually exclusive with zc_value)
+ * zc_obj	fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj	objsetid of snapshot to send
+ * zc_fromobj	objsetid of incremental fromsnap (may be zero)
+ * zc_guid	if set, estimate size of stream only.  zc_cookie is ignored.
+ *		output size in zc_objset_type.
+ * zc_flags	lzc_send_flags
  *
- * outputs: none
+ * outputs:
+ * zc_objset_type	estimated size, if zc_guid is set
  */
 static int
 zfs_ioc_send(zfs_cmd_t *zc)
 {
-	objset_t *fromsnap = NULL;
-	objset_t *tosnap;
-	file_t *fp;
 	int error;
 	offset_t off;
+	boolean_t estimate = (zc->zc_guid != 0);
+	boolean_t embedok = (zc->zc_flags & 0x1);
+	boolean_t large_block_ok = (zc->zc_flags & 0x2);
+
+	if (zc->zc_obj != 0) {
+		dsl_pool_t *dp;
+		dsl_dataset_t *tosnap;
 
-	error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
-	if (error)
-		return (error);
+		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+		if (error != 0)
+			return (error);
+
+		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+		if (error != 0) {
+			dsl_pool_rele(dp, FTAG);
+			return (error);
+		}
+
+		if (dsl_dir_is_clone(tosnap->ds_dir))
+			zc->zc_fromobj =
+			    dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+	}
+
+	if (estimate) {
+		dsl_pool_t *dp;
+		dsl_dataset_t *tosnap;
+		dsl_dataset_t *fromsnap = NULL;
+
+		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+		if (error != 0)
+			return (error);
 
-	if (zc->zc_value[0] != '\0') {
-		char *buf;
-		char *cp;
-
-		buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-		(void) strncpy(buf, zc->zc_name, MAXPATHLEN);
-		cp = strchr(buf, '@');
-		if (cp)
-			*(cp+1) = 0;
-		(void) strncat(buf, zc->zc_value, MAXPATHLEN);
-		error = dmu_objset_hold(buf, FTAG, &fromsnap);
-		kmem_free(buf, MAXPATHLEN);
-		if (error) {
-			dmu_objset_rele(tosnap, FTAG);
+		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+		if (error != 0) {
+			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
+
+		if (zc->zc_fromobj != 0) {
+			error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
+			    FTAG, &fromsnap);
+			if (error != 0) {
+				dsl_dataset_rele(tosnap, FTAG);
+				dsl_pool_rele(dp, FTAG);
+				return (error);
+			}
+		}
+
+		error = dmu_send_estimate(tosnap, fromsnap,
+		    &zc->zc_objset_type);
+
+		if (fromsnap != NULL)
+			dsl_dataset_rele(fromsnap, FTAG);
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+	} else {
+		file_t *fp;
+#ifdef __FreeBSD__
+		cap_rights_t rights;
+
+		fget_write(curthread, zc->zc_cookie,
+		    cap_rights_init(&rights, CAP_WRITE), &fp);
+#else
+		fp = getf(zc->zc_cookie);
+#endif
+		if (fp == NULL)
+			return (SET_ERROR(EBADF));
+
+		off = fp->f_offset;
+		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+		    zc->zc_fromobj, embedok, large_block_ok,
+#ifdef illumos
+		    zc->zc_cookie, fp->f_vnode, &off);
+#else
+		    zc->zc_cookie, fp, &off);
+#endif
+
+		if (off >= 0 && off <= MAXOFFSET_T)
+			fp->f_offset = off;
+		releasef(zc->zc_cookie);
 	}
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name	name of snapshot on which to report progress
+ * zc_cookie	file descriptor of send stream
+ *
+ * outputs:
+ * zc_cookie	number of bytes written in send stream thus far
+ */
+static int
+zfs_ioc_send_progress(zfs_cmd_t *zc)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	dmu_sendarg_t *dsp = NULL;
+	int error;
+
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
 
-	error = fd_getvnode(zc->zc_cookie, &fp);
+	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
 	if (error != 0) {
-		dmu_objset_rele(tosnap, FTAG);
-		if (fromsnap)
-			dmu_objset_rele(fromsnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
-	off = fp->f_offset;
-	error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_data, &off);
+	mutex_enter(&ds->ds_sendstream_lock);
 
-	if (VOP_SEEK(fp->f_data, fp->f_offset, &off, NULL) == 0)
-		fp->f_offset = off;
-	fd_putfile(zc->zc_cookie);
-	if (fromsnap)
-		dmu_objset_rele(fromsnap, FTAG);
-	dmu_objset_rele(tosnap, FTAG);
+	/*
+	 * Iterate over all the send streams currently active on this dataset.
+	 * If there's one which matches the specified file descriptor _and_ the
+	 * stream was started by the current process, return the progress of
+	 * that stream.
+	 */
+	for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
+	    dsp = list_next(&ds->ds_sendstreams, dsp)) {
+		if (dsp->dsa_outfd == zc->zc_cookie &&
+		    dsp->dsa_proc == curproc)
+			break;
+	}
+
+	if (dsp != NULL)
+		zc->zc_cookie = *(dsp->dsa_off);
+	else
+		error = SET_ERROR(ENOENT);
+
+	mutex_exit(&ds->ds_sendstream_lock);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
@@ -3747,7 +4834,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 	spa = spa_lookup(zc->zc_name);
 	if (spa == NULL) {
 		mutex_exit(&spa_namespace_lock);
-		return (EIO);
+		return (SET_ERROR(EIO));
 	}
 	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
 		/* we need to let spa_open/spa_load clear the chains */
@@ -3763,21 +4850,24 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 		nvlist_t *config = NULL;
 
 		if (zc->zc_nvlist_src == 0)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
 			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
 			    policy, &config);
 			if (config != NULL) {
-				(void) put_nvlist(zc, config);
+				int err;
+
+				if ((err = put_nvlist(zc, config)) != 0)
+					error = err;
 				nvlist_free(config);
 			}
 			nvlist_free(policy);
 		}
 	}
 
-	if (error)
+	if (error != 0)
 		return (error);
 
 	spa_vdev_state_enter(spa, SCL_NONE);
@@ -3789,7 +4879,7 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 		if (vd == NULL) {
 			(void) spa_vdev_state_exit(spa, NULL, ENODEV);
 			spa_close(spa, FTAG);
-			return (ENODEV);
+			return (SET_ERROR(ENODEV));
 		}
 	}
 
@@ -3801,13 +4891,39 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 	 * Resume any suspended I/Os.
 	 */
 	if (zio_resume(spa) != 0)
-		error = EIO;
+		error = SET_ERROR(EIO);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
+static int
+zfs_ioc_pool_reopen(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+
+	/*
+	 * If a resilver is already in progress then set the
+	 * spa_scrub_reopen flag to B_TRUE so that we don't restart
+	 * the scan as a side effect of the reopen. Otherwise, let
+	 * vdev_open() decided if a resilver is required.
+	 */
+	spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool);
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	spa_close(spa, FTAG);
+	return (0);
+}
 /*
  * inputs:
  * zc_name	name of filesystem
@@ -3829,7 +4945,7 @@ zfs_ioc_promote(zfs_cmd_t *zc)
 	if (cp)
 		*cp = '\0';
 	(void) dmu_objset_find(zc->zc_value,
-	    zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
+	    zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
 	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
 }
 
@@ -3852,10 +4968,10 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc)
 	int error;
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
-	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
-	if (error)
+	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+	if (error != 0)
 		return (error);
 
 	error = zfs_userspace_one(zfsvfs,
@@ -3880,22 +4996,24 @@ static int
 zfs_ioc_userspace_many(zfs_cmd_t *zc)
 {
 	zfsvfs_t *zfsvfs;
-	int error;
+	int bufsize = zc->zc_nvlist_dst_size;
+
+	if (bufsize <= 0)
+		return (SET_ERROR(ENOMEM));
 
-	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
-	if (error)
+	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+	if (error != 0)
 		return (error);
 
-	int bufsize = zc->zc_nvlist_dst_size;
 	void *buf = kmem_alloc(bufsize, KM_SLEEP);
 
 	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
 	    buf, &zc->zc_nvlist_dst_size);
 
 	if (error == 0) {
-		error = xcopyout(buf,
+		error = ddi_copyout(buf,
 		    (void *)(uintptr_t)zc->zc_nvlist_dst,
-		    zc->zc_nvlist_dst_size);
+		    zc->zc_nvlist_dst_size, zc->zc_iflags);
 	}
 	kmem_free(buf, bufsize);
 	zfsvfs_rele(zfsvfs, FTAG);
@@ -3924,17 +5042,27 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 			 * objset needs to be closed & reopened (to grow the
 			 * objset_phys_t).  Suspend/resume the fs will do that.
 			 */
+			dsl_dataset_t *ds;
+
+			ds = dmu_objset_ds(zfsvfs->z_os);
 			error = zfs_suspend_fs(zfsvfs);
-			if (error == 0)
-				error = zfs_resume_fs(zfsvfs, zc->zc_name);
+			if (error == 0) {
+				dmu_objset_refresh_ownership(zfsvfs->z_os,
+				    zfsvfs);
+				error = zfs_resume_fs(zfsvfs, ds);
+			}
 		}
 		if (error == 0)
 			error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+#ifdef illumos
 		VFS_RELE(zfsvfs->z_vfs);
+#else
+		vfs_unbusy(zfsvfs->z_vfs);
+#endif
 	} else {
 		/* XXX kind of reading contents without owning */
 		error = dmu_objset_hold(zc->zc_name, FTAG, &os);
-		if (error)
+		if (error != 0)
 			return (error);
 
 		error = dmu_objset_userspace_upgrade(os);
@@ -3944,6 +5072,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 	return (error);
 }
 
+#ifdef illumos
 /*
  * We don't want to have a hard dependency
  * against some special symbols in sharefs
@@ -3951,17 +5080,6 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
  * the first file system is shared.
  * Neither sharefs, nfs or smbsrv are unloadable modules.
  */
-#ifdef __NetBSD__
-
-static int
-zfs_ioc_share(zfs_cmd_t *zc)
-{
-
-	return EOPNOTSUPP;
-}
-
-#else	/* __NetBSD__ */
-
 int (*znfsexport_fs)(void *arg);
 int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t);
 int (*zsmbexport_fs)(void *arg, boolean_t add_share);
@@ -3972,8 +5090,10 @@ int zfs_smbshare_inited;
 ddi_modhandle_t nfs_mod;
 ddi_modhandle_t sharefs_mod;
 ddi_modhandle_t smbsrv_mod;
+#endif	/* illumos */
 kmutex_t zfs_share_lock;
 
+#ifdef illumos
 static int
 zfs_init_sharefs()
 {
@@ -3984,19 +5104,21 @@ zfs_init_sharefs()
 	if (sharefs_mod == NULL && ((sharefs_mod =
 	    ddi_modopen("fs/sharefs",
 	    KRTLD_MODE_FIRST, &error)) == NULL)) {
-		return (ENOSYS);
+		return (SET_ERROR(ENOSYS));
 	}
 	if (zshare_fs == NULL && ((zshare_fs =
 	    (int (*)(enum sharefs_sys_op, share_t *, uint32_t))
 	    ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) {
-		return (ENOSYS);
+		return (SET_ERROR(ENOSYS));
 	}
 	return (0);
 }
+#endif	/* illumos */
 
 static int
 zfs_ioc_share(zfs_cmd_t *zc)
 {
+#ifdef illumos
 	int error;
 	int opcode;
 
@@ -4008,19 +5130,19 @@ zfs_ioc_share(zfs_cmd_t *zc)
 			if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs",
 			    KRTLD_MODE_FIRST, &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
-				return (ENOSYS);
+				return (SET_ERROR(ENOSYS));
 			}
 			if (znfsexport_fs == NULL &&
 			    ((znfsexport_fs = (int (*)(void *))
 			    ddi_modsym(nfs_mod,
 			    "nfs_export", &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
-				return (ENOSYS);
+				return (SET_ERROR(ENOSYS));
 			}
 			error = zfs_init_sharefs();
-			if (error) {
+			if (error != 0) {
 				mutex_exit(&zfs_share_lock);
-				return (ENOSYS);
+				return (SET_ERROR(ENOSYS));
 			}
 			zfs_nfsshare_inited = 1;
 			mutex_exit(&zfs_share_lock);
@@ -4034,25 +5156,25 @@ zfs_ioc_share(zfs_cmd_t *zc)
 			    ddi_modopen("drv/smbsrv",
 			    KRTLD_MODE_FIRST, &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
-				return (ENOSYS);
+				return (SET_ERROR(ENOSYS));
 			}
 			if (zsmbexport_fs == NULL && ((zsmbexport_fs =
 			    (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod,
 			    "smb_server_share", &error)) == NULL)) {
 				mutex_exit(&zfs_share_lock);
-				return (ENOSYS);
+				return (SET_ERROR(ENOSYS));
 			}
 			error = zfs_init_sharefs();
-			if (error) {
+			if (error != 0) {
 				mutex_exit(&zfs_share_lock);
-				return (ENOSYS);
+				return (SET_ERROR(ENOSYS));
 			}
 			zfs_smbshare_inited = 1;
 			mutex_exit(&zfs_share_lock);
 		}
 		break;
 	default:
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	switch (zc->zc_share.z_sharetype) {
@@ -4087,6 +5209,9 @@ zfs_ioc_share(zfs_cmd_t *zc)
 
 	return (error);
 
+#else	/* !illumos */
+	return (ENOSYS);
+#endif	/* illumos */
 }
 
 ace_t full_access[] = {
@@ -4094,6 +5219,109 @@ ace_t full_access[] = {
 };
 
 /*
+ * inputs:
+ * zc_name		name of containing filesystem
+ * zc_obj		object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj		next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+	objset_t *os = NULL;
+	int error;
+
+	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+	if (error != 0)
+		return (error);
+
+	error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
+	    dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);
+
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_value		prefix name for snapshot
+ * zc_cleanup_fd	cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ * zc_value		short name of new snapshot
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+	char *snap_name;
+	char *hold_name;
+	int error;
+	minor_t minor;
+
+	error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+	if (error != 0)
+		return (error);
+
+	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+	    (u_longlong_t)ddi_get_lbolt64());
+	hold_name = kmem_asprintf("%%%s", zc->zc_value);
+
+	error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
+	    hold_name);
+	if (error == 0)
+		(void) strcpy(zc->zc_value, snap_name);
+	strfree(snap_name);
+	strfree(hold_name);
+	zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of "to" snapshot
+ * zc_value		name of "from" snapshot
+ * zc_cookie		file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+	file_t *fp;
+	offset_t off;
+	int error;
+
+#ifdef __FreeBSD__
+	cap_rights_t rights;
+
+	fget_write(curthread, zc->zc_cookie,
+		    cap_rights_init(&rights, CAP_WRITE), &fp);
+#else
+	fp = getf(zc->zc_cookie);
+#endif
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));
+
+	off = fp->f_offset;
+
+#ifdef __FreeBSD__
+	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
+#else
+	error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off);
+#endif
+
+	if (off >= 0 && off <= MAXOFFSET_T)
+		fp->f_offset = off;
+	releasef(zc->zc_cookie);
+
+	return (error);
+}
+
+#ifdef illumos
+/*
  * Remove all ACL files in shares dir
  */
 static int
@@ -4114,10 +5342,12 @@ zfs_smb_acl_purge(znode_t *dzp)
 	zap_cursor_fini(&zc);
 	return (error);
 }
+#endif	/* illumos */
 
 static int
 zfs_ioc_smb_acl(zfs_cmd_t *zc)
 {
+#ifdef illumos
 	vnode_t *vp;
 	znode_t *dzp;
 	vnode_t *resourcevp = NULL;
@@ -4130,18 +5360,18 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 	int error = 0;
 
 	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
-		    NULL, &vp)) != 0)
+	    NO_FOLLOW, NULL, &vp)) != 0)
 		return (error);
 
 	/* Now make sure mntpnt and dataset are ZFS */
-#ifndef __NetBSD__
-	if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+
+	if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 ||
 	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
 	    zc->zc_name) != 0)) {
 		VN_RELE(vp);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
-#endif
+
 	dzp = VTOZ(vp);
 	zfsvfs = dzp->z_zfsvfs;
 	ZFS_ENTER(zfsvfs);
@@ -4158,13 +5388,13 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 		    ZFS_SHARES_DIR);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
+		if (error != 0) {
 			dmu_tx_abort(tx);
 		} else {
 			error = zfs_create_share_dir(zfsvfs, tx);
 			dmu_tx_commit(tx);
 		}
-		if (error) {
+		if (error != 0) {
 			mutex_exit(&zfsvfs->z_lock);
 			VN_RELE(vp);
 			ZFS_EXIT(zfsvfs);
@@ -4208,6 +5438,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
 			VN_RELE(vp);
+			VN_RELE(ZTOV(sharedir));
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
@@ -4230,7 +5461,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 		break;
 
 	default:
-		error = EINVAL;
+		error = SET_ERROR(EINVAL);
 		break;
 	}
 
@@ -4240,457 +5471,1187 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
+#else	/* !illumos */
+	return (EOPNOTSUPP);
+#endif	/* illumos */
 }
-#endif	/* __NetBSD__ */
 
 /*
- * inputs:
- * zc_name	name of filesystem
- * zc_value	short name of snap
- * zc_string	user-supplied tag for this reference
- * zc_cookie	recursive flag
- * zc_temphold	set if hold is temporary
+ * innvl: {
+ *     "holds" -> { snapname -> holdname (string), ... }
+ *     (optional) "cleanup_fd" -> fd (int32)
+ * }
  *
- * outputs:		none
+ * outnvl: {
+ *     snapname -> error value (int32)
+ *     ...
+ * }
  */
+/* ARGSUSED */
 static int
-zfs_ioc_hold(zfs_cmd_t *zc)
+zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 {
-	boolean_t recursive = zc->zc_cookie;
+	nvpair_t *pair;
+	nvlist_t *holds;
+	int cleanup_fd = -1;
+	int error;
+	minor_t minor = 0;
 
-	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-		return (EINVAL);
+	error = nvlist_lookup_nvlist(args, "holds", &holds);
+	if (error != 0)
+		return (SET_ERROR(EINVAL));
 
-	return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
-	    zc->zc_string, recursive, zc->zc_temphold));
-}
+	/* make sure the user didn't pass us any invalid (empty) tags */
+	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		char *htag;
 
-/*
- * inputs:
- * zc_name	name of dataset from which we're releasing a user reference
- * zc_value	short name of snap
- * zc_string	user-supplied tag for this reference
- * zc_cookie	recursive flag
- *
- * outputs:		none
+		error = nvpair_value_string(pair, &htag);
+		if (error != 0)
+			return (SET_ERROR(error));
+
+		if (strlen(htag) == 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
+		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+		if (error != 0)
+			return (error);
+	}
+
+	error = dsl_dataset_user_hold(holds, minor, errlist);
+	if (minor != 0)
+		zfs_onexit_fd_rele(cleanup_fd);
+	return (error);
+}
+
+/*
+ * innvl is not used.
+ *
+ * outnvl: {
+ *    holdname -> time added (uint64 seconds since epoch)
+ *    ...
+ * }
  */
+/* ARGSUSED */
 static int
-zfs_ioc_release(zfs_cmd_t *zc)
+zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
 {
-	boolean_t recursive = zc->zc_cookie;
-
-	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
-		return (EINVAL);
+	return (dsl_dataset_get_holds(snapname, outnvl));
+}
 
-	return (dsl_dataset_user_release(zc->zc_name, zc->zc_value,
-	    zc->zc_string, recursive));
+/*
+ * innvl: {
+ *     snapname -> { holdname, ... }
+ *     ...
+ * }
+ *
+ * outnvl: {
+ *     snapname -> error value (int32)
+ *     ...
+ * }
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
+{
+	return (dsl_dataset_user_release(holds, errlist));
 }
 
 /*
  * inputs:
- * zc_name		name of filesystem
+ * zc_name		name of new filesystem or snapshot
+ * zc_value		full name of old snapshot
  *
  * outputs:
- * zc_nvlist_src{_size}	nvlist of snapshot holds
+ * zc_cookie		space in bytes
+ * zc_objset_type	compressed space in bytes
+ * zc_perm_action	uncompressed space in bytes
  */
 static int
-zfs_ioc_get_holds(zfs_cmd_t *zc)
+zfs_ioc_space_written(zfs_cmd_t *zc)
 {
-	nvlist_t *nvp;
 	int error;
+	dsl_pool_t *dp;
+	dsl_dataset_t *new, *old;
 
-	if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) {
-		error = put_nvlist(zc, nvp);
-		nvlist_free(nvp);
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+	error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
+	if (error != 0) {
+		dsl_dataset_rele(new, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (error);
 	}
 
+	error = dsl_dataset_space_written(old, new, &zc->zc_cookie,
+	    &zc->zc_objset_type, &zc->zc_perm_action);
+	dsl_dataset_rele(old, FTAG);
+	dsl_dataset_rele(new, FTAG);
+	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 /*
- * pool create, destroy, and export don't log the history as part of
- * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
- * do the logging of those commands.
- */
-static zfs_ioc_vec_t zfs_ioc_vec[] = {
-	{ zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_pool_destroy,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_FALSE },
-	{ zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_pool_configs,	zfs_secpolicy_none, NO_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_pool_upgrade,	zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_vdev_set_state, zfs_secpolicy_config,	POOL_NAME, B_TRUE,
-	    B_FALSE },
-	{ zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_vdev_setpath,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
-	    B_TRUE },
-	{ zfs_ioc_vdev_setfru,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
-	    B_TRUE },
-	{ zfs_ioc_objset_stats,	zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-	    B_TRUE },
-	{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-	    B_TRUE },
-	{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-	    B_TRUE },
-	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
-	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
-	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
-	    B_TRUE},
-	{ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_rename, zfs_secpolicy_rename,	DATASET_NAME, B_TRUE, B_TRUE },
-	{ zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE },
-	{ zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE },
-	{ zfs_ioc_inject_fault,	zfs_secpolicy_inject, NO_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
-	{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME,
-	    B_TRUE, B_TRUE },
-	{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
-	    B_TRUE },
-	{ zfs_ioc_pool_set_props, zfs_secpolicy_config,	POOL_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
-	{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
-	    B_TRUE },
-	/*{ zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
-	  B_FALSE },*/
-	{ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
-	    DATASET_NAME, B_FALSE, B_FALSE },
-	{ zfs_ioc_userspace_many, zfs_secpolicy_userspace_many,
-	    DATASET_NAME, B_FALSE, B_FALSE },
-	{ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
-	    DATASET_NAME, B_FALSE, B_TRUE },
-	{ zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
-	{ zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
-	    B_TRUE },
-	{ zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-	    B_TRUE },
-	{ zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE,
-	    B_TRUE }
-};
-
-int
-pool_status_check(const char *name, zfs_ioc_namecheck_t type)
+ * innvl: {
+ *     "firstsnap" -> snapshot name
+ * }
+ *
+ * outnvl: {
+ *     "used" -> space in bytes
+ *     "compressed" -> compressed space in bytes
+ *     "uncompressed" -> uncompressed space in bytes
+ * }
+ */
+static int
+zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 {
-	spa_t *spa;
 	int error;
+	dsl_pool_t *dp;
+	dsl_dataset_t *new, *old;
+	char *firstsnap;
+	uint64_t used, comp, uncomp;
 
-	ASSERT(type == POOL_NAME || type == DATASET_NAME);
+	if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
+		return (SET_ERROR(EINVAL));
 
-	error = spa_open(name, &spa, FTAG);
-	if (error == 0) {
-		if (spa_suspended(spa))
-			error = EAGAIN;
-		spa_close(spa, FTAG);
+	error = dsl_pool_hold(lastsnap, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+	if (error == 0 && !new->ds_is_snapshot) {
+		dsl_dataset_rele(new, FTAG);
+		error = SET_ERROR(EINVAL);
+	}
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
+	if (error == 0 && !old->ds_is_snapshot) {
+		dsl_dataset_rele(old, FTAG);
+		error = SET_ERROR(EINVAL);
+	}
+	if (error != 0) {
+		dsl_dataset_rele(new, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (error);
 	}
+
+	error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
+	dsl_dataset_rele(old, FTAG);
+	dsl_dataset_rele(new, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	fnvlist_add_uint64(outnvl, "used", used);
+	fnvlist_add_uint64(outnvl, "compressed", comp);
+	fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
 	return (error);
 }
 
+#ifdef __FreeBSD__
+
 static int
-zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+zfs_ioc_jail(zfs_cmd_t *zc)
 {
-	zfs_cmd_t *zc;
-	uint_t vec;
-	int error, rc;
 
-	dprintf("zfsdev_ioctl called \n");
+	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_jailid));
+}
 
-	if (getminor(dev) != 0)
-		return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp));
-	dprintf("zfsdev_ioctl -> zvol_ioctl\n");
-	vec = cmd - ZFS_IOC;
-	ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
 
-	if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
-		return (EINVAL);
+	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_jailid));
+}
 
-	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+#endif
 
-	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
-	if ((error == 0) && !(flag & FKIOCTL))
-		error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr);
+/*
+ * innvl: {
+ *     "fd" -> file descriptor to write stream to (int32)
+ *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "largeblockok" -> (value ignored)
+ *         indicates that blocks > 128KB are permitted
+ *     (optional) "embedok" -> (value ignored)
+ *         presence indicates DRR_WRITE_EMBEDDED records are permitted
+ *     (optional) "resume_object" and "resume_offset" -> (uint64)
+ *         if present, resume send stream from specified object and offset.
+ * }
+ *
+ * outnvl is unused
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int error;
+	offset_t off;
+	char *fromname = NULL;
+	int fd;
+	boolean_t largeblockok;
+	boolean_t embedok;
+	uint64_t resumeobj = 0;
+	uint64_t resumeoff = 0;
 
-	/*
-	 * Ensure that all pool/dataset names are valid before we pass down to
-	 * the lower layers.
-	 */
-	if (error == 0) {
-		dprintf("zfsdev_ioctl, zc->zc_name %s\n", zc->zc_name);
-		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
-		zc->zc_iflags = flag & FKIOCTL;
-		switch (zfs_ioc_vec[vec].zvec_namecheck) {
-		case POOL_NAME:
-			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
-				error = EINVAL;
-			if (zfs_ioc_vec[vec].zvec_pool_check)
-				error = pool_status_check(zc->zc_name,
-				    zfs_ioc_vec[vec].zvec_namecheck);
-			break;
+	error = nvlist_lookup_int32(innvl, "fd", &fd);
+	if (error != 0)
+		return (SET_ERROR(EINVAL));
 
-		case DATASET_NAME:
-			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
-				error = EINVAL;
-			if (zfs_ioc_vec[vec].zvec_pool_check)
-				error = pool_status_check(zc->zc_name,
-				    zfs_ioc_vec[vec].zvec_namecheck);
-			break;
+	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
-		case NO_NAME:
-			break;
-		}
-	}
+	largeblockok = nvlist_exists(innvl, "largeblockok");
+	embedok = nvlist_exists(innvl, "embedok");
 
-	dprintf("zfsdev_ioctl -> calling zfs_ioc_vec zvec_func on %d\n", vec);
-	if (error == 0)
-		error = zfs_ioc_vec[vec].zvec_func(zc);
+	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
 
-	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
+#ifdef __FreeBSD__
+	cap_rights_t rights;
+
+	fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
+#else
+	file_t *fp = getf(fd);
+#endif
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));
+
+	off = fp->f_offset;
+	error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
+#ifdef illumos
+	    resumeobj, resumeoff, fp->f_vnode, &off);
+#else
+	    resumeobj, resumeoff, fp, &off);
+#endif
+
+#ifdef illumos
+	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
+		fp->f_offset = off;
+#else
+	fp->f_offset = off;
+#endif
+
+	releasef(fd);
+	return (error);
+}
+
+/*
+ * Determine approximately how large a zfs send stream will be -- the number
+ * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
+ *
+ * innvl: {
+ *     (optional) "from" -> full snap or bookmark name to send an incremental
+ *                          from
+ * }
+ *
+ * outnvl: {
+ *     "space" -> bytes of space (uint64)
+ * }
+ */
+static int
+zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *tosnap;
+	int error;
+	char *fromname;
+	uint64_t space;
+
+	error = dsl_pool_hold(snapname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	error = nvlist_lookup_string(innvl, "from", &fromname);
 	if (error == 0) {
-		error = rc;
-		if (zfs_ioc_vec[vec].zvec_his_log)
-			zfs_log_history(zc);
+		if (strchr(fromname, '@') != NULL) {
+			/*
+			 * If from is a snapshot, hold it and use the more
+			 * efficient dmu_send_estimate to estimate send space
+			 * size using deadlists.
+			 */
+			dsl_dataset_t *fromsnap;
+			error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+			if (error != 0)
+				goto out;
+			error = dmu_send_estimate(tosnap, fromsnap, &space);
+			dsl_dataset_rele(fromsnap, FTAG);
+		} else if (strchr(fromname, '#') != NULL) {
+			/*
+			 * If from is a bookmark, fetch the creation TXG of the
+			 * snapshot it was created from and use that to find
+			 * blocks that were born after it.
+			 */
+			zfs_bookmark_phys_t frombm;
+
+			error = dsl_bookmark_lookup(dp, fromname, tosnap,
+			    &frombm);
+			if (error != 0)
+				goto out;
+			error = dmu_send_estimate_from_txg(tosnap,
+			    frombm.zbm_creation_txg, &space);
+		} else {
+			/*
+			 * from is not properly formatted as a snapshot or
+			 * bookmark
+			 */
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+	} else {
+		// If estimating the size of a full send, use dmu_send_estimate
+		error = dmu_send_estimate(tosnap, NULL, &space);
 	}
 
-	kmem_free(zc, sizeof (zfs_cmd_t));
-	dprintf("zfsdev_ioctl %d\n", error);
+	fnvlist_add_uint64(outnvl, "space", space);
+
+out:
+	dsl_dataset_rele(tosnap, FTAG);
+	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
-#ifdef __NetBSD__
+static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
 
-#include <sys/module.h>
-#include <uvm/uvm_extern.h>
+static void
+zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+    boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
+{
+	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
 
-MODULE(MODULE_CLASS_DRIVER, zfs, "solaris");
+	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+	ASSERT3U(ioc, <, ZFS_IOC_LAST);
+	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+	ASSERT3P(vec->zvec_func, ==, NULL);
 
-static int
-nb_zvol_copen(dev_t dev, int flag, int mode, lwp_t *l)
+	vec->zvec_legacy_func = func;
+	vec->zvec_secpolicy = secpolicy;
+	vec->zvec_namecheck = namecheck;
+	vec->zvec_allow_log = log_history;
+	vec->zvec_pool_check = pool_check;
+}
+
+/*
+ * See the block comment at the beginning of this file for details on
+ * each argument to this function.
+ */
+static void
+zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+    zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
+    boolean_t allow_log)
+{
+	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
+
+	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+	ASSERT3U(ioc, <, ZFS_IOC_LAST);
+	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+	ASSERT3P(vec->zvec_func, ==, NULL);
+
+	/* if we are logging, the name must be valid */
+	ASSERT(!allow_log || namecheck != NO_NAME);
+
+	vec->zvec_name = name;
+	vec->zvec_func = func;
+	vec->zvec_secpolicy = secpolicy;
+	vec->zvec_namecheck = namecheck;
+	vec->zvec_pool_check = pool_check;
+	vec->zvec_smush_outnvlist = smush_outnvlist;
+	vec->zvec_allow_log = allow_log;
+}
+
+static void
+zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
+    zfs_ioc_poolcheck_t pool_check)
 {
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    POOL_NAME, log_history, pool_check);
+}
 
-	return zvol_open(&dev, flag, OTYPCHR, kauth_cred_get());
+static void
+zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    DATASET_NAME, B_FALSE, pool_check);
 }
 
-static int
-nb_zvol_cclose(dev_t dev, int flag, int mode, lwp_t *l)
+static void
+zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
+	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
+	    POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+}
 
-	return zvol_close(dev, flag, OTYPCHR, kauth_cred_get());
+static void
+zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    NO_NAME, B_FALSE, POOL_CHECK_NONE);
 }
 
-static int
-nb_zvol_bopen(dev_t dev, int flag, int mode, lwp_t *l)
+static void
+zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
+    zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
+}
+
+static void
+zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
+	zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
+	    zfs_secpolicy_read);
+}
 
-	return zvol_open(&dev, flag, OTYPBLK, kauth_cred_get());
+static void
+zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
-static int
-nb_zvol_bclose(dev_t dev, int flag, int mode, lwp_t *l)
+static void
+zfs_ioctl_init(void)
 {
+	zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
+	    zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
+	    zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
+
+	zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
+	    zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
+	zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
+	    zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
+	zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
+	    zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
+	zfs_ioctl_register("create", ZFS_IOC_CREATE,
+	    zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("clone", ZFS_IOC_CLONE,
+	    zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
+	    zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("hold", ZFS_IOC_HOLD,
+	    zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+	zfs_ioctl_register("release", ZFS_IOC_RELEASE,
+	    zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
+	    zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
+	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
+	    zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
+
+	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
+	    zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
+	    zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
+	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
+	    zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
+	    POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+	/* IOCTLS that use the legacy function signature */
+
+	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
+	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
+
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
+	    zfs_ioc_pool_scan);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
+	    zfs_ioc_pool_upgrade);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
+	    zfs_ioc_vdev_add);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
+	    zfs_ioc_vdev_remove);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
+	    zfs_ioc_vdev_set_state);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
+	    zfs_ioc_vdev_attach);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
+	    zfs_ioc_vdev_detach);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
+	    zfs_ioc_vdev_setpath);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
+	    zfs_ioc_vdev_setfru);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
+	    zfs_ioc_pool_set_props);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
+	    zfs_ioc_vdev_split);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
+	    zfs_ioc_pool_reguid);
+
+	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
+	    zfs_ioc_pool_configs, zfs_secpolicy_none);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
+	    zfs_ioc_pool_tryimport, zfs_secpolicy_config);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
+	    zfs_ioc_inject_fault, zfs_secpolicy_inject);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
+	    zfs_ioc_clear_fault, zfs_secpolicy_inject);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
+	    zfs_ioc_inject_list_next, zfs_secpolicy_inject);
+
+	/*
+	 * pool destroy, and export don't log the history as part of
+	 * zfsdev_ioctl, but rather zfs_ioc_pool_export
+	 * does the logging of those commands.
+	 */
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
+	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
+	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE);
+
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
+	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
+	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+
+	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
+	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
+	    zfs_ioc_dsobj_to_dsname,
+	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
+	    zfs_ioc_pool_get_history,
+	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+
+	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
+
+	zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
+	    zfs_ioc_space_written);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
+	    zfs_ioc_objset_recvd_props);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
+	    zfs_ioc_next_obj);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
+	    zfs_ioc_get_fsacl);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
+	    zfs_ioc_objset_stats);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
+	    zfs_ioc_objset_zplprops);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
+	    zfs_ioc_dataset_list_next);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
+	    zfs_ioc_snapshot_list_next);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
+	    zfs_ioc_send_progress);
+
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
+	    zfs_ioc_diff, zfs_secpolicy_diff);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
+	    zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
+	    zfs_ioc_obj_to_path, zfs_secpolicy_diff);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
+	    zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
+	    zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
+	    zfs_ioc_send, zfs_secpolicy_send);
+
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
+	    zfs_secpolicy_none);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
+	    zfs_secpolicy_destroy);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
+	    zfs_secpolicy_rename);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
+	    zfs_secpolicy_recv);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
+	    zfs_secpolicy_promote);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
+	    zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
+	    zfs_secpolicy_set_fsacl);
+
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
+	    zfs_secpolicy_share, POOL_CHECK_NONE);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
+	    zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
+	    zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
+	    zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+
+#ifdef __FreeBSD__
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
+	    zfs_secpolicy_config, POOL_CHECK_NONE);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
+	    zfs_secpolicy_config, POOL_CHECK_NONE);
+	zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+	    zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+	    POOL_CHECK_NONE, B_FALSE, B_FALSE);
+#endif
+}
+
+int
+pool_status_check(const char *name, zfs_ioc_namecheck_t type,
+    zfs_ioc_poolcheck_t check)
+{
+	spa_t *spa;
+	int error;
 
-	return zvol_close(dev, flag, OTYPBLK, kauth_cred_get());
+	ASSERT(type == POOL_NAME || type == DATASET_NAME);
+
+	if (check & POOL_CHECK_NONE)
+		return (0);
+
+	error = spa_open(name, &spa, FTAG);
+	if (error == 0) {
+		if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
+			error = SET_ERROR(EAGAIN);
+		else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
+			error = SET_ERROR(EROFS);
+		spa_close(spa, FTAG);
+	}
+	return (error);
 }
 
-static int
-nb_zvol_read(dev_t dev, struct uio *uio, int flag)
+/*
+ * Find a free minor number.
+ */
+minor_t
+zfsdev_minor_alloc(void)
 {
+	static minor_t last_minor;
+	minor_t m;
 
-	return zvol_read(dev, uio, kauth_cred_get());
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	for (m = last_minor + 1; m != last_minor; m++) {
+		if (m > ZFSDEV_MAX_MINOR)
+			m = 1;
+		if (ddi_get_soft_state(zfsdev_state, m) == NULL) {
+			last_minor = m;
+			return (m);
+		}
+	}
+
+	return (0);
 }
 
+#ifdef __FreeBSD__
 static int
-nb_zvol_write(dev_t dev, struct uio *uio, int flag)
+zfs_ctldev_init(struct cdev *devp)
+#else
+static int
+zfs_ctldev_init(dev_t *devp)
+#endif
 {
+	minor_t minor;
+	zfs_soft_state_t *zs;
 
-	return zvol_write(dev, uio, kauth_cred_get());
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	minor = zfsdev_minor_alloc();
+	if (minor == 0)
+		return (SET_ERROR(ENXIO));
+
+	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS)
+		return (SET_ERROR(EAGAIN));
+
+#ifdef __FreeBSD__
+	devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close);
+#endif
+
+	zs = ddi_get_soft_state(zfsdev_state, minor);
+	zs->zss_type = ZSST_CTLDEV;
+	zfs_onexit_init((zfs_onexit_t **)&zs->zss_data);
+
+	return (0);
 }
 
+static void
+zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	zfs_onexit_destroy(zo);
+	ddi_soft_state_free(zfsdev_state, minor);
+}
+
+void *
+zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which)
+{
+	zfs_soft_state_t *zp;
+
+	zp = ddi_get_soft_state(zfsdev_state, minor);
+	if (zp == NULL || zp->zss_type != which)
+		return (NULL);
+
+	return (zp->zss_data);
+}
+
+#ifdef __FreeBSD__
 static int
-nb_zfsdev_ioctl(dev_t dev, u_long cmd, void *argp, int flag, lwp_t *l)
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+#endif
+#ifdef __NetBSD__
+static int
+zfsdev_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+#endif
 {
-	int rval;
+	int error = 0;
 
-	return zfsdev_ioctl(dev, cmd, (intptr_t)argp, flag, kauth_cred_get(),
-	    &rval);
+#ifndef __FreeBSD__
+	if (getminor(*devp) != 0)
+		return (zvol_open(devp, flag, otyp, cr));
+#endif
+
+	/* This is the control device. Allocate a new minor if requested. */
+	if (flag & FEXCL) {
+		mutex_enter(&spa_namespace_lock);
+		error = zfs_ctldev_init(devp);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	return (error);
 }
 
-const struct bdevsw zfs_bdevsw = {
-	.d_open = nb_zvol_bopen,
-	.d_close = nb_zvol_bclose,
-	.d_strategy = zvol_strategy,
-	.d_ioctl = nb_zfsdev_ioctl,
-	.d_dump = nodump,
-	.d_psize = nosize,
-	.d_flag = D_DISK | D_MPSAFE
-};
+#ifdef __FreeBSD__
+static void
+zfsdev_close(void *data)
+#endif
+#ifdef __NetBSD__
+static int
+zfsdev_close(dev_t dev, int flag, int otyp, cred_t *cr)
+#endif
+{
+	zfs_onexit_t *zo;
+#ifdef __FreeBSD__
+	minor_t minor = (minor_t)(uintptr_t)data;
+#endif
+#ifdef __NetBSD__
+	minor_t minor = getminor(dev);
+#endif
 
-const struct cdevsw zfs_cdevsw = {
-	.d_open = nb_zvol_copen,
-	.d_close = nb_zvol_cclose,
-	.d_read = nb_zvol_read,
-	.d_write = nb_zvol_write,
-	.d_ioctl = nb_zfsdev_ioctl,
-	.d_stop = nostop,
-	.d_tty = notty,
-	.d_poll = nopoll,
-	.d_mmap = nommap,
-	.d_kqfilter = nokqfilter,
-	.d_flag = D_DISK | D_MPSAFE
-};
+	if (minor == 0)
+#ifdef __FreeBSD__
+		return;
+#else
+		return (0);
+#endif
 
-uint_t zfs_fsyncer_key;
-extern uint_t rrw_tsd_key;
+	mutex_enter(&spa_namespace_lock);
+	zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+	if (zo == NULL) {
+		mutex_exit(&spa_namespace_lock);
+#ifdef __FreeBSD__
+		return;
+#else
+		return 0;
+#endif
+	}
+	zfs_ctldev_destroy(zo, minor);
+	mutex_exit(&spa_namespace_lock);
 
-/* ZFS must be used on machines with at least 512Mb. */
-#define ZFS_MIN_MEGS 512 
+#ifndef __FreeBSD__
+	return (0);
+#endif
+}
 
+#ifdef __FreeBSD__
 static int
-zfs_modcmd(modcmd_t cmd, void *arg)
+zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
+    struct thread *td)
+#endif
+#ifdef __NetBSD__
+static int
+zfsdev_ioctl(dev_t dev, int zcmd, intptr_t iarg, int flag, cred_t *cr, int *rvalp)
+#endif
 {
-	int error;
-	int active, inactive;
-	uint64_t availrmem;
-	
-	switch (cmd) {
-	case MODULE_CMD_INIT:
-		if (!rootvnode)
-			return EAGAIN;
+	zfs_cmd_t *zc;
+	uint_t vecnum;
+	int error, rc, len;
+	zfs_iocparm_t *zc_iocparm;
+	int cflag, cmd, oldvecnum;
+	boolean_t newioc, compat;
+	void *compat_zc = NULL;
+#ifdef __FreeBSD__
+	cred_t *cr = td->td_ucred;
+#endif
+	const zfs_ioc_vec_t *vec;
+	char *saved_poolname = NULL;
+	nvlist_t *innvl = NULL;
+#ifdef __NetBSD__
+	caddr_t arg = (caddr_t)iarg;
+#endif
 
-		printf("WARNING: ZFS on NetBSD is under development\n");
-		availrmem = (uint64_t)physmem * PAGE_SIZE / 1048576;
-		if (availrmem < ZFS_MIN_MEGS * 80 / 100) {
-			printf("ERROR: at least %dMB of memory required to"
-			    "use ZFS\n", ZFS_MIN_MEGS);
-			return ENOMEM;
+#if defined(illumos) || defined(__NetBSD__)
+	minor_t minor = getminor(dev);
+
+	if (minor != 0 &&
+	    zfsdev_get_soft_state(minor, ZSST_CTLDEV) == NULL)
+		return (zvol_ioctl(dev, zcmd, iarg, flag, cr, rvalp));
+#endif
+#ifdef illumos
+	vecnum = cmd - ZFS_IOC_FIRST;
+	ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
+#endif
+
+	cflag = ZFS_CMD_COMPAT_NONE;
+	compat = B_FALSE;
+	newioc = B_TRUE;	/* "new" style (zfs_iocparm_t) ioctl */
+	len = IOCPARM_LEN(zcmd);
+	vecnum = cmd = zcmd & 0xff;
+
+	/*
+	 * Check if we are talking to supported older binaries
+	 * and translate zfs_cmd if necessary
+	 */
+	if (len != sizeof(zfs_iocparm_t)) {
+		newioc = B_FALSE;
+		compat = B_TRUE;
+
+		vecnum = cmd;
+
+		switch (len) {
+		case sizeof(zfs_cmd_zcmd_t):
+			cflag = ZFS_CMD_COMPAT_LZC;
+			break;
+		case sizeof(zfs_cmd_deadman_t):
+			cflag = ZFS_CMD_COMPAT_DEADMAN;
+			break;
+		case sizeof(zfs_cmd_v28_t):
+			cflag = ZFS_CMD_COMPAT_V28;
+			break;
+		case sizeof(zfs_cmd_v15_t):
+			cflag = ZFS_CMD_COMPAT_V15;
+			vecnum = zfs_ioctl_v15_to_v28[cmd];
+
+			/*
+			 * Return without further handling
+			 * if the command is blacklisted.
+			 */
+			if (vecnum == ZFS_IOC_COMPAT_PASS)
+				return (0);
+			else if (vecnum == ZFS_IOC_COMPAT_FAIL)
+				return (ENOTSUP);
+			break;
+		default:
+			return (EINVAL);
 		}
-		error = lwp_specific_key_create(&zfs_fsyncer_key, NULL);
-		if (error != 0) {
-			return error;
+	}
+
+	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+		return (SET_ERROR(EINVAL));
+	vec = &zfs_ioc_vec[vecnum];
+
+	zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+
+#ifdef illumos
+	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
+	if (error != 0) {
+		error = SET_ERROR(EFAULT);
+		goto out;
+	}
+#else	/* !illumos */
+
+	bzero(zc, sizeof(zfs_cmd_t));
+
+	if (newioc) {
+		zc_iocparm = (void *)arg;
+
+		switch (zc_iocparm->zfs_ioctl_version) {
+		case ZFS_IOCVER_CURRENT:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
+				error = SET_ERROR(EINVAL);
+				goto out;
+			}
+			break;
+		case ZFS_IOCVER_INLANES:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_INLANES;
+			break;
+		case ZFS_IOCVER_RESUME:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_RESUME;
+			break;
+		case ZFS_IOCVER_EDBP:
+			if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_EDBP;
+			break;
+		case ZFS_IOCVER_ZCMD:
+			if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
+			    zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+			compat = B_TRUE;
+			cflag = ZFS_CMD_COMPAT_ZCMD;
+			break;
+		default:
+			error = SET_ERROR(EINVAL);
+			goto out;
+			/* NOTREACHED */
 		}
-		error = lwp_specific_key_create(&rrw_tsd_key, NULL);
-		if (error != 0) {
-			lwp_specific_key_delete(zfs_fsyncer_key);
-			return error;
+
+		if (compat) {
+			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+			compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+			bzero(compat_zc, sizeof(zfs_cmd_t));
+
+			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+			    compat_zc, zc_iocparm->zfs_cmd_size, flag);
+			if (error != 0) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
+		} else {
+			error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+			    zc, zc_iocparm->zfs_cmd_size, flag);
+			if (error != 0) {
+				error = SET_ERROR(EFAULT);
+				goto out;
+			}
 		}
-		spa_init(FREAD | FWRITE);
-		zvol_init();
-		zfs_vfsinit(16, MOUNT_ZFS); /* I need to use well defined args. */
-		error = devsw_attach("zfs", &zfs_bdevsw, &zfs_bmajor,
-		    &zfs_cdevsw, &zfs_cmajor);
-		if (error != 0) {
-			zvol_fini();
-			zfs_vfsfini();
-			spa_fini();
-			lwp_specific_key_delete(zfs_fsyncer_key);
-			lwp_specific_key_delete(rrw_tsd_key);
+	}
+
+	if (compat) {
+		if (newioc) {
+			ASSERT(compat_zc != NULL);
+			zfs_cmd_compat_get(zc, compat_zc, cflag);
+		} else {
+			ASSERT(compat_zc == NULL);
+			zfs_cmd_compat_get(zc, arg, cflag);
 		}
-		return error;
+		oldvecnum = vecnum;
+		error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
+		if (error != 0)
+			goto out;
+		if (oldvecnum != vecnum)
+			vec = &zfs_ioc_vec[vecnum];
+	}
+#endif	/* !illumos */
 
-	case MODULE_CMD_FINI:
-		if (spa_busy() || zfs_busy() || zvol_busy() ||
-		    zio_injection_enabled)
-			return EBUSY;
-		error = devsw_detach(&zfs_bdevsw, &zfs_cdevsw);
-		zvol_fini();
-		zfs_vfsfini();
-		spa_fini();
-		lwp_specific_key_delete(zfs_fsyncer_key);
-		lwp_specific_key_delete(rrw_tsd_key);
-		return error;
+	zc->zc_iflags = flag & FKIOCTL;
+	if (zc->zc_nvlist_src_size != 0) {
+		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+		    zc->zc_iflags, &innvl);
+		if (error != 0)
+			goto out;
+	}
 
-	case MODULE_CMD_AUTOUNLOAD:
-		/*
-		 * We don't want to be autounloaded because unlike
-		 * other subsystems, we read our own configuration
-		 * from disk and provide things that might be used
-		 * later (zvols).
-		 */
-		return EBUSY;
+	/* rewrite innvl for backwards compatibility */
+	if (compat)
+		innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag);
 
-	default:
-		return ENOTTY;
+	/*
+	 * Ensure that all pool/dataset names are valid before we pass down to
+	 * the lower layers.
+	 */
+	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+	switch (vec->zvec_namecheck) {
+	case POOL_NAME:
+		if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+			error = SET_ERROR(EINVAL);
+		else
+			error = pool_status_check(zc->zc_name,
+			    vec->zvec_namecheck, vec->zvec_pool_check);
+		break;
+
+	case DATASET_NAME:
+		if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+			error = SET_ERROR(EINVAL);
+		else
+			error = pool_status_check(zc->zc_name,
+			    vec->zvec_namecheck, vec->zvec_pool_check);
+		break;
+
+	case NO_NAME:
+		break;
 	}
-}
 
-#else	/* __NetBSD__ */
+	if (error == 0)
+		error = vec->zvec_secpolicy(zc, innvl, cr);
 
-int
-_fini(void)
-{
-	int error;
+	if (error != 0)
+		goto out;
 
-	if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
-		return (EBUSY);
+	/* legacy ioctls can modify zc_name */
+	len = strcspn(zc->zc_name, "/@#") + 1;
+	saved_poolname = kmem_alloc(len, KM_SLEEP);
+	(void) strlcpy(saved_poolname, zc->zc_name, len);
+
+	if (vec->zvec_func != NULL) {
+		nvlist_t *outnvl;
+		int puterror = 0;
+		spa_t *spa;
+		nvlist_t *lognv = NULL;
 
-	if ((error = mod_remove(&modlinkage)) != 0)
-		return (error);
+		ASSERT(vec->zvec_legacy_func == NULL);
 
-	zvol_fini();
-	zfs_fini();
-	spa_fini();
-	if (zfs_nfsshare_inited)
-		(void) ddi_modclose(nfs_mod);
-	if (zfs_smbshare_inited)
-		(void) ddi_modclose(smbsrv_mod);
-	if (zfs_nfsshare_inited || zfs_smbshare_inited)
-		(void) ddi_modclose(sharefs_mod);
+		/*
+		 * Add the innvl to the lognv before calling the func,
+		 * in case the func changes the innvl.
+		 */
+		if (vec->zvec_allow_log) {
+			lognv = fnvlist_alloc();
+			fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
+			    vec->zvec_name);
+			if (!nvlist_empty(innvl)) {
+				fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
+				    innvl);
+			}
+		}
+
+		outnvl = fnvlist_alloc();
+		error = vec->zvec_func(zc->zc_name, innvl, outnvl);
+
+		if (error == 0 && vec->zvec_allow_log &&
+		    spa_open(zc->zc_name, &spa, FTAG) == 0) {
+			if (!nvlist_empty(outnvl)) {
+				fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL,
+				    outnvl);
+			}
+			(void) spa_history_log_nvl(spa, lognv);
+			spa_close(spa, FTAG);
+		}
+		fnvlist_free(lognv);
+
+		/* rewrite outnvl for backwards compatibility */
+		if (compat)
+			outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
+			    cflag);
+
+		if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
+			int smusherror = 0;
+			if (vec->zvec_smush_outnvlist) {
+				smusherror = nvlist_smush(outnvl,
+				    zc->zc_nvlist_dst_size);
+			}
+			if (smusherror == 0)
+				puterror = put_nvlist(zc, outnvl);
+		}
+
+		if (puterror != 0)
+			error = puterror;
+
+		nvlist_free(outnvl);
+	} else {
+		error = vec->zvec_legacy_func(zc);
+	}
 
-	tsd_destroy(&zfs_fsyncer_key);
-	ldi_ident_release(zfs_li);
-	zfs_li = NULL;
-	mutex_destroy(&zfs_share_lock);
+out:
+	nvlist_free(innvl);
+
+#ifdef illumos
+	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
+	if (error == 0 && rc != 0)
+		error = SET_ERROR(EFAULT);
+#else
+	if (compat) {
+		zfs_ioctl_compat_post(zc, cmd, cflag);
+		if (newioc) {
+			ASSERT(compat_zc != NULL);
+			ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+
+			zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
+			rc = ddi_copyout(compat_zc,
+			    (void *)(uintptr_t)zc_iocparm->zfs_cmd,
+			    zc_iocparm->zfs_cmd_size, flag);
+			if (error == 0 && rc != 0)
+				error = SET_ERROR(EFAULT);
+			kmem_free(compat_zc, sizeof (zfs_cmd_t));
+		} else {
+			zfs_cmd_compat_put(zc, arg, vecnum, cflag);
+		}
+	} else {
+		ASSERT(newioc);
+
+		rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
+		    sizeof (zfs_cmd_t), flag);
+		if (error == 0 && rc != 0)
+			error = SET_ERROR(EFAULT);
+	}
+#endif
+	if (error == 0 && vec->zvec_allow_log) {
+		char *s = tsd_get(zfs_allow_log_key);
+		if (s != NULL)
+			strfree(s);
+		(void) tsd_set(zfs_allow_log_key, saved_poolname);
+	} else {
+		if (saved_poolname != NULL)
+			strfree(saved_poolname);
+	}
 
+	kmem_free(zc, sizeof (zfs_cmd_t));
 	return (error);
 }
 
+static void
+zfs_allow_log_destroy(void *arg)
+{
+	char *poolname = arg;
+	strfree(poolname);
+}
+
+#ifdef illumos
 static int
 zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
@@ -4752,8 +6713,8 @@ zfs_info(dev_info_t *dip, ddi_info_cmd_t
  * so most of the standard driver entry points are in zvol.c.
  */
 static struct cb_ops zfs_cb_ops = {
-	zvol_open,	/* open */
-	zvol_close,	/* close */
+	zfsdev_open,	/* open */
+	zfsdev_close,	/* close */
 	zvol_strategy,	/* strategy */
 	nodev,		/* print */
 	zvol_dump,	/* dump */
@@ -4800,10 +6761,6 @@ static struct modlinkage modlinkage = {
 	NULL
 };
 
-
-uint_t zfs_fsyncer_key;
-extern uint_t rrw_tsd_key;
-
 int
 _init(void)
 {
@@ -4812,6 +6769,7 @@ _init(void)
 	spa_init(FREAD | FWRITE);
 	zfs_init();
 	zvol_init();
+	zfs_ioctl_init();
 
 	if ((error = mod_install(&modlinkage)) != 0) {
 		zvol_fini();
@@ -4821,7 +6779,9 @@ _init(void)
 	}
 
 	tsd_create(&zfs_fsyncer_key, NULL);
-	tsd_create(&rrw_tsd_key, NULL);
+	tsd_create(&zfs_putpages_key, NULL);
+	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
 
 	error = ldi_ident_from_mod(&modlinkage, &zfs_li);
 	ASSERT(error == 0);
@@ -4836,7 +6796,7 @@ _fini(void)
 	int error;
 
 	if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
-		return (EBUSY);
+		return (SET_ERROR(EBUSY));
 
 	if ((error = mod_remove(&modlinkage)) != 0)
 		return (error);
@@ -4864,4 +6824,382 @@ _info(struct modinfo *modinfop)
 {
 	return (mod_info(&modlinkage, modinfop));
 }
-#endif	/* __NetBSD__ */
+#endif	/* illumos */
+
+#ifdef __FreeBSD__
+static struct cdevsw zfs_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	zfsdev_open,
+	.d_ioctl =	zfsdev_ioctl,
+	.d_name =	ZFS_DEV_NAME
+};
+
+static void
+zfsdev_init(void)
+{
+	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
+	    ZFS_DEV_NAME);
+}
+
+static void
+zfsdev_fini(void)
+{
+	if (zfsdev != NULL)
+		destroy_dev(zfsdev);
+}
+
+static struct root_hold_token *zfs_root_token;
+struct proc *zfsproc;
+
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+
+#define ZFS_MIN_KSTACK_PAGES 4
+
+int
+zfs__init(void)
+{
+
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+	    "overflow panic!\nPlease consider adding "
+	    "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+	    ZFS_MIN_KSTACK_PAGES);
+#endif
+	zfs_root_token = root_mount_hold("ZFS");
+
+	mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	spa_init(FREAD | FWRITE);
+	zfs_init();
+	zvol_init();
+	zfs_ioctl_init();
+
+	tsd_create(&zfs_fsyncer_key, NULL);
+	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+	tsd_create(&zfs_geom_probe_vdev_key, NULL);
+
+	printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
+	root_mount_rel(zfs_root_token);
+
+	zfsdev_init();
+
+	return (0);
+}
+
+int
+zfs__fini(void)
+{
+	if (spa_busy() || zfs_busy() || zvol_busy() ||
+	    zio_injection_enabled) {
+		return (EBUSY);
+	}
+
+	zfsdev_fini();
+	zvol_fini();
+	zfs_fini();
+	spa_fini();
+
+	tsd_destroy(&zfs_fsyncer_key);
+	tsd_destroy(&rrw_tsd_key);
+	tsd_destroy(&zfs_allow_log_key);
+
+	mutex_destroy(&zfs_share_lock);
+
+	return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+	/*
+	 * ZFS fini routines can not properly work in a panic-ed system.
+	 */
+	if (panicstr == NULL)
+		(void)zfs__fini();
+}
+
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+	int err;
+
+	switch (type) {
+	case MOD_LOAD:
+		err = zfs__init();
+		if (err == 0)
+			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+			    shutdown_post_sync, zfs_shutdown, NULL,
+			    SHUTDOWN_PRI_FIRST);
+		return (err);
+	case MOD_UNLOAD:
+		err = zfs__fini();
+		if (err == 0 && zfs_shutdown_event_tag != NULL)
+			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+			    zfs_shutdown_event_tag);
+		return (err);
+	case MOD_SHUTDOWN:
+		return (0);
+	default:
+		break;
+	}
+	return (EOPNOTSUPP);
+}
+
+static moduledata_t zfs_mod = {
+	"zfsctrl",
+	zfs_modevent,
+	0
+};
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(zfsctrl, 1);
+MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
+
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
+
+#include <sys/module.h>
+#include <uvm/uvm_extern.h>
+
+MODULE(MODULE_CLASS_DRIVER, zfs, "solaris");
+
+static int
+nb_zfsdev_copen(dev_t dev, int flag, int mode, lwp_t *l)
+{
+
+	return zfsdev_open(&dev, flag, OTYPCHR, kauth_cred_get());
+}
+
+static int
+nb_zfsdev_cclose(dev_t dev, int flag, int mode, lwp_t *l)
+{
+
+	return zfsdev_close(dev, flag, OTYPCHR, kauth_cred_get());
+}
+
+static int
+nb_zfsdev_bopen(dev_t dev, int flag, int mode, lwp_t *l)
+{
+
+	return zfsdev_open(&dev, flag, OTYPBLK, kauth_cred_get());
+}
+
+static int
+nb_zfsdev_bclose(dev_t dev, int flag, int mode, lwp_t *l)
+{
+
+	return zfsdev_close(dev, flag, OTYPBLK, kauth_cred_get());
+}
+
+static int
+nb_zvol_read(dev_t dev, struct uio *uio, int flag)
+{
+
+	return zvol_read(dev, uio, kauth_cred_get());
+}
+
+static int
+nb_zvol_write(dev_t dev, struct uio *uio, int flag)
+{
+
+	return zvol_write(dev, uio, kauth_cred_get());
+}
+
+static int
+nb_zfsdev_ioctl(dev_t dev, u_long cmd, void *argp, int flag, lwp_t *l)
+{
+	int rval;
+
+	return zfsdev_ioctl(dev, cmd, (intptr_t)argp, flag, kauth_cred_get(),
+	    &rval);
+}
+
+static void
+nb_zvol_strategy(struct buf *bp)
+{
+
+	(void) zvol_strategy(bp);
+}
+
+const struct bdevsw zfs_bdevsw = {
+	.d_open = nb_zfsdev_bopen,
+	.d_close = nb_zfsdev_bclose,
+	.d_strategy = nb_zvol_strategy,
+	.d_ioctl = nb_zfsdev_ioctl,
+	.d_dump = nodump,
+	.d_psize = nosize,
+	.d_flag = D_DISK | D_MPSAFE
+};
+
+const struct cdevsw zfs_cdevsw = {
+	.d_open = nb_zfsdev_copen,
+	.d_close = nb_zfsdev_cclose,
+	.d_read = nb_zvol_read,
+	.d_write = nb_zvol_write,
+	.d_ioctl = nb_zfsdev_ioctl,
+	.d_stop = nostop,
+	.d_tty = notty,
+	.d_poll = nopoll,
+	.d_mmap = nommap,
+	.d_kqfilter = nokqfilter,
+	.d_flag = D_DISK | D_MPSAFE
+};
+
+/* ZFS should only be used on systems with enough memory. */
+#define ZFS_MIN_MEGS 512 
+
+static int zfs_version_ioctl = ZFS_IOCVER_CURRENT;
+static int zfs_version_spa = SPA_VERSION;
+static struct sysctllog *zfs_sysctl_log;
+
+static void
+zfs_sysctl_init(void)
+{
+	const struct sysctlnode *rnode;
+
+	sysctl_createv(&zfs_sysctl_log, 0, NULL, &rnode,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "zfs",
+		       SYSCTL_DESCR("zfs"),
+		       NULL, 0, NULL, 0,
+		       CTL_VFS, CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&zfs_sysctl_log, 0, &rnode, &rnode,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_NODE, "version",
+		       SYSCTL_DESCR("version"),
+		       NULL, 0, NULL, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&zfs_sysctl_log, 0, &rnode, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+		       CTLTYPE_INT, "ioctl",
+		       SYSCTL_DESCR("ZFS ioctl version"),
+		       NULL, 0, &zfs_version_ioctl, 0,
+		       CTL_CREATE, CTL_EOL);
+
+	sysctl_createv(&zfs_sysctl_log, 0, &rnode, NULL,
+		       CTLFLAG_PERMANENT|CTLFLAG_READONLY,
+		       CTLTYPE_INT, "spa",
+		       SYSCTL_DESCR("ZFS SPA version"),
+		       NULL, 0, &zfs_version_spa, 0,
+		       CTL_CREATE, CTL_EOL);
+}
+
+static void
+zfs_sysctl_fini(void)
+{
+
+	sysctl_teardown(&zfs_sysctl_log);
+}
+
+
+static void
+zfs_loadvnode_destroy(void *arg)
+{
+
+	if (arg != NULL)
+		panic("thread exiting with TSD loadvnode data %p", arg);
+}
+
+static int
+zfs_modcmd(modcmd_t cmd, void *arg)
+{
+	int error;
+	int active, inactive;
+	uint64_t availrmem;
+
+	extern struct vfsops zfs_vfsops;
+	extern uint_t zfs_loadvnode_key;
+	extern uint_t zfs_putpage_key;
+
+	switch (cmd) {
+	case MODULE_CMD_INIT:
+		if (!rootvnode)
+			return EAGAIN;
+
+		/* XXXNETBSD trim is not supported yet */
+		zfs_trim_enabled = B_FALSE;
+
+		printf("WARNING: ZFS on NetBSD is under development\n");
+		availrmem = (uint64_t)physmem * PAGE_SIZE / 1048576;
+		if (availrmem < ZFS_MIN_MEGS * 80 / 100) {
+			printf("ERROR: at least %dMB of memory required to"
+			    "use ZFS\n", ZFS_MIN_MEGS);
+			return ENOMEM;
+		}
+		mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+		tsd_create(&zfs_fsyncer_key, NULL);
+		tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+		tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+		tsd_create(&zfs_loadvnode_key, zfs_loadvnode_destroy);
+		tsd_create(&zfs_putpage_key, NULL);
+
+		spa_init(FREAD | FWRITE);
+		zfs_init();
+		zvol_init();
+		zfs_ioctl_init();
+		zfs_sysctl_init();
+
+		error = devsw_attach("zfs", &zfs_bdevsw, &zfs_bmajor,
+		    &zfs_cdevsw, &zfs_cmajor);
+		if (error != 0) {
+			goto attacherr;
+		}
+		(void) vfs_attach(&zfs_vfsops);
+		return error;
+
+	case MODULE_CMD_FINI:
+		if (spa_busy() || zfs_busy() || zvol_busy() ||
+		    zio_injection_enabled)
+			return EBUSY;
+
+		error = vfs_detach(&zfs_vfsops);
+		if (error)
+			return error;
+
+		(void) devsw_detach(&zfs_bdevsw, &zfs_cdevsw);
+
+attacherr:
+		zfs_sysctl_fini();
+		zvol_fini();
+		zfs_fini();
+		spa_fini();
+
+		tsd_destroy(&zfs_putpage_key);
+		tsd_destroy(&zfs_loadvnode_key);
+		tsd_destroy(&zfs_fsyncer_key);
+		tsd_destroy(&rrw_tsd_key);
+		tsd_destroy(&zfs_allow_log_key);
+
+		mutex_destroy(&zfs_debug_mtx);
+		mutex_destroy(&zfs_share_lock);
+
+		return error;
+
+	case MODULE_CMD_AUTOUNLOAD:
+		/*
+		 * We don't want to be autounloaded because unlike
+		 * other subsystems, we read our own configuration
+		 * from disk and provide things that might be used
+		 * later (zvols).
+		 */
+		return EBUSY;
+
+	default:
+		return ENOTTY;
+	}
+}
+
+#endif /* __NetBSD__ */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_log.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c	27 Feb 2010 22:31:25 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c	3 Dec 2016 17:03:49 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/types.h>
@@ -29,7 +30,6 @@
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
-#include <sys/thread.h>
 #include <sys/file.h>
 #include <sys/vfs.h>
 #include <sys/zfs_znode.h>
@@ -39,12 +39,10 @@
 #include <sys/byteorder.h>
 #include <sys/policy.h>
 #include <sys/stat.h>
-#include <sys/mode.h>
 #include <sys/acl.h>
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/zfs_fuid.h>
-#include <sys/ddi.h>
 #include <sys/dsl_dataset.h>
 
 /*
@@ -75,7 +73,11 @@ zfs_log_create_txtype(zil_create_t type,
 		if (vsecp == NULL && !isxvattr)
 			return (TX_CREATE);
 		if (vsecp && isxvattr)
+#ifdef TODO
 			return (TX_CREATE_ACL_ATTR);
+#else
+			panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
 		if (vsecp)
 			return (TX_CREATE_ACL);
 		else
@@ -85,7 +87,11 @@ zfs_log_create_txtype(zil_create_t type,
 		if (vsecp == NULL && !isxvattr)
 			return (TX_MKDIR);
 		if (vsecp && isxvattr)
+#ifdef TODO
 			return (TX_MKDIR_ACL_ATTR);
+#else
+			panic("%s:%u: unsupported condition", __func__, __LINE__);
+#endif
 		if (vsecp)
 			return (TX_MKDIR_ACL);
 		else
@@ -170,6 +176,12 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 		*attrs |= (xoap->xoa_reparse == 0) ? 0 :
 		    XAT0_REPARSE;
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+		*attrs |= (xoap->xoa_offline == 0) ? 0 :
+		    XAT0_OFFLINE;
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+		*attrs |= (xoap->xoa_sparse == 0) ? 0 :
+		    XAT0_SPARSE;
 }
 
 static void *
@@ -206,9 +218,8 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fu
 }
 
 /*
- * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
- * TX_MKDIR_ATTR and TX_MKXATTR
- * transactions.
+ * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
+ * TK_MKXATTR transactions.
  *
  * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
  * domain information appended prior to the name.  In this case the
@@ -231,10 +242,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t 
     zfs_fuid_info_t *fuidp, vattr_t *vap)
 {
 	itx_t *itx;
-	uint64_t seq;
 	lr_create_t *lr;
 	lr_acl_create_t *lracl;
-	size_t aclsize;
+	size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
 	size_t xvatsize = 0;
 	size_t txsize;
 	xvattr_t *xvap = (xvattr_t *)vap;
@@ -264,7 +274,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t 
 		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
 		lrsize = sizeof (*lr);
 	} else {
-		aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0;
 		txsize =
 		    sizeof (lr_acl_create_t) + namesize + fuidsz +
 		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
@@ -276,21 +285,25 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t 
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_foid = zp->z_id;
-	lr->lr_mode = zp->z_phys->zp_mode;
-	if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) {
-		lr->lr_uid = (uint64_t)zp->z_phys->zp_uid;
+	lr->lr_mode = zp->z_mode;
+	if (!IS_EPHEMERAL(zp->z_uid)) {
+		lr->lr_uid = (uint64_t)zp->z_uid;
 	} else {
 		lr->lr_uid = fuidp->z_fuid_owner;
 	}
-	if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) {
-		lr->lr_gid = (uint64_t)zp->z_phys->zp_gid;
+	if (!IS_EPHEMERAL(zp->z_gid)) {
+		lr->lr_gid = (uint64_t)zp->z_gid;
 	} else {
 		lr->lr_gid = fuidp->z_fuid_group;
 	}
-	lr->lr_gen = zp->z_phys->zp_gen;
-	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
-	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
-	lr->lr_rdev = zp->z_phys->zp_rdev;
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+	    sizeof (uint64_t));
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+	    lr->lr_crtime, sizeof (uint64_t) * 2);
+
+	if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
+	    sizeof (lr->lr_rdev)) != 0)
+		lr->lr_rdev = 0;
 
 	/*
 	 * Fill in xvattr info if any
@@ -329,20 +342,17 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t 
 	 */
 	bcopy(name, end, namesize);
 
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
-	zp->z_last_itx = seq;
+	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
- * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ * Handles both TX_REMOVE and TX_RMDIR transactions.
  */
 void
 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-	znode_t *dzp, char *name)
+    znode_t *dzp, char *name, uint64_t foid)
 {
 	itx_t *itx;
-	uint64_t seq;
 	lr_remove_t *lr;
 	size_t namesize = strlen(name) + 1;
 
@@ -354,19 +364,19 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t 
 	lr->lr_doid = dzp->z_id;
 	bcopy(name, (char *)(lr + 1), namesize);
 
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
+	itx->itx_oid = foid;
+
+	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
- * zfs_log_link() handles TX_LINK transactions.
+ * Handles TX_LINK transactions.
  */
 void
 zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-	znode_t *dzp, znode_t *zp, char *name)
+    znode_t *dzp, znode_t *zp, char *name)
 {
 	itx_t *itx;
-	uint64_t seq;
 	lr_link_t *lr;
 	size_t namesize = strlen(name) + 1;
 
@@ -379,20 +389,17 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *t
 	lr->lr_link_obj = zp->z_id;
 	bcopy(name, (char *)(lr + 1), namesize);
 
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
-	zp->z_last_itx = seq;
+	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
- * zfs_log_symlink() handles TX_SYMLINK transactions.
+ * Handles TX_SYMLINK transactions.
  */
 void
 zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name, char *link)
 {
 	itx_t *itx;
-	uint64_t seq;
 	lr_create_t *lr;
 	size_t namesize = strlen(name) + 1;
 	size_t linksize = strlen(link) + 1;
@@ -404,29 +411,27 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t
 	lr = (lr_create_t *)&itx->itx_lr;
 	lr->lr_doid = dzp->z_id;
 	lr->lr_foid = zp->z_id;
-	lr->lr_mode = zp->z_phys->zp_mode;
-	lr->lr_uid = zp->z_phys->zp_uid;
-	lr->lr_gid = zp->z_phys->zp_gid;
-	lr->lr_gen = zp->z_phys->zp_gen;
-	lr->lr_crtime[0] = zp->z_phys->zp_crtime[0];
-	lr->lr_crtime[1] = zp->z_phys->zp_crtime[1];
+	lr->lr_uid = zp->z_uid;
+	lr->lr_gid = zp->z_gid;
+	lr->lr_mode = zp->z_mode;
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
+	    sizeof (uint64_t));
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+	    lr->lr_crtime, sizeof (uint64_t) * 2);
 	bcopy(name, (char *)(lr + 1), namesize);
 	bcopy(link, (char *)(lr + 1) + namesize, linksize);
 
-	seq = zil_itx_assign(zilog, itx, tx);
-	dzp->z_last_itx = seq;
-	zp->z_last_itx = seq;
+	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
- * zfs_log_rename() handles TX_RENAME transactions.
+ * Handles TX_RENAME transactions.
  */
 void
 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-	znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+    znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
 {
 	itx_t *itx;
-	uint64_t seq;
 	lr_rename_t *lr;
 	size_t snamesize = strlen(sname) + 1;
 	size_t dnamesize = strlen(dname) + 1;
@@ -440,36 +445,36 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t 
 	lr->lr_tdoid = tdzp->z_id;
 	bcopy(sname, (char *)(lr + 1), snamesize);
 	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+	itx->itx_oid = szp->z_id;
 
-	seq = zil_itx_assign(zilog, itx, tx);
-	sdzp->z_last_itx = seq;
-	tdzp->z_last_itx = seq;
-	szp->z_last_itx = seq;
+	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
- * zfs_log_write() handles TX_WRITE transactions.
+ * Handles TX_WRITE transactions.
  */
 ssize_t zfs_immediate_write_sz = 32768;
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_LONG(_vfs_zfs, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
+    &zfs_immediate_write_sz, 0, "Minimal size for indirect log write");
+#endif
 
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+    znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 {
+	uint32_t blocksize = zp->z_blksz;
 	itx_wr_state_t write_state;
-	boolean_t slogging;
 	uintptr_t fsync_cnt;
-	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zfs_immediate_write_sz;
-
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
-	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= zfs_immediate_write_sz)
 		write_state = WR_INDIRECT;
 	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
@@ -483,30 +488,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * If the write would overflow the largest block then split it.
-		 */
-		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
-			len = SPA_MAXBLOCKSIZE >> 1;
-		else
-			len = resid;
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(txtype, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = zp->z_id;
 		lr->lr_offset = off;
 		lr->lr_length = len;
@@ -515,13 +516,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *
 
 		itx->itx_private = zp->z_zfsvfs;
 
-		if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
-		    (ioflag & (FSYNC | FDSYNC)))
-			itx->itx_sync = B_TRUE;
-		else
+		if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
+		    (fsync_cnt == 0))
 			itx->itx_sync = B_FALSE;
 
-		zp->z_last_itx = zil_itx_assign(zilog, itx, tx);
+		zil_itx_assign(zilog, itx, tx);
 
 		off += len;
 		resid -= len;
@@ -529,14 +528,13 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *
 }
 
 /*
- * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ * Handles TX_TRUNCATE transactions.
  */
 void
 zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, uint64_t off, uint64_t len)
+    znode_t *zp, uint64_t off, uint64_t len)
 {
 	itx_t *itx;
-	uint64_t seq;
 	lr_truncate_t *lr;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
@@ -549,19 +547,17 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_
 	lr->lr_length = len;
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
+	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
- * zfs_log_setattr() handles TX_SETATTR transactions.
+ * Handles TX_SETATTR transactions.
  */
 void
 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
-	znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+    znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
 {
 	itx_t		*itx;
-	uint64_t	seq;
 	lr_setattr_t	*lr;
 	xvattr_t	*xvap = (xvattr_t *)vap;
 	size_t		recsize = sizeof (lr_setattr_t);
@@ -613,19 +609,17 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t
 		(void) zfs_log_fuid_domains(fuidp, start);
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
+	zil_itx_assign(zilog, itx, tx);
 }
 
 /*
- * zfs_log_acl() handles TX_ACL transactions.
+ * Handles TX_ACL transactions.
  */
 void
 zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
 {
 	itx_t *itx;
-	uint64_t seq;
 	lr_acl_v0_t *lrv0;
 	lr_acl_t *lr;
 	int txtype;
@@ -681,6 +675,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx
 	}
 
 	itx->itx_sync = (zp->z_sync_cnt != 0);
-	seq = zil_itx_assign(zilog, itx, tx);
-	zp->z_last_itx = seq;
+	zil_itx_assign(zilog, itx, tx);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c	5 May 2017 22:07:28 -0000
@@ -0,0 +1,265 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls.  User processes participate
+ * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a
+ * clone-open, generating a unique minor number. The process then passes
+ * along that file descriptor to each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF.  When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation.  Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+	zfs_onexit_t *zo;
+
+	zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+	mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+	    offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+	zfs_onexit_action_node_t *ap;
+
+	mutex_enter(&zo->zo_lock);
+	while ((ap = list_head(&zo->zo_actions)) != NULL) {
+		list_remove(&zo->zo_actions, ap);
+		mutex_exit(&zo->zo_lock);
+		ap->za_func(ap->za_data);
+		kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+		mutex_enter(&zo->zo_lock);
+	}
+	mutex_exit(&zo->zo_lock);
+
+	list_destroy(&zo->zo_actions);
+	mutex_destroy(&zo->zo_lock);
+	kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+	*zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV);
+	if (*zo == NULL)
+		return (SET_ERROR(EBADF));
+
+	return (0);
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+	file_t *fp;
+	zfs_onexit_t *zo;
+
+#ifdef __FreeBSD__
+	file_t *tmpfp;
+	cap_rights_t rights;
+	void *data;
+	int error;
+
+	fp = getf(fd, cap_rights_init(&rights));
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));
+
+	tmpfp = curthread->td_fpop;
+	curthread->td_fpop = fp;
+	error = devfs_get_cdevpriv(&data);
+	if (error == 0)
+		*minorp = (minor_t)(uintptr_t)data;
+	curthread->td_fpop = tmpfp;
+	if (error != 0)
+		return (SET_ERROR(EBADF));
+#else
+	fp = getf(fd);
+	if (fp == NULL)
+		return (SET_ERROR(EBADF));
+
+	*minorp = getminor(fp->f_vnode->v_rdev);
+#endif
+
+	return (zfs_onexit_minor_to_state(*minorp, &zo));
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+	releasef(fd);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+    uint64_t *action_handle)
+{
+	zfs_onexit_t *zo;
+	zfs_onexit_action_node_t *ap;
+	int error;
+
+	error = zfs_onexit_minor_to_state(minor, &zo);
+	if (error)
+		return (error);
+
+	ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+	list_link_init(&ap->za_link);
+	ap->za_func = func;
+	ap->za_data = data;
+
+	mutex_enter(&zo->zo_lock);
+	list_insert_tail(&zo->zo_actions, ap);
+	mutex_exit(&zo->zo_lock);
+	if (action_handle)
+		*action_handle = (uint64_t)(uintptr_t)ap;
+
+	return (0);
+}
+
+static zfs_onexit_action_node_t *
+zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle)
+{
+	zfs_onexit_action_node_t *match;
+	zfs_onexit_action_node_t *ap;
+	list_t *l;
+
+	ASSERT(MUTEX_HELD(&zo->zo_lock));
+
+	match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle;
+	l = &zo->zo_actions;
+	for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) {
+		if (match == ap)
+			break;
+	}
+	return (ap);
+}
+
+/*
+ * Delete the callback, triggering it first if 'fire' is set.
+ */
+int
+zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
+{
+	zfs_onexit_t *zo;
+	zfs_onexit_action_node_t *ap;
+	int error;
+
+	error = zfs_onexit_minor_to_state(minor, &zo);
+	if (error)
+		return (error);
+
+	mutex_enter(&zo->zo_lock);
+	ap = zfs_onexit_find_cb(zo, action_handle);
+	if (ap != NULL) {
+		list_remove(&zo->zo_actions, ap);
+		mutex_exit(&zo->zo_lock);
+		if (fire)
+			ap->za_func(ap->za_data);
+		kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+	} else {
+		mutex_exit(&zo->zo_lock);
+		error = SET_ERROR(ENOENT);
+	}
+
+	return (error);
+}
+
+/*
+ * Return the data associated with this callback.  This allows consumers
+ * of the cleanup-on-exit interfaces to stash kernel data across system
+ * calls, knowing that it will be cleaned up if the calling process exits.
+ */
+int
+zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
+{
+	zfs_onexit_t *zo;
+	zfs_onexit_action_node_t *ap;
+	int error;
+
+	*data = NULL;
+
+	error = zfs_onexit_minor_to_state(minor, &zo);
+	if (error)
+		return (error);
+
+	mutex_enter(&zo->zo_lock);
+	ap = zfs_onexit_find_cb(zo, action_handle);
+	if (ap != NULL)
+		*data = ap->za_data;
+	else
+		error = SET_ERROR(ENOENT);
+	mutex_exit(&zo->zo_lock);
+
+	return (error);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c,v
retrieving revision 1.10
diff -u -p -r1.10 zfs_replay.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c	7 Feb 2014 15:29:20 -0000	1.10
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c	12 May 2017 21:45:09 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -54,9 +54,9 @@
 
 static void
 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
-	uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+    uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
 {
-	vattr_null(vap);
+	VATTR_NULL(vap);
 	vap->va_mask = (uint_t)mask;
 	if (mask & AT_TYPE)
 		vap->va_type = IFTOVT(mode);
@@ -74,7 +74,7 @@ zfs_init_vattr(vattr_t *vap, uint64_t ma
 static int
 zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap)
 {
-	return (ENOTSUP);
+	return (SET_ERROR(ENOTSUP));
 }
 
 static void
@@ -132,6 +132,10 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xva
 		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 		xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+		xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+		xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
 }
 
 static int
@@ -321,7 +325,6 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
-	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	switch (txtype) {
 	case TX_CREATE_ACL:
 		aclstart = (caddr_t)(lracl + 1);
@@ -394,9 +397,8 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 #endif
 		break;
 	default:
-		error = ENOTSUP;
+		error = SET_ERROR(ENOTSUP);
 	}
-	VOP_UNLOCK(ZTOV(dzp));
 
 bail:
 	if (error == 0 && vp != NULL)
@@ -476,7 +478,10 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_c
 	}
 
 	cn.cn_cred = kcred;
-	cn.cn_flags = 0;
+#ifndef __NetBSD__
+	cn.cn_thread = curthread;
+	cn.cn_flags = SAVENAME;
+#endif
 
 	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	switch (txtype) {
@@ -526,13 +531,17 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_c
 		error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/);
 		break;
 	default:
-		error = ENOTSUP;
+		error = SET_ERROR(ENOTSUP);
 	}
-	VOP_UNLOCK(ZTOV(dzp));
+	VOP_UNLOCK(ZTOV(dzp), 0);
 
 out:
 	if (error == 0 && vp != NULL)
+#ifdef __NetBSD__
 		VN_RELE(vp);
+#else
+		VN_URELE(vp);
+#endif
 
 	VN_RELE(ZTOV(dzp));
 
@@ -563,21 +572,21 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_r
 	cn.cn_nameptr = name;
 	cn.cn_namelen = strlen(name);
 	cn.cn_nameiop = DELETE;
-	cn.cn_flags = ISLASTCN;
-	//cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	cn.cn_flags = ISLASTCN | SAVENAME;
 	cn.cn_cred = kcred;
+#ifndef __NetBSD__
+	cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	cn.cn_thread = curthread;
+#endif
 	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn);
 	if (error != 0) {
-		VOP_UNLOCK(ZTOV(dzp));
-		goto fail;
-	}
-	error = vn_lock(vp, LK_EXCLUSIVE);
-	if (error != 0) {
-		VOP_UNLOCK(ZTOV(dzp));
-		vrele(vp);
+		VOP_UNLOCK(ZTOV(dzp), 0);
 		goto fail;
 	}
+#ifdef __NetBSD__
+	VOP_UNLOCK(vp, 0);
+#endif
 
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_REMOVE:
@@ -587,10 +596,15 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_r
 		error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/);
 		break;
 	default:
-		error = ENOTSUP;
+		error = SET_ERROR(ENOTSUP);
 	}
+#ifdef __NetBSD__
+	vrele(vp);
+#else
 	vput(vp);
-	VOP_UNLOCK(ZTOV(dzp));
+#endif
+	VOP_UNLOCK(ZTOV(dzp), 0);
+
 fail:
 	VN_RELE(ZTOV(dzp));
 
@@ -619,15 +633,23 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_lin
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
+
 	cn.cn_nameptr = name;
 	cn.cn_cred = kcred;
-	cn.cn_flags = 0;
+#ifndef __NetBSD__
+	cn.cn_thread = curthread;
+	cn.cn_flags = SAVENAME;
+#endif
 
 	vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY);
+#ifndef __NetBSD__
 	vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
+#endif
 	error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/);
-	VOP_UNLOCK(ZTOV(zp));
-	VOP_UNLOCK(ZTOV(dzp));
+#ifndef __NetBSD__
+	VOP_UNLOCK(ZTOV(zp), 0);
+#endif
+	VOP_UNLOCK(ZTOV(dzp), 0);
 
 	VN_RELE(ZTOV(zp));
 	VN_RELE(ZTOV(dzp));
@@ -665,36 +687,39 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_r
 	scn.cn_nameptr = sname;
 	scn.cn_namelen = strlen(sname);
 	scn.cn_nameiop = DELETE;
-	scn.cn_flags = ISLASTCN;
-//	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	scn.cn_flags = ISLASTCN | SAVENAME;
 	scn.cn_cred = kcred;
+#ifndef __NetBSD__
+	scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	scn.cn_thread = td;
+#endif
 	vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn);
-	VOP_UNLOCK(ZTOV(sdzp));
+	VOP_UNLOCK(ZTOV(sdzp), 0);
 	if (error != 0)
 		goto fail;
+	VOP_UNLOCK(svp, 0);
 
 	tcn.cn_nameptr = tname;
 	tcn.cn_namelen = strlen(tname);
 	tcn.cn_nameiop = RENAME;
-	tcn.cn_flags = ISLASTCN;
-//	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	tcn.cn_flags = ISLASTCN | SAVENAME;
 	tcn.cn_cred = kcred;
+#ifndef __NetBSD__
+	tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	tcn.cn_thread = td;
+#endif
 	vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn);
 	if (error == EJUSTRETURN)
 		tvp = NULL;
 	else if (error != 0) {
-		VOP_UNLOCK(ZTOV(tdzp));
+		VOP_UNLOCK(ZTOV(tdzp), 0);
 		goto fail;
-	} else {
-		error = vn_lock(tvp, LK_EXCLUSIVE);
-		if (error != 0) {
-			VOP_UNLOCK(ZTOV(tdzp));
-			vrele(tvp);
-			goto fail;
-		}
 	}
+#ifdef __NetBSD__
+	vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
+#endif
 
 	error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/);
 	return (error);
@@ -716,7 +741,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_wr
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
-	uint64_t orig_eof, eod, offset, length;
+	uint64_t eod, offset, length;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -734,9 +759,20 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_wr
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
-	eod = offset + length;		/* end of data for this write */
+	eod = offset + length;	/* end of data for this write */
+
+	/*
+	 * This may be a write from a dmu_sync() for a whole block,
+	 * and may extend beyond the current end of the file.
+	 * We can't just replay what was written for this TX_WRITE as
+	 * a future TX_WRITE2 may extend the eof and the data for that
+	 * write needs to be there. So we write the whole block and
+	 * reduce the eof. This needs to be done within the single dmu
+	 * transaction created within vn_rdwr -> zfs_write. So a possible
+	 * new end of file is passed through in zfsvfs->z_replay_eof
+	 */
 
-	orig_eof = zp->z_phys->zp_size;
+	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
@@ -745,23 +781,15 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_wr
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
+		if (zp->z_size < eod)
+			zfsvfs->z_replay_eof = eod;
 	}
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
-	/*
-	 * This may be a write from a dmu_sync() for a whole block,
-	 * and may extend beyond the current end of the file.
-	 * We can't just replay what was written for this TX_WRITE as
-	 * a future TX_WRITE2 may extend the eof and the data for that
-	 * write needs to be there. So we write the whole block and
-	 * reduce the eof.
-	 */
-	if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
-		zp->z_phys->zp_size = eod;
-
 	VN_RELE(ZTOV(zp));
+	zfsvfs->z_replay_eof = 0;	/* safety */
 
 	return (error);
 }
@@ -785,10 +813,31 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_w
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
+top:
 	end = lr->lr_offset + lr->lr_length;
-	if (end > zp->z_phys->zp_size) {
-		ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
-		zp->z_phys->zp_size = end;
+	if (end > zp->z_size) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		zp->z_size = end;
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			VN_RELE(ZTOV(zp));
+			if (error == ERESTART) {
+				dmu_tx_wait(tx);
+				dmu_tx_abort(tx);
+				goto top;
+			}
+			dmu_tx_abort(tx);
+			return (error);
+		}
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+		    (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+		/* Ensure the replayed seq is updated */
+		(void) zil_replaying(zfsvfs->z_log, tx);
+
+		dmu_tx_commit(tx);
 	}
 
 	VN_RELE(ZTOV(zp));
@@ -799,11 +848,14 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_w
 static int
 zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
 {
+#ifdef illumos
+	znode_t *zp;
+	flock64_t fl;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
 
-#ifdef __NetBSD__
-	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
-	return (EOPNOTSUPP);
-#else
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
@@ -819,6 +871,9 @@ zfs_replay_truncate(zfsvfs_t *zfsvfs, lr
 	VN_RELE(ZTOV(zp));
 
 	return (error);
+#else
+	ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org");
+	return (EOPNOTSUPP);
 #endif
 }
 
@@ -869,7 +924,7 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_
 	vp = ZTOV(zp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_SETATTR(vp, vap, kcred);
-	VOP_UNLOCK(vp);
+	VOP_UNLOCK(vp, 0);
 
 	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
@@ -878,11 +933,15 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_
 	return (error);
 }
 
+extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+    caller_context_t *ct);
+
 static int
 zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
 {
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
 	vsecattr_t vsa;
+	vnode_t *vp;
 	znode_t *zp;
 	int error;
 
@@ -901,13 +960,12 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_a
 	vsa.vsa_aclflags = 0;
 	vsa.vsa_aclentp = ace;
 
-#ifdef TODO
-	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
-#else
-	panic("%s:%u: unsupported condition", __func__, __LINE__);
-#endif
+	vp = ZTOV(zp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+	VOP_UNLOCK(vp, 0);
 
-	VN_RELE(ZTOV(zp));
+	VN_RELE(vp);
 
 	return (error);
 }
@@ -932,6 +990,7 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_
 	ace_t *ace = (ace_t *)(lr + 1);
 	vsecattr_t vsa;
 	znode_t *zp;
+	vnode_t *vp;
 	int error;
 
 	if (byteswap) {
@@ -947,7 +1006,6 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
-#ifdef TODO
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
@@ -964,16 +1022,16 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_
 		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
 	}
 
-	error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
+	vp = ZTOV(zp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+	VOP_UNLOCK(vp, 0);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-#else
-	error = EOPNOTSUPP;
-#endif
 
 	zfsvfs->z_fuid_replay = NULL;
-	VN_RELE(ZTOV(zp));
+	VN_RELE(vp);
 
 	return (error);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c,v
retrieving revision 1.5
diff -u -p -r1.5 zfs_rlock.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c	29 Feb 2016 16:18:37 -0000	1.5
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c	19 Jun 2013 13:06:07 -0000
@@ -19,13 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 /*
  * This file contains the code to implement file range locking in
- * ZFS, although there isn't much specific to ZFS (all that comes to mind
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind is
  * support for growing the blocksize).
  *
  * Interface
@@ -94,36 +97,6 @@
 
 #include <sys/zfs_rlock.h>
 
-static int
-zfs_range_lock_hold(rl_t *rl)
-{
-
-	KASSERT(rl->r_zp != NULL);
-	KASSERT(0 < rl->r_refcnt);
-	KASSERT(mutex_owned(&rl->r_zp->z_range_lock));
-
-	if (rl->r_refcnt >= ULONG_MAX)
-		return (ENFILE); /* XXX What to do?  */
-
-	rl->r_refcnt++;
-	return (0);
-}
-
-static void
-zfs_range_lock_rele(rl_t *rl)
-{
-
-	KASSERT(rl->r_zp != NULL);
-	KASSERT(0 < rl->r_refcnt);
-	KASSERT(mutex_owned(&rl->r_zp->z_range_lock));
-
-	if (--rl->r_refcnt == 0) {
-		cv_destroy(&rl->r_wr_cv);
-		cv_destroy(&rl->r_rd_cv);
-		kmem_free(rl, sizeof (rl_t));
-	}
-}
-
 /*
  * Check if a write lock can be grabbed, or wait and recheck until available.
  */
@@ -142,7 +115,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t 
 		 * Range locking is also used by zvol and uses a
 		 * dummied up znode. However, for zvol, we don't need to
 		 * append or grow blocksize, and besides we don't have
-		 * a z_phys or z_zfsvfs - so skip that processing.
+		 * a "sa" data or z_zfsvfs - so skip that processing.
 		 *
 		 * Yes, this is ugly, and would be solved by not handling
 		 * grow or append in range lock code. If that was done then
@@ -155,14 +128,14 @@ zfs_range_lock_writer(znode_t *zp, rl_t 
 			 * This is done under z_range_lock to avoid races.
 			 */
 			if (new->r_type == RL_APPEND)
-				new->r_off = zp->z_phys->zp_size;
+				new->r_off = zp->z_size;
 
 			/*
 			 * If we need to grow the block size then grab the whole
 			 * file range. This is also done under z_range_lock to
 			 * avoid races.
 			 */
-			end_size = MAX(zp->z_phys->zp_size, new->r_off + len);
+			end_size = MAX(zp->z_size, new->r_off + len);
 			if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
 			    zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
 				new->r_off = 0;
@@ -187,12 +160,10 @@ zfs_range_lock_writer(znode_t *zp, rl_t 
 			goto wait; /* already locked at same offset */
 
 		rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
-		KASSERT(0 < rl->r_refcnt);
 		if (rl && (rl->r_off < new->r_off + new->r_len))
 			goto wait;
 
 		rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
-		KASSERT(0 < rl->r_refcnt);
 		if (rl && rl->r_off + rl->r_len > new->r_off)
 			goto wait;
 
@@ -201,12 +172,10 @@ zfs_range_lock_writer(znode_t *zp, rl_t 
 		return;
 wait:
 		if (!rl->r_write_wanted) {
+			cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
 			rl->r_write_wanted = B_TRUE;
 		}
-		if (zfs_range_lock_hold(rl) != 0)
-			panic("too many waiters on zfs range lock %p", rl);
 		cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
-		zfs_range_lock_rele(rl);
 
 		/* reset to original */
 		new->r_off = off;
@@ -234,17 +203,13 @@ zfs_range_proxify(avl_tree_t *tree, rl_t
 
 	/* create a proxy range lock */
 	proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	proxy->r_zp = rl->r_zp;
 	proxy->r_off = rl->r_off;
 	proxy->r_len = rl->r_len;
 	proxy->r_cnt = 1;
 	proxy->r_type = RL_READER;
 	proxy->r_proxy = B_TRUE;
-	cv_init(&proxy->r_wr_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&proxy->r_rd_cv, NULL, CV_DEFAULT, NULL);
 	proxy->r_write_wanted = B_FALSE;
 	proxy->r_read_wanted = B_FALSE;
-	proxy->r_refcnt = 1;
 	avl_add(tree, proxy);
 
 	return (proxy);
@@ -267,15 +232,11 @@ zfs_range_split(avl_tree_t *tree, rl_t *
 
 	/* create the rear proxy range lock */
 	rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	rear->r_zp = rl->r_zp;
 	rear->r_off = off;
 	rear->r_len = rl->r_off + rl->r_len - off;
 	rear->r_cnt = rl->r_cnt;
 	rear->r_type = RL_READER;
 	rear->r_proxy = B_TRUE;
-	cv_init(&rear->r_wr_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&rear->r_rd_cv, NULL, CV_DEFAULT, NULL);
-	rear->r_refcnt = 1;
 	rear->r_write_wanted = B_FALSE;
 	rear->r_read_wanted = B_FALSE;
 
@@ -290,30 +251,25 @@ zfs_range_split(avl_tree_t *tree, rl_t *
  * Create and add a new proxy range lock for the supplied range.
  */
 static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len, znode_t *zp)
+zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
 {
 	rl_t *rl;
 
 	ASSERT(len);
 	rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
-	rl->r_zp = zp;
 	rl->r_off = off;
 	rl->r_len = len;
 	rl->r_cnt = 1;
 	rl->r_type = RL_READER;
 	rl->r_proxy = B_TRUE;
-	cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&rl->r_rd_cv, NULL, CV_DEFAULT, NULL);
 	rl->r_write_wanted = B_FALSE;
 	rl->r_read_wanted = B_FALSE;
-	rl->r_refcnt = 1;
 	avl_add(tree, rl);
 }
 
 static void
 zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
 {
-	znode_t *zp = new->r_zp;
 	rl_t *next;
 	uint64_t off = new->r_off;
 	uint64_t len = new->r_len;
@@ -350,10 +306,9 @@ zfs_range_add_reader(avl_tree_t *tree, r
 		return;
 	}
 
-	KASSERT(0 < next->r_refcnt);
 	if (off < next->r_off) {
 		/* Add a proxy for initial range before the overlap */
-		zfs_range_new_proxy(tree, off, next->r_off - off, zp);
+		zfs_range_new_proxy(tree, off, next->r_off - off);
 	}
 
 	new->r_cnt = 0; /* will use proxies in tree */
@@ -370,31 +325,28 @@ zfs_range_add_reader(avl_tree_t *tree, r
 			/* there's a gap */
 			ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
 			zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-			    next->r_off - (prev->r_off + prev->r_len), zp);
+			    next->r_off - (prev->r_off + prev->r_len));
 		}
 		if (off + len == next->r_off + next->r_len) {
 			/* exact overlap with end */
 			next = zfs_range_proxify(tree, next);
-			KASSERT(0 < next->r_refcnt);
 			next->r_cnt++;
 			return;
 		}
 		if (off + len < next->r_off + next->r_len) {
 			/* new range ends in the middle of this block */
 			next = zfs_range_split(tree, next, off + len);
-			KASSERT(0 < next->r_refcnt);
 			next->r_cnt++;
 			return;
 		}
 		ASSERT3U(off + len, >, next->r_off + next->r_len);
 		next = zfs_range_proxify(tree, next);
-		KASSERT(0 < next->r_refcnt);
 		next->r_cnt++;
 	}
 
 	/* Add the remaining end range. */
 	zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
-	    (off + len) - (prev->r_off + prev->r_len), zp);
+	    (off + len) - (prev->r_off + prev->r_len));
 }
 
 /*
@@ -423,13 +375,10 @@ retry:
 	if (prev && (off < prev->r_off + prev->r_len)) {
 		if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
 			if (!prev->r_read_wanted) {
+				cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
 				prev->r_read_wanted = B_TRUE;
 			}
-			if (zfs_range_lock_hold(prev) != 0)
-				panic("too many waiters on zfs range lock %p",
-				    prev);
 			cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
-			zfs_range_lock_rele(prev);
 			goto retry;
 		}
 		if (off + len < prev->r_off + prev->r_len)
@@ -449,13 +398,10 @@ retry:
 			goto got_lock;
 		if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
 			if (!next->r_read_wanted) {
+				cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
 				next->r_read_wanted = B_TRUE;
 			}
-			if (zfs_range_lock_hold(next) != 0)
-				panic("too many waiters on zfs range lock %p",
-				    next);
 			cv_wait(&next->r_rd_cv, &zp->z_range_lock);
-			zfs_range_lock_rele(next);
 			goto retry;
 		}
 		if (off + len <= next->r_off + next->r_len)
@@ -492,11 +438,8 @@ zfs_range_lock(znode_t *zp, uint64_t off
 	new->r_cnt = 1; /* assume it's going to be in the tree */
 	new->r_type = type;
 	new->r_proxy = B_FALSE;
-	cv_init(&new->r_wr_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&new->r_rd_cv, NULL, CV_DEFAULT, NULL);
 	new->r_write_wanted = B_FALSE;
 	new->r_read_wanted = B_FALSE;
-	new->r_refcnt = 1;
 
 	mutex_enter(&zp->z_range_lock);
 	if (type == RL_READER) {
@@ -507,9 +450,8 @@ zfs_range_lock(znode_t *zp, uint64_t off
 			avl_add(&zp->z_range_avl, new);
 		else
 			zfs_range_lock_reader(zp, new);
-	} else {
+	} else
 		zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
-	}
 	mutex_exit(&zp->z_range_lock);
 	return (new);
 }
@@ -521,7 +463,7 @@ static void
 zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
 {
 	avl_tree_t *tree = &zp->z_range_avl;
-	rl_t *rl, *next;
+	rl_t *rl, *next = NULL;
 	uint64_t len;
 
 	/*
@@ -535,14 +477,16 @@ zfs_range_unlock_reader(znode_t *zp, rl_
 		avl_remove(tree, remove);
 		if (remove->r_write_wanted) {
 			cv_broadcast(&remove->r_wr_cv);
+			cv_destroy(&remove->r_wr_cv);
 		}
 		if (remove->r_read_wanted) {
 			cv_broadcast(&remove->r_rd_cv);
+			cv_destroy(&remove->r_rd_cv);
 		}
 	} else {
-		ASSERT3U(remove->r_cnt, ==, 0);
-		ASSERT3U(remove->r_write_wanted, ==, 0);
-		ASSERT3U(remove->r_read_wanted, ==, 0);
+		ASSERT0(remove->r_cnt);
+		ASSERT0(remove->r_write_wanted);
+		ASSERT0(remove->r_read_wanted);
 		/*
 		 * Find start proxy representing this reader lock,
 		 * then decrement ref count on all proxies
@@ -566,15 +510,17 @@ zfs_range_unlock_reader(znode_t *zp, rl_
 				avl_remove(tree, rl);
 				if (rl->r_write_wanted) {
 					cv_broadcast(&rl->r_wr_cv);
+					cv_destroy(&rl->r_wr_cv);
 				}
 				if (rl->r_read_wanted) {
 					cv_broadcast(&rl->r_rd_cv);
+					cv_destroy(&rl->r_rd_cv);
 				}
-				zfs_range_lock_rele(rl);
+				kmem_free(rl, sizeof (rl_t));
 			}
 		}
 	}
-	zfs_range_lock_rele(remove);
+	kmem_free(remove, sizeof (rl_t));
 }
 
 /*
@@ -593,14 +539,16 @@ zfs_range_unlock(rl_t *rl)
 	if (rl->r_type == RL_WRITER) {
 		/* writer locks can't be shared or split */
 		avl_remove(&zp->z_range_avl, rl);
+		mutex_exit(&zp->z_range_lock);
 		if (rl->r_write_wanted) {
 			cv_broadcast(&rl->r_wr_cv);
+			cv_destroy(&rl->r_wr_cv);
 		}
 		if (rl->r_read_wanted) {
 			cv_broadcast(&rl->r_rd_cv);
+			cv_destroy(&rl->r_rd_cv);
 		}
-		zfs_range_lock_rele(rl);
-		mutex_exit(&zp->z_range_lock);
+		kmem_free(rl, sizeof (rl_t));
 	} else {
 		/*
 		 * lock may be shared, let zfs_range_unlock_reader()
@@ -632,11 +580,11 @@ zfs_range_reduce(rl_t *rl, uint64_t off,
 	mutex_enter(&zp->z_range_lock);
 	rl->r_off = off;
 	rl->r_len = len;
+	mutex_exit(&zp->z_range_lock);
 	if (rl->r_write_wanted)
 		cv_broadcast(&rl->r_wr_cv);
 	if (rl->r_read_wanted)
 		cv_broadcast(&rl->r_rd_cv);
-	mutex_exit(&zp->z_range_lock);
 }
 
 /*
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c	10 Oct 2016 11:09:56 -0000
@@ -0,0 +1,327 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively.  The file system
+ * could have other attributes stored in files, but they will be
+ * ignored.  The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+	{"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+	{"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+	{"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+	{"ZPL_DACL_ACES", 0, SA_ACL, 0},
+	{NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+
+int
+zfs_sa_readlink(znode_t *zp, uio_t *uio)
+{
+	dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+	size_t bufsz;
+	int error;
+
+	bufsz = zp->z_size;
+	if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
+		error = uiomove((caddr_t)db->db_data +
+		    ZFS_OLD_ZNODE_PHYS_SIZE,
+		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+	} else {
+		dmu_buf_t *dbp;
+		if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id,
+		    0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
+			error = uiomove(dbp->db_data,
+			    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
+			dmu_buf_rele(dbp, FTAG);
+		}
+	}
+	return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+	dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+	if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+		VERIFY(dmu_set_bonus(db,
+		    len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0);
+		if (len) {
+			bcopy(link, (caddr_t)db->db_data +
+			    ZFS_OLD_ZNODE_PHYS_SIZE, len);
+		}
+	} else {
+		dmu_buf_t *dbp;
+
+		zfs_grow_blocksize(zp, len, tx);
+		VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os,
+		    zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH));
+
+		dmu_buf_will_dirty(dbp, tx);
+
+		ASSERT3U(len, <=, dbp->db_size);
+		bcopy(link, dbp->db_data, len);
+		dmu_buf_rele(dbp, FTAG);
+	}
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	xoptattr_t *xoap;
+
+	ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+	if (zp->z_is_sa) {
+		if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+		    &xoap->xoa_av_scanstamp,
+		    sizeof (xoap->xoa_av_scanstamp)) != 0)
+			return;
+	} else {
+		dmu_object_info_t doi;
+		dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+		int len;
+
+		if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+			return;
+
+		sa_object_info(zp->z_sa_hdl, &doi);
+		len = sizeof (xoap->xoa_av_scanstamp) +
+		    ZFS_OLD_ZNODE_PHYS_SIZE;
+
+		if (len <= doi.doi_bonus_size) {
+			(void) memcpy(xoap->xoa_av_scanstamp,
+			    (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+			    sizeof (xoap->xoa_av_scanstamp));
+		}
+	}
+	XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	xoptattr_t *xoap;
+
+	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+	if (zp->z_is_sa)
+		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+		    &xoap->xoa_av_scanstamp,
+		    sizeof (xoap->xoa_av_scanstamp), tx));
+	else {
+		dmu_object_info_t doi;
+		dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+		int len;
+
+		sa_object_info(zp->z_sa_hdl, &doi);
+		len = sizeof (xoap->xoa_av_scanstamp) +
+		    ZFS_OLD_ZNODE_PHYS_SIZE;
+		if (len > doi.doi_bonus_size)
+			VERIFY(dmu_set_bonus(db, len, tx) == 0);
+		(void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+		    xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+		zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+		    &zp->z_pflags, sizeof (uint64_t), tx));
+	}
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to no performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db = sa_get_db(hdl);
+	znode_t *zp = sa_get_userdata(hdl);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	sa_bulk_attr_t bulk[20];
+	int count = 0;
+	sa_bulk_attr_t sa_attrs[20] = { 0 };
+	zfs_acl_locator_cb_t locate = { 0 };
+	uint64_t uid, gid, mode, rdev, xattr, parent;
+	uint64_t crtime[2], mtime[2], ctime[2];
+	zfs_acl_phys_t znode_acl;
+	char scanstamp[AV_SCANSTAMP_SZ];
+
+	/*
+	 * No upgrade if ACL isn't cached
+	 * since we won't know which locks are held
+	 * and ready the ACL would require special "locked"
+	 * interfaces that would be messy
+	 */
+	if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK)
+		return;
+
+	/*
+	 * If the vnode lock is held and we aren't the owner
+	 * then just return since we don't want to deadlock
+	 * trying to update the status of z_is_sa.  This
+	 * file can then be upgraded at a later time.
+	 *
+	 * Otherwise, we know we are doing the
+	 * sa_update() that caused us to enter this function.
+	 */
+	if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
+			return;
+
+	/* First do a bulk query of the attributes that aren't cached */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+	    &znode_acl, 88);
+
+	if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+		goto done;
+
+
+	/*
+	 * While the order here doesn't matter its best to try and organize
+	 * it is such a way to pick up an already existing layout number
+	 */
+	count = 0;
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+	    NULL, &zp->z_gen, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+	    NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    zp->z_atime, 16);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, 16);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+	    &crtime, 16);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, 8);
+	if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR)
+		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+		    &rdev, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+	    &zp->z_acl_cached->z_acl_count, 8);
+
+	if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+		zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+	locate.cb_aclp = zp->z_acl_cached;
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+	    zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
+	if (xattr)
+		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+		    NULL, &xattr, 8);
+
+	/* if scanstamp then add scanstamp */
+
+	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+		bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+		    scanstamp, AV_SCANSTAMP_SZ);
+		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+		    NULL, scanstamp, AV_SCANSTAMP_SZ);
+		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+	}
+
+	VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+	VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+	    count, tx) == 0);
+	if (znode_acl.z_acl_extern_obj)
+		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+		    znode_acl.z_acl_extern_obj, tx));
+
+	zp->z_is_sa = B_TRUE;
+done:
+	VOP_UNLOCK(ZTOV(zp), 0);
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+	if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa)
+		return;
+
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	if (zfs_external_acl(zp)) {
+		dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
+		    DMU_OBJECT_END);
+	}
+}
+
+#endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c,v
retrieving revision 1.15
diff -u -p -r1.15 zfs_vfsops.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c	17 Feb 2017 08:31:23 -0000	1.15
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c	16 Jun 2017 16:35:57 -0000
@@ -19,19 +19,24 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
-#include <sys/pathname.h>
+#include <sys/acl.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
@@ -45,11 +50,11 @@
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
-#include <sys/mkdev.h>
-#include <sys/modctl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
@@ -57,36 +62,89 @@
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
+#include "zfs_comutil.h"
+
+#ifdef __FreeBSD_kernel__
+
+#include <sys/jail.h>
+
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+
+int zfs_super_owner;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
+    "File system owner can perform privileged operation on his file systems");
+
+int zfs_debug_level;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
+    "Debug level");
+
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
+static int zfs_version_acl = ZFS_ACL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
+    "ZFS_ACL_VERSION");
+static int zfs_version_spa = SPA_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
+    "SPA_VERSION");
+static int zfs_version_zpl = ZPL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
+    "ZPL_VERSION");
+
+static int zfs_mount(vfs_t *vfsp);
+static int zfs_umount(vfs_t *vfsp, int fflag);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor);
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors);
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
+static void zfs_objset_close(zfsvfs_t *zfsvfs);
+static void zfs_freevfs(vfs_t *vfsp);
+
+struct vfsops zfs_vfsops = {
+	.vfs_mount =		zfs_mount,
+	.vfs_unmount =		zfs_umount,
+	.vfs_root =		zfs_root,
+	.vfs_statfs =		zfs_statfs,
+	.vfs_vget =		zfs_vget,
+	.vfs_sync =		zfs_sync,
+	.vfs_checkexp =		zfs_checkexp,
+	.vfs_fhtovp =		zfs_fhtovp,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
+
+#endif /* __FreeBSD_kernel__ */
 
 #ifdef __NetBSD__
-/* include ddi_name_to_major function is there better place for it ?*/
-#include <sys/ddi.h>
-#include <sys/systm.h>
-#endif
 
-int zfsfstype;
-vfsops_t *zfs_vfsops = NULL;
-static major_t zfs_major;
-static minor_t zfs_minor;
-static kmutex_t	zfs_dev_mtx;
+#include <sys/mkdev.h>
+#include <miscfs/genfs/genfs.h>
 
 int zfs_debug_level;
 kmutex_t zfs_debug_mtx;
 
-/* XXX NetBSD static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);*/
+#define	DROP_GIANT()	/* nothing */
+#define PICKUP_GIANT()	/* nothing */
+#define vfs_stdsync(a, b) 0
+
 static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len);
 static int zfs_umount(vfs_t *vfsp, int fflag);
-static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
+static int zfs_netbsd_root(vfs_t *vfsp, vnode_t **vpp);
 static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp);
 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
 static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp);
-static int zfs_start(vfs_t *vfsp, int flags);
+static int zfs_sync(vfs_t *vfsp, int waitfor);
+static int zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr);
 static void zfs_freevfs(vfs_t *vfsp);
 
 void zfs_init(void);
 void zfs_fini(void);
 
-
 extern const struct vnodeopv_desc zfs_vnodeop_opv_desc;
 
 static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = {
@@ -94,23 +152,23 @@ static const struct vnodeopv_desc * cons
 	NULL,
 };
 
-static struct vfsops zfs_vfsops_template = {
+struct vfsops zfs_vfsops = {
 	.vfs_name = MOUNT_ZFS,
 	.vfs_min_mount_data = sizeof(struct zfs_args),
 	.vfs_opv_descs = zfs_vnodeop_descs,
 	.vfs_mount = zfs_mount,
 	.vfs_unmount = zfs_umount,
-	.vfs_root = zfs_root,
+	.vfs_root = zfs_netbsd_root,
 	.vfs_statvfs = zfs_statvfs,
-	.vfs_sync = zfs_sync,
+	.vfs_sync = zfs_netbsd_sync,
 	.vfs_vget = zfs_vget,
 	.vfs_loadvnode = zfs_loadvnode,
 	.vfs_fhtovp = zfs_fhtovp,
 	.vfs_init = zfs_init,
 	.vfs_done = zfs_fini,
-	.vfs_start = zfs_start,
-	.vfs_renamelock_enter = (void*)nullop,
-	.vfs_renamelock_exit = (void*)nullop,
+	.vfs_start = (void *)nullop,
+	.vfs_renamelock_enter = genfs_renamelock_enter,
+	.vfs_renamelock_exit = genfs_renamelock_exit,
 	.vfs_reinit = (void *)nullop,
 	.vfs_vptofh = (void *)eopnotsupp,
 	.vfs_fhtovp = (void *)eopnotsupp,
@@ -121,34 +179,6 @@ static struct vfsops zfs_vfsops_template
 	.vfs_fsync = (void *)eopnotsupp,
 };
 
-/*
- * We need to keep a count of active fs's.
- * This is necessary to prevent our module
- * from being unloaded after a umount -f
- */
-static uint32_t	zfs_active_fs_count = 0;
-
-static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
-static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
-static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
-static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
-
-/*
- * MO_DEFAULT is not used since the default value is determined
- * by the equivalent property.
- */
-static mntopt_t mntopts[] = {
-	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
-	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
-	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
-	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
-};
-
-static mntopts_t zfs_mntopts = {
-	sizeof (mntopts) / sizeof (mntopt_t),
-	mntopts
-};
-
 static bool
 zfs_sync_selector(void *cl, struct vnode *vp)
 {
@@ -163,26 +193,12 @@ zfs_sync_selector(void *cl, struct vnode
 	    && !zp->z_unlinked;
 }
 
-/*ARGSUSED*/
-int
-zfs_sync(vfs_t *vfsp, int flag, cred_t *cr)
+static int
+zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr)
 {
+	struct vnode_iterator *marker;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	znode_t *zp;
 	vnode_t *vp;
-	struct vnode_iterator *marker;
-	dmu_tx_t *tx;
-	int error;
-	
-	
-	error = 0;
-	
-        /*
-	 * Data integrity is job one.  We don't want a compromised kernel
-	 * writing to the storage pool, so we never sync during panic.
-	 */
-	if (panicstr)
-		return (0);
 
 	/*
 	 * On NetBSD, we need to push out atime updates.  Solaris does
@@ -192,35 +208,73 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t *
 	vfs_vnode_iterator_init(vfsp, &marker);
 	while ((vp = vfs_vnode_iterator_next(marker, zfs_sync_selector, NULL)))
 	{
+		znode_t *zp;
+		dmu_buf_t *dbp;
+		dmu_tx_t *tx;
+		int error;
+
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error) {
-			vrele(vp);
+			VN_RELE(vp);
 			continue;
 		}
+		ZFS_ENTER(zfsvfs);
 		zp = VTOZ(vp);
 		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_bonus(tx, zp->z_id);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
-			dmu_buf_will_dirty(zp->z_dbuf, tx);
-			mutex_enter(&zp->z_lock);
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
+		ZFS_EXIT(zfsvfs);
 		vput(vp);
 	}
 	vfs_vnode_iterator_destroy(marker);
 
 	/*
-	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
-	 * to sync metadata, which they would otherwise cache indefinitely.
-	 * Semantically, the only requirement is that the sync be initiated.
-	 * The DMU syncs out txgs frequently, so there's nothing to do.
+	 * Then do the regular ZFS stuff.
+	 */
+	return zfs_sync(vfsp, waitfor);
+}
+
+static int
+zfs_netbsd_root(vfs_t *vfsp, vnode_t **vpp)
+{
+
+	return zfs_root(vfsp, LK_EXCLUSIVE | LK_RETRY, vpp);
+}
+
+#endif /* __NetBSD__ */
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t	zfs_active_fs_count = 0;
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor)
+{
+        /*
+	 * Data integrity is job one.  We don't want a compromised kernel
+	 * writing to the storage pool, so we never sync during panic.
+	 */
+	if (panicstr)
+		return (0);
+
+	/*
+	 * Ignore the system syncher.  ZFS already commits async data
+	 * at zfs_txg_timeout intervals.
 	 */
-	if ((flag & MNT_LAZY) != 0)
+	if (waitfor == MNT_LAZY)
 		return (0);
 	
 	if (vfsp != NULL) {
@@ -229,6 +283,11 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t *
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		dsl_pool_t *dp;
+		int error;
+
+		error = vfs_stdsync(vfsp, waitfor);
+		if (error != 0)
+			return (error);
 
 		ZFS_ENTER(zfsvfs);
 		dp = dmu_objset_pool(zfsvfs->z_os);
@@ -243,9 +302,8 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t *
 		}
 
 		if (zfsvfs->z_log != NULL)
-			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
-		else
-			txg_wait_synced(dp, 0);
+			zil_commit(zfsvfs->z_log, 0);
+
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
@@ -259,17 +317,18 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t *
 	return (0);
 }
 
+#ifdef illumos
 static int
 zfs_create_unique_device(dev_t *dev)
 {
 	major_t new_major;
 
 	do {
-		ASSERT3U(zfs_minor, <=, MAXMIN);
+		ASSERT3U(zfs_minor, <=, MAXMIN32);
 		minor_t start = zfs_minor;
 		do {
 			mutex_enter(&zfs_dev_mtx);
-			if (zfs_minor >= MAXMIN) {
+			if (zfs_minor >= MAXMIN32) {
 				/*
 				 * If we're still using the real major
 				 * keep out of /dev/zfs and /dev/zvol minor
@@ -286,8 +345,7 @@ zfs_create_unique_device(dev_t *dev)
 			*dev = makedevice(zfs_major, zfs_minor);
 			mutex_exit(&zfs_dev_mtx);
 		} while (vfs_devismounted(*dev) && zfs_minor != start);
-		break;
-#ifndef __NetBSD__
+#ifdef illumos
 		if (zfs_minor == start) {
 			/*
 			 * We are using all ~262,000 minor numbers for the
@@ -313,6 +371,8 @@ zfs_create_unique_device(dev_t *dev)
 
 	return (0);
 }
+#endif	/* illumos */
+
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
@@ -321,10 +381,12 @@ atime_changed_cb(void *arg, uint64_t new
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
+		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
+		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
@@ -339,14 +401,14 @@ xattr_changed_cb(void *arg, uint64_t new
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
-#endif		
+#endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
-#endif		
+#endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
@@ -356,13 +418,12 @@ static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
-
-	if (newval < SPA_MINBLOCKSIZE ||
-	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
-		newval = SPA_MAXBLOCKSIZE;
+	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+	ASSERT(ISP2(newval));
 
 	zfsvfs->z_max_blksz = newval;
-	zfsvfs->z_vfs->vfs_bsize = newval;
+	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 }
 
 static void
@@ -384,22 +445,6 @@ readonly_changed_cb(void *arg, uint64_t 
 }
 
 static void
-devices_changed_cb(void *arg, uint64_t newval)
-{
-	zfsvfs_t *zfsvfs = arg;
-
-	if (newval == FALSE) {
-		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
-	} else {
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
-		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
-		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
-	}
-}
-
-static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
@@ -491,12 +536,20 @@ zfs_register_callbacks(vfs_t *vfsp)
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	uint64_t nbmand;
-	int readonly, do_readonly = B_FALSE;
-	int setuid, do_setuid = B_FALSE;
-	int exec, do_exec = B_FALSE;
-	int devices, do_devices = B_FALSE;
-	int xattr, do_xattr = B_FALSE;
-	int atime, do_atime = B_FALSE;
+	boolean_t readonly = B_FALSE;
+	boolean_t do_readonly = B_FALSE;
+	boolean_t setuid = B_FALSE;
+	boolean_t do_setuid = B_FALSE;
+	boolean_t exec = B_FALSE;
+	boolean_t do_exec = B_FALSE;
+#ifdef illumos
+	boolean_t devices = B_FALSE;
+	boolean_t do_devices = B_FALSE;
+#endif
+	boolean_t xattr = B_FALSE;
+	boolean_t do_xattr = B_FALSE;
+	boolean_t atime = B_FALSE;
+	boolean_t do_atime = B_FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
@@ -505,12 +558,20 @@ zfs_register_callbacks(vfs_t *vfsp)
 	os = zfsvfs->z_os;
 
 	/*
+	 * This function can be called for a snapshot when we update snapshot's
+	 * mount point, which isn't really supported.
+	 */
+	if (dmu_objset_is_snapshot(os))
+		return (EOPNOTSUPP);
+
+	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
-	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+	    !spa_writeable(dmu_objset_spa(os))) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
@@ -518,19 +579,9 @@ zfs_register_callbacks(vfs_t *vfsp)
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
-		devices = B_FALSE;
 		setuid = B_FALSE;
-		do_devices = B_TRUE;
 		do_setuid = B_TRUE;
 	} else {
-		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
-			devices = B_FALSE;
-			do_devices = B_TRUE;
-		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
-			devices = B_TRUE;
-			do_devices = B_TRUE;
-		}
-
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
@@ -562,6 +613,19 @@ zfs_register_callbacks(vfs_t *vfsp)
 	}
 
 	/*
+	 * We need to enter pool configuration here, so that we can use
+	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
+	 * dsl_prop_get_integer() can not be used, because it has to acquire
+	 * spa_namespace_lock and we can not do that because we already hold
+	 * z_teardown_lock.  The problem is that spa_config_sync() is called
+	 * with spa_namespace_lock held and the function calls ZFS vnode
+	 * operations to write the cache file and thus z_teardown_lock is
+	 * acquired after spa_namespace_lock.
+	 */
+	ds = dmu_objset_ds(os);
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+	/*
 	 * nbmand is a special property.  It can only be changed at
 	 * mount time.
 	 *
@@ -572,14 +636,9 @@ zfs_register_callbacks(vfs_t *vfsp)
 		nbmand = B_FALSE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 		nbmand = B_TRUE;
-	} else {
-		char osname[MAXNAMELEN];
-
-		dmu_objset_name(os, osname);
-		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
-		    NULL)) {
-			return (error);
-		}
+	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
+		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		return (error);
 	}
 
 	/*
@@ -589,28 +648,32 @@ zfs_register_callbacks(vfs_t *vfsp)
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
-	ds = dmu_objset_ds(os);
-	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
+	error = dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "xattr", xattr_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "recordsize", blksz_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "readonly", readonly_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+#ifdef illumos
 	error = error ? error : dsl_prop_register(ds,
-	    "devices", devices_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
+#endif
 	error = error ? error : dsl_prop_register(ds,
-	    "setuid", setuid_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "exec", exec_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "snapdir", snapdir_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "aclmode", acl_mode_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+	    zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
-	    "vscan", vscan_changed_cb, zfsvfs);
+	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 	if (error)
 		goto unregister;
 
@@ -623,8 +686,6 @@ zfs_register_callbacks(vfs_t *vfsp)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
-	if (do_devices)
-		devices_changed_cb(zfsvfs, devices);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 	if (do_atime)
@@ -635,64 +696,70 @@ zfs_register_callbacks(vfs_t *vfsp)
 	return (0);
 
 unregister:
-	/*
-	 * We may attempt to unregister some callbacks that are not
-	 * registered, but this is OK; it will simply return ENOMSG,
-	 * which we will ignore.
-	 */
-	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
-	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
-	    zfsvfs);
-	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
+	dsl_prop_unregister_all(ds, zfsvfs);
 	return (error);
-
 }
 
-static void
-uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
-    int64_t delta, dmu_tx_t *tx)
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
+    uint64_t *userp, uint64_t *groupp)
 {
-	uint64_t used = 0;
-	char buf[32];
-	int err;
-	uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+	/*
+	 * Is it a valid type of object to track?
+	 */
+	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+		return (SET_ERROR(ENOENT));
 
-	if (delta == 0)
-		return;
+	/*
+	 * If we have a NULL data pointer
+	 * then assume the id's aren't changing and
+	 * return EEXIST to the dmu to let it know to
+	 * use the same ids
+	 */
+	if (data == NULL)
+		return (SET_ERROR(EEXIST));
 
-	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
-	err = zap_lookup(os, obj, buf, 8, 1, &used);
-	ASSERT(err == 0 || err == ENOENT);
-	/* no underflow/overflow */
-	ASSERT(delta > 0 || used >= -delta);
-	ASSERT(delta < 0 || used + delta > used);
-	used += delta;
-	if (used == 0)
-		err = zap_remove(os, obj, buf, tx);
-	else
-		err = zap_update(os, obj, buf, 8, 1, &used, tx);
-	ASSERT(err == 0);
-}
+	if (bonustype == DMU_OT_ZNODE) {
+		znode_phys_t *znp = data;
+		*userp = znp->zp_uid;
+		*groupp = znp->zp_gid;
+	} else {
+		int hdrsize;
+		sa_hdr_phys_t *sap = data;
+		sa_hdr_phys_t sa = *sap;
+		boolean_t swap = B_FALSE;
 
-static int
-zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus,
-    uint64_t *userp, uint64_t *groupp)
-{
-	znode_phys_t *znp = bonus;
+		ASSERT(bonustype == DMU_OT_SA);
 
-	if (bonustype != DMU_OT_ZNODE)
-		return (ENOENT);
+		if (sa.sa_magic == 0) {
+			/*
+			 * This should only happen for newly created
+			 * files that haven't had the znode data filled
+			 * in yet.
+			 */
+			*userp = 0;
+			*groupp = 0;
+			return (0);
+		}
+		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
+			sa.sa_magic = SA_MAGIC;
+			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
+			swap = B_TRUE;
+		} else {
+			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
+		}
 
-	*userp = znp->zp_uid;
-	*groupp = znp->zp_gid;
+		hdrsize = sa_hdrsize(&sa);
+		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
+		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
+		    SA_UID_OFFSET));
+		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
+		    SA_GID_OFFSET));
+		if (swap) {
+			*userp = BSWAP_64(*userp);
+			*groupp = BSWAP_64(*groupp);
+		}
+	}
 	return (0);
 }
 
@@ -740,7 +807,7 @@ zfs_userspace_many(zfsvfs_t *zfsvfs, zfs
 	uint64_t obj;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0) {
@@ -776,7 +843,7 @@ zfs_userspace_many(zfsvfs_t *zfsvfs, zfs
  */
 static int
 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
-    char *buf, size_t buflen, boolean_t addok)
+    char *buf, boolean_t addok)
 {
 	uint64_t fuid;
 	int domainid = 0;
@@ -784,10 +851,10 @@ id_to_fuidstr(zfsvfs_t *zfsvfs, const ch
 	if (domain && domain[0]) {
 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 		if (domainid == -1)
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 	}
 	fuid = FUID_ENCODE(domainid, rid);
-	(void) snprintf(buf, buflen, "%llx", (longlong_t)fuid);
+	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	return (0);
 }
 
@@ -802,13 +869,13 @@ zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_
 	*valp = 0;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0)
 		return (0);
 
-	err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), FALSE);
+	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 	if (err)
 		return (err);
 
@@ -829,15 +896,15 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_
 	boolean_t fuid_dirtied;
 
 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
-		return (ENOTSUP);
+		return (SET_ERROR(ENOTSUP));
 
 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 	    &zfsvfs->z_groupquota_obj;
 
-	err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), B_TRUE);
+	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 	if (err)
 		return (err);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
@@ -880,7 +947,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_
 }
 
 boolean_t
-zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
 	char buf[32];
 	uint64_t used, quota, usedobj, quotaobj;
@@ -892,7 +959,7 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
-	(void) snprintf(buf, sizeof(buf), "%llx", (longlong_t)fuid);
+	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 	if (err != 0)
 		return (B_FALSE);
@@ -903,60 +970,62 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs
 	return (used >= quota);
 }
 
-int
-zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+boolean_t
+zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 {
-	objset_t *os;
-	zfsvfs_t *zfsvfs;
-	uint64_t zval;
-	int i, error;
+	uint64_t fuid;
+	uint64_t quotaobj;
 
-	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
-	/*
-	 * We claim to always be readonly so we can open snapshots;
-	 * other ZPL code will prevent us from writing to snapshots.
-	 */
-	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
-	if (error) {
-		kmem_free(zfsvfs, sizeof (zfsvfs_t));
-		return (error);
-	}
+	fuid = isgroup ? zp->z_gid : zp->z_uid;
 
-	/*
-	 * Initialize the zfs-specific filesystem structure.
-	 * Should probably make this a kmem cache, shuffle fields,
-	 * and just bzero up to z_hold_mtx[].
-	 */
-	zfsvfs->z_vfs = NULL;
-	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+	if (quotaobj == 0 || zfsvfs->z_replay)
+		return (B_FALSE);
+
+	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+	uint64_t val;
+
+	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 	zfsvfs->z_os = os;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
-	if (error) {
-		goto out;
-	} else if (zfsvfs->z_version > ZPL_VERSION) {
-		(void) printf("Mismatched versions:  File system "
-		    "is version %llu on-disk format, which is "
-		    "incompatible with this software version %lld!",
-		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
-		error = ENOTSUP;
-		goto out;
+	if (error != 0)
+		return (error);
+	if (zfsvfs->z_version >
+	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+		(void) printf("Can't mount a version %lld file system "
+		    "on a version %lld pool\n. Pool must be upgraded to mount "
+		    "this file system.", (u_longlong_t)zfsvfs->z_version,
+		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
+		return (SET_ERROR(ENOTSUP));
 	}
+	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_norm = (int)val;
 
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
-		goto out;
-	zfsvfs->z_norm = (int)zval;
-
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
-		goto out;
-	zfsvfs->z_utf8 = (zval != 0);
+	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_utf8 = (val != 0);
 
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
-		goto out;
-	zfsvfs->z_case = (uint_t)zval;
+	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_case = (uint_t)val;
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
@@ -967,58 +1036,138 @@ zfsvfs_create(const char *osname, zfsvfs
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+	uint64_t sa_obj = 0;
+	if (zfsvfs->z_use_sa) {
+		/* should either have both of these objects or none */
+		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+		    &sa_obj);
+		if (error != 0)
+			return (error);
+	}
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+	if (error != 0)
+		return (error);
+
+	if (zfsvfs->z_version >= ZPL_VERSION_SA)
+		sa_register_update_callback(os, zfs_sa_upgrade);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
-	if (error)
-		goto out;
+	if (error != 0)
+		return (error);
 	ASSERT(zfsvfs->z_root != 0);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
-	if (error)
-		goto out;
+	if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 	    8, 1, &zfsvfs->z_userquota_obj);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_userquota_obj = 0;
+	else if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 	    8, 1, &zfsvfs->z_groupquota_obj);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_groupquota_obj = 0;
+	else if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 	    &zfsvfs->z_fuid_obj);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_fuid_obj = 0;
+	else if (error != 0)
+		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 	    &zfsvfs->z_shares_dir);
-	if (error && error != ENOENT)
-		goto out;
+	if (error == ENOENT)
+		zfsvfs->z_shares_dir = 0;
+	else if (error != 0)
+		return (error);
+
+	/*
+	 * Only use the name cache if we are looking for a
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name (which is always the case on
+	 * FreeBSD).
+	 */
+	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+	return (0);
+}
+
+int
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	/*
+	 * XXX: Fix struct statfs so this isn't necessary!
+	 *
+	 * The 'osname' is used as the filesystem's special node, which means
+	 * it must fit in statfs.f_mntfromname, or else it can't be
+	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+	 * 'zfs unmount' to think it's not mounted when it is.
+	 */
+	if (strlen(osname) >= MNAMELEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	/*
+	 * We claim to always be readonly so we can open snapshots;
+	 * other ZPL code will prevent us from writing to snapshots.
+	 */
+	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+	if (error) {
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_parent = zfsvfs;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
-	rrw_init(&zfsvfs->z_teardown_lock);
+#ifdef DIAGNOSTIC
+	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
+	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
+	error = zfsvfs_init(zfsvfs, os);
+	if (error != 0) {
+		dmu_objset_disown(os, zfsvfs);
+		*zfvp = NULL;
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
 	*zfvp = zfsvfs;
 	return (0);
-
-out:
-	dmu_objset_disown(os, zfsvfs);
-	*zfvp = NULL;
-	kmem_free(zfsvfs, sizeof (zfsvfs_t));
-	return (error);
 }
 
 static int
@@ -1030,18 +1179,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t
 	if (error)
 		return (error);
 
-	/*
-	 * Set the objset user_ptr to track its zfsvfs.
-	 */
-	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
-	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
-
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-	if (zil_disable) {
-		zil_destroy(zfsvfs->z_log, B_FALSE);
-		zfsvfs->z_log = NULL;
-	}
 
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
@@ -1061,49 +1199,62 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t
 		else
 			zfs_unlinked_drain(zfsvfs);
 
-		if (zfsvfs->z_log) {
-			/*
-			 * Parse and replay the intent log.
-			 *
-			 * Because of ziltest, this must be done after
-			 * zfs_unlinked_drain().  (Further note: ziltest
-			 * doesn't use readonly mounts, where
-			 * zfs_unlinked_drain() isn't called.)  This is because
-			 * ziltest causes spa_sync() to think it's committed,
-			 * but actually it is not, so the intent log contains
-			 * many txg's worth of changes.
-			 *
-			 * In particular, if object N is in the unlinked set in
-			 * the last txg to actually sync, then it could be
-			 * actually freed in a later txg and then reallocated
-			 * in a yet later txg.  This would write a "create
-			 * object N" record to the intent log.  Normally, this
-			 * would be fine because the spa_sync() would have
-			 * written out the fact that object N is free, before
-			 * we could write the "create object N" intent log
-			 * record.
-			 *
-			 * But when we are in ziltest mode, we advance the "open
-			 * txg" without actually spa_sync()-ing the changes to
-			 * disk.  So we would see that object N is still
-			 * allocated and in the unlinked set, and there is an
-			 * intent log record saying to allocate it.
-			 */
-			zfsvfs->z_replay = B_TRUE;
-			zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
-			zfsvfs->z_replay = B_FALSE;
+		/*
+		 * Parse and replay the intent log.
+		 *
+		 * Because of ziltest, this must be done after
+		 * zfs_unlinked_drain().  (Further note: ziltest
+		 * doesn't use readonly mounts, where
+		 * zfs_unlinked_drain() isn't called.)  This is because
+		 * ziltest causes spa_sync() to think it's committed,
+		 * but actually it is not, so the intent log contains
+		 * many txg's worth of changes.
+		 *
+		 * In particular, if object N is in the unlinked set in
+		 * the last txg to actually sync, then it could be
+		 * actually freed in a later txg and then reallocated
+		 * in a yet later txg.  This would write a "create
+		 * object N" record to the intent log.  Normally, this
+		 * would be fine because the spa_sync() would have
+		 * written out the fact that object N is free, before
+		 * we could write the "create object N" intent log
+		 * record.
+		 *
+		 * But when we are in ziltest mode, we advance the "open
+		 * txg" without actually spa_sync()-ing the changes to
+		 * disk.  So we would see that object N is still
+		 * allocated and in the unlinked set, and there is an
+		 * intent log record saying to allocate it.
+		 */
+		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+			if (zil_replay_disable) {
+				zil_destroy(zfsvfs->z_log, B_FALSE);
+			} else {
+				zfsvfs->z_replay = B_TRUE;
+				zil_replay(zfsvfs->z_os, zfsvfs,
+				    zfs_replay_vector);
+				zfsvfs->z_replay = B_FALSE;
+			}
 		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
 	return (0);
 }
 
+extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
+
 void
 zfsvfs_free(zfsvfs_t *zfsvfs)
 {
 	int i;
-	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
 
 	/*
 	 * This is a barrier to prevent the filesystem from going away in
@@ -1115,10 +1266,11 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
 	rw_exit(&zfsvfs_lock);
 
 	zfs_fuid_destroy(zfsvfs);
+
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
-	rrw_destroy(&zfsvfs->z_teardown_lock);
+	rrm_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1130,23 +1282,33 @@ static void
 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 {
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
-	if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
-		vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
-		vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
-		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
-		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
-		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
-		vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+	if (zfsvfs->z_vfs) {
+		if (zfsvfs->z_use_fuids) {
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+		} else {
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+		}
 	}
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
-	dev_t mount_dev;
 	uint64_t recordsize, fsid_guid;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
+	vnode_t *vp;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
@@ -1155,26 +1317,37 @@ zfs_domount(vfs_t *vfsp, char *osname)
 	if (error)
 		return (error);
 	zfsvfs->z_vfs = vfsp;
-	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
-	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 
+#ifdef illumos
 	/* Initialize the generic filesystem structure. */
+	vfsp->vfs_bcount = 0;
 	vfsp->vfs_data = NULL;
 
 	if (zfs_create_unique_device(&mount_dev) == -1) {
-		error = ENODEV;
+		error = SET_ERROR(ENODEV);
 		goto out;
 	}
 	ASSERT(vfs_devismounted(mount_dev) == 0);
+#endif
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
-	    goto out;
+		goto out;
+	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
+	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
 
-	vfsp->vfs_bsize = DEV_BSIZE;
-	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfsp->vfs_data = zfsvfs;
+#ifdef __FreeBSD_kernel__
+	vfsp->mnt_flag |= MNT_LOCAL;
+	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
+	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
+#endif
+#ifdef __NetBSD__
+	vfsp->mnt_flag |= MNT_LOCAL;
+	vfsp->mnt_iflag |= IMNT_MPSAFE;
+#endif
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
@@ -1186,11 +1359,16 @@ zfs_domount(vfs_t *vfsp, char *osname)
 	 */
 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+#ifdef __FreeBSD_kernel__
+	vfsp->vfs_fsid.val[0] = fsid_guid;
+	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
+#endif
+#ifdef __NetBSD__
 	vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid;
-	vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) |
-	    zfsfstype & 0xFF;
-	
-	dprintf("zfs_domount vrele after vfsp->vfs_count %d\n", vfsp->vfs_count);
+	vfsp->mnt_stat.f_fsidx.__fsid_val[1] = fsid_guid >> 32;
+#endif
+
 	/*
 	 * Set features for file system.
 	 */
@@ -1214,6 +1392,7 @@ zfs_domount(vfs_t *vfsp, char *osname)
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
+		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
 
 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
@@ -1222,9 +1401,13 @@ zfs_domount(vfs_t *vfsp, char *osname)
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
-	dprintf("zfs_vfsops.c zfs_domount called\n");	
-	dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
-	
+#ifdef __FreeBSD_kernel__
+	vfs_mountedfrom(vfsp, osname);
+#endif
+#ifdef __NetBSD__
+	set_statvfs_info("on-name", UIO_SYSSPACE, osname, UIO_SYSSPACE, "zfs", vfsp, curlwp);
+#endif
+
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
@@ -1232,8 +1415,9 @@ out:
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	} else {
-		atomic_add_32(&zfs_active_fs_count, 1);
+		atomic_inc_32(&zfs_active_fs_count);
 	}
+
 	return (error);
 }
 
@@ -1241,48 +1425,12 @@ void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
-	struct dsl_dataset *ds;
-
-	/*
-	 * Unregister properties.
-	 */
-	if (!dmu_objset_is_snapshot(os)) {
-		ds = dmu_objset_ds(os);
-		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
-		    zfsvfs) == 0);
 
-		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
-		    zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "aclinherit",
-		    acl_inherit_changed_cb, zfsvfs) == 0);
-
-		VERIFY(dsl_prop_unregister(ds, "vscan",
-		    vscan_changed_cb, zfsvfs) == 0);
-	}
+	if (!dmu_objset_is_snapshot(os))
+		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
 }
 
+#ifdef SECLABEL
 /*
  * Convert a decimal digit string to a uint64_t integer.
  */
@@ -1293,7 +1441,7 @@ str_to_uint64(char *str, uint64_t *objnu
 
 	while (*str) {
 		if (*str < '0' || *str > '9')
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 
 		num = num*10 + *str++ - '0';
 	}
@@ -1315,7 +1463,7 @@ zfs_parse_bootfs(char *bpath, char *outp
 	int error;
 
 	if (*bpath == 0 || *bpath == '/')
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(outpath, bpath);
 
@@ -1338,20 +1486,17 @@ zfs_parse_bootfs(char *bpath, char *outp
 	return (error);
 }
 
-
 /*
- * zfs_check_global_label:
- *	Check that the hex label string is appropriate for the dataset
- *	being mounted into the global_zone proper.
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
  *
- *	Return an error if the hex label string is not default or
- *	admin_low/admin_high.  For admin_low labels, the corresponding
- *	dataset must be readonly.
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high.  For admin_low labels, the corresponding
+ * dataset must be readonly.
  */
 int
 zfs_check_global_label(const char *dsname, const char *hexsl)
 {
-#ifdef PORT_SOLARIS
 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
@@ -1362,30 +1507,23 @@ zfs_check_global_label(const char *dsnam
 
 		if (dsl_prop_get_integer(dsname,
 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
-			return (EACCES);
+			return (SET_ERROR(EACCES));
 		return (rdonly ? 0 : EACCES);
 	}
-	return (EACCES);
-#else
-	return 0;
-#endif	
+	return (SET_ERROR(EACCES));
 }
 
 /*
- * zfs_mount_label_policy:
- *	Determine whether the mount is allowed according to MAC check.
- *	by comparing (where appropriate) label of the dataset against
- *	the label of the zone being mounted into.  If the dataset has
- *	no label, create one.
+ * Determine whether the mount is allowed according to MAC check.
+ * by comparing (where appropriate) label of the dataset against
+ * the label of the zone being mounted into.  If the dataset has
+ * no label, create one.
  *
- *	Returns:
- *		 0 :	access allowed
- *		>0 :	error code, such as EACCES
+ * Returns 0 if access allowed, error otherwise (e.g. EACCES)
  */
 static int
 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
 {
-#ifdef PORT_SOLARIS	
 	int		error, retv;
 	zone_t		*mntzone = NULL;
 	ts_label_t	*mnt_tsl;
@@ -1401,7 +1539,7 @@ zfs_mount_label_policy(vfs_t *vfsp, char
 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error)
-		return (EACCES);
+		return (SET_ERROR(EACCES));
 
 	/*
 	 * If labeling is NOT enabled, then disallow the mount of datasets
@@ -1411,7 +1549,7 @@ zfs_mount_label_policy(vfs_t *vfsp, char
 	if (!is_system_labeled()) {
 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 			return (0);
-		return (EACCES);
+		return (SET_ERROR(EACCES));
 	}
 
 	/*
@@ -1428,7 +1566,7 @@ zfs_mount_label_policy(vfs_t *vfsp, char
 
 		if (dsl_prop_get_integer(osname,
 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
-			return (EACCES);
+			return (SET_ERROR(EACCES));
 		if (!zoned)
 			return (zfs_check_global_label(osname, ds_hexsl));
 		else
@@ -1452,8 +1590,9 @@ zfs_mount_label_policy(vfs_t *vfsp, char
 		char *str = NULL;
 
 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
-		    dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
-		    ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
+		    dsl_prop_set_string(osname,
+		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+		    ZPROP_SRC_LOCAL, str) == 0)
 			retv = 0;
 		if (str != NULL)
 			kmem_free(str, strlen(str) + 1);
@@ -1475,12 +1614,10 @@ zfs_mount_label_policy(vfs_t *vfsp, char
 	label_rele(mnt_tsl);
 	zone_rele(mntzone);
 	return (retv);
-#else   /* PORT_SOLARIS */
-	return (0);
-#endif
 }
+#endif	/* SECLABEL */
 
-#ifndef __NetBSD__
+#ifdef OPENSOLARIS_MOUNTROOT
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
@@ -1501,7 +1638,7 @@ zfs_mountroot(vfs_t *vfsp, enum whymount
 	 */
 	if (why == ROOT_INIT) {
 		if (zfsrootdone++)
-			return (EBUSY);
+			return (SET_ERROR(EBUSY));
 		/*
 		 * the process of doing a spa_load will require the
 		 * clock to be set before we could (for example) do
@@ -1513,7 +1650,7 @@ zfs_mountroot(vfs_t *vfsp, enum whymount
 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
 			    "bootfs name");
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 		}
 		zfs_devid = spa_get_bootprop("diskdevid");
 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
@@ -1582,39 +1719,60 @@ out:
 	 * if "why" is equal to anything else other than ROOT_INIT,
 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
 	 */
-	return (ENOTSUP);
+	return (SET_ERROR(ENOTSUP));
+}
+#endif	/* OPENSOLARIS_MOUNTROOT */
+
+static int
+getpoolname(const char *osname, char *poolname)
+{
+	char *p;
+
+	p = strchr(osname, '/');
+	if (p == NULL) {
+		if (strlen(osname) >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strcpy(poolname, osname);
+	} else {
+		if (p - osname >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strncpy(poolname, osname, p - osname);
+		poolname[p - osname] = '\0';
+	}
+	return (0);
 }
-#endif /*__NetBSD__ */
 
 /*ARGSUSED*/
+#ifdef illumos
+static int
+zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+#endif
+#ifdef __FreeBSD_kernel__
+static int
+zfs_mount(vfs_t *vfsp)
+#endif
+#ifdef __NetBSD__
 static int
 zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len)
+#endif
 {
+	vnode_t		*mvp = vfsp->mnt_vnodecovered;
 	char		*osname;
-	pathname_t	spn;
-	vnode_t         *mvp = vfsp->mnt_vnodecovered;
-	struct mounta   *uap = data;
 	int		error = 0;
 	int		canwrite;
-	cred_t          *cr;
 
-	crget(cr);
-	dprintf("zfs_vfsops.c zfs_mount called\n");	
-	dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
+#ifdef illumos
 	if (mvp->v_type != VDIR)
-		return (ENOTDIR);
-
-	if (uap == NULL)
-		return (EINVAL);
+		return (SET_ERROR(ENOTDIR));
 
-	mutex_enter(mvp->v_interlock);
+	mutex_enter(&mvp->v_lock);
 	if ((uap->flags & MS_REMOUNT) == 0 &&
 	    (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
-		mutex_exit(mvp->v_interlock);
-		return (EBUSY);
+		mutex_exit(&mvp->v_lock);
+		return (SET_ERROR(EBUSY));
 	}
-	mutex_exit(mvp->v_interlock);
+	mutex_exit(&mvp->v_lock);
 
 	/*
 	 * ZFS does not support passing unparsed data in via MS_DATA.
@@ -1623,11 +1781,52 @@ zfs_mount(vfs_t *vfsp, const char *path,
 	 * can be interrogated.
 	 */
 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
+#endif /* illumos */
 
-	osname = PNBUF_GET();
+#ifdef __FreeBSD_kernel__
+	kthread_t	*td = curthread;
+	cred_t		*cr = td->td_ucred;
+
+	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
+		return (SET_ERROR(EPERM));
 
+	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * If full-owner-access is enabled and delegated administration is
+	 * turned on, we must set nosuid.
+	 */
+	if (zfs_super_owner &&
+	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
+		secpolicy_fs_mount_clearopts(cr, vfsp);
+	}
+
+#endif /* __FreeBSD_kernel__ */
+
+#ifdef __NetBSD__
+	cred_t		*cr = CRED();
+	struct mounta	*uap = data;
+
+	if (uap == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if (mvp->v_type != VDIR)
+		return (SET_ERROR(ENOTDIR));
+
+	mutex_enter(mvp->v_interlock);
+	if ((uap->flags & MS_REMOUNT) == 0 &&
+	    (uap->flags & MS_OVERLAY) == 0 &&
+	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+		mutex_exit(mvp->v_interlock);
+		return (SET_ERROR(EBUSY));
+	}
+	mutex_exit(mvp->v_interlock);
+
+	osname = PNBUF_GET();
 	strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1);
+#endif /* __NetBSD__ */
 
 	/*
 	 * Check for mount privilege?
@@ -1637,8 +1836,10 @@ zfs_mount(vfs_t *vfsp, const char *path,
 	 */
 	error = secpolicy_fs_mount(cr, mvp, vfsp);
 	if (error) {
-		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
-		if (error == 0) {
+		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
+			goto out;
+
+		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
 			vattr_t		vattr;
 
 			/*
@@ -1648,52 +1849,111 @@ zfs_mount(vfs_t *vfsp, const char *path,
 
 			vattr.va_mask = AT_UID;
 
-			if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
+#ifdef __FreeBSD_kernel__
+			vn_lock(mvp, LK_SHARED | LK_RETRY);
+			if (VOP_GETATTR(mvp, &vattr, cr)) {
+				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 
-			if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
-			    VOP_ACCESS(mvp, VWRITE, cr) != 0) {
-				error = EPERM;
+			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
+				VOP_UNLOCK(mvp, 0);
+				goto out;
+			}
+			VOP_UNLOCK(mvp, 0);
+#endif
+#ifdef __NetBSD__
+			vn_lock(mvp, LK_SHARED | LK_RETRY);
+			if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
+				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 
-/* XXX NetBSD			secpolicy_fs_mount_clearopts(cr, vfsp);*/
-		} else {
-			goto out;
+			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+			    VOP_ACCESS(mvp, VWRITE, cr) != 0) {
+				VOP_UNLOCK(mvp, 0);
+				goto out;
+			}
+			VOP_UNLOCK(mvp, 0);
+#endif
 		}
+
+		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Refuse to mount a filesystem if we are in a local zone and the
 	 * dataset is not visible.
 	 */
-	if (!INGLOBALZONE(curproc) &&
+	if (!INGLOBALZONE(curthread) &&
 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
-		error = EPERM;
+		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
+#ifdef SECLABEL
 	error = zfs_mount_label_policy(vfsp, osname);
 	if (error)
 		goto out;
+#endif
+
+#ifdef __FreeBSD_kernel__
+	vfsp->vfs_flag |= MNT_NFS4ACLS;
+#endif
+#ifdef __NetBSD__
+	vfsp->mnt_iflag |= IMNT_MPSAFE;
+#endif
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
-	if (uap->flags & MS_REMOUNT) {
-		/* refresh mount options */
-		zfs_unregister_callbacks(vfsp->vfs_data);
+	if (vfsp->vfs_flag & MS_REMOUNT) {
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+		/*
+		 * Refresh mount options with z_teardown_lock blocking I/O while
+		 * the filesystem is in an inconsistent state.
+		 * The lock also serializes this code with filesystem
+		 * manipulations between entry to zfs_suspend_fs() and return
+		 * from zfs_resume_fs().
+		 */
+		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+		zfs_unregister_callbacks(zfsvfs);
 		error = zfs_register_callbacks(vfsp);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		goto out;
 	}
 
-	/* Mark ZFS as MP SAFE */
-	vfsp->mnt_iflag |= IMNT_MPSAFE;
-	
+#ifdef __FreeBSD_kernel__
+	/* Initial root mount: try hard to import the requested root pool. */
+	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
+	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
+		char pname[MAXNAMELEN];
+
+		error = getpoolname(osname, pname);
+		if (error == 0)
+			error = spa_import_rootpool(pname);
+		if (error)
+			goto out;
+	}
+#endif
+
+	DROP_GIANT();
 	error = zfs_domount(vfsp, osname);
+	PICKUP_GIANT();
+
+#ifdef illumos
+	/*
+	 * Add an extra VFS_HOLD on our parent vfs so that it can't
+	 * disappear due to a forced unmount.
+	 */
+	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
+		VFS_HOLD(mvp->v_vfsp);
+#endif
 
+#ifdef __NetBSD__
 	vfs_getnewfsid(vfsp);
 
 	/* setup zfs mount info */
@@ -1701,26 +1961,28 @@ zfs_mount(vfs_t *vfsp, const char *path,
 	    sizeof(vfsp->mnt_stat.f_mntfromname));
 	set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname,
 	    UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp);
-
-	/*
-	 * Add an extra VFS_HOLD on our parent vfs so that it can't
-	 * disappear due to a forced unmount.
-	 */
-	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
-		VFS_HOLD(mvp->v_vfsp);
+#endif
 
 out:
-	PNBUF_PUT(osname);
 	return (error);
 }
 
+#ifdef __FreeBSD_kernel__
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp)
+#endif
+#ifdef __NetBSD__
 static int
 zfs_statvfs(vfs_t *vfsp, struct statvfs *statp)
+#endif
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	dev_t dev;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
+#ifdef __FreeBSD_kernel__
+	statp->f_version = STATFS_VERSION;
+#endif
+
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
@@ -1731,8 +1993,11 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs 
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
-	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
-	statp->f_bsize = zfsvfs->z_max_blksz;
+	statp->f_bsize = SPA_MINBLOCKSIZE;
+#ifdef __NetBSD__
+	statp->f_frsize = SPA_MINBLOCKSIZE;
+#endif
+	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
@@ -1741,7 +2006,7 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs 
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
-	statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
+	statp->f_bfree = availbytes / statp->f_bsize;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
@@ -1753,51 +2018,62 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs 
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
+#ifndef __FreeBSD__
 	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
+#endif
 	statp->f_files = statp->f_ffree + usedobjs;
 
+#ifdef __FreeBSD__
+	(void) cmpldev(&d32, vfsp->vfs_dev);
+	statp->f_fsid = d32;
+#endif
+#ifdef __NetBSD__
 	statp->f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0];
-	
+#endif
+
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
-	(void) strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+
+	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 	    sizeof(statp->f_mntfromname));
-	(void) strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
+#ifdef __FreeBSD_kernel__
+	statp->f_namemax = MAXNAMELEN - 1;
+#endif
+#ifdef __NetBSD__
 	statp->f_namemax = ZFS_MAXNAMELEN;
-
-	/*
-	 * We have all of 32 characters to stuff a string here.
-	 * Is there anything useful we could/should provide?
-	 */
-#ifndef __NetBSD__	
-	bzero(statp->f_fstr, sizeof (statp->f_fstr));
 #endif
+
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
-zfs_root(vfs_t *vfsp, vnode_t **vpp)
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
-	dprintf("zfs_root called\n");
+
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0)
 		*vpp = ZTOV(rootzp);
-	dprintf("vpp -> %d, error %d -- %p\n", (*vpp)->v_type, error, *vpp);
+
 	ZFS_EXIT(zfsvfs);
-	if (error == 0)
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-	KASSERT((error != 0) || (*vpp != NULL));
-	KASSERT((error != 0) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE));
+
+	if (error == 0) {
+		error = vn_lock(*vpp, flags);
+		if (error != 0) {
+			VN_RELE(*vpp);
+			*vpp = NULL;
+		}
+	}
 	return (error);
 }
 
@@ -1812,7 +2088,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea
 {
 	znode_t	*zp;
 
-	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
 	if (!unmounting) {
 		/*
@@ -1822,6 +2098,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea
 		 * 'z_parent' is self referential for non-snapshots.
 		 */
 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
+#ifdef FREEBSD_NAMECACHE
+		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
+#endif
 	}
 
 	/*
@@ -1842,8 +2121,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea
 	 */
 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
-		return (EIO);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+		return (SET_ERROR(EIO));
 	}
 
 	/*
@@ -1856,8 +2135,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
-		if (zp->z_dbuf) {
-			ASSERT(ZTOV(zp)->v_count > 0);
+		if (zp->z_sa_hdl) {
+			ASSERT(ZTOV(zp)->v_count >= 0);
 			zfs_znode_dmu_fini(zp);
 		}
 	mutex_exit(&zfsvfs->z_znodes_lock);
@@ -1869,7 +2148,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea
 	 */
 	if (unmounting) {
 		zfsvfs->z_unmounted = B_TRUE;
-		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	}
 
@@ -1889,10 +2168,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea
 	/*
 	 * Evict cached data
 	 */
-	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
+	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
+	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
-	}
+	dmu_objset_evict_dbufs(zfsvfs->z_os);
 
 	return (0);
 }
@@ -1903,32 +2182,22 @@ zfs_umount(vfs_t *vfsp, int fflag)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	objset_t *os;
-	int ret, flags = 0;
-	cred_t *cr;
-
-	vnode_t *vpp;
-	int counter;
-
-	counter = 0;
-	
-	dprintf("ZFS_UMOUNT called\n");
+	int ret;
+#ifdef __FreeBSD_kernel__
+	kthread_t *td = curthread;
+	cred_t *cr = td->td_ucred;
+#endif
+#ifdef __NetBSD__
+	cred_t *cr = CRED();
+#endif
 
-	/*TAILQ_FOREACH(vpp, &vfsp->mnt_vnodelist, v_mntvnodes) {
-		printf("vnode list vnode number %d -- vnode address %p\n", counter, vpp);
-		vprint("ZFS vfsp vnode list", vpp);
-		counter++;
-		} */
-	
-	crget(cr);
-#ifdef TODO	
 	ret = secpolicy_fs_unmount(cr, vfsp);
 	if (ret) {
-		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
-		    ZFS_DELEG_PERM_MOUNT, cr);
-		if (ret)
+		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
+		    ZFS_DELEG_PERM_MOUNT, cr))
 			return (ret);
 	}
-#endif
+
 	/*
 	 * We purge the parent filesystem's vfsp as the parent filesystem
 	 * and all of its snapshots have their vnode's v_vfsp set to the
@@ -1941,12 +2210,35 @@ zfs_umount(vfs_t *vfsp, int fflag)
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
-	if (zfsvfs->z_ctldir != NULL &&
-	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
-		return (ret);
+	if (zfsvfs->z_ctldir != NULL) {
+		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+			return (ret);
 	}
 
-#if 0
+	if (fflag & MS_FORCE) {
+		/*
+		 * Mark file system as unmounted before calling
+		 * vflush(FORCECLOSE). This way we ensure no future vnops
+		 * will be called and risk operating on DOOMED vnodes.
+		 */
+		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+		zfsvfs->z_unmounted = B_TRUE;
+		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+	}
+
+	/*
+	 * Flush all the files.
+	 */
+#ifdef __FreeBSD_kernel__
+	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+#endif
+#ifdef __NetBSD__
+	ret = vflush(vfsp, NULL, (fflag & MS_FORCE) ? FORCECLOSE : 0);
+#endif
+	if (ret != 0)
+		return (ret);
+
+#ifdef illumos
 	if (!(fflag & MS_FORCE)) {
 		/*
 		 * Check the number of active vnodes in the file system.
@@ -1959,21 +2251,15 @@ zfs_umount(vfs_t *vfsp, int fflag)
 		 * reflected in the vnode count.
 		 */
 		if (zfsvfs->z_ctldir == NULL) {
-			if (vfsp->vfs_count > 1){
-				return (EBUSY);
-			}
+			if (vfsp->vfs_count > 1)
+				return (SET_ERROR(EBUSY));
 		} else {
 			if (vfsp->vfs_count > 2 ||
-			    zfsvfs->z_ctldir->v_count > 1) {
-				return (EBUSY);
-			}
+			    zfsvfs->z_ctldir->v_count > 1)
+				return (SET_ERROR(EBUSY));
 		}
 	}
 #endif
-	ret = vflush(vfsp, NULL, (ISSET(fflag, MS_FORCE)? FORCECLOSE : 0));
-	if (ret != 0)
-		return ret;
-	vfsp->vfs_flag |= VFS_UNMOUNTED;
 
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;
@@ -2001,59 +2287,111 @@ zfs_umount(vfs_t *vfsp, int fflag)
 	 */
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
+	zfs_freevfs(vfsp);
 
 	return (0);
 }
 
+#ifdef __FreeBSD_kernel__
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+#endif
+#ifdef __NetBSD__
 static int
 zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp)
+#endif
 {
-	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	znode_t *zp;
-	int err;
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	int 		err;
+#ifdef __NetBSD__
+	int		flags = LK_EXCLUSIVE;
+#endif
+
+	/*
+	 * zfs_zget() can't operate on virtual entries like .zfs/ or
+	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
+	 * This will make NFS to switch to LOOKUP instead of using VGET.
+	 */
+	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
+	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
+		return (EOPNOTSUPP);
 
-	dprintf("zfs_vget called\n");
-	dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
-	
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		err = EINVAL;
 	}
-	if (err != 0)
-		*vpp = NULL;
-	else {
+	if (err == 0)
 		*vpp = ZTOV(zp);
-		/* XXX NetBSD how to get flags for vn_lock ? */
-		vn_lock(*vpp, 0);
-	}
 	ZFS_EXIT(zfsvfs);
+	if (err == 0)
+		err = vn_lock(*vpp, flags);
+	if (err != 0)
+		*vpp = NULL;
+
 	return (err);
 }
 
+#ifdef __FreeBSD_kernel__
 static int
-zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	znode_t *zp;
-	uint64_t object = 0;
-	uint64_t fid_gen = 0;
-	uint64_t gen_mask;
-	uint64_t zp_gen;
-	int i, err;
+
+	/*
+	 * If this is regular file system vfsp is the same as
+	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
+	 * zfsvfs->z_parent->z_vfs represents parent file system
+	 * which we have to use here, because only this file system
+	 * has mnt_export configured.
+	 */
+	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
+	    credanonp, numsecflavors, secflavors));
+}
+
+CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
+CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
+#endif
+
+#ifdef __FreeBSD_kernel__
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
+#endif
+#ifdef __NetBSD__
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
+#endif
+{
+	struct componentname cn;
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	vnode_t		*dvp;
+	uint64_t	object = 0;
+	uint64_t	fid_gen = 0;
+	uint64_t	gen_mask;
+	uint64_t	zp_gen;
+	int 		i, err;
 
 	*vpp = NULL;
 
-	dprintf("zfs_fhtovp called\n");
-	dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count);
-	
+#ifdef __NetBSD__
+	return (SET_ERROR(ENOTSUP));
+#endif
+
 	ZFS_ENTER(zfsvfs);
 
-	if (fidp->fid_len == LONG_FID_LEN) {
-		zfid_long_t *zlfid = (zfid_long_t *)fidp;
-		uint64_t objsetid = 0;
-		uint64_t setgen = 0;
+#ifdef __FreeBSD_kernel__
+	/*
+	 * On FreeBSD we can get snapshot's mount point or its parent file
+	 * system mount point depending if snapshot is already mounted or not.
+	 */
+	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
+		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
+		uint64_t	objsetid = 0;
+		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
@@ -2065,12 +2403,12 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vno
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
-			return (EINVAL);
+			return (SET_ERROR(EINVAL));
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
-		zfid_short_t *zfid = (zfid_short_t *)fidp;
+		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
@@ -2079,26 +2417,46 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vno
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
-	/* A zero fid_gen means we are in the .zfs control directories */
-	if (fid_gen == 0 &&
-	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
-		*vpp = zfsvfs->z_ctldir;
-		ASSERT(*vpp != NULL);
+	/*
+	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
+	 * directory tree. If the object == zfsvfs->z_shares_dir, then
+	 * we are in the .zfs/shares directory tree.
+	 */
+	if ((fid_gen == 0 &&
+	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
+	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
+		ZFS_EXIT(zfsvfs);
+		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
 		if (object == ZFSCTL_INO_SNAPDIR) {
-			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
-				0, NULL, NULL, NULL, NULL, NULL) == 0);
+			cn.cn_nameptr = "snapshot";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN | LOCKLEAF;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
+		} else if (object == zfsvfs->z_shares_dir) {
+			/*
+			 * XXX This branch must not be taken,
+			 * if it is, then the lookup below will
+			 * explode.
+			 */
+			cn.cn_nameptr = "shares";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
 		} else {
-			VN_HOLD(*vpp);
+			*vpp = dvp;
 		}
-		ZFS_EXIT(zfsvfs);
-		/* XXX: LK_RETRY? */
-		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
-		return (0);
+		return (err);
 	}
-
+#endif
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
@@ -2106,28 +2464,37 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vno
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
-	zp_gen = zp->z_phys->zp_gen & gen_mask;
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+	    sizeof (uint64_t));
+	zp_gen = zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		VN_RELE(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	*vpp = ZTOV(zp);
-	/* XXX: LK_RETRY? */
-	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 	ZFS_EXIT(zfsvfs);
-	return (0);
+#ifdef __FreeBSD_kernel__
+	err = vn_lock(*vpp, flags);
+	if (err == 0)
+		vnode_create_vobject(*vpp, zp->z_size, curthread);
+	else
+		*vpp = NULL;
+#endif
+	return (err);
 }
 
 /*
  * Block out VOPs and close zfsvfs_t::z_os
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held.
+ * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
  */
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
@@ -2136,57 +2503,70 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs)
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
-	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 
 	return (0);
 }
 
 /*
- * Reopen zfsvfs_t::z_os and release VOPs.
+ * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended.  Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
  */
 int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 {
 	int err;
+	znode_t *zp;
 
-	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
-	err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
-	    &zfsvfs->z_os);
-	if (err) {
-		zfsvfs->z_os = NULL;
-	} else {
-		znode_t *zp;
+	/*
+	 * We already own this, so just update the objset_t, as the one we
+	 * had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	VERIFY0(dmu_objset_from_ds(ds, &os));
 
-		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+	err = zfsvfs_init(zfsvfs, os);
+	if (err != 0)
+		goto bail;
 
-		/*
-		 * Attempt to re-establish all the active znodes with
-		 * their dbufs.  If a zfs_rezget() fails, then we'll let
-		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
-		 * when they try to use their znode.
-		 */
-		mutex_enter(&zfsvfs->z_znodes_lock);
-		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
-		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
-			(void) zfs_rezget(zp);
-		}
-		mutex_exit(&zfsvfs->z_znodes_lock);
+	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
+	zfs_set_fuid_feature(zfsvfs);
+
+	/*
+	 * Attempt to re-establish all the active znodes with
+	 * their dbufs.  If a zfs_rezget() fails, then we'll let
+	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+	 * when they try to use their znode.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+		(void) zfs_rezget(zp);
 	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
 
+bail:
 	/* release the VOPs */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
-	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
-		 * Since we couldn't reopen zfsvfs::z_os, force
+		 * Since we couldn't setup the sa framework, try to force
 		 * unmount this file system.
 		 */
-		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
-			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curlwp);
+		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+			vfs_ref(zfsvfs->z_vfs);
+			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+		}
 	}
 	return (err);
 }
@@ -2196,76 +2576,76 @@ zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
+#ifdef illumos
 	/*
 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
-	 * from zfs_mount().  Release it here.
+	 * from zfs_mount().  Release it here.  If we came through
+	 * zfs_mountroot() instead, we didn't grab an extra hold, so
+	 * skip the VFS_RELE for rootvfs.
 	 */
-	if (zfsvfs->z_issnap)
+	if (zfsvfs->z_issnap && (vfsp != rootvfs))
 		VFS_RELE(zfsvfs->z_parent->z_vfs);
+#endif
 
 	zfsvfs_free(zfsvfs);
 
-	atomic_add_32(&zfs_active_fs_count, -1);
+	atomic_dec_32(&zfs_active_fs_count);
 }
 
-/*
- * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
- * so we can't safely do any non-idempotent initialization here.
- * Leave that to zfs_init() and zfs_fini(), which are called
- * from the module's _init() and _fini() entry points.
- */
-/*ARGSUSED*/
-int
-zfs_vfsinit(int fstype, char *name)
-{
-	int error;
-
-	zfsfstype = fstype;
+#ifdef __FreeBSD_kernel__
+#ifdef __i386__
+static int desiredvnodes_backup;
+#endif
 
-	/*
-	 * Setup vfsops and vnodeops tables.
-	 */
-	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
+static void
+zfs_vnodes_adjust(void)
+{
+#ifdef __i386__
+	int newdesiredvnodes;
 
-	error = zfs_create_op_tables();
-	if (error) {
-		zfs_remove_op_tables();
-		cmn_err(CE_WARN, "zfs: bad vnode ops template");
-		vfs_freevfsops_by_type(zfsfstype);
-		return (error);
-	}
+	desiredvnodes_backup = desiredvnodes;
 
-	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL);
-	
 	/*
-	 * Unique major number for all zfs mounts.
-	 * If we run out of 32-bit minors, we'll getudev() another major.
+	 * We calculate newdesiredvnodes the same way it is done in
+	 * vntblinit(). If it is equal to desiredvnodes, it means that
+	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
-	zfs_major = ddi_name_to_major(ZFS_DRIVER);
-	zfs_minor = ZFS_MIN_MINOR;
-
-	return (0);
+	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
+	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
+	    sizeof(struct vnode))));
+	if (newdesiredvnodes == desiredvnodes)
+		desiredvnodes = (3 * newdesiredvnodes) / 4;
+#endif
 }
 
-int
-zfs_vfsfini(void)
+static void
+zfs_vnodes_adjust_back(void)
 {
-	int err;
 
-	err = vfs_detach(&zfs_vfsops_template);
-	if (err != 0)
-		return err;
+#ifdef __i386__
+	desiredvnodes = desiredvnodes_backup;
+#endif
+}
+#endif /* __FreeBSD_kernel__ */
 
-	mutex_destroy(&zfs_debug_mtx);
-	mutex_destroy(&zfs_dev_mtx);	
+#ifdef __NetBSD__
+static void
+zfs_vnodes_adjust(void)
+{
+}
 
-	return 0;
+static void
+zfs_vnodes_adjust_back(void)
+{
 }
+#endif
 
 void
 zfs_init(void)
 {
+
+	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
+
 	/*
 	 * Initialize .zfs directory structures
 	 */
@@ -2276,6 +2656,13 @@ zfs_init(void)
 	 */
 	zfs_znode_init();
 
+	/*
+	 * Reduce number of vnodes. Originally number of vnodes is calculated
+	 * with UFS inode in mind. We reduce it here, because it's too big for
+	 * ZFS/i386.
+	 */
+	zfs_vnodes_adjust();
+
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
@@ -2284,6 +2671,7 @@ zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
+	zfs_vnodes_adjust_back();
 }
 
 int
@@ -2300,18 +2688,28 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64
 	dmu_tx_t *tx;
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	if (newvers < zfsvfs->z_version)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_spa_version_map(newvers) >
+	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
+		return (SET_ERROR(ENOTSUP));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    ZFS_SA_ATTRS);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
+
 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 	    8, 1, &newvers, tx);
 
@@ -2320,17 +2718,30 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64
 		return (error);
 	}
 
-	spa_history_internal_log(LOG_DS_UPGRADE,
-	    dmu_objset_spa(os), tx, CRED(),
-	    "oldver=%llu newver=%llu dataset = %llu",
-	    zfsvfs->z_version, newvers, dmu_objset_id(os));
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		uint64_t sa_obj;
+
+		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+		    SPA_VERSION_SA);
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+
+		error = zap_add(os, MASTER_NODE_OBJ,
+		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT0(error);
+
+		VERIFY(0 == sa_set_sa_object(os, sa_obj));
+		sa_register_update_callback(os, zfs_sa_upgrade);
+	}
+
+	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+	    "from %llu to %llu", zfsvfs->z_version, newvers);
 
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
 
-	if (zfsvfs->z_version >= ZPL_VERSION_FUID)
-		zfs_set_fuid_feature(zfsvfs);
+	zfs_set_fuid_feature(zfsvfs);
 
 	return (0);
 }
@@ -2377,25 +2788,36 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t
 	return (error);
 }
 
-static int
-zfs_start(vfs_t *vfsp, int flags)
+#ifdef __FreeBSD_kernel__
+#ifdef _KERNEL
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
 {
-
-	return (0);
+	char tmpbuf[MAXPATHLEN];
+	struct mount *mp;
+	char *fromname;
+	size_t oldlen;
+
+	oldlen = strlen(oldname);
+
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		fromname = mp->mnt_stat.f_mntfromname;
+		if (strcmp(fromname, oldname) == 0) {
+			(void)strlcpy(fromname, newname,
+			    sizeof(mp->mnt_stat.f_mntfromname));
+			continue;
+		}
+		if (strncmp(fromname, oldname, oldlen) == 0 &&
+		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
+			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
+			    newname, fromname + oldlen);
+			(void)strlcpy(fromname, tmpbuf,
+			    sizeof(mp->mnt_stat.f_mntfromname));
+			continue;
+		}
+	}
+	mtx_unlock(&mountlist_mtx);
 }
-
-
-#ifdef TODO
-static vfsdef_t vfw = {
-	VFSDEF_VERSION,
-	MNTTYPE_ZFS,
-	zfs_vfsinit,
-	VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
-	    VSW_XID,
-	&zfs_mntopts
-};
-
-struct modlfs zfs_modlfs = {
-	&mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
-};
+#endif
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c,v
retrieving revision 1.26
diff -u -p -r1.26 zfs_vnops.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c	26 May 2017 14:20:59 -0000	1.26
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c	13 Sep 2017 07:55:33 -0000
@@ -19,11 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -32,6 +35,7 @@
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
+#include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
@@ -48,28 +52,39 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
+#include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
+#include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
-#include <sys/zfs_vfsops.h>
+#include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
-#include <sys/extdirent.h>
-#include <sys/kidmap.h>
 #include <sys/buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
-#include <sys/extattr.h>
+#include <sys/extdirent.h>
+
+#ifdef __FreeBSD__
+#include <sys/kidmap.h>
+#include <sys/bio.h>
+#include <vm/vm_param.h>
+#endif
 
 #ifdef __NetBSD__
 #include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+#include <uvm/uvm_extern.h>
+
+uint_t zfs_putpage_key;
 #endif
 
 /*
@@ -83,11 +98,11 @@
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
- *  (1) A check must be made in each zfs thread for a mounted file system.
+ *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
- *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
- *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
- *      can return EIO from the calling function.
+ *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
+ *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
+ *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
@@ -102,11 +117,18 @@
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
- *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
- *	This is critical because we don't want to block while holding locks.
- *	Note, in particular, that if a lock is sometimes acquired before
- *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
- *	use a non-blocking assign can deadlock the system.  The scenario:
+ *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ *      dmu_tx_assign().  This is critical because we don't want to block
+ *      while holding locks.
+ *
+ *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
+ *	reduces lock contention and CPU usage when we must wait (note that if
+ *	throughput is constrained by the storage, nearly every transaction
+ *	must wait).
+ *
+ *      Note, in particular, that if a lock is sometimes acquired before
+ *      the tx assigns, and sometimes after (e.g. z_lock), then failing
+ *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
@@ -114,34 +136,39 @@
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- *	then drop all locks, call dmu_tx_wait(), and try again.
+ *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
+ *	to indicate that this operation has already called dmu_tx_wait().
+ *	This will ensure that we don't retry forever, waiting a short bit
+ *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
- *      During ZIL replay the zfs_log_* functions will update the sequence
+ *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
- *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
+ *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
- *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
+ *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
- *		zfs_dirent_unlock(...);	// unlock directory entry
+ *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
+ *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
@@ -155,9 +182,9 @@
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
- *	zfs_dirent_unlock(dl, 0);	// unlock directory entry
+ *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
- *	zil_commit(zilog, seq, foid);	// synchronous when necessary
+ *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
@@ -167,23 +194,31 @@ static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
-	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
-		return (EPERM);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
-	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
-	    zp->z_phys->zp_size > 0)
-		if (fs_vscan(*vpp, cr, 0) != 0)
-			return (EACCES);
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+		if (fs_vscan(*vpp, cr, 0) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EACCES));
+		}
+	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
+	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
@@ -204,22 +239,19 @@ zfs_close(vnode_t *vp, int flag, int cou
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	dprintf("zfs_close called \n");
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
-	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
-	    zp->z_phys->zp_size > 0)
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
-#ifdef PORT_NETBSD
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
@@ -233,9 +265,9 @@ zfs_holey(vnode_t *vp, u_long cmd, offse
 	int error;
 	boolean_t hole;
 
-	file_sz = zp->z_phys->zp_size;
+	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
@@ -245,16 +277,19 @@ zfs_holey(vnode_t *vp, u_long cmd, offse
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
-	/* end of file? */
-	if ((error == ESRCH) || (noff > file_sz)) {
-		/*
-		 * Handle the virtual hole at the end of file.
-		 */
-		if (hole) {
-			*off = file_sz;
-			return (0);
-		}
-		return (ENXIO);
+	if (error == ESRCH)
+		return (SET_ERROR(ENXIO));
+
+	/*
+	 * We could find a hole that begins after the logical end-of-file,
+	 * because dmu_offset_next() only works on whole blocks.  If the
+	 * EOF falls mid-block, then indicate that the "virtual hole"
+	 * at the end of the file begins at the logical EOF, rather than
+	 * at the end of the last block.
+	 */
+	if (noff > file_sz) {
+		ASSERT(hole);
+		noff = file_sz;
 	}
 
 	if (noff < *off)
@@ -262,54 +297,220 @@ zfs_holey(vnode_t *vp, u_long cmd, offse
 	*off = noff;
 	return (error);
 }
-#endif /* PORT_NETBSD */
 
+/* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
+	offset_t ndata;
+	dmu_object_info_t doi;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
-	
+
 	switch (com) {
 	case _FIOFFS:
+	{
 		return (0);
-		
+
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
+	}
 	case _FIOGDIO:
 	case _FIOSDIO:
+	{
 		return (0);
-#ifdef PORT_NETBSD /* XXX NetBSD Do we support holes in files ? */
+	}
+
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
+	{
+#ifdef illumos
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
-			return (EFAULT);
-		
+			return (SET_ERROR(EFAULT));
+#else
+		off = *(offset_t *)data;
+#endif
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
-		
+
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
+#ifdef illumos
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
-			return (EFAULT);
+			return (SET_ERROR(EFAULT));
+#else
+		*(offset_t *)data = off;
+#endif
+		return (0);
+	}
+#ifdef illumos
+	case _FIO_COUNT_FILLED:
+	{
+		/*
+		 * _FIO_COUNT_FILLED adds a new ioctl command which
+		 * exposes the number of filled blocks in a
+		 * ZFS object.
+		 */
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+
+		/*
+		 * Wait for all dirty blocks for this object
+		 * to get synced out to disk, and the DMU info
+		 * updated.
+		 */
+		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
+		if (error) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		/*
+		 * Retrieve fill count from DMU object.
+		 */
+		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
+		if (error) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		ndata = doi.doi_fill_count;
+
+		ZFS_EXIT(zfsvfs);
+		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
+			return (SET_ERROR(EFAULT));
 		return (0);
+	}
 #endif
 	}
-	
-	return (ENOTTY);
+	return (SET_ERROR(ENOTTY));
+}
+
+#ifdef __FreeBSD__
+static vm_page_t
+page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+{
+	vm_object_t obj;
+	vm_page_t pp;
+	int64_t end;
+
+	/*
+	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+	 * aligned boundaries, if the range is not aligned.  As a result a
+	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
+	 * the whole page would be considred clean despite have some dirty data.
+	 * For this reason we should shrink the range to DEV_BSIZE aligned
+	 * boundaries before calling vm_page_clear_dirty.
+	 */
+	end = rounddown2(off + nbytes, DEV_BSIZE);
+	off = roundup2(off, DEV_BSIZE);
+	nbytes = end - off;
+
+	obj = vp->v_object;
+	zfs_vmobject_assert_wlocked(obj);
+
+	for (;;) {
+		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+		    pp->valid) {
+			if (vm_page_xbusied(pp)) {
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_reference(pp);
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
+				continue;
+			}
+			vm_page_sbusy(pp);
+		} else if (pp != NULL) {
+			ASSERT(!pp->valid);
+			pp = NULL;
+		}
+
+		if (pp != NULL) {
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_object_pip_add(obj, 1);
+			pmap_remove_write(pp);
+			if (nbytes != 0)
+				vm_page_clear_dirty(pp, off, nbytes);
+		}
+		break;
+	}
+	return (pp);
+}
+
+static void
+page_unbusy(vm_page_t pp)
+{
+
+	vm_page_sunbusy(pp);
+	vm_object_pip_subtract(pp->object, 1);
+}
+
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+	vm_object_t obj;
+	vm_page_t pp;
+
+	obj = vp->v_object;
+	zfs_vmobject_assert_wlocked(obj);
+
+	for (;;) {
+		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+		    pp->valid) {
+			if (vm_page_xbusied(pp)) {
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_reference(pp);
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
+				continue;
+			}
+
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_lock(pp);
+			vm_page_hold(pp);
+			vm_page_unlock(pp);
+
+		} else
+			pp = NULL;
+		break;
+	}
+	return (pp);
+}
+
+static void
+page_unhold(vm_page_t pp)
+{
+
+	vm_page_lock(pp);
+	vm_page_unhold(pp);
+	vm_page_unlock(pp);
 }
 
-#ifdef PORT_NETBSD
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
@@ -318,33 +519,109 @@ zfs_ioctl(vnode_t *vp, u_long com, intpt
  *		the page and the dmu buffer.
  */
 static void
-update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
+    int segflg, dmu_tx_t *tx)
 {
-	int64_t	off;
+	vm_object_t obj;
+	struct sf_buf *sf;
+	caddr_t va;
+	int off;
+
+	ASSERT(segflg != UIO_NOCOPY);
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
 
 	off = start & PAGEOFFSET;
-	dirbytes = 0;
-	VM_OBJECT_LOCK(obj);
+	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
-		page_t *pp;
-		uint64_t nbytes = MIN(PAGESIZE - off, len);
+		vm_page_t pp;
+		int nbytes = imin(PAGESIZE - off, len);
 
-		if (pp = page_lookup(vp, start, SE_SHARED)) {
-			caddr_t va;
+		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+			zfs_vmobject_wunlock(obj);
 
-			va = zfs_map_page(pp, S_WRITE);
-			(void) dmu_read(os, oid, start+off, nbytes, va+off,
-			    DMU_READ_PREFETCH);
-			zfs_unmap_page(pp, va);
-			page_unlock(pp);
+			va = zfs_map_page(pp, &sf);
+			(void) dmu_read(os, oid, start+off, nbytes,
+			    va+off, DMU_READ_PREFETCH);;
+			zfs_unmap_page(sf);
+
+			zfs_vmobject_wlock(obj);
+			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
+	vm_object_pip_wakeupn(obj, 0);
+	zfs_vmobject_wunlock(obj);
+}
+
+/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain exclusive busy on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+static int
+mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
+{
+	znode_t *zp = VTOZ(vp);
+	objset_t *os = zp->z_zfsvfs->z_os;
+	struct sf_buf *sf;
+	vm_object_t obj;
+	vm_page_t pp;
+	int64_t start;
+	caddr_t va;
+	int len = nbytes;
+	int off;
+	int error = 0;
 
-	VM_OBJECT_UNLOCK(obj);
-	if (error == 0 && dirbytes > 0)
-		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
+	ASSERT(uio->uio_segflg == UIO_NOCOPY);
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
+	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
+
+	zfs_vmobject_wlock(obj);
+	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
+		int bytes = MIN(PAGESIZE, len);
+
+		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
+		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
+		if (pp->valid == 0) {
+			zfs_vmobject_wunlock(obj);
+			va = zfs_map_page(pp, &sf);
+			error = dmu_read(os, zp->z_id, start, bytes, va,
+			    DMU_READ_PREFETCH);
+			if (bytes != PAGESIZE && error == 0)
+				bzero(va + bytes, PAGESIZE - bytes);
+			zfs_unmap_page(sf);
+			zfs_vmobject_wlock(obj);
+			vm_page_sunbusy(pp);
+			vm_page_lock(pp);
+			if (error) {
+				if (pp->wire_count == 0 && pp->valid == 0 &&
+				    !vm_page_busied(pp))
+					vm_page_free(pp);
+			} else {
+				pp->valid = VM_PAGE_BITS_ALL;
+				vm_page_activate(pp);
+			}
+			vm_page_unlock(pp);
+		} else {
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_sunbusy(pp);
+		}
+		if (error)
+			break;
+		uio->uio_resid -= bytes;
+		uio->uio_offset += bytes;
+		len -= bytes;
+	}
+	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
@@ -356,21 +633,18 @@ update_pages(vnode_t *vp, int64_t start,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- *	the file is memory mapped.
+ *	 the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
-	objset_t *os = zp->z_zfsvfs->z_os;
 	vm_object_t obj;
-	vm_page_t m;
-	struct sf_buf *sf;
-	int64_t start, off;
+	int64_t start;
 	caddr_t va;
 	int len = nbytes;
+	int off;
 	int error = 0;
-	uint64_t dirbytes;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
@@ -378,77 +652,161 @@ mappedread(vnode_t *vp, int nbytes, uio_
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
-	dirbytes = 0;
-	VM_OBJECT_LOCK(obj);
+	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
-again:
-		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
-		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
-			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
-				goto again;
-			vm_page_busy(m);
-			VM_OBJECT_UNLOCK(obj);
-			if (dirbytes > 0) {
-				error = dmu_read_uio(os, zp->z_id, uio,
-				    dirbytes);
-				dirbytes = 0;
-			}
-			if (error == 0) {
-				sched_pin();
-				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
-				va = (caddr_t)sf_buf_kva(sf);
-				error = uiomove(va + off, bytes, UIO_READ, uio);
-				sf_buf_free(sf);
-				sched_unpin();
-			}
-			VM_OBJECT_LOCK(obj);
-			vm_page_wakeup(m);
-		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
-			/*
-			 * The code below is here to make sendfile(2) work
-			 * correctly with ZFS. As pointed out by ups@
-			 * sendfile(2) should be changed to use VOP_GETPAGES(),
-			 * but it pessimize performance of sendfile/UFS, that's
-			 * why I handle this special case in ZFS code.
-			 */
-			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
-				goto again;
-			vm_page_busy(m);
-			VM_OBJECT_UNLOCK(obj);
-			if (dirbytes > 0) {
-				error = dmu_read_uio(os, zp->z_id, uio,
-				    dirbytes);
-				dirbytes = 0;
-			}
-			if (error == 0) {
-				sched_pin();
-				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
-				va = (caddr_t)sf_buf_kva(sf);
-				error = dmu_read(os, zp->z_id, start + off,
-				    bytes, (void *)(va + off));
-				sf_buf_free(sf);
-				sched_unpin();
-			}
-			VM_OBJECT_LOCK(obj);
-			vm_page_wakeup(m);
-			if (error == 0)
-				uio->uio_resid -= bytes;
+		if (pp = page_hold(vp, start)) {
+			struct sf_buf *sf;
+			caddr_t va;
+
+			zfs_vmobject_wunlock(obj);
+			va = zfs_map_page(pp, &sf);
+#ifdef illumos
+			error = uiomove(va + off, bytes, UIO_READ, uio);
+#else
+			error = vn_io_fault_uiomove(va + off, bytes, uio);
+#endif
+			zfs_unmap_page(sf);
+			zfs_vmobject_wlock(obj);
+			page_unhold(pp);
+		} else {
+			zfs_vmobject_wunlock(obj);
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, bytes);
+			zfs_vmobject_wlock(obj);
+		}
+		len -= bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	zfs_vmobject_wunlock(obj);
+	return (error);
+}
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
+
+caddr_t
+zfs_map_page(page_t *pp, enum seg_rw rw)
+{
+	vaddr_t va;
+	int flags;
+
+#ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
+	if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
+		return (caddr_t)va;
+#endif
+
+	flags = UVMPAGER_MAPIN_WAITOK |
+		(rw == S_READ ? UVMPAGER_MAPIN_WRITE : UVMPAGER_MAPIN_READ);
+	va = uvm_pagermapin(&pp, 1, flags);
+	return (caddr_t)va;
+}
+
+void
+zfs_unmap_page(page_t *pp, caddr_t addr)
+{
+
+#ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
+	vaddr_t va;
+
+	if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
+		return;
+#endif
+	uvm_pagermapout((vaddr_t)addr, 1);
+}
+
+static int
+mappedread(vnode_t *vp, int nbytes, uio_t *uio)
+{
+	znode_t *zp = VTOZ(vp);
+	struct uvm_object *uobj = &vp->v_uobj;
+	kmutex_t *mtx = uobj->vmobjlock;
+	int64_t start;
+	caddr_t va;
+	size_t len = nbytes;
+	int off;
+	int error = 0;
+	int npages, found;
+
+	start = uio->uio_loffset;
+	off = start & PAGEOFFSET;
+
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		page_t *pp;
+		uint64_t bytes = MIN(PAGESIZE - off, len);
+
+		pp = NULL;
+		npages = 1;
+		mutex_enter(mtx);
+		found = uvn_findpages(uobj, start, &npages, &pp, UFP_NOALLOC);
+		mutex_exit(mtx);
+
+		/* XXXNETBSD shouldn't access userspace with the page busy */
+		if (found) {
+			va = zfs_map_page(pp, S_READ);
+			error = uiomove(va + off, bytes, UIO_READ, uio);
+			zfs_unmap_page(pp, va);
 		} else {
-			dirbytes += bytes;
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, bytes);
 		}
+
+		mutex_enter(mtx);
+		uvm_page_unbusy(&pp, 1);
+		mutex_exit(mtx);
+
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
-	VM_OBJECT_UNLOCK(obj);
-	if (error == 0 && dirbytes > 0)
-		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
 	return (error);
 }
-#endif  /* PORT_NETBSD */
+
+static void
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
+    int segflg, dmu_tx_t *tx)
+{
+	struct uvm_object *uobj = &vp->v_uobj;
+	kmutex_t *mtx = uobj->vmobjlock;
+	caddr_t va;
+	int off;
+
+	ASSERT(vp->v_mount != NULL);
+
+	mutex_enter(mtx);
+
+	off = start & PAGEOFFSET;
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		page_t *pp;
+		int nbytes = MIN(PAGESIZE - off, len);
+		int npages, found;
+
+		pp = NULL;
+		npages = 1;
+		found = uvn_findpages(uobj, start, &npages, &pp, UFP_NOALLOC);
+		if (found) {
+			mutex_exit(mtx);
+
+			va = zfs_map_page(pp, S_WRITE);
+			(void) dmu_read(os, oid, start + off, nbytes,
+			    va + off, DMU_READ_PREFETCH);
+			zfs_unmap_page(pp, va);
+
+			mutex_enter(mtx);
+			uvm_page_unbusy(&pp, 1);
+		}
+		len -= nbytes;
+		off = 0;
+	}
+	mutex_exit(mtx);
+}
+#endif /* __NetBSD__ */
+
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
@@ -463,8 +821,7 @@ offset_t zfs_read_chunk_size = 1024 * 10
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
@@ -475,21 +832,17 @@ zfs_read(vnode_t *vp, uio_t *uio, int io
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	objset_t	*os;
 	ssize_t		n, nbytes;
-	int		error;
+	int		error = 0;
 	rl_t		*rl;
 	xuio_t		*xuio = NULL;
 
-	dprintf("zfs_read called\n");
-	
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
-	os = zfsvfs->z_os;
 
-	if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
+	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
-		return (EACCES);
+		return (SET_ERROR(EACCES));
 	}
 
 	/*
@@ -497,7 +850,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int io
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
@@ -511,7 +864,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int io
 	/*
 	 * Check for mandatory locks
 	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
+	if (MANDMODE(zp->z_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
@@ -522,8 +875,9 @@ zfs_read(vnode_t *vp, uio_t *uio, int io
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
-	if (ioflag & FRSYNC)
-		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
+	if (zfsvfs->z_log &&
+	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
@@ -534,14 +888,15 @@ zfs_read(vnode_t *vp, uio_t *uio, int io
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
-	if (uio->uio_loffset >= zp->z_phys->zp_size) {
+	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
-	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
-	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
-#ifdef PORT_SOLARIS
+	ASSERT(uio->uio_loffset < zp->z_size);
+	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+
+#ifdef illumos
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
@@ -565,24 +920,32 @@ zfs_read(vnode_t *vp, uio_t *uio, int io
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
-				    dmu_request_arcbuf(zp->z_dbuf, blksz),
-				    0, blksz);
+				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+				    blksz), 0, blksz);
 			}
 		}
 	}
-#endif
+#endif	/* illumos */
+
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
-//		if (vn_has_cached_data(vp))
-//			error = mappedread(vp, nbytes, uio);
-//		else
-			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
+#ifdef __FreeBSD__
+		if (uio->uio_segflg == UIO_NOCOPY)
+			error = mappedread_sf(vp, nbytes, uio);
+		else
+#endif /* __FreeBSD__ */
+		if (vn_has_cached_data(vp)) {
+			error = mappedread(vp, nbytes, uio);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes);
+		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
-				error = EIO;
+				error = SET_ERROR(EIO);
 			break;
 		}
 
@@ -597,70 +960,24 @@ out:
 }
 
 /*
- * Fault in the pages of the first n bytes specified by the uio structure.
- * 1 byte in each page is touched and the uio struct is unmodified.
- * Any error will exit this routine as this is only a best
- * attempt to get the pages resident. This is a copy of ufs_trans_touch().
- */
-static void
-zfs_prefault_write(ssize_t n, struct uio *uio)
-{
-	struct iovec *iov;
-	ulong_t cnt, incr;
-	caddr_t p;
-	
-	if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
-		return;
-
-	iov = uio->uio_iov;
-
-	while (n) {
-		cnt = MIN(iov->iov_len, n);
-		if (cnt == 0) {
-			/* empty iov entry */
-			iov++;
-			continue;
-		}
-		n -= cnt;
-		/*
-		 * touch each page in this segment.
-		 */
-		p = iov->iov_base;
-		while (cnt) {
-			if (fubyte(p) == -1)
-				return;
-			incr = MIN(cnt, PAGESIZE);
-			p += incr;
-			cnt -= incr;
-		}
-		/*
-		 * touch the last byte in case it straddles a page.
-		 */
-		p--;
-		if (fubyte(p) == -1)
-			return;
-		iov++;
-	}
-}
-
-/*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
- *		ioflag	- IO_APPEND flag set if in append mode.
+ *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
+ *			  set if in append mode.
  *		cr	- credentials of caller.
  *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
+
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -677,17 +994,25 @@ zfs_write(vnode_t *vp, uio_t *uio, int i
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
-	uint64_t	pflags;
-	int		error;
+	int		error = 0;
 	arc_buf_t	*abuf;
-	iovec_t		*aiov;
+	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
 	iovec_t		*iovp = uio->uio_iov;
 	int		write_eof;
+	int		count = 0;
+	sa_bulk_attr_t	bulk[4];
+	uint64_t	mtime[2], ctime[2];
+	int		segflg;
 
-	dprintf("zfs_write called\n");
+#ifdef __NetBSD__
+	segflg = VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ?
+		UIO_SYSSPACE : UIO_USERSPACE;
+#else
+	segflg = uio->uio_segflg;
+#endif
 
 	/*
 	 * Fasttrack empty write
@@ -702,49 +1027,71 @@ zfs_write(vnode_t *vp, uio_t *uio, int i
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+
 	/*
-	 * If immutable or not appending then return EPERM
+	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
+	 * callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
 	 */
-	pflags = zp->z_phys->zp_flags;
-	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
-	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
-	    (uio->uio_loffset < zp->z_phys->zp_size))) {
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
-		return (EPERM);
+		return (SET_ERROR(EROFS));
 	}
 
-	zilog = zfsvfs->z_log;
-
+	/*
+	 * If immutable or not appending then return EPERM
+	 */
+	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+	    (uio->uio_loffset < zp->z_size))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	zilog = zfsvfs->z_log;
+
 	/*
 	 * Validate file offset
 	 */
-	woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset;
+	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check for mandatory locks before calling zfs_range_lock()
 	 * in order to prevent a deadlock with locks set via fcntl().
 	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+	if (MANDMODE((mode_t)zp->z_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
+#ifdef illumos
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
-	zfs_prefault_write(n, uio);
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+		xuio = (xuio_t *)uio;
+	else
+		uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
-	if (ioflag & IO_APPEND) {
+	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
@@ -757,7 +1104,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int i
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
-			woff = zp->z_phys->zp_size;
+			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
@@ -769,19 +1116,32 @@ zfs_write(vnode_t *vp, uio_t *uio, int i
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
+#ifdef illumos
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
-		return (EFBIG);
+		return (SET_ERROR(EFBIG));
+	}
+
+#endif
+#ifdef __FreeBSD__
+	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
+		zfs_range_unlock(rl);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EFBIG));
 	}
+#endif
+#ifdef __NetBSD__
+	/* XXXNETBSD we might need vn_rlimit_fsize() too here eventually */
+#endif
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
-	write_eof = (woff + n > zp->z_phys->zp_size);
+	write_eof = (woff + n > zp->z_size);
 
-	end_size = MAX(zp->z_phys->zp_size, woff + n);
+	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
@@ -791,14 +1151,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int i
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
-again:
-		if (zfs_usergroup_overquota(zfsvfs,
-		    B_FALSE, zp->z_phys->zp_uid) ||
-		    zfs_usergroup_overquota(zfsvfs,
-		    B_TRUE, zp->z_phys->zp_gid)) {
+		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
-			error = EDQUOT;
+			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
@@ -814,7 +1171,7 @@ again:
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 		} else if (abuf == NULL && n >= max_blksz &&
-		    woff >= zp->z_phys->zp_size &&
+		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
@@ -824,32 +1181,55 @@ again:
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
+#ifdef illumos
 			size_t cbytes;
+#endif
 
-			abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
+#ifdef illumos
 			if (error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes)) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
+#endif
+#ifdef __FreeBSD__
+			ssize_t resid = uio->uio_resid;
+
+			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
+			if (error != 0) {
+				uio->uio_offset -= resid - uio->uio_resid;
+				uio->uio_resid = resid;
+				dmu_return_arcbuf(abuf);
+				break;
+			}
+#endif
+#ifdef __NetBSD__
+			ssize_t resid = uio->uio_resid;
+
+			error = uiomove(abuf->b_data, max_blksz, UIO_WRITE, uio);
+			if (error != 0) {
+				uio->uio_offset -= resid - uio->uio_resid;
+				uio->uio_resid = resid;
+				dmu_return_arcbuf(abuf);
+				break;
+			}
+#endif
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
-		dmu_tx_hold_bonus(tx, zp->z_id);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-		error = dmu_tx_assign(tx, TXG_NOWAIT);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
-			if (error == ERESTART) {
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto again;
-			}
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
@@ -866,8 +1246,14 @@ again:
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
+				/*
+				 * File's blocksize is already larger than the
+				 * "recordsize" property.  Only let it grow to
+				 * the next power of 2.
+				 */
 				ASSERT(!ISP2(zp->z_blksz));
-				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+				new_blksz = MIN(end_size,
+				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
@@ -881,10 +1267,13 @@ again:
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
+		if (woff + nbytes > zp->z_size)
+			vnode_pager_setsize(vp, woff + nbytes);
+
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
-			error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
-			    nbytes, tx);
+			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
@@ -904,22 +1293,26 @@ again:
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
-				dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
+				    woff, abuf, tx);
 			}
+#ifdef illumos
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
+#endif
 		}
-#ifdef PORT_SOLARIS
 		if (tx_bytes && vn_has_cached_data(vp)) {
-			update_pages(vp, woff,
-			    tx_bytes, zfsvfs->z_os, zp->z_id);
+			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
+			    zp->z_id, segflg, tx);
 		}
-#endif
+
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
@@ -937,27 +1330,48 @@ again:
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
-		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
+		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
-		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
-		    secpolicy_vnode_setid_retain(cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) {
-			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
+		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+		    secpolicy_vnode_setid_retain(vp, cr,
+		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
+			uint64_t newmode;
+			zp->z_mode &= ~(S_ISUID | S_ISGID);
+			newmode = zp->z_mode;
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
-		/*
-		 * Update time stamp.  NOTE: This marks the bonus buffer as
-		 * dirty, so we don't have to do it again for zp_size.
-		 */
-		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+		    B_TRUE);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
-		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
-			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
+		while ((end_size = zp->z_size) < uio->uio_loffset) {
+			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
+#ifdef illumos
+			ASSERT(error == 0);
+#else
+			ASSERT(error == 0 || error == EFAULT);
+#endif
+		}
+		/*
+		 * If we are replaying and eof is non zero then force
+		 * the file size to the specified eof. Note, there's no
+		 * concurrency during replay.
+		 */
+		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+			zp->z_size = zfsvfs->z_replay_eof;
+
+		if (error == 0)
+			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		else
+			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
@@ -965,6 +1379,11 @@ again:
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
+
+#ifdef illumos
+		if (!xuio && n > 0)
+			uio_prefaultpages(MIN(n, max_blksz), uio);
+#endif
 	}
 
 	zfs_range_unlock(rl);
@@ -978,11 +1397,22 @@ again:
 		return (error);
 	}
 
-	if (ioflag & (FSYNC | FDSYNC))
-		zil_commit(zilog, zp->z_last_itx, zp->z_id);
+#ifdef __FreeBSD__
+	/*
+	 * EFAULT means that at least one page of the source buffer was not
+	 * available.  VFS will re-try remaining I/O upon this error.
+	 */
+	if (error == EFAULT) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+#endif
+
+	if (ioflag & (FSYNC | FDSYNC) ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
-
 	return (0);
 }
 
@@ -1001,7 +1431,7 @@ zfs_get_done(zgd_t *zgd, int error)
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
-	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+	VN_RELE_CLEANER(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
@@ -1036,16 +1466,16 @@ zfs_get_data(void *arg, lr_write_t *lr, 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
-	if (zfs_zget(zfsvfs, object, &zp) != 0)
-		return (ENOENT);
+	if (zfs_zget_cleaner(zfsvfs, object, &zp) != 0)
+		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
-		VN_RELE_ASYNC(ZTOV(zp),
+		VN_RELE_CLEANER(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
-		return (ENOENT);
+		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
@@ -1062,8 +1492,8 @@ zfs_get_data(void *arg, lr_write_t *lr, 
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
-		if (offset >= zp->z_phys->zp_size) {
-			error = ENOENT;
+		if (offset >= zp->z_size) {
+			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
@@ -1089,18 +1519,25 @@ zfs_get_data(void *arg, lr_write_t *lr, 
 			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
-		if (lr->lr_offset >= zp->z_phys->zp_size)
-			error = ENOENT;
+		if (lr->lr_offset >= zp->z_size)
+			error = SET_ERROR(ENOENT);
 #ifdef DEBUG
 		if (zil_fault_io) {
-			error = EIO;
+			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
-			error = dmu_buf_hold(os, object, offset, zgd, &db);
+			error = dmu_buf_hold(os, object, offset, zgd, &db,
+			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
@@ -1153,6 +1590,310 @@ zfs_access(vnode_t *vp, int mode, int fl
 	return (error);
 }
 
+#ifdef __FreeBSD__
+static int
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
+{
+	int error;
+
+	*vpp = arg;
+	error = vn_lock(*vpp, lkflags);
+	if (error != 0)
+		vrele(*vpp);
+	return (error);
+}
+
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int error;
+	int ltype;
+
+	ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+	if ((zdp->z_pflags & ZFS_XATTR) == 0)
+		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		ASSERT3P(dvp, ==, vp);
+		vref(dvp);
+		ltype = lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(dvp)) {
+			if (ltype == LK_EXCLUSIVE)
+				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+			else /* if (ltype == LK_SHARED) */
+				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+			/*
+			 * Relock for the "." case could leave us with
+			 * reclaimed vnode.
+			 */
+			if (dvp->v_iflag & VI_DOOMED) {
+				vrele(dvp);
+				return (SET_ERROR(ENOENT));
+			}
+		}
+		return (0);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		/*
+		 * Note that in this case, dvp is the child vnode, and we
+		 * are looking up the parent vnode - exactly reverse from
+		 * normal operation.  Unlocking dvp requires some rather
+		 * tricky unlock/relock dance to prevent mp from being freed;
+		 * use vn_vget_ino_gen() which takes care of all that.
+		 *
+		 * XXX Note that there is a time window when both vnodes are
+		 * unlocked.  It is possible, although highly unlikely, that
+		 * during that window the parent-child relationship between
+		 * the vnodes may change, for example, get reversed.
+		 * In that case we would have a wrong lock order for the vnodes.
+		 * All other filesystems seem to ignore this problem, so we
+		 * do the same here.
+		 * A potential solution could be implemented as follows:
+		 * - using LK_NOWAIT when locking the second vnode and retrying
+		 *   if necessary
+		 * - checking that the parent-child relationship still holds
+		 *   after locking both vnodes and retrying if it doesn't
+		 */
+		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+		return (error);
+	} else {
+		error = vn_lock(vp, lkflags);
+		if (error != 0)
+			vrele(vp);
+		return (error);
+	}
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ *	IN:	dvp	- vnode of directory to search.
+ *		nm	- name of entry to lookup.
+ *		pnp	- full pathname to lookup [UNUSED].
+ *		flags	- LOOKUP_XATTR set if looking for an attribute.
+ *		rdir	- root directory vnode [UNUSED].
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	OUT:	vpp	- vnode of located entry, NULL if not found.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
+    int nameiop, cred_t *cr, kthread_t *td, int flags)
+{
+	znode_t *zdp = VTOZ(dvp);
+	znode_t *zp;
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int	error = 0;
+
+	/* fast path (should be redundant with vfs namecache) */
+	if (!(flags & LOOKUP_XATTR)) {
+		if (dvp->v_type != VDIR) {
+			return (SET_ERROR(ENOTDIR));
+		} else if (zdp->z_sa_hdl == NULL) {
+			return (SET_ERROR(EIO));
+		}
+	}
+
+	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zdp);
+
+	*vpp = NULL;
+
+	if (flags & LOOKUP_XATTR) {
+#ifdef TODO
+		/*
+		 * If the xattr property is off, refuse the lookup request.
+		 */
+		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+#endif
+
+		/*
+		 * We don't allow recursive attributes..
+		 * Maybe someday we will.
+		 */
+		if (zdp->z_pflags & ZFS_XATTR) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+
+		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		/*
+		 * Do we have permission to get into attribute directory?
+		 */
+		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
+		    B_FALSE, cr)) {
+			vrele(*vpp);
+			*vpp = NULL;
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Check accessibility of directory.
+	 */
+	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+
+	/*
+	 * First handle the special cases.
+	 */
+	if ((cnp->cn_flags & ISDOTDOT) != 0) {
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+			struct componentname cn;
+			vnode_t *zfsctl_vp;
+			int ltype;
+
+			ZFS_EXIT(zfsvfs);
+			ltype = VOP_ISLOCKED(dvp);
+			VOP_UNLOCK(dvp, 0);
+			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+			    &zfsctl_vp);
+			if (error == 0) {
+				cn.cn_nameptr = "snapshot";
+				cn.cn_namelen = strlen(cn.cn_nameptr);
+				cn.cn_nameiop = cnp->cn_nameiop;
+				cn.cn_flags = cnp->cn_flags;
+				cn.cn_lkflags = cnp->cn_lkflags;
+				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+				vput(zfsctl_vp);
+			}
+			vn_lock(dvp, ltype | LK_RETRY);
+			return (error);
+		}
+	}
+	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+		ZFS_EXIT(zfsvfs);
+		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+			return (SET_ERROR(ENOTSUP));
+		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+		return (error);
+	}
+
+	/*
+	 * The loop is retry the lookup if the parent-child relationship
+	 * changes during the dot-dot locking complexities.
+	 */
+	for (;;) {
+		uint64_t parent;
+
+		error = zfs_dirlook(zdp, nm, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+
+		ZFS_EXIT(zfsvfs);
+		if (error != 0)
+			break;
+
+		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		if (error != 0) {
+			/*
+			 * If we've got a locking error, then the vnode
+			 * got reclaimed because of a force unmount.
+			 * We never enter doomed vnodes into the name cache.
+			 */
+			*vpp = NULL;
+			return (error);
+		}
+
+		if ((cnp->cn_flags & ISDOTDOT) == 0)
+			break;
+
+		ZFS_ENTER(zfsvfs);
+		if (zdp->z_sa_hdl == NULL) {
+			error = SET_ERROR(EIO);
+		} else {
+			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+			    &parent, sizeof (parent));
+		}
+		if (error != 0) {
+			ZFS_EXIT(zfsvfs);
+			vput(ZTOV(zp));
+			break;
+		}
+		if (zp->z_id == parent) {
+			ZFS_EXIT(zfsvfs);
+			break;
+		}
+		vput(ZTOV(zp));
+	}
+
+out:
+	if (error != 0)
+		*vpp = NULL;
+
+	/* Translate errors and add SAVENAME when needed. */
+	if (cnp->cn_flags & ISLASTCN) {
+		switch (nameiop) {
+		case CREATE:
+		case RENAME:
+			if (error == ENOENT) {
+				error = EJUSTRETURN;
+				cnp->cn_flags |= SAVENAME;
+				break;
+			}
+			/* FALLTHROUGH */
+		case DELETE:
+			if (error == 0)
+				cnp->cn_flags |= SAVENAME;
+			break;
+		}
+	}
+
+	/* Insert name into cache (as non-existent) if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(dvp, NULL, cnp);
+
+	/* Insert name into cache if appropriate. */
+	if (zfsvfs->z_use_namecache &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+		if (!(cnp->cn_flags & ISLASTCN) ||
+		    (nameiop != DELETE && nameiop != RENAME)) {
+			cache_enter(dvp, *vpp, cnp);
+		}
+	}
+
+	return (error);
+}
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
 /*
  * If vnode is for a device return a specfs vnode instead.
  */
@@ -1173,7 +1914,6 @@ specvp_check(vnode_t **vpp, cred_t *cr)
 	return (error);
 }
 
-
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
@@ -1199,20 +1939,20 @@ specvp_check(vnode_t **vpp, cred_t *cr)
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
-    int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
     int *direntflags, pathname_t *realpnp)
 {
 	znode_t *zdp = VTOZ(dvp);
+	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
 
 	/* fast path */
-	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
-
+	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (ENOTDIR);
-		} else if (zdp->z_dbuf == NULL) {
-			return (EIO);
+		} else if (zdp->z_sa_hdl == NULL) {
+			return (SET_ERROR(EIO));
 		}
 
 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
@@ -1265,7 +2005,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
-		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
+		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (EINVAL);
 		}
@@ -1307,13 +2047,16 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode
 		return (EILSEQ);
 	}
 
-	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
-	if (error == 0)
-		error = specvp_check(vpp, cr);
+	error = zfs_dirlook(zdp, nm, &zp);
+	if (error == 0) {
+		*vpp = ZTOV(zp);
+		error = specvp_check(vpp, cr);
+	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
+#endif
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
@@ -1328,12 +2071,11 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
- *		vsecp 	- ACL to be set
+ *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
@@ -1343,30 +2085,38 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
-    vnode_t **vpp, cred_t *cr)
+    vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
 	void		*vsecp = NULL;
 	int		flag = 0;
-	zfs_acl_ids_t	acl_ids;
-	boolean_t	fuid_dirtied;
+	uint64_t	txtype;
 
-	dprintf("zfs_create called\n");
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
+
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
-	    IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
-		return (EINVAL);
+	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
@@ -1376,172 +2126,105 @@ zfs_create(vnode_t *dvp, char *name, vat
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
-		return (EILSEQ);
+		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
-		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
-top:
+
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
-	if (*name == '\0') {
-		/*
-		 * Null component name refers to the directory itself.
-		 */
-		VN_HOLD(dvp);
-		zp = dzp;
-		dl = NULL;
-		error = 0;
-	} else {
-		/* possible VN_HOLD(zp) */
-		int zflg = 0;
-
-		if (flag & FIGNORECASE)
-			zflg |= ZCILOOK;
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ASSERT3P(zp, ==, NULL);
 
-		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-		    NULL, NULL);
-		if (error) {
-			if (strcmp(name, "..") == 0)
-				error = EISDIR;
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+		goto out;
 	}
-	if (zp == NULL) {
-		uint64_t txtype;
 
-		/*
-		 * Create a new file object and update the directory
-		 * to reference it.
-		 */
-		error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr);
-		if (error) {
-			goto out;
-		}
+	/*
+	 * We only support the creation of regular files in
+	 * extended attribute directories.
+	 */
 
-		/*
-		 * We only support the creation of regular files in
-		 * extended attribute directories.
-		 */
-		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
-		    (vap->va_type != VREG)) {
-			error = EINVAL;
-			goto out;
-		}
+	if ((dzp->z_pflags & ZFS_XATTR) &&
+	    (vap->va_type != VREG)) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
 
-		if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
-		    &acl_ids)) != 0)
-			goto out;
-		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
-			zfs_acl_ids_free(&acl_ids);
-			error = EDQUOT;
-			goto out;
-		}
+	if ((error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
 
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		fuid_dirtied = zfsvfs->z_fuid_dirty;
-		if (fuid_dirtied)
-			zfs_fuid_txhold(zfsvfs, tx);
-		dmu_tx_hold_bonus(tx, dzp->z_id);
-		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-			    0, SPA_MAXBLOCKSIZE);
-		}
-		error = dmu_tx_assign(tx, TXG_NOWAIT);
-		if (error) {
-			zfs_acl_ids_free(&acl_ids);
-			zfs_dirent_unlock(dl);
-			if (error == ERESTART) {
-				dmu_tx_wait(tx);
-				dmu_tx_abort(tx);
-				goto top;
-			}
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
 
-		if (fuid_dirtied)
-			zfs_fuid_sync(zfsvfs, tx);
+	getnewvnode_reserve(1);
 
-		(void) zfs_link_create(dl, zp, tx, ZNEW);
+	tx = dmu_tx_create(os);
 
-		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
-		if (flag & FIGNORECASE)
-			txtype |= TX_CI;
-		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
-		    vsecp, acl_ids.z_fuidp, vap);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		dmu_tx_commit(tx);
-	} else {
-		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
-		/*
-		 * A directory entry already exists for this name.
-		 */
-		/*
-		 * Can't truncate an existing file if in exclusive mode.
-		 */
-		if (excl == EXCL) {
-			error = EEXIST;
-			goto out;
-		}
-		/*
-		 * Can't open a directory for writing.
-		 */
-		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
-			error = EISDIR;
-			goto out;
-		}
-		/*
-		 * Verify requested access to file.
-		 */
-		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
-			goto out;
-		}
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
 
-		mutex_enter(&dzp->z_lock);
-		dzp->z_seq++;
-		mutex_exit(&dzp->z_lock);
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+	    vsecp, acl_ids.z_fuidp, vap);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
 
-		/*
-		 * Truncate regular files if requested.
-		 */
-		if ((ZTOV(zp)->v_type == VREG) &&
-		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
-			/* we can't hold any locks when calling zfs_freesp() */
-			zfs_dirent_unlock(dl);
-			dl = NULL;
-			error = zfs_freesp(zp, 0, 0, mode, TRUE);
-			if (error == 0) {
-				vnevent_create(ZTOV(zp), NULL);
-			}
-		}
-	}
-out:
-	if (dl)
-		zfs_dirent_unlock(dl);
+	getnewvnode_drop_reserve();
 
-	if (error) {
-		if (zp)
-			VN_RELE(ZTOV(zp));
-	} else {
+out:
+	if (error == 0) {
 		*vpp = ZTOV(zp);
-		error = specvp_check(vpp, cr);
 	}
 
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
@@ -1555,59 +2238,37 @@ out:
  *		ct	- caller context
  *		flags	- case flags
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
+
 /*ARGSUSED*/
 static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
-    int flags)
+zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
-	znode_t		*zp, *dzp = VTOZ(dvp);
-	znode_t		*xzp = NULL;
-	vnode_t		*vp;
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp = VTOZ(vp);
+	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
-	zfs_dirlock_t	*dl;
+	uint64_t	obj = 0;
 	dmu_tx_t	*tx;
-	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
-	pathname_t	*realnmp = NULL;
-	pathname_t	realnm;
 	int		error;
-	int		zflg = ZEXISTS;
 
-	dprintf("zfs_remove called\n");
-	
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
+	zp = VTOZ(vp);
 
-	if (flags & FIGNORECASE) {
-		zflg |= ZCILOOK;
-		pn_alloc(&realnm);
-		realnmp = &realnm;
-	}
-
-top:
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, realnmp)) {
-		if (realnmp)
-			pn_free(realnmp);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
+	xattr_obj = 0;
+	xzp = NULL;
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
@@ -1617,18 +2278,21 @@ top:
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
-		error = EPERM;
+		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
-	if (realnmp)
-		dnlc_remove(dvp, realnmp->pn_buf);
-	else
-		dnlc_remove(dvp, name);
+	obj = zp->z_id;
 
-	may_delete_now = FALSE;
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
@@ -1638,40 +2302,25 @@ top:
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	if (may_delete_now) {
-		toobig =
-		    zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
-		/* if the file is too big, only hold_free a token amount */
-		dmu_tx_hold_free(tx, zp->z_id, 0,
-		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
-	}
-
-	/* are there any extended attributes? */
-	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
-		/* XXX - do we need this if we are deleting? */
-		dmu_tx_hold_bonus(tx, xattr_obj);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+
+	if (xzp) {
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
-	/* are there any additional acls */
-	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
-	    may_delete_now)
-		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
-
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	/*
+	 * Mark this transaction as typically resulting in a net free of space
+	 */
+	dmu_tx_mark_netfree(tx);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		if (realnmp)
-			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
@@ -1680,66 +2329,29 @@ top:
 	/*
 	 * Remove the directory entry.
 	 */
-	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
-	if (0 && unlinked) {
-		KASSERT(0);	/* NetBSD: must now happen now */
-		VI_LOCK(vp);
-		delete_now = may_delete_now && !toobig &&
-		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
-		    zp->z_phys->zp_xattr == xattr_obj &&
-		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
-		VI_UNLOCK(vp);
-		}
-
-	if (delete_now) {
-		KASSERT(0);	/* NetBSD: must now happen now */
-		if (zp->z_phys->zp_xattr) {
-			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
-			ASSERT3U(error, ==, 0);
-			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
-			dmu_buf_will_dirty(xzp->z_dbuf, tx);
-			mutex_enter(&xzp->z_lock);
-			xzp->z_unlinked = 1;
-			xzp->z_phys->zp_links = 0;
-			mutex_exit(&xzp->z_lock);
-			zfs_unlinked_add(xzp, tx);
-			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
-		}
-		mutex_enter(&zp->z_lock);
-		VI_LOCK(vp);
-		vp->v_count--;
-		ASSERT3U(vp->v_count, ==, 0);
-		VI_UNLOCK(vp);
-		mutex_exit(&zp->z_lock);
-		zfs_znode_delete(zp, tx);
-	} else if (unlinked) {
+	if (unlinked) {
 		zfs_unlinked_add(zp, tx);
+		vp->v_vflag |= VV_NOSYNC;
 	}
 
 	txtype = TX_REMOVE;
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
-	zfs_log_remove(zilog, tx, txtype, dzp, name);
+	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
-	if (realnmp)
-		pn_free(realnmp);
 
-	zfs_dirent_unlock(dl);
+	if (xzp)
+		vrele(ZTOV(xzp));
 
-	if (!delete_now) {
-		VN_RELE(vp);
-	} else if (xzp) {
-		/* this rele is delayed to prevent nesting transactions */
-		VN_RELE(ZTOV(xzp));
-	}
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -1754,12 +2366,12 @@ out:
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
+ *		flags	- case flags
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
@@ -1767,18 +2379,18 @@ out:
  */
 /*ARGSUSED*/
 static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
-    caller_context_t *ct, int flags, vsecattr_t *vsecp)
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zf = ZNEW;
-	zfs_acl_ids_t	acl_ids;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VDIR);
@@ -1788,88 +2400,96 @@ zfs_mkdir(vnode_t *dvp, char *dirname, v
 	 * make sure file system is at proper version
 	 */
 
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
-	    (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
-	    IS_EPHEMERAL(crgetgid(cr))))
-		return (EINVAL);
+	    ((vap->va_mask & AT_XVATTR) ||
+	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
-	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
+	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
-		return (EILSEQ);
+		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
-	if (vap->va_mask & AT_XVATTR)
-		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+	if (vap->va_mask & AT_XVATTR) {
+		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+	    NULL, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
 
 	/*
 	 * First make sure the new directory doesn't exist.
+	 *
+	 * Existence is checked first to make sure we don't return
+	 * EACCES instead of EEXIST which can cause some applications
+	 * to fail.
 	 */
-top:
 	*vpp = NULL;
 
-	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
-	    NULL, NULL)) {
+	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
+		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
+	ASSERT3P(zp, ==, NULL);
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
-		zfs_dirent_unlock(dl);
+		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
-	    &acl_ids)) != 0) {
-		zfs_dirent_unlock(dl);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
-		return (EDQUOT);
+		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
-		    0, SPA_MAXBLOCKSIZE);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -1877,27 +2497,30 @@ top:
 	/*
 	 * Create new node.
 	 */
-	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
+
 	/*
 	 * Now put new name in parent dir.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
+	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
-	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
-	if (flags & FIGNORECASE)
-		txtype |= TX_CI;
-	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
+
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
+	getnewvnode_drop_reserve();
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
@@ -1915,116 +2538,68 @@ top:
  *		ct	- caller context
  *		flags	- case flags
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
-	znode_t		*zp;
-	vnode_t		*vp;
+	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
-	int		zflg = ZEXISTS;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-top:
-	zp = NULL;
-
-	/*
-	 * Attempt to lock directory; fail if entry doesn't exist.
-	 */
-	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
-	    NULL, NULL)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
-		error = ENOTDIR;
-		goto out;
-	}
-
-	if (vp == cwd) {
-		error = EINVAL;
+		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
-	/*
-	 * Grab a lock on the directory to make sure that noone is
-	 * trying to add (or lookup) entries while we are removing it.
-	 */
-	rw_enter(&zp->z_name_lock, RW_WRITER);
-
-	/*
-	 * Grab a lock on the parent pointer to make sure we play well
-	 * with the treewalk and directory rename code.
-	 */
-	rw_enter(&zp->z_parent_lock, RW_WRITER);
-
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
-	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		rw_exit(&zp->z_parent_lock);
-		rw_exit(&zp->z_name_lock);
-		zfs_dirent_unlock(dl);
-		VN_RELE(vp);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	/* Purge cache entries, while still holding locks. */
 	cache_purge(dvp);
-	cache_purge(vp);
 
-	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
-		zfs_log_remove(zilog, tx, txtype, dzp, name);
+		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
-	rw_exit(&zp->z_parent_lock);
-	rw_exit(&zp->z_name_lock);
+	cache_purge(vp);
 out:
-	zfs_dirent_unlock(dl);
-
-	VN_RELE(vp);
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -2033,7 +2608,7 @@ out:
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
- * the uio structure.
+ * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
@@ -2045,8 +2620,7 @@ out:
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
@@ -2058,7 +2632,7 @@ out:
  */
 /* ARGSUSED */
 static int
-zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
+zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, off_t **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
@@ -2072,21 +2646,32 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
+	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	boolean_t	check_sysattrs;
 	uint8_t		type;
-	int		ncooks;
-	u_long		*cooks = NULL;
+	int		ncooks = 0;
+	off_t		*cooks = NULL;
 	int		flags = 0;
+#ifdef __FreeBSD__
+	boolean_t user = uio->uio_segflg != UIO_SYSSPACE;
+#endif
+#ifdef __NetBSD__
+	boolean_t user = !VMSPACE_IS_KERNEL_P(uio->uio_vmspace);
+#endif
 
-	dprintf("zfs_readdir called\n");
-	
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
@@ -2099,7 +2684,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
@@ -2114,7 +2699,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
-	
+
 	/*
 	 * Initialize the iterator cursor.
 	 */
@@ -2135,13 +2720,13 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
-	if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1) {
+	if (user || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
-		memset(outbuf, 0, bufsize);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
+		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
@@ -2150,11 +2735,14 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
+#ifdef __FreeBSD__
+		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
+		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
+#endif
+#ifdef __NetBSD__
 		ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp);
-//		    sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 
-		cooks = kmem_alloc(ncooks * sizeof(u_long), KM_SLEEP);
-		
-		memset(cooks, 0, ncooks * sizeof(u_long));
+		cooks = kmem_alloc(ncooks * sizeof(off_t), KM_SLEEP);
+#endif
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
@@ -2180,7 +2768,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
-		off64_t *next;
+		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
@@ -2193,7 +2781,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
-			objnum = zp->z_phys->zp_parent;
+			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
@@ -2217,7 +2805,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
-				error = ENXIO;
+				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
@@ -2247,16 +2835,16 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
-				VN_RELE(ZTOV(ezp));
+				vrele(ZTOV(ezp));
 				goto skip_entry;
 			}
-			VN_RELE(ZTOV(ezp));
+			vrele(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
-			reclen = _DIRENT_RECLEN(odp, strlen(zap.za_name));
+			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
@@ -2266,7 +2854,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
-				error = EINVAL;
+				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
@@ -2297,11 +2885,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 		}
 		outcount += reclen;
 
-		KASSERT(outcount <= bufsize);
+		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
-			dmu_prefetch(os, objnum, 0, 0);
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
@@ -2317,7 +2906,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
-			KASSERT(ncooks >= 0);
+#ifdef __FreeBSD__
+			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+#endif
+#ifdef __NetBSD__
+			KASSERTMSG(ncooks >= 0, "ncooks=%d", ncooks);
+#endif
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
@@ -2326,11 +2920,11 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
-	if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace) && uio->uio_iovcnt == 1) {
+	if (!user && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
-	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
+	} else if (error = uiomove(outbuf, (size_t)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
@@ -2339,7 +2933,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre
 
 update:
 	zap_cursor_fini(&zc);
-	if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1)
+	if (user || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
@@ -2350,7 +2944,12 @@ update:
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
-		kmem_free(*cookies, ncooks * sizeof(u_long));
+#ifdef __FreeBSD__
+		free(*cookies, M_TEMP);
+#endif
+#ifdef __NetBSD__
+		kmem_free(*cookies, ncooks * sizeof(off_t));
+#endif
 		*cookies = NULL;
 		*ncookies = 0;
 	}
@@ -2364,44 +2963,19 @@ zfs_fsync(vnode_t *vp, int syncflag, cre
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
 
-	error = 0;
-
-	dprintf("zfs_fsync called vp %p -- zfsvfs %p\n", vp, zfsvfs);
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-	/*
-	 * NetBSD: if the sync is from reclaim or from ioflush,
-	 * push dirty atime now.  No need to lock: in the reclaim
-	 * case, everything is single threaded and for ioflush this
-	 * is a lazy writeback.
-	 *
-	 * XXXNETBSD: in the ioflush case, we don't want to push anything
-	 * to disk immediately.  We just want to queue the update so it
-	 * will happen "soon".  Check this is the case otherwise zfs will
-	 * perform poorly.
-	 */
-	if (zp->z_atime_dirty && zp->z_unlinked == 0 &&
-	    (syncflag & (FSYNC_RECLAIM | FSYNC_LAZY)) != 0) {
-		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
 
-		dmu_tx_hold_bonus(tx, zp->z_id);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-		} else {
-			dmu_buf_will_dirty(zp->z_dbuf, tx);
-			mutex_enter(&zp->z_lock);
-			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
-			dmu_tx_commit(tx);
-		}
+#ifdef __NetBSD__
+		if (!zp->z_unlinked)
+#endif
+		zil_commit(zfsvfs->z_log, zp->z_id);
+		ZFS_EXIT(zfsvfs);
 	}
-	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
-	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
@@ -2419,7 +2993,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cre
  *
  *	OUT:	vap	- attribute values.
  *
- *	RETURN:	0 (always succeeds)
+ *	RETURN:	0 (always succeeds).
  */
 /* ARGSUSED */
 static int
@@ -2428,28 +3002,41 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	znode_phys_t *pzp;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t links;
+	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
 
-	dprintf("zfs_getattr called\n");
-	
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
-	pzp = zp->z_phys;
+
+	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+		    &rdev, 8);
+
+	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
-	if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
-	    (pzp->zp_uid != crgetuid(cr))) {
+	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+	    (vap->va_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
 			ZFS_EXIT(zfsvfs);
@@ -2461,22 +3048,34 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
-	mutex_enter(&zp->z_lock);
-	vap->va_type = IFTOVT(pzp->zp_mode);
-	vap->va_mode = pzp->zp_mode & ~S_IFMT;
-	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+	vap->va_type = IFTOVT(zp->z_mode);
+	vap->va_mode = zp->z_mode & ~S_IFMT;
+#ifdef illumos
+	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
+#endif
+#ifdef __FreeBSD__
+	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+#endif
+#ifdef __NetBSD__
+	vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
+#endif
 	vap->va_nodeid = zp->z_id;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
-		links = pzp->zp_links + 1;
+		links = zp->z_links + 1;
 	else
-		links = pzp->zp_links;
-	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
-	vap->va_size = pzp->zp_size;
-	vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
-//	vap->va_fsid = 0;
-	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
+		links = zp->z_links;
+	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
+	vap->va_size = zp->z_size;
+#ifdef illumos
+	vap->va_rdev = vp->v_rdev;
+#else
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		vap->va_rdev = zfs_cmpldev(rdev);
+#endif
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
+     	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
@@ -2485,116 +3084,104 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
-			    ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
+			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
-			    ((pzp->zp_flags & ZFS_READONLY) != 0);
+			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
-			    ((pzp->zp_flags & ZFS_SYSTEM) != 0);
+			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
-			    ((pzp->zp_flags & ZFS_HIDDEN) != 0);
+			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
-			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
-			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
-			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
-			    ((pzp->zp_flags & ZFS_NODUMP) != 0);
+			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
-			    ((pzp->zp_flags & ZFS_OPAQUE) != 0);
+			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
-			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
-			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
-		    vp->v_type == VREG &&
-		    (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
-			size_t len;
-			dmu_object_info_t doi;
-
-			/*
-			 * Only VREG files have anti-virus scanstamps, so we
-			 * won't conflict with symlinks in the bonus buffer.
-			 */
-			dmu_object_info_from_db(zp->z_dbuf, &doi);
-			len = sizeof (xoap->xoa_av_scanstamp) +
-			    sizeof (znode_phys_t);
-			if (len <= doi.doi_bonus_size) {
-				/*
-				 * pzp points to the start of the
-				 * znode_phys_t. pzp + 1 points to the
-				 * first byte after the znode_phys_t.
-				 */
-				(void) memcpy(xoap->xoa_av_scanstamp,
-				    pzp + 1,
-				    sizeof (xoap->xoa_av_scanstamp));
-				XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
-			}
-		}
-
-		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
-			ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
-			XVA_SET_RTN(xvap, XAT_CREATETIME);
+		    vp->v_type == VREG) {
+			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-			xoap->xoa_reparse =
-			    ((pzp->zp_flags & ZFS_REPARSE) != 0);
+			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
-	}
+		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+			xoap->xoa_generation = zp->z_gen;
+			XVA_SET_RTN(xvap, XAT_GEN);
+		}
 
-	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
-	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
-	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
-	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
+		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+			xoap->xoa_offline =
+			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
+			XVA_SET_RTN(xvap, XAT_OFFLINE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+			xoap->xoa_sparse =
+			    ((zp->z_pflags & ZFS_SPARSE) != 0);
+			XVA_SET_RTN(xvap, XAT_SPARSE);
+		}
+	}
 
-	mutex_exit(&zp->z_lock);
+	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
-	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
+
+	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
@@ -2621,8 +3208,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
@@ -2630,42 +3216,43 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
-	caller_context_t *ct)
+    caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
-	znode_phys_t	*pzp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
-	uint_t		saved_mask;
+	uint_t		saved_mask = 0;
+	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
+	uint64_t	xattr_obj;
+	uint64_t	mtime[2], ctime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
-	int		err;
+	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
-	zfs_acl_t	*aclp = NULL;
+	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
-	boolean_t fuid_dirtied = B_FALSE;
-
-	dprintf("zfs_setattr called\n");
+	boolean_t	fuid_dirtied = B_FALSE;
+	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
+	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	pzp = zp->z_phys;
 	zilog = zfsvfs->z_log;
 
 	/*
@@ -2678,17 +3265,17 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, i
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
-		return (EISDIR);
+		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
 	/*
@@ -2702,16 +3289,16 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, i
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
-	if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
+	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		ZFS_EXIT(zfsvfs);
-		return (EPERM);
+		return (SET_ERROR(EPERM));
 	}
 
-	if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
+	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
-		return (EPERM);
+		return (SET_ERROR(EPERM));
 	}
 
 	/*
@@ -2724,28 +3311,29 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, i
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			ZFS_EXIT(zfsvfs);
-			return (EOVERFLOW);
+			return (SET_ERROR(EOVERFLOW));
 		}
 	}
+	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
+	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EOVERFLOW));
+	}
 
-top:
 	attrzp = NULL;
+	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
-		return (EROFS);
+		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
+
 	if (mask & AT_SIZE) {
-		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
-		if (err) {
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
@@ -2759,15 +3347,18 @@ top:
 			return (err);
 		}
 	}
-	
+
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
-	    XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
+	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
+	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
@@ -2780,7 +3371,7 @@ top:
 		 */
 
 		if (!(mask & AT_MODE))
-			vap->va_mode = pzp->zp_mode;
+			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
@@ -2798,7 +3389,7 @@ top:
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
-		
+
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
@@ -2807,7 +3398,7 @@ top:
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
-				secpolicy_setid_clear(vap, cr);
+				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
@@ -2817,8 +3408,7 @@ top:
 		}
 	}
 
-	mutex_enter(&zp->z_lock);
-	oldva.va_mode = pzp->zp_mode;
+	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
@@ -2830,7 +3420,7 @@ top:
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
-			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
@@ -2840,7 +3430,7 @@ top:
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
-			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
@@ -2850,7 +3440,7 @@ top:
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
-			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
@@ -2860,7 +3450,7 @@ top:
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
-			    ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
@@ -2870,7 +3460,7 @@ top:
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
-			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
@@ -2882,7 +3472,7 @@ top:
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
-			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
@@ -2891,9 +3481,8 @@ top:
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
-			return (EPERM);
+			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
@@ -2903,8 +3492,6 @@ top:
 		}
 	}
 
-	mutex_exit(&zp->z_lock);
-
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
@@ -2931,6 +3518,13 @@ top:
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
+			if (trim_mask & AT_MODE) {
+				/*
+				 * Save the mode, as secpolicy_vnode_setattr()
+				 * will overwrite it with ova.va_mode.
+				 */
+				saved_mode = vap->va_mode;
+			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
@@ -2939,97 +3533,122 @@ top:
 			return (err);
 		}
 
-		if (trim_mask)
+		if (trim_mask) {
 			vap->va_mask |= saved_mask;
+			if (trim_mask & AT_MODE) {
+				/*
+				 * Recover the mode after
+				 * secpolicy_vnode_setattr().
+				 */
+				vap->va_mode = saved_mode;
+			}
+		}
 	}
+
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
+	if ((mask & (AT_UID | AT_GID))) {
+		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+		    &xattr_obj, sizeof (xattr_obj));
+
+		if (err == 0 && xattr_obj) {
+			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+			if (err == 0) {
+				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+				if (err != 0)
+					vrele(ZTOV(attrzp));
+			}
+			if (err)
+				goto out2;
+		}
+		if (mask & AT_UID) {
+			new_uid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+			if (new_uid != zp->z_uid &&
+			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
+				if (attrzp)
+					vput(ZTOV(attrzp));
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (mask & AT_GID) {
+			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &fuidp);
+			if (new_gid != zp->z_gid &&
+			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
+				if (attrzp)
+					vput(ZTOV(attrzp));
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+	}
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
 
 	if (mask & AT_MODE) {
-		uint64_t pmode = pzp->zp_mode;
-
+		uint64_t pmode = zp->z_mode;
+		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
+		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+			err = SET_ERROR(EPERM);
+			goto out;
+		}
+
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
-		if (pzp->zp_acl.z_acl_extern_obj) {
-			/* Are we upgrading ACL from old V0 format to new V1 */
-			if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
-			    pzp->zp_acl.z_acl_version ==
+
+		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+			/*
+			 * Are we upgrading ACL from old V0 format
+			 * to V1 format?
+			 */
+			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
-				dmu_tx_hold_free(tx,
-				    pzp->zp_acl.z_acl_extern_obj, 0,
+				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
-				dmu_tx_hold_write(tx,
-				    pzp->zp_acl.z_acl_extern_obj, 0,
+				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
-		} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	} else {
+		if ((mask & AT_XVATTR) &&
+		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		else
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
-	if (mask & (AT_UID | AT_GID)) {
-		if (pzp->zp_xattr) {
-			err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
-			if (err)
-				goto out;
-			dmu_tx_hold_bonus(tx, attrzp->z_id);
-		}
-		if (mask & AT_UID) {
-			new_uid = zfs_fuid_create(zfsvfs,
-			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
-			if (new_uid != pzp->zp_uid &&
-			    zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
-				err = EDQUOT;
-				goto out;
-			}
-		}
-
-		if (mask & AT_GID) {
-			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
-			    cr, ZFS_GROUP, &fuidp);
-			if (new_gid != pzp->zp_gid &&
-			    zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
-				err = EDQUOT;
-				goto out;
-			}
-		}
-		fuid_dirtied = zfsvfs->z_fuid_dirty;
-		if (fuid_dirtied) {
-			if (zfsvfs->z_fuid_obj == 0) {
-				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-				    FUID_SIZE_ESTIMATE(zfsvfs));
-				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
-				    FALSE, NULL);
-			} else {
-				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-				    FUID_SIZE_ESTIMATE(zfsvfs));
-			}
-		}
+	if (attrzp) {
+		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
-	if (err) {
-		if (err == ERESTART)
-			dmu_tx_wait(tx);
-		goto out;
-	}
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	zfs_sa_upgrade_txholds(tx, zp);
 
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err)
+		goto out;
 
+	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
@@ -3038,47 +3657,105 @@ top:
 	 * updated as a side-effect of calling this function.
 	 */
 
-	mutex_enter(&zp->z_lock);
+	if (mask & (AT_UID|AT_GID|AT_MODE))
+		mutex_enter(&zp->z_acl_lock);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (attrzp) {
+		if (mask & (AT_UID|AT_GID|AT_MODE))
+			mutex_enter(&attrzp->z_acl_lock);
+		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+		    sizeof (attrzp->z_pflags));
+	}
+
+	if (mask & (AT_UID|AT_GID)) {
+
+		if (mask & AT_UID) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+			    &new_uid, sizeof (new_uid));
+			zp->z_uid = new_uid;
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+				    sizeof (new_uid));
+				attrzp->z_uid = new_uid;
+			}
+		}
+
+		if (mask & AT_GID) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+			    NULL, &new_gid, sizeof (new_gid));
+			zp->z_gid = new_gid;
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+				    sizeof (new_gid));
+				attrzp->z_gid = new_gid;
+			}
+		}
+		if (!(mask & AT_MODE)) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+			    NULL, &new_mode, sizeof (new_mode));
+			new_mode = zp->z_mode;
+		}
+		err = zfs_acl_chown_setattr(zp);
+		ASSERT(err == 0);
+		if (attrzp) {
+			err = zfs_acl_chown_setattr(attrzp);
+			ASSERT(err == 0);
+		}
+	}
 
 	if (mask & AT_MODE) {
-		mutex_enter(&zp->z_acl_lock);
-		zp->z_phys->zp_mode = new_mode;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+		    &new_mode, sizeof (new_mode));
+		zp->z_mode = new_mode;
+		ASSERT3U((uintptr_t)aclp, !=, 0);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
-		ASSERT3U(err, ==, 0);
+		ASSERT0(err);
+		if (zp->z_acl_cached)
+			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
-		mutex_exit(&zp->z_acl_lock);
 	}
 
-	if (attrzp)
-		mutex_enter(&attrzp->z_lock);
 
-	if (mask & AT_UID) {
-		pzp->zp_uid = new_uid;
-		if (attrzp)
-			attrzp->z_phys->zp_uid = new_uid;
+	if (mask & AT_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
-	if (mask & AT_GID) {
-		pzp->zp_gid = new_gid;
-		if (attrzp)
-			attrzp->z_phys->zp_gid = new_gid;
+	if (mask & AT_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    mtime, sizeof (mtime));
 	}
 
-	if (attrzp)
-		mutex_exit(&attrzp->z_lock);
-
-	if (mask & AT_ATIME)
-		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
-
-	if (mask & AT_MTIME)
-		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
-
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
-	if (mask & AT_SIZE)
-		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
-	else if (mask != 0)
-		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
+	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+		    NULL, mtime, sizeof (mtime));
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+		    B_TRUE);
+	} else if (mask != 0) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
+		    B_TRUE);
+		if (attrzp) {
+			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+			    SA_ZPL_CTIME(zfsvfs), NULL,
+			    &ctime, sizeof (ctime));
+			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+			    mtime, ctime, B_TRUE);
+		}
+	}
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
@@ -3086,6 +3763,8 @@ top:
 
 	if (xoap && (mask & AT_XVATTR)) {
 
+		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+			xoap->xoa_createtime = vap->va_birthtime;
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
@@ -3110,20 +3789,10 @@ top:
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
-		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
-			size_t len;
-			dmu_object_info_t doi;
-
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(vp->v_type == VREG);
 
-			/* Grow the bonus buffer if necessary. */
-			dmu_object_info_from_db(zp->z_dbuf, &doi);
-			len = sizeof (xoap->xoa_av_scanstamp) +
-			    sizeof (znode_phys_t);
-			if (len > doi.doi_bonus_size)
-				VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
-		}
-		zfs_xvattr_set(zp, xvap);
+		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
@@ -3132,11 +3801,22 @@ top:
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
-	mutex_exit(&zp->z_lock);
+	if (mask & (AT_UID|AT_GID|AT_MODE))
+		mutex_exit(&zp->z_acl_lock);
 
+	if (attrzp) {
+		if (mask & (AT_UID|AT_GID|AT_MODE))
+			mutex_exit(&attrzp->z_acl_lock);
+	}
 out:
+	if (err == 0 && attrzp) {
+		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+		    xattr_count, tx);
+		ASSERT(err2 == 0);
+	}
+
 	if (attrzp)
-		VN_RELE(ZTOV(attrzp));
+		vput(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
@@ -3146,112 +3826,299 @@ out:
 		fuidp = NULL;
 	}
 
-	if (err)
+	if (err) {
 		dmu_tx_abort(tx);
-	else
+	} else {
+		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
+	}
 
-	if (err == ERESTART)
-		goto top;
+out2:
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
-typedef struct zfs_zlock {
-	krwlock_t	*zl_rwlock;	/* lock we acquired */
-	znode_t		*zl_znode;	/* znode we held */
-	struct zfs_zlock *zl_next;	/* next in list */
-} zfs_zlock_t;
-
 /*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
+ * We acquire all but fdvp locks using non-blocking acquisitions.  If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename.  This acquire/release step ensures that we do not
+ * spin on a lock waiting for release.  On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
  */
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+    struct vnode *tdvp, struct vnode **tvpp,
+    const struct componentname *scnp, const struct componentname *tcnp)
 {
-	zfs_zlock_t *zl;
+	zfsvfs_t	*zfsvfs;
+	struct vnode	*nvp, *svp, *tvp;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	const char	*snm = scnp->cn_nameptr;
+	const char	*tnm = tcnp->cn_nameptr;
+	int error;
+
+#ifdef __FreeBSD__
+	VOP_UNLOCK(tdvp, 0);
+	if (*tvpp != NULL && *tvpp != tdvp)
+		VOP_UNLOCK(*tvpp, 0);
+#endif
+
+relock:
+	error = vn_lock(sdvp, LK_EXCLUSIVE);
+	if (error)
+		goto out;
+	sdzp = VTOZ(sdvp);
+
+#ifdef __NetBSD__
+	if (tdvp == sdvp) {
+	} else {
+#endif
+	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+		if (error != EBUSY)
+			goto out;
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto out;
+		VOP_UNLOCK(tdvp, 0);
+		goto relock;
+	}
+#ifdef __NetBSD__
+	} /* end if (tdvp == sdvp) */
+#endif
+
+	tdzp = VTOZ(tdvp);
+
+	/*
+	 * Before using sdzp and tdzp we must ensure that they are live.
+	 * As a porting legacy from illumos we have two things to worry
+	 * about.  One is typical for FreeBSD and it is that the vnode is
+	 * not reclaimed (doomed).  The other is that the znode is live.
+	 * The current code can invalidate the znode without acquiring the
+	 * corresponding vnode lock if the object represented by the znode
+	 * and vnode is no longer valid after a rollback or receive operation.
+	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+	 * that protects the znodes from the invalidation.
+	 */
+	zfsvfs = sdzp->z_zfsvfs;
+	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+	ZFS_ENTER(zfsvfs);
 
-	while ((zl = *zlpp) != NULL) {
-		if (zl->zl_znode != NULL)
-			VN_RELE(ZTOV(zl->zl_znode));
-		rw_exit(zl->zl_rwlock);
-		*zlpp = zl->zl_next;
-		kmem_free(zl, sizeof (*zl));
+	/*
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
+	 */
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+#ifdef __NetBSD__
+		if (tdvp != sdvp)
+#endif
+		VOP_UNLOCK(tdvp, 0);
+		error = SET_ERROR(EIO);
+		goto out;
 	}
-}
 
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
-	zfs_zlock_t	*zl;
-	znode_t		*zp = tdzp;
-	uint64_t	rootid = zp->z_zfsvfs->z_root;
-	uint64_t	*oidp = &zp->z_id;
-	krwlock_t	*rwlp = &szp->z_parent_lock;
-	krw_t		rw = RW_WRITER;
+	/*
+	 * Re-resolve svp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+	if (error != 0) {
+		/* Source entry invalid or not there. */
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+#ifdef __NetBSD__
+		if (tdvp != sdvp)
+#endif
+		VOP_UNLOCK(tdvp, 0);
+		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	svp = ZTOV(szp);
 
 	/*
-	 * First pass write-locks szp and compares to zp->z_id.
-	 * Later passes read-lock zp and compare to zp->z_parent.
+	 * Re-resolve tvp, if it disappeared we just carry on.
 	 */
-	do {
-		if (!rw_tryenter(rwlp, rw)) {
-			/*
-			 * Another thread is renaming in this path.
-			 * Note that if we are a WRITER, we don't have any
-			 * parent_locks held yet.
-			 */
-			if (rw == RW_READER && zp->z_id > szp->z_id) {
-				/*
-				 * Drop our locks and restart
-				 */
-				zfs_rename_unlock(&zl);
-				*zlpp = NULL;
-				zp = tdzp;
-				oidp = &zp->z_id;
-				rwlp = &szp->z_parent_lock;
-				rw = RW_WRITER;
-				continue;
-			} else {
-				/*
-				 * Wait for other thread to drop its locks
-				 */
-				rw_enter(rwlp, rw);
+	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK(sdvp, 0);
+#ifdef __NetBSD__
+		if (tdvp != sdvp)
+#endif
+		VOP_UNLOCK(tdvp, 0);
+		vrele(svp);
+		if ((tcnp->cn_flags & ISDOTDOT) != 0)
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	if (tzp != NULL)
+		tvp = ZTOV(tzp);
+	else
+		tvp = NULL;
+
+	/*
+	 * At present the vnode locks must be acquired before z_teardown_lock,
+	 * although it would be more logical to use the opposite order.
+	 */
+	ZFS_EXIT(zfsvfs);
+
+	/*
+	 * Now try acquire locks on svp and tvp.
+	 */
+	nvp = svp;
+	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK(sdvp, 0);
+#ifdef __NetBSD__
+		if (tdvp != sdvp)
+#endif
+		VOP_UNLOCK(tdvp, 0);
+		if (tvp != NULL)
+			vrele(tvp);
+		if (error != EBUSY) {
+			vrele(nvp);
+			goto out;
+		}
+		error = vn_lock(nvp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vrele(nvp);
+			goto out;
+		}
+		VOP_UNLOCK(nvp, 0);
+		/*
+		 * Concurrent rename race.
+		 * XXX ?
+		 */
+		if (nvp == tdvp) {
+			vrele(nvp);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+#ifdef __NetBSD__
+		if (*svpp != NULL)
+#endif
+		vrele(*svpp);
+		*svpp = nvp;
+		goto relock;
+	}
+#ifdef __NetBSD__
+	if (*svpp != NULL)
+#endif
+	vrele(*svpp);
+	*svpp = nvp;
+
+	if (*tvpp != NULL)
+		vrele(*tvpp);
+	*tvpp = NULL;
+	if (tvp != NULL) {
+		nvp = tvp;
+
+#ifdef __NetBSD__
+		if (tvp == svp || tvp == sdvp) {
+		} else {
+#endif
+		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error != 0) {
+			VOP_UNLOCK(sdvp, 0);
+#ifdef __NetBSD__
+			if (tdvp != sdvp)
+#endif
+			VOP_UNLOCK(tdvp, 0);
+#ifdef __NetBSD__
+			if (*svpp != tdvp)
+#endif
+			VOP_UNLOCK(*svpp, 0);
+			if (error != EBUSY) {
+				vrele(nvp);
+				goto out;
+			}
+			error = vn_lock(nvp, LK_EXCLUSIVE);
+			if (error != 0) {
+				vrele(nvp);
+				goto out;
 			}
+			vput(nvp);
+			goto relock;
 		}
+#ifdef __NetBSD__
+		} /* end if (tvp == svp || tvp == sdvp) */
+#endif
+
+		*tvpp = nvp;
+	}
 
-		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
-		zl->zl_rwlock = rwlp;
-		zl->zl_znode = NULL;
-		zl->zl_next = *zlpp;
-		*zlpp = zl;
+	KASSERT(VOP_ISLOCKED(sdvp) == LK_EXCLUSIVE);
+	KASSERT(VOP_ISLOCKED(*svpp) == LK_EXCLUSIVE);
+	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
+	KASSERT(*tvpp == NULL || VOP_ISLOCKED(*tvpp) == LK_EXCLUSIVE);
 
-		if (*oidp == szp->z_id)		/* We're a descendant of szp */
-			return (EINVAL);
+	return (0);
 
-		if (*oidp == rootid)		/* We've hit the top */
-			return (0);
+out:
+	return (error);
+}
 
-		if (rw == RW_READER) {		/* i.e. not the first pass */
-			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
-			if (error)
-				return (error);
-			zl->zl_znode = zp;
-		}
-		oidp = &zp->z_phys->zp_parent;
-		rwlp = &zp->z_parent_lock;
-		rw = RW_READER;
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*zp, *zp1;
+	uint64_t	parent;
+	int		error;
+
+	zfsvfs = tdzp->z_zfsvfs;
+	if (tdzp == szp)
+		return (SET_ERROR(EINVAL));
+	if (tdzp == sdzp)
+		return (0);
+	if (tdzp->z_id == zfsvfs->z_root)
+		return (0);
+	zp = tdzp;
+	for (;;) {
+		ASSERT(!zp->z_unlinked);
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			break;
 
-	} while (zp->z_id != sdzp->z_id);
+		if (parent == szp->z_id) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		if (parent == zfsvfs->z_root)
+			break;
+		if (parent == sdzp->z_id)
+			break;
 
-	return (0);
+		error = zfs_zget(zfsvfs, parent, &zp1);
+		if (error != 0)
+			break;
+
+		if (zp != tdzp)
+			VN_RELE_ASYNC(ZTOV(zp),
+			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+		zp = zp1;
+	}
+
+	if (error == ENOTDIR)
+		panic("checkpath: .. not a directory\n");
+	if (zp != tdzp)
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+	return (error);
 }
 
 /*
@@ -3266,213 +4133,100 @@ zfs_rename_lock(znode_t *szp, znode_t *t
  *		ct	- caller context
  *		flags	- case flags
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
-/* XXX NetBSD There is significant problem with dirent locking during rename
- * of files which are in a same dir. zfs_dirent_lock is then called twice on
- * same lock which panics LOCKDEBUG kernel. Locking twice is not needed. 
- * Proper solution for this is add new flag to zfs_dirent_lock which will
- * disable rw_enter in it. Renaming of files in same dir is considered as broken
- * on LOCKDEBUG kernels on NetBSD for now.
- */
 /*ARGSUSED*/
 static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
-    caller_context_t *ct, int flags)
+zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+    cred_t *cr)
 {
-	znode_t		*tdzp, *szp, *tzp;
-	znode_t		*sdzp = VTOZ(sdvp);
-	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
-	zilog_t		*zilog;
-	vnode_t		*realvp;
-	zfs_dirlock_t	*sdl, *tdl;
+	zfsvfs_t	*zfsvfs;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	zilog_t		*zilog = NULL;
 	dmu_tx_t	*tx;
-	zfs_zlock_t	*zl;
-	int		cmp, serr, terr;
+	char		*snm = __UNCONST(scnp->cn_nameptr);
+	char		*tnm = __UNCONST(tcnp->cn_nameptr);
 	int		error = 0;
-	int		zflg = 0;
-	int		samedir = 0;
-	
-	tdl = NULL;
-	sdl = NULL;
-	
-	dprintf("zfs_rename called\n");
-	
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(sdzp);
-	zilog = zfsvfs->z_log;
 
-	/*
-	 * Make sure we have the real vp for the target directory.
-	 */
-	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
-		tdvp = realvp;
-
-	if (tdvp->v_vfsp != sdvp->v_vfsp) {
-		ZFS_EXIT(zfsvfs);
-		return (EXDEV);
+	/* Reject renames across filesystems. */
+	if (((*svpp) != NULL && (*svpp)->v_mount != tdvp->v_mount) ||
+	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
 	}
 
-	tdzp = VTOZ(tdvp);
-	ZFS_VERIFY_ZP(tdzp);
-	if (zfsvfs->z_utf8 && u8_validate(tnm,
-	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (EILSEQ);
+	if (zfsctl_is_node(tdvp)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
 	}
 
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-
-top:
-	szp = NULL;
-	tzp = NULL;
-	zl = NULL;
-
 	/*
-	 * This is to prevent the creation of links into attribute space
-	 * by renaming a linked file into/outof an attribute directory.
-	 * See the comment in zfs_link() for why this is considered bad.
+	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
-	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
-	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+	if (error != 0) {
+		/* no vnodes are locked in the case of error here */
+		return (error);
 	}
 
+	tdzp = VTOZ(tdvp);
+	sdzp = VTOZ(sdvp);
+	zfsvfs = tdzp->z_zfsvfs;
+	zilog = zfsvfs->z_log;
+
 	/*
-	 * Lock source and target directory entries.  To prevent deadlock,
-	 * a lock ordering must be defined.  We lock the directory with
-	 * the smallest object id first, or if it's a tie, the one with
-	 * the lexically first name.
+	 * After we re-enter ZFS_ENTER() we will have to revalidate all
+	 * znodes involved.
 	 */
-	if (sdzp->z_id < tdzp->z_id) {
-		cmp = -1;
-	} else if (sdzp->z_id > tdzp->z_id) {
-		cmp = 1;
-	} else {
-		/*
-		 * First compare the two name arguments without
-		 * considering any case folding.
-		 */
-		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
+	ZFS_ENTER(zfsvfs);
 
-		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
-		ASSERT(error == 0 || !zfsvfs->z_utf8);
-		if (cmp == 0) {
-			/*
-			 * POSIX: "If the old argument and the new argument
-			 * both refer to links to the same existing file,
-			 * the rename() function shall return successfully
-			 * and perform no other action."
-			 */
-			ZFS_EXIT(zfsvfs);
-			return (0);
-		}
-		/*
-		 * If the file system is case-folding, then we may
-		 * have some more checking to do.  A case-folding file
-		 * system is either supporting mixed case sensitivity
-		 * access or is completely case-insensitive.  Note
-		 * that the file system is always case preserving.
-		 *
-		 * In mixed sensitivity mode case sensitive behavior
-		 * is the default.  FIGNORECASE must be used to
-		 * explicitly request case insensitive behavior.
-		 *
-		 * If the source and target names provided differ only
-		 * by case (e.g., a request to rename 'tim' to 'Tim'),
-		 * we will treat this as a special case in the
-		 * case-insensitive mode: as long as the source name
-		 * is an exact match, we will allow this to proceed as
-		 * a name-change request.
-		 */
-		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
-		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
-		    flags & FIGNORECASE)) &&
-		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
-		    &error) == 0) {
-			/*
-			 * case preserving rename request, require exact
-			 * name matches
-			 */
-			zflg |= ZCIEXACT;
-			zflg &= ~ZCILOOK;
-		}
+	if (zfsvfs->z_utf8 && u8_validate(tnm,
+	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		error = SET_ERROR(EILSEQ);
+		goto unlockout;
 	}
 
-	/*
-	 * If the source and destination directories are the same, we should
-	 * grab the z_name_lock of that directory only once.
-	 */
-	if (sdzp == tdzp) {
-		zflg |= ZHAVELOCK;
-		rw_enter(&sdzp->z_name_lock, RW_READER);
+	/* If source and target are the same file, there is nothing to do. */
+	if ((*svpp) == (*tvpp)) {
+		error = 0;
+		goto unlockout;
 	}
 
-	if (cmp < 0) {
-
-		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
-		    ZEXISTS | zflg, NULL, NULL);
-		if ((serr == 0) && (sdzp == tdzp)) {
-			/*
-			 * If renaming within the one directory we must
-			 * be careful not to recursively acquire locks.
-			 */
-			zflg |= ZHAVELOCK;
-		}
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
-	} else {
-		terr = zfs_dirent_lock(&tdl,
-		    tdzp, tnm, &tzp, zflg, NULL, NULL);
-
-		if ((terr == 0) && (sdzp == tdzp)) {
-			/*
-			 * If renaming within the one directory we must
-			 * be careful not to recursively acquire locks.
-			 */
-			zflg |= ZHAVELOCK;
-		}
-		serr = zfs_dirent_lock(&sdl,
-		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
-		    NULL, NULL);
+	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+	    (*tvpp)->v_mountedhere != NULL)) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
 	}
 
-	if (serr) {
-		/*
-		 * Source entry invalid or not there.
-		 */
-		if (!terr) {
-			zfs_dirent_unlock(tdl);
-			if (tzp)
-				VN_RELE(ZTOV(tzp));
-		}
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		if (strcmp(snm, "..") == 0)
-			serr = EINVAL;
-		ZFS_EXIT(zfsvfs);
-		return (serr);
+	/*
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
+	 */
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
 	}
-	if (terr) {
-		if (sdl != NULL)
-			zfs_dirent_unlock(sdl);
-		VN_RELE(ZTOV(szp));
 
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
+	szp = VTOZ(*svpp);
+	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
+	}
 
-		if (strcmp(tnm, "..") == 0)
-			terr = EINVAL;
-		ZFS_EXIT(zfsvfs);
-		return (terr);
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		error = SET_ERROR(EINVAL);
+		goto unlockout;
 	}
 
 	/*
@@ -3481,17 +4235,26 @@ top:
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
-
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
-		goto out;
+		goto unlockout;
+
+	if ((*svpp)->v_type == VDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+		    sdzp == szp ||
+		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+			error = SET_ERROR(EINVAL);
+			goto unlockout;
+		}
 
-	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
-		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
-			goto out;
+		if (error = zfs_rename_check(szp, sdzp, tdzp))
+			goto unlockout;
 	}
 
 	/*
@@ -3501,17 +4264,22 @@ top:
 		/*
 		 * Source and target must be the same type.
 		 */
-		if (ZTOV(szp)->v_type == VDIR) {
-			if (ZTOV(tzp)->v_type != VDIR) {
-				error = ENOTDIR;
-				goto out;
+		if ((*svpp)->v_type == VDIR) {
+			if ((*tvpp)->v_type != VDIR) {
+				error = SET_ERROR(ENOTDIR);
+				goto unlockout;
+			} else {
+				cache_purge(tdvp);
+				if (sdvp != tdvp)
+					cache_purge(sdvp);
 			}
 		} else {
-			if (ZTOV(tzp)->v_type == VDIR) {
-				error = EISDIR;
-				goto out;
+			if ((*tvpp)->v_type == VDIR) {
+				error = SET_ERROR(EISDIR);
+				goto unlockout;
 			}
 		}
+
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
@@ -3528,14 +4296,14 @@ top:
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
-			goto out;
+			goto unlockout;
 		}
 #endif
 	}
 
-	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
-		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
@@ -3546,87 +4314,97 @@ top:
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
-	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
-	if (sdzp != tdzp)
-		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
-	if (tzp)
-		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
+	if (sdzp != tdzp) {
+		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tdzp);
+	}
+	if (tzp) {
+		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tzp);
+	}
+
+	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (zl != NULL)
-			zfs_rename_unlock(&zl);
-
-		zfs_dirent_unlock(sdl);
-		zfs_dirent_unlock(tdl);
-
-		if (sdzp == tdzp)
-			rw_exit(&sdzp->z_name_lock);
-
-		VN_RELE(ZTOV(szp));
-		if (tzp)
-			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (error);
+		goto unlockout;
 	}
 
+
 	if (tzp && (tzp->z_id != szp->z_id))
 		/* Attempt to remove the existing target */
-		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		if (!tzp || (tzp->z_id != szp->z_id))
-			error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+			error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
-			szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
+			szp->z_pflags |= ZFS_AV_MODIFIED;
 
-			error = zfs_link_destroy(sdl, szp, tx,
-			    /* Kludge for BSD rename semantics.  */
-			    ((tzp && (tzp->z_id == szp->z_id)) ?
-				zflg : ZRENAMING), NULL);
-			ASSERT(error == 0);
+			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+			ASSERT0(error);
 
-			zfs_log_rename(zilog, tx,
-			    TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
-			    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
+			error = zfs_link_destroy(sdzp, snm, szp, tx,
+			    /* Kludge for BSD rename semantics.  */
+			    tzp && tzp->z_id == szp->z_id ? 0: ZRENAMING, NULL);
+			if (error == 0) {
+				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+				    snm, tdzp, tnm, szp);
 
-			/* Update path information for the target vnode */
-			vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
+				/*
+				 * Update path information for the target vnode
+				 */
+				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
+			} else {
+				/*
+				 * At this point, we have successfully created
+				 * the target name, but have failed to remove
+				 * the source name.  Since the create was done
+				 * with the ZRENAMING flag, there are
+				 * complications; for one, the link count is
+				 * wrong.  The easiest way to deal with this
+				 * is to remove the newly created target, and
+				 * return the original error.  This must
+				 * succeed; fortunately, it is very unlikely to
+				 * fail, since we just created it.
+				 */
+				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
+				    ZRENAMING, NULL), ==, 0);
+			}
 		}
 		if (error == 0) {
-			/* Purge cache entries, while still holding locks. */
-			cache_purge(sdvp);
-			cache_purge(tdvp);
+			cache_purge(*svpp);
+			if (*tvpp != NULL)
+				cache_purge(*tvpp);
+			cache_purge_negative(tdvp);
 		}
 	}
 
 	dmu_tx_commit(tx);
-out:
-	if (zl != NULL)
-		zfs_rename_unlock(&zl);
 
-	zfs_dirent_unlock(sdl);
-	zfs_dirent_unlock(tdl);
-
-	if (sdzp == tdzp)
-		rw_exit(&sdzp->z_name_lock);
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
 
+unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
+	ZFS_EXIT(zfsvfs);
 
-	VN_RELE(ZTOV(szp));
-	if (tzp)
-		VN_RELE(ZTOV(tzp));
+	VOP_UNLOCK(*svpp, 0);
+	VOP_UNLOCK(sdvp, 0);
 
-	ZFS_EXIT(zfsvfs);
+	if (*tvpp != sdvp && *tvpp != *svpp)
+	if (*tvpp != NULL)
+		VOP_UNLOCK(*tvpp, 0);
+	if (tdvp != sdvp && tdvp != *svpp)
+	if (tdvp != *tvpp)
+		VOP_UNLOCK(tdvp, 0);
 
+out:
 	return (error);
 }
 
@@ -3636,13 +4414,11 @@ out:
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
- *		target	- Target path of new symlink.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
@@ -3650,18 +4426,18 @@ out:
 /*ARGSUSED*/
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
-    cred_t *cr, int flags)
+    cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	int		len = strlen(link);
+	uint64_t	len = strlen(link);
 	int		error;
-	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
+	uint64_t	txtype = TX_SYMLINK;
+	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
 
@@ -3672,111 +4448,99 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp,
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
-		return (EILSEQ);
+		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zflg |= ZCILOOK;
-top:
-	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+
+	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
-		return (error);
+		return (SET_ERROR(ENAMETOOLONG));
 	}
 
-	if (len > MAXPATHLEN) {
+	if ((error = zfs_acl_ids_create(dzp, 0,
+	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
-		return (ENAMETOOLONG);
+		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
-		return (EDQUOT);
+		return (SET_ERROR(EDQUOT));
 	}
+
+	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
-	dmu_tx_hold_bonus(tx, dzp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE + len);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	dmu_buf_will_dirty(dzp->z_dbuf, tx);
-
 	/*
 	 * Create a new object for the symlink.
-	 * Put the link content into bonus buffer if it will fit;
-	 * otherwise, store it just like any other file data.
+	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
-	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
-		if (len != 0)
-			bcopy(link, zp->z_phys + 1, len);
-	} else {
-		dmu_buf_t *dbp;
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
-
-		if (fuid_dirtied)
-			zfs_fuid_sync(zfsvfs, tx);
-		/*
-		 * Nothing can access the znode yet so no locking needed
-		 * for growing the znode's blocksize.
-		 */
-		zfs_grow_blocksize(zp, len, tx);
-
-		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
-		    zp->z_id, 0, FTAG, &dbp));
-		dmu_buf_will_dirty(dbp, tx);
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
 
-		ASSERT3U(len, <=, dbp->db_size);
-		bcopy(link, dbp->db_data, len);
-		dmu_buf_rele(dbp, FTAG);
-	}
-	zp->z_phys->zp_size = len;
+	if (zp->z_is_sa)
+		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+		    link, len, tx);
+	else
+		zfs_sa_symlink(zp, link, len, tx);
 
+	zp->z_size = len;
+	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
-	(void) zfs_link_create(dl, zp, tx, ZNEW);
-	if (error == 0) {
-		uint64_t txtype = TX_SYMLINK;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
-		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
-		*vpp = ZTOV(zp);
-	}
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+
+	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+	*vpp = ZTOV(zp);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
+	getnewvnode_drop_reserve();
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -3787,14 +4551,13 @@ top:
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
- *		uoip	- structure to contain the link path.
+ *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
- *	OUT:	uio	- structure to contain the link path.
+ *	OUT:	uio	- structure containing the link path.
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
@@ -3805,29 +4568,19 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cr
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	size_t		bufsz;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	bufsz = (size_t)zp->z_phys->zp_size;
-	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
-		error = uiomove(zp->z_phys + 1,
-		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-	} else {
-		dmu_buf_t *dbp;
-		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
-		if (error) {
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
-		error = uiomove(dbp->db_data,
-		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
-		dmu_buf_rele(dbp, FTAG);
-	}
+	if (zp->z_is_sa)
+		error = sa_lookup_uio(zp->z_sa_hdl,
+		    SA_ZPL_SYMLINK(zfsvfs), uio);
+	else
+		error = zfs_sa_readlink(zp, uio);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
@@ -3841,8 +4594,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cr
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
- *	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
@@ -3857,11 +4609,9 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, ch
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
-	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
-	vnode_t		*realvp;
 	int		error;
-	int		zf = ZNEW;
+	uint64_t	parent;
 	uid_t		owner;
 
 	ASSERT(tdvp->v_type == VDIR);
@@ -3870,51 +4620,57 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, ch
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
-	if (VOP_REALVP(svp, &realvp, ct) == 0)
-		svp = realvp;
-
-	if (svp->v_vfsp != tdvp->v_vfsp) {
+	/*
+	 * POSIX dictates that we return EPERM here.
+	 * Better choices include ENOTSUP or EISDIR.
+	 */
+	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
-		return (EXDEV);
+		return (SET_ERROR(EPERM));
 	}
+
 	szp = VTOZ(svp);
 	ZFS_VERIFY_ZP(szp);
 
+	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/* Prevent links to .zfs/shares files */
+
+	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	if (parent == zfsvfs->z_shares_dir) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
-		return (EILSEQ);
+		return (SET_ERROR(EILSEQ));
 	}
-	if (flags & FIGNORECASE)
-		zf |= ZCILOOK;
 
-top:
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
-	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
-	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
+	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
-	/*
-	 * POSIX dictates that we return EPERM here.
-	 * Better choices include ENOTSUP or EISDIR.
-	 */
-	if (svp->v_type == VDIR) {
-		ZFS_EXIT(zfsvfs);
-		return (EPERM);
-	}
 
-	owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
-	if (owner != crgetuid(cr) &&
-	    secpolicy_basic_link(cr) != 0) {
+	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
 		ZFS_EXIT(zfsvfs);
-		return (EPERM);
+		return (SET_ERROR(EPERM));
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
@@ -3925,73 +4681,131 @@ top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
-	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, szp->z_id);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	zfs_sa_upgrade_txholds(tx, szp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		zfs_dirent_unlock(dl);
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
-	error = zfs_link_create(dl, szp, tx, 0);
+	error = zfs_link_create(dzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
-		if (flags & FIGNORECASE)
-			txtype |= TX_CI;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
-	zfs_dirent_unlock(dl);
-
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
 
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
+
 /*ARGSUSED*/
+void
+zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	if (zp->z_sa_hdl == NULL) {
+		/*
+		 * The fs has been unmounted, or we did a
+		 * suspend/resume and this file no longer exists.
+		 */
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		vrecycle(vp);
+		return;
+	}
+
+	if (zp->z_unlinked) {
+		/*
+		 * Fast path to recycle a vnode of a removed file.
+		 */
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		vrecycle(vp);
+		return;
+	}
+
+	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
+			zp->z_atime_dirty = 0;
+			dmu_tx_commit(tx);
+		}
+	}
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+}
 
-/* CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); */
-/* CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); */
+
+#ifdef __FreeBSD__
+CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
+CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
+#endif
 
 /*ARGSUSED*/
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
-	/* XXX This should bre reviewed maybe Opensolaris version of zfs_fid can
-	   be used for NetBSD */
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
+	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
-	int		size, i;
+	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
-	gen = (uint32_t)zp->z_gen;
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+	    &gen64, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+
+#ifdef illumos
+	if (fidp->fid_len < size) {
+		fidp->fid_len = size;
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOSPC));
+	}
+#else
 	fidp->fid_len = size;
+#endif
 
 	zfid = (zfid_short_t *)fidp;
 
@@ -4024,656 +4838,14 @@ zfs_fid(vnode_t *vp, fid_t *fidp, caller
 	return (0);
 }
 
-/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
- *
- *	IN:	vp	- vnode of file to push page data to.
- *		off	- position in file to put data.
- *		len	- amount of data to write.
- *		flags	- flags to control the operation.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
-/*ARGSUSED*/
-#ifdef PORT_SOLARIS
 static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		*pp;
-	size_t		io_len;
-	u_offset_t	io_off;
-	uint_t		blksz;
-	rl_t		*rl;
-	int		error = 0;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
+	znode_t		*zp, *xzp;
+	zfsvfs_t	*zfsvfs;
+	int		error;
 
-	/*
-	 * Align this request to the file block size in case we kluster.
-	 * XXX - this can result in pretty aggresive locking, which can
-	 * impact simultanious read/write access.  One option might be
-	 * to break up long requests (len == 0) into block-by-block
-	 * operations to get narrower locking.
-	 */
-	blksz = zp->z_blksz;
-	if (ISP2(blksz))
-		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
-	else
-		io_off = 0;
-	if (len > 0 && ISP2(blksz))
-		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
-	else
-		io_len = 0;
-
-	if (io_len == 0) {
-		/*
-		 * Search the entire vp list for pages >= io_off.
-		 */
-		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
-		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
-		goto out;
-	}
-	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
-
-	if (off > zp->z_phys->zp_size) {
-		/* past end of file */
-		zfs_range_unlock(rl);
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off);
-
-	for (off = io_off; io_off < off + len; io_off += io_len) {
-		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
-			pp = page_lookup(vp, io_off,
-			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
-		} else {
-			pp = page_lookup_nowait(vp, io_off,
-			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
-		}
-
-		if (pp != NULL && pvn_getdirty(pp, flags)) {
-			int err;
-
-			/*
-			 * Found a dirty page to push
-			 */
-			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
-			if (err)
-				error = err;
-		} else {
-			io_len = PAGESIZE;
-		}
-	}
-out:
-	zfs_range_unlock(rl);
-	if ((flags & B_ASYNC) == 0)
-		zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*ARGSUSED*/
-void
-zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
-
-	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-	if (zp->z_dbuf == NULL) {
-		/*
-		 * The fs has been unmounted, or we did a
-		 * suspend/resume and this file no longer exists.
-		 */
-		if (vn_has_cached_data(vp)) {
-			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
-			    B_INVAL, cr);
-		}
-
-		mutex_enter(&zp->z_lock);
-		mutex_enter(&vp->v_lock);
-		ASSERT(vp->v_count == 1);
-		vp->v_count = 0;
-		mutex_exit(&vp->v_lock);
-		mutex_exit(&zp->z_lock);
-		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		zfs_znode_free(zp);
-		return;
-	}
-
-	/*
-	 * Attempt to push any data in the page cache.  If this fails
-	 * we will get kicked out later in zfs_zinactive().
-	 */
-	if (vn_has_cached_data(vp)) {
-		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
-		    cr);
-	}
-
-	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
-		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
-
-		dmu_tx_hold_bonus(tx, zp->z_id);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-		} else {
-			dmu_buf_will_dirty(zp->z_dbuf, tx);
-			mutex_enter(&zp->z_lock);
-			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
-			dmu_tx_commit(tx);
-		}
-	}
-
-	zfs_zinactive(zp);
-	rw_exit(&zfsvfs->z_teardown_inactive_lock);
-}
-#endif /* PORT_SOLARIS */
-
-/*
- * Bounds-check the seek operation.
- *
- *	IN:	vp	- vnode seeking within
- *		ooff	- old file offset
- *		noffp	- pointer to new file offset
- *		ct	- caller context
- *
- *	RETURN:	0 if success
- *		EINVAL if new offset invalid
- */
-/* ARGSUSED */
-static int
-zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
-    caller_context_t *ct)
-{
-	if (vp->v_type == VDIR)
-		return (0);
-	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
-
-#ifdef PORT_SOLARIS
-/*
- * Pre-filter the generic locking function to trap attempts to place
- * a mandatory lock on a memory mapped file.
- */
-static int
-zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
-    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	/*
-	 * We are following the UFS semantics with respect to mapcnt
-	 * here: If we see that the file is mapped already, then we will
-	 * return an error, but we don't worry about races between this
-	 * function and zfs_map().
-	 */
-	if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
-		ZFS_EXIT(zfsvfs);
-		return (EAGAIN);
-	}
-	ZFS_EXIT(zfsvfs);
-	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
-}
-
-
-/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data.  For efficiency, we may try to fill
- * multiple pages at once (klustering) to fill up the supplied page
- * list.  Note that the pages to be filled are held with an exclusive
- * lock to prevent access by other threads while they are being filled.
- */
-static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
-    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
-{
-	znode_t *zp = VTOZ(vp);
-	page_t *pp, *cur_pp;
-	objset_t *os = zp->z_zfsvfs->z_os;
-	u_offset_t io_off, total;
-	size_t io_len;
-	int err;
-
-	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
-		/*
-		 * We only have a single page, don't bother klustering
-		 */
-		io_off = off;
-		io_len = PAGESIZE;
-		pp = page_create_va(vp, io_off, io_len,
-		    PG_EXCL | PG_WAIT, seg, addr);
-	} else {
-		/*
-		 * Try to find enough pages to fill the page list
-		 */
-		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
-		    &io_len, off, plsz, 0);
-	}
-	if (pp == NULL) {
-		/*
-		 * The page already exists, nothing to do here.
-		 */
-		*pl = NULL;
-		return (0);
-	}
-
-	/*
-	 * Fill the pages in the kluster.
-	 */
-	cur_pp = pp;
-	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
-		caddr_t va;
-
-		ASSERT3U(io_off, ==, cur_pp->p_offset);
-		va = zfs_map_page(cur_pp, S_WRITE);
-		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
-		    DMU_READ_PREFETCH);
-		zfs_unmap_page(cur_pp, va);
-		if (err) {
-			/* On error, toss the entire kluster */
-			pvn_read_done(pp, B_ERROR);
-			/* convert checksum errors into IO errors */
-			if (err == ECKSUM)
-				err = EIO;
-			return (err);
-		}
-		cur_pp = cur_pp->p_next;
-	}
-
-	/*
-	 * Fill in the page list array from the kluster starting
-	 * from the desired offset `off'.
-	 * NOTE: the page list will always be null terminated.
-	 */
-	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
-	ASSERT(pl == NULL || (*pl)->p_offset == off);
-
-	return (0);
-}
-
-/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array.  If plsz is greater than len, this function may
- * also return page pointers from after the specified region
- * (i.e. the region [off, off + plsz]).  These additional pages are
- * only returned if they are already in the cache, or were created as
- * part of a klustered read.
- *
- *	IN:	vp	- vnode of file to get data from.
- *		off	- position in file to get data from.
- *		len	- amount of data to retrieve.
- *		plsz	- length of provided page list.
- *		seg	- segment to obtain pages for.
- *		addr	- virtual address of fault.
- *		rw	- mode of created pages.
- *		cr	- credentials of caller.
- *		ct	- caller context.
- *
- *	OUT:	protp	- protection mode of created pages.
- *		pl	- list of pages created.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
-	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
-	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		**pl0 = pl;
-	int		err = 0;
-
-	/* we do our own caching, faultahead is unnecessary */
-	if (pl == NULL)
-		return (0);
-	else if (len > plsz)
-		len = plsz;
-	else
-		len = P2ROUNDUP(len, PAGESIZE);
-	ASSERT(plsz >= len);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (protp)
-		*protp = PROT_ALL;
-
-	/*
-	 * Loop through the requested range [off, off + len) looking
-	 * for pages.  If we don't find a page, we will need to create
-	 * a new page and fill it with data from the file.
-	 */
-	while (len > 0) {
-		if (*pl = page_lookup(vp, off, SE_SHARED))
-			*(pl+1) = NULL;
-		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
-			goto out;
-		while (*pl) {
-			ASSERT3U((*pl)->p_offset, ==, off);
-			off += PAGESIZE;
-			addr += PAGESIZE;
-			if (len > 0) {
-				ASSERT3U(len, >=, PAGESIZE);
-				len -= PAGESIZE;
-			}
-			ASSERT3U(plsz, >=, PAGESIZE);
-			plsz -= PAGESIZE;
-			pl++;
-		}
-	}
-
-	/*
-	 * Fill out the page array with any pages already in the cache.
-	 */
-	while (plsz > 0 &&
-	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
-			off += PAGESIZE;
-			plsz -= PAGESIZE;
-	}
-out:
-	if (err) {
-		/*
-		 * Release any pages we have previously locked.
-		 */
-		while (pl > pl0)
-			page_unlock(*--pl);
-	} else {
-		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
-	}
-
-	*pl = NULL;
-
-	ZFS_EXIT(zfsvfs);
-	return (err);
-}
-
-/*
- * Request a memory map for a section of a file.  This code interacts
- * with common code and the VM system as follows:
- *
- *	common code calls mmap(), which ends up in smmap_common()
- *
- *	this calls VOP_MAP(), which takes you into (say) zfs
- *
- *	zfs_map() calls as_map(), passing segvn_create() as the callback
- *
- *	segvn_create() creates the new segment and calls VOP_ADDMAP()
- *
- *	zfs_addmap() updates z_mapcnt
- */
-/*ARGSUSED*/
-static int
-zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t *zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	segvn_crargs_t	vn_a;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if ((prot & PROT_WRITE) &&
-	    (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY |
-	    ZFS_APPENDONLY))) {
-		ZFS_EXIT(zfsvfs);
-		return (EPERM);
-	}
-
-	if ((prot & (PROT_READ | PROT_EXEC)) &&
-	    (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) {
-		ZFS_EXIT(zfsvfs);
-		return (EACCES);
-	}
-
-	if (vp->v_flag & VNOMAP) {
-		ZFS_EXIT(zfsvfs);
-		return (ENOSYS);
-	}
-
-	if (off < 0 || len > MAXOFFSET_T - off) {
-		ZFS_EXIT(zfsvfs);
-		return (ENXIO);
-	}
-
-	if (vp->v_type != VREG) {
-		ZFS_EXIT(zfsvfs);
-		return (ENODEV);
-	}
-
-	/*
-	 * If file is locked, disallow mapping.
-	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
-		ZFS_EXIT(zfsvfs);
-		return (EAGAIN);
-	}
-
-	as_rangelock(as);
-	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
-	if (error != 0) {
-		as_rangeunlock(as);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	vn_a.vp = vp;
-	vn_a.offset = (u_offset_t)off;
-	vn_a.type = flags & MAP_TYPE;
-	vn_a.prot = prot;
-	vn_a.maxprot = maxprot;
-	vn_a.cred = cr;
-	vn_a.amp = NULL;
-	vn_a.flags = flags & ~MAP_TYPE;
-	vn_a.szc = 0;
-	vn_a.lgrp_mem_policy_flags = 0;
-
-	error = as_map(as, *addrp, len, segvn_create, &vn_a);
-
-	as_rangeunlock(as);
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
-	return (0);
-}
-
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file.  Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics.  If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed.  The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * 	open()
- * 	mmap()
- * 	<modify memory>
- * 	munmap()
- * 	close()
- * 	<time lapse>
- * 	putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future.  In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
-/* ARGSUSED */
-static int
-zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
-    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
-    caller_context_t *ct)
-{
-	uint64_t pages = btopr(len);
-
-	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
-	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
-
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
-	    vn_has_cached_data(vp))
-		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
-	return (0);
-}
-
-/*
- * Free or allocate space in a file.  Currently, this function only
- * supports the `F_FREESP' command.  However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- *	IN:	vp	- vnode of file to free data in.
- *		cmd	- action to take (only F_FREESP supported).
- *		bfp	- section of file to free/alloc.
- *		flag	- current file open mode flags.
- *		offset	- current file offset.
- *		cr	- credentials of caller [UNUSED].
- *		ct	- caller context.
- *
- *	RETURN:	0 if success
- *		error code if failure
- *
- * Timestamps:
- *	vp - ctime|mtime updated
- */
-/* ARGSUSED */
-static int
-zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
-    offset_t offset, cred_t *cr, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint64_t	off, len;
-	int		error;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-
-	if (cmd != F_FREESP) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	if (error = convoff(vp, bfp, 0, offset)) {
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
-
-	if (bfp->l_len < 0) {
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	off = bfp->l_start;
-	len = bfp->l_len; /* 0 means from off to end of file */
-
-	error = zfs_freesp(zp, off, len, flag, TRUE);
-
-	ZFS_EXIT(zfsvfs);
-	return (error);
-}
-
-/*ARGSUSED*/
-static int
-zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
-{
-	znode_t		*zp = VTOZ(vp);
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	uint32_t	gen;
-	uint64_t	object = zp->z_id;
-	zfid_short_t	*zfid;
-	int		size, i;
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-	gen = (uint32_t)zp->z_gen;
-
-	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
-	if (fidp->fid_len < size) {
-		fidp->fid_len = size;
-		ZFS_EXIT(zfsvfs);
-		return (ENOSPC);
-	}
-
-	zfid = (zfid_short_t *)fidp;
-
-	zfid->zf_len = size;
-
-	for (i = 0; i < sizeof (zfid->zf_object); i++)
-		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
-
-	/* Must have a non-zero generation number to distinguish from .zfs */
-	if (gen == 0)
-		gen = 1;
-	for (i = 0; i < sizeof (zfid->zf_gen); i++)
-		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
-
-	if (size == LONG_FID_LEN) {
-		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
-		zfid_long_t	*zlfid;
-
-		zlfid = (zfid_long_t *)fidp;
-
-		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
-			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
-
-		/* XXX - this should be the generation number for the objset */
-		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
-			zlfid->zf_setgen[i] = 0;
-	}
-
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-#endif /* PORT_SOLARIS */
-
-static int
-zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t		*zp, *xzp;
-	zfsvfs_t	*zfsvfs;
-	zfs_dirlock_t	*dl;
-	int		error;
-	
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = INT_MAX;
@@ -4682,21 +4854,19 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
-
-#if 0
+#ifdef illumos
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
-		error = zfs_dirent_lock(&dl, zp, "", &xzp,
-		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
+		error = zfs_dirent_lookup(zp, "", &xzp,
+		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
-			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
-			VN_RELE(ZTOV(xzp));
+			vrele(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
@@ -4706,7 +4876,7 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
-#endif
+
 	case _PC_SATTR_ENABLED:
 	case _PC_SATTR_EXISTS:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
@@ -4721,21 +4891,93 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
-
+#endif	/* illumos */
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
-
+#ifdef illumos
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
+#endif
+	case _PC_ACL_EXTENDED:
+		*valp = 0;
+		return (0);
+
+#ifndef __NetBSD__
+	case _PC_ACL_NFS4:
+		*valp = 1;
+		return (0);
+
+	case _PC_ACL_PATH_MAX:
+		*valp = ACL_MAX_ENTRIES;
+		return (0);
+#endif
+
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+    caller_context_t *ct)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+    caller_context_t *ct)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	zilog_t	*zilog = zfsvfs->z_log;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static int
+ioflags(int ioflags)
+{
+	int flags = 0;
 
-	default:
-		return (EOPNOTSUPP);
-	}
+	if (ioflags & IO_APPEND)
+		flags |= FAPPEND;
+	if (ioflags & IO_NDELAY)
+		flags |= FNONBLOCK;
+	if (ioflags & IO_SYNC)
+		flags |= (FSYNC | FDSYNC | FRSYNC);
+
+	return (flags);
 }
 
+#ifdef __NetBSD__
+
 static int
 zfs_netbsd_open(void *v)
 {
@@ -4767,7 +5009,7 @@ zfs_netbsd_read(void *v)
 {
 	struct vop_read_args *ap = v;
 
-	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
+	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL));
 }
 
 static int
@@ -4775,7 +5017,7 @@ zfs_netbsd_write(void *v)
 {
 	struct vop_write_args *ap = v;
 
-	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
+	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL));
 }
 
 static int
@@ -4824,15 +5066,11 @@ zfs_netbsd_lookup(void *v)
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 	int error;
+	int iswhiteout;
 
-	KASSERT(dvp != NULL);
-	KASSERT(vpp != NULL);
-	KASSERT(cnp != NULL);
-	KASSERT(cnp->cn_nameptr != NULL);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 	KASSERT(cnp->cn_namelen < sizeof nm);
 
-#if 0				/* Namecache too scary to contemplate.  */
 	*vpp = NULL;
 
 	/*
@@ -4849,9 +5087,13 @@ zfs_netbsd_lookup(void *v)
 	 * Check the namecache before entering zfs_lookup.
 	 * cache_lookup does the locking dance for us.
 	 */
-	if ((error = cache_lookup(dvp, vpp, cnp)) >= 0)
-		goto out;
-#endif
+	if (cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
+	    cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
+		if (iswhiteout) {
+			cnp->cn_flags |= ISWHITEOUT;
+		}
+		return *vpp == NULL ? ENOENT : 0;
+	}
 
 	/*
 	 * zfs_lookup wants a null-terminated component name, but namei
@@ -4889,7 +5131,7 @@ zfs_netbsd_lookup(void *v)
 		KASSERT(*vpp == NULL);
 		goto out;
 	}
-	KASSERT(*vpp != NULL);	/* XXX Correct?  */
+	KASSERT(*vpp != NULL);
 
 	if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
 		KASSERT(!(cnp->cn_flags & ISDOTDOT));
@@ -4905,33 +5147,13 @@ zfs_netbsd_lookup(void *v)
 out:
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 
-#if 0				/* Namecache too scary to contemplate.  */
 	/*
 	 * Insert name into cache if appropriate.
-	 *
-	 * XXX This seems like a lot of code.  Is it all necessary?
-	 * Can we just do a single cache_enter call?
 	 */
-#if 0
-	cache_enter(dvp, *vpp, cnp);
-#endif
 
-	if ((cnp->cn_flags & MAKEENTRY) == 0)
-		return (error);
-
-	switch (error) {
-	case 0:
-		cache_enter(dvp, *vpp, cnp);
-		break;
-	case ENOENT:
-		KASSERT(*vpp == NULL);
-		if (cnp->cn_nameiop != CREATE)
-			cache_enter(dvp, NULL, cnp);
-		break;
-	default:
-		break;
-	}
-#endif
+	if (error == 0 || (error == ENOENT && cnp->cn_nameiop != CREATE))
+		cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
+		    cnp->cn_flags);
 
 	return (error);
 }
@@ -4952,11 +5174,6 @@ zfs_netbsd_create(void *v)
 	int mode;
 	int error;
 
-	KASSERT(dvp != NULL);
-	KASSERT(vpp != NULL);
-	KASSERT(cnp != NULL);
-	KASSERT(vap != NULL);
-	KASSERT(cnp->cn_nameptr != NULL);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 
 	vattr_init_mask(vap);
@@ -4964,10 +5181,11 @@ zfs_netbsd_create(void *v)
 
 	/* XXX !EXCL is wrong here...  */
 	error = zfs_create(dvp, __UNCONST(cnp->cn_nameptr), vap, !EXCL, mode,
-	    vpp, cnp->cn_cred);
+	    vpp, cnp->cn_cred, NULL);
 
 	KASSERT((error == 0) == (*vpp != NULL));
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	VOP_UNLOCK(*vpp, 0);
 
 	return (error);
 }
@@ -4985,22 +5203,11 @@ zfs_netbsd_remove(void *v)
 	struct componentname *cnp = ap->a_cnp;
 	int error;
 
-	KASSERT(dvp != NULL);
-	KASSERT(vp != NULL);
-	KASSERT(cnp != NULL);
-	KASSERT(cnp->cn_nameptr != NULL);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
 
-	/*
-	 * zfs_remove will look up the entry again itself, so discard vp.
-	 */
-	VOP_UNLOCK(vp);
-	VN_RELE(vp);
-
-	error = zfs_remove(dvp, __UNCONST(cnp->cn_nameptr), cnp->cn_cred, NULL,
-	    0);
-
+	error = zfs_remove(dvp, vp, __UNCONST(cnp->cn_nameptr), cnp->cn_cred);
+	vput(vp);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 	return (error);
 }
@@ -5020,20 +5227,16 @@ zfs_netbsd_mkdir(void *v)
 	struct vattr *vap = ap->a_vap;
 	int error;
 
-	KASSERT(dvp != NULL);
-	KASSERT(vpp != NULL);
-	KASSERT(cnp != NULL);
-	KASSERT(vap != NULL);
-	KASSERT(cnp->cn_nameptr != NULL);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 
 	vattr_init_mask(vap);
 
 	error = zfs_mkdir(dvp, __UNCONST(cnp->cn_nameptr), vap, vpp,
-	    cnp->cn_cred, NULL, 0, NULL);
+	    cnp->cn_cred);
 
 	KASSERT((error == 0) == (*vpp != NULL));
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	VOP_UNLOCK(*vpp, 0);
 
 	return (error);
 }
@@ -5051,22 +5254,11 @@ zfs_netbsd_rmdir(void *v)
 	struct componentname *cnp = ap->a_cnp;
 	int error;
 
-	KASSERT(dvp != NULL);
-	KASSERT(vp != NULL);
-	KASSERT(cnp != NULL);
-	KASSERT(cnp->cn_nameptr != NULL);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
 
-	/*
-	 * zfs_rmdir will look up the entry again itself, so discard vp.
-	 */
-	VOP_UNLOCK(vp);
-	VN_RELE(vp);
-
-	error = zfs_rmdir(dvp, __UNCONST(cnp->cn_nameptr), NULL, cnp->cn_cred,
-	    NULL, 0);
-
+	error = zfs_rmdir(dvp, vp, __UNCONST(cnp->cn_nameptr), cnp->cn_cred);
+	vput(vp);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 	return error;
 }
@@ -5077,7 +5269,7 @@ zfs_netbsd_readdir(void *v)
 	struct vop_readdir_args *ap = v;
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
-		ap->a_ncookies, (u_long **)ap->a_cookies));
+		ap->a_ncookies, ap->a_cookies));
 }
 
 static int
@@ -5150,7 +5342,7 @@ zfs_netbsd_setattr(void *v)
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
-	zflags = VTOZ(vp)->z_phys->zp_flags;
+	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		int error;
@@ -5229,17 +5421,8 @@ zfs_netbsd_rename(void *v)
 	kauth_cred_t cred;
 	int error;
 
-	KASSERT(fdvp != NULL);
-	KASSERT(fvp != NULL);
-	KASSERT(fcnp != NULL);
-	KASSERT(fcnp->cn_nameptr != NULL);
-	KASSERT(tdvp != NULL);
-	KASSERT(tcnp != NULL);
-	KASSERT(fcnp->cn_nameptr != NULL);
-	/* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
-	/* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
 	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
-	KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
+	KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
 	KASSERT(fdvp->v_type == VDIR);
 	KASSERT(tdvp->v_type == VDIR);
 
@@ -5255,9 +5438,9 @@ zfs_netbsd_rename(void *v)
 	/*
 	 * Drop the insane locks.
 	 */
-	VOP_UNLOCK(tdvp);
-	if ((tvp != NULL) && (tvp != tdvp))
-		VOP_UNLOCK(tvp);
+	VOP_UNLOCK(tdvp, 0);
+	if (tvp != NULL && tvp != tdvp)
+		VOP_UNLOCK(tvp, 0);
 
 	/*
 	 * Release the source and target nodes; zfs_rename will look
@@ -5266,19 +5449,24 @@ zfs_netbsd_rename(void *v)
 	VN_RELE(fvp);
 	if (tvp != NULL)
 		VN_RELE(tvp);
+	fvp = NULL;
+	tvp = NULL;
 
 	/*
 	 * Do the rename ZFSly.
 	 */
-	error = zfs_rename(fdvp, __UNCONST(fcnp->cn_nameptr), tdvp,
-	    __UNCONST(tcnp->cn_nameptr), cred, NULL, 0);
+	error = zfs_rename(fdvp, &fvp, fcnp, tdvp, &tvp, tcnp, cred);
 
 	/*
 	 * Release the directories now too, because the VOP_RENAME
 	 * protocol is insane.
 	 */
+
 	VN_RELE(fdvp);
 	VN_RELE(tdvp);
+	VN_RELE(fvp);
+	if (tvp != NULL)
+		VN_RELE(tvp);
 
 	return (error);
 }
@@ -5300,12 +5488,6 @@ zfs_netbsd_symlink(void *v)
 	char *target = ap->a_target;
 	int error;
 
-	KASSERT(dvp != NULL);
-	KASSERT(vpp != NULL);
-	KASSERT(cnp != NULL);
-	KASSERT(vap != NULL);
-	KASSERT(target != NULL);
-	KASSERT(cnp->cn_nameptr != NULL);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 
 	vap->va_type = VLNK;	/* Netbsd: Syscall only sets va_mode. */
@@ -5316,176 +5498,11 @@ zfs_netbsd_symlink(void *v)
 
 	KASSERT((error == 0) == (*vpp != NULL));
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
+	VOP_UNLOCK(*vpp, 0);
 
 	return (error);
 }
 
-#ifdef PORT_SOLARIS
-/*
- * Tunable, both must be a power of 2.
- *
- * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
- * zcr_blksz_max: if set to less than the file block size, allow loaning out of
- *                an arcbuf for a partial block read
- */
-int zcr_blksz_min = (1 << 10);	/* 1K */
-int zcr_blksz_max = (1 << 17);	/* 128K */
-
-/*ARGSUSED*/
-static int
-zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
-    caller_context_t *ct)
-{
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int max_blksz = zfsvfs->z_max_blksz;
-	uio_t *uio = &xuio->xu_uio;
-	ssize_t size = uio->uio_resid;
-	offset_t offset = uio->uio_loffset;
-	int blksz;
-	int fullblk, i;
-	arc_buf_t *abuf;
-	ssize_t maxsize;
-	int preamble, postamble;
-
-	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
-		return (EINVAL);
-
-	ZFS_ENTER(zfsvfs);
-	ZFS_VERIFY_ZP(zp);
-	switch (ioflag) {
-	case UIO_WRITE:
-		/*
-		 * Loan out an arc_buf for write if write size is bigger than
-		 * max_blksz, and the file's block size is also max_blksz.
-		 */
-		blksz = max_blksz;
-		if (size < blksz || zp->z_blksz != blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (EINVAL);
-		}
-		/*
-		 * Caller requests buffers for write before knowing where the
-		 * write offset might be (e.g. NFS TCP write).
-		 */
-		if (offset == -1) {
-			preamble = 0;
-		} else {
-			preamble = P2PHASE(offset, blksz);
-			if (preamble) {
-				preamble = blksz - preamble;
-				size -= preamble;
-			}
-		}
-
-		postamble = P2PHASE(size, blksz);
-		size -= postamble;
-
-		fullblk = size / blksz;
-		(void) dmu_xuio_init(xuio,
-		    (preamble != 0) + fullblk + (postamble != 0));
-		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
-		    int, postamble, int,
-		    (preamble != 0) + fullblk + (postamble != 0));
-
-		/*
-		 * Have to fix iov base/len for partial buffers.  They
-		 * currently represent full arc_buf's.
-		 */
-		if (preamble) {
-			/* data begins in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf,
-			    blksz - preamble, preamble);
-		}
-
-		for (i = 0; i < fullblk; i++) {
-			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
-		}
-
-		if (postamble) {
-			/* data ends in the middle of the arc_buf */
-			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
-			ASSERT(abuf);
-			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
-		}
-		break;
-	case UIO_READ:
-		/*
-		 * Loan out an arc_buf for read if the read size is larger than
-		 * the current file block size.  Block alignment is not
-		 * considered.  Partial arc_buf will be loaned out for read.
-		 */
-		blksz = zp->z_blksz;
-		if (blksz < zcr_blksz_min)
-			blksz = zcr_blksz_min;
-		if (blksz > zcr_blksz_max)
-			blksz = zcr_blksz_max;
-		/* avoid potential complexity of dealing with it */
-		if (blksz > max_blksz) {
-			ZFS_EXIT(zfsvfs);
-			return (EINVAL);
-		}
-
-		maxsize = zp->z_phys->zp_size - uio->uio_loffset;
-		if (size > maxsize)
-			size = maxsize;
-
-		if (size < blksz || vn_has_cached_data(vp)) {
-			ZFS_EXIT(zfsvfs);
-			return (EINVAL);
-		}
-		break;
-	default:
-		ZFS_EXIT(zfsvfs);
-		return (EINVAL);
-	}
-
-	uio->uio_extflg = UIO_XUIO;
-	XUIO_XUZC_RW(xuio) = ioflag;
-	ZFS_EXIT(zfsvfs);
-	return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
-{
-	int i;
-	arc_buf_t *abuf;
-	int ioflag = XUIO_XUZC_RW(xuio);
-
-	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
-	i = dmu_xuio_cnt(xuio);
-	while (i-- > 0) {
-		abuf = dmu_xuio_arcbuf(xuio, i);
-		/*
-		 * if abuf == NULL, it must be a write buffer
-		 * that has been returned in zfs_write().
-		 */
-		if (abuf)
-			dmu_return_arcbuf(abuf);
-		ASSERT(abuf || ioflag == UIO_WRITE);
-	}
-
-	dmu_xuio_fini(xuio);
-	return (0);
-}
-
-/*
- * Predeclare these here so that the compiler assumes that
- * this is an "old style" function declaration that does
- * not include arguments => we won't get type mismatch errors
- * in the initializations that follow.
- */
-static int zfs_inval();
-static int zfs_isdir();
-#endif
-
 static int
 zfs_netbsd_readlink(void *v)
 {
@@ -5507,14 +5524,13 @@ zfs_netbsd_link(void *v)
 	struct componentname *cnp = ap->a_cnp;
 	int error;
 
-	KASSERT(dvp != NULL);
-	KASSERT(vp != NULL);
-	KASSERT(cnp != NULL);
-	KASSERT(cnp->cn_nameptr != NULL);
 	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
 
-	return (zfs_link(dvp, vp, __UNCONST(cnp->cn_nameptr), cnp->cn_cred,
-		NULL, 0));
+	vn_lock(vp, LK_EXCLUSIVE);
+	error = zfs_link(dvp, vp, __UNCONST(cnp->cn_nameptr), cnp->cn_cred,
+	    NULL, 0);
+	VOP_UNLOCK(vp, 0);
+	return error;
 }
 
 static int
@@ -5545,55 +5561,42 @@ zfs_netbsd_reclaim(void *v)
 	zfsvfs_t *zfsvfs;
 	int error;
 
-	KASSERT(vp != NULL);
-	VOP_UNLOCK(vp);
+	VOP_UNLOCK(vp, 0);
 	zp = VTOZ(vp);
-	KASSERT(zp != NULL);
 	zfsvfs = zp->z_zfsvfs;
-	KASSERT(zfsvfs != NULL);
 
-	/* Not until we get uvm and zfs talking to one another.  */
-	KASSERT(!vn_has_cached_data(vp));
+	KASSERTMSG(!vn_has_cached_data(vp), "vp %p", vp);
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
-	if (zp->z_dbuf == NULL) {
-		/*
-		 * The fs has been unmounted, or we did a
-		 * suspend/resume and this file no longer exists.
-		 */
-		if (vn_has_cached_data(vp))
-			/* zfs and uvm are hosed.  Should not happen.  */
-			panic("zfs vnode has cached data (0): %p", vp);
-		rw_exit(&zfsvfs->z_teardown_inactive_lock);
-		zfs_znode_free(zp);
-		return (0);
-	}
 
 	/*
-	 * Attempt to push any data in the page cache.  If this fails
-	 * we will get kicked out later in zfs_zinactive().
+	 * Process a deferred atime update.
+	 */
+	/*
+	 * XXXNETBSD I don't think this actually works.
+	 * We are dirtying the znode again after the vcache layer cleaned it,
+	 * so we would need to zil_commit() again here.
 	 */
-	if (vn_has_cached_data(vp))
-		/* zfs and uvm are hosed.  Should not happen.  */
-		panic("zfs vnode has cached data (1): %p", vp);
-
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
-		dmu_tx_hold_bonus(tx, zp->z_id);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
-			dmu_buf_will_dirty(zp->z_dbuf, tx);
-			mutex_enter(&zp->z_lock);
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
-			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 
-	zfs_zinactive(zp);
+	if (zp->z_sa_hdl == NULL)
+		zfs_znode_free(zp);
+	else
+		zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	return 0;
 }
@@ -5653,10 +5656,6 @@ zfs_netbsd_pathconf(void *v)
 	return (error);
 }
 
-#define	zfs_netbsd_lock		genfs_lock
-#define	zfs_netbsd_unlock	genfs_unlock
-#define	zfs_netbsd_islocked	genfs_islocked
-
 static int
 zfs_netbsd_advlock(void *v)
 {
@@ -5673,66 +5672,321 @@ zfs_netbsd_advlock(void *v)
 	int error;
 
 	vp = ap->a_vp;
-	KASSERT(vp != NULL);
 	zp = VTOZ(vp);
-	KASSERT(zp != NULL);
 	zfsvfs = zp->z_zfsvfs;
-	KASSERT(zfsvfs != NULL);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
-	error = lf_advlock(ap, &zp->z_lockf, zp->z_phys->zp_size);
+	error = lf_advlock(ap, &zp->z_lockf, zp->z_size);
 	ZFS_EXIT(zfsvfs);
 
 	return error;
 }
 
-/*
-int
+static int
 zfs_netbsd_getpages(void *v)
 {
-	struct vnode *vp = ((struct vop_getpages_args *)v)->a_vp;
-	voff_t offset = ((struct vop_getpages_args *)v)->a_offset;
-	struct vm_page **m = ((struct vop_getpages_args *)v)->a_m;
-	int *count = ((struct vop_getpages_args *)v)->a_count;
-	int centeridx = ((struct vop_getpages_args *)v)->a_centeridx;
-	vm_prot_t access_type = ((struct vop_getpages_args *)v)->a_access_type;
-	int advice = ((struct vop_getpages_args *)v)->a_advice;
-	int flags = ((struct vop_getpages_args *)v)->a_flags;
-	
-	int error;
+	struct vop_getpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offset;
+		struct vm_page **a_m;
+		int *a_count;
+		int a_centeridx;
+		vm_prot_t a_access_type;
+		int a_advice;
+		int a_flags;
+	} */ * const ap = v;
 
-	error = 0;
-	
-	KASSERT(!vn_has_cached_data(vp));
-	mutex_exit(&vp->v_interlock);
+	vnode_t *const vp = ap->a_vp;
+	off_t offset = ap->a_offset + (ap->a_centeridx << PAGE_SHIFT);
+	const int flags = ap->a_flags;
+	const bool async = (flags & PGO_SYNCIO) == 0;
+	const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
 
-	return error;
+	struct uvm_object * const uobj = &vp->v_uobj;
+	kmutex_t * const mtx = uobj->vmobjlock;
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	struct vm_page *pg;
+	caddr_t va;
+	int npages, found, err = 0;
+
+	if (flags & PGO_LOCKED) {
+		*ap->a_count = 0;
+		ap->a_m[ap->a_centeridx] = NULL;
+		return EBUSY;
+	}
+	mutex_exit(mtx);
+
+	if (async) {
+		return 0;
+	}
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	mutex_enter(mtx);
+	npages = 1;
+	pg = NULL;
+	uvn_findpages(uobj, offset, &npages, &pg, UFP_ALL);
+
+	if (pg->flags & PG_FAKE) {
+		mutex_exit(mtx);
+
+		va = zfs_map_page(pg, S_WRITE);
+		err = dmu_read(zfsvfs->z_os, zp->z_id, offset, PAGE_SIZE,
+		    va, DMU_READ_PREFETCH);
+		zfs_unmap_page(pg, va);
+
+		mutex_enter(mtx);
+		pg->flags &= ~(PG_FAKE);
+		pmap_clear_modify(pg);
+	}
+
+	if (memwrite) {
+		if ((vp->v_iflag & VI_ONWORKLST) == 0) {
+			vn_syncer_add_to_worklist(vp, filedelay);
+		}
+		if ((vp->v_iflag & (VI_WRMAP|VI_WRMAPDIRTY)) == VI_WRMAP) {
+			vp->v_iflag |= VI_WRMAPDIRTY;
+		}
+	}
+	mutex_exit(mtx);
+	ap->a_m[ap->a_centeridx] = pg;
+
+	ZFS_EXIT(zfsvfs);
+
+	return (err);
 }
-*/
 
-int
+static int
+zfs_putapage(vnode_t *vp, page_t **pp, int count, int flags)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	dmu_tx_t	*tx;
+	voff_t		off, koff;
+	voff_t		len, klen;
+	int		err;
+
+	bool async = (flags & PGO_SYNCIO) == 0;
+	bool *cleanedp;
+	struct uvm_object *uobj = &vp->v_uobj;
+	kmutex_t *mtx = uobj->vmobjlock;
+
+	off = pp[0]->offset;
+	len = count * PAGESIZE;
+	KASSERT(off + len <= round_page(zp->z_size));
+
+	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+		err = SET_ERROR(EDQUOT);
+		goto out;
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	if (zp->z_blksz <= PAGESIZE) {
+		KASSERTMSG(count == 1, "vp %p pp %p count %d", vp, pp, count);
+		caddr_t va = zfs_map_page(*pp, S_READ);
+		ASSERT3U(len, <=, PAGESIZE);
+		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
+		zfs_unmap_page(*pp, va);
+	} else {
+		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
+	}
+	cleanedp = tsd_get(zfs_putpage_key);
+	*cleanedp = true;
+
+	if (err == 0) {
+		uint64_t mtime[2], ctime[2];
+		sa_bulk_attr_t bulk[3];
+		int count = 0;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    &mtime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+		    &zp->z_pflags, 8);
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+		    B_TRUE);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		ASSERT0(err);
+		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
+	}
+	dmu_tx_commit(tx);
+
+	if (async) {
+		mutex_enter(mtx);
+		mutex_enter(&uvm_pageqlock);
+		uvm_page_unbusy(pp, count);
+		mutex_exit(&uvm_pageqlock);
+		mutex_exit(mtx);
+	}
+
+out:
+	return (err);
+}
+
+static void
+zfs_netbsd_gop_markupdate(vnode_t *vp, int flags)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	dmu_tx_t	*tx;
+	sa_bulk_attr_t	bulk[2];
+	uint64_t	mtime[2], ctime[2];
+	int		count = 0, err;
+
+	KASSERT(flags == GOP_UPDATE_MODIFIED);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return;
+	}
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+	dmu_tx_commit(tx);
+}
+
+static int
 zfs_netbsd_putpages(void *v)
 {
-	struct vnode *vp = ((struct vop_putpages_args *)v)->a_vp;
-	voff_t offlo = ((struct vop_putpages_args *)v)->a_offlo;
-	voff_t offhi = ((struct vop_putpages_args *)v)->a_offhi;
-	int flags = ((struct vop_putpages_args *)v)->a_flags;
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offlo;
+		voff_t a_offhi;
+		int a_flags;
+	} */ * const ap = v;
+
+	struct vnode *vp = ap->a_vp;
+	voff_t offlo = ap->a_offlo;
+	voff_t offhi = ap->a_offhi;
+	int flags = ap->a_flags;
+
 	znode_t *zp = VTOZ(vp);
-	
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	rl_t *rl = NULL;
 	int error;
+	bool cleaned = false;
+
+	bool async = (flags & PGO_SYNCIO) == 0;
+	bool cleaning = (flags & PGO_CLEANIT) != 0;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
 
-	dprintf("putpages entry %p -- zfsvfs %p\n", vp, zp->z_zfsvfs);
+	if (cleaning) {
+		rl = zfs_range_lock(zp, offlo, offhi, RL_WRITER);
+		tsd_set(zfs_putpage_key, &cleaned);
+	}
 	error = genfs_putpages(v);
-	dprintf("putpages exit %p -- zfsvfs %p\n", vp, zp->z_zfsvfs);
-	
+	if (rl) {
+		tsd_set(zfs_putpage_key, NULL);
+		zfs_range_unlock(rl);
+	}
+
+	/*
+	 * Only zil_commit() if we cleaned something.
+	 * This avoids deadlock if we're called from zfs_netbsd_setsize().
+	 */
+
+	if (cleaned)
+	if (!async || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zfsvfs->z_log, zp->z_id);
+	ZFS_EXIT(zfsvfs);
 	return error;
 }
 
-#define zfs_netbsd_seek genfs_seek
-#define zfs_netbsd_mmap genfs_eopnotsupp
-#define zfs_netbsd_getpages genfs_compat_getpages
-//#define zfs_netbsd_putpages genfs_putpages
+/*
+ * Restrict the putpages range to the ZFS block containing the offset.
+ */
+static void
+zfs_netbsd_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
+{
+	znode_t *zp = VTOZ(vp);
+
+	*lop = trunc_page(rounddown2(off, zp->z_blksz));
+	*hip = round_page(*lop + zp->z_blksz);
+}
+
+void
+zfs_netbsd_setsize(vnode_t *vp, off_t size)
+{
+	struct uvm_object *uobj = &vp->v_uobj;
+	kmutex_t *mtx = uobj->vmobjlock;
+	page_t *pg;
+	int count, pgoff;
+	caddr_t va;
+	off_t tsize;
+
+	uvm_vnp_setsize(vp, size);
+	if (!vn_has_cached_data(vp))
+		return;
+
+	tsize = trunc_page(size);
+	if (tsize == size)
+		return;
+
+	/*
+	 * If there's a partial page, we need to zero the tail.
+	 */
+
+	mutex_enter(mtx);
+	count = 1;
+	pg = NULL;
+	if (uvn_findpages(uobj, tsize, &count, &pg, UFP_NOALLOC)) {
+		va = zfs_map_page(pg, S_WRITE);
+		pgoff = size - tsize;
+		memset(va + pgoff, 0, PAGESIZE - pgoff);
+		zfs_unmap_page(pg, va);
+		uvm_page_unbusy(&pg, 1);
+	}
+
+	mutex_exit(mtx);
+}
+
+static int
+zfs_netbsd_print(void *v)
+{
+	struct vop_print_args /* {
+		struct vnode	*a_vp;
+	} */ *ap = v;
+	vnode_t	*vp;
+	znode_t	*zp;
+
+	vp = ap->a_vp;
+	zp = VTOZ(vp);
+
+	printf("\tino %" PRIu64 " size %" PRIu64 "\n",
+	       zp->z_id, zp->z_size);
+	return 0;
+}
+
+const struct genfs_ops zfs_genfsops = {
+        .gop_write = zfs_putapage,
+	.gop_markupdate = zfs_netbsd_gop_markupdate,
+	.gop_putrange = zfs_netbsd_gop_putrange,
+};
+
+#define	zfs_netbsd_lock		genfs_lock
+#define	zfs_netbsd_unlock	genfs_unlock
+#define	zfs_netbsd_islocked	genfs_islocked
+#define zfs_netbsd_seek		genfs_seek
+#define zfs_netbsd_mmap		genfs_mmap
+#define zfs_netbsd_fcntl	genfs_fcntl
 
 int (**zfs_vnodeop_p)(void *);
 const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = {
@@ -5767,15 +6021,12 @@ const struct vnodeopv_entry_desc zfs_vno
 	{ &vop_mmap_desc,		zfs_netbsd_mmap },
 	{ &vop_islocked_desc,		zfs_netbsd_islocked },
 	{ &vop_advlock_desc,		zfs_netbsd_advlock },
-#ifdef notyet
-	{ &vop_fcntl_desc,		zfs_netbsd_fcntl },
-	{ &vop_bmap_desc,		zfs_netbsd_bmap },
-	{ &vop_strategy_desc,		zfs_netbsd_strategy },		
 	{ &vop_print_desc,		zfs_netbsd_print },
-	{ &vop_bwrite_desc,		zfs_netbsd_bwrite },
-#endif		
+	{ &vop_fcntl_desc,		zfs_netbsd_fcntl },
 	{ NULL, NULL }
 };
 
 const struct vnodeopv_desc zfs_vnodeop_opv_desc =
 	{ &zfs_vnodeop_p, zfs_vnodeop_entries };
+
+#endif /* __NetBSD__ */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c,v
retrieving revision 1.19
diff -u -p -r1.19 zfs_znode.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c	20 Aug 2016 12:37:06 -0000	1.19
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c	10 Jul 2017 20:29:31 -0000
@@ -19,11 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
 
 #ifdef _KERNEL
 #include <sys/types.h>
@@ -47,29 +49,46 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/kidmap.h>
+
+#ifdef __NetBSD__
+#include <miscfs/specfs/specdev.h>
+
+extern int (**zfs_vnodeop_p)(void *);
+extern int (**zfs_fifoop_p)(void *);
+extern int (**zfs_specop_p)(void *);
+
+struct zfs_loadvnode_args {
+	dmu_buf_t		*db;
+	int			blksz;
+	dmu_object_type_t	obj_type;
+	void			*sa_hdl;
+};
+
+uint_t zfs_loadvnode_key;
+
+#endif
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
+#include <sys/dmu_objset.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+#include <sys/refcount.h>
 
 #include "zfs_prop.h"
+#include "zfs_comutil.h"
 
-#if defined(_KERNEL) && defined(__NetBSD__)
-#include <miscfs/specfs/specdev.h>
-static const struct genfs_ops zfs_genfsops = {
-	.gop_write = genfs_compat_gop_write,
-};
-
-#endif
-
-extern int (**zfs_vnodeop_p)(void *);
-extern int (**zfs_fifoop_p)(void *);
-extern int (**zfs_specop_p)(void *);
+/* Used by fstat(1). */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+    SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
 
 /*
  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
@@ -85,9 +104,6 @@ extern int (**zfs_specop_p)(void *);
 #define	ZNODE_STAT_ADD(stat)			/* nothing */
 #endif	/* ZNODE_STATS */
 
-#define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
-#define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
-
 /*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
@@ -113,30 +129,31 @@ znode_evict_error(dmu_buf_t *dbuf, void 
 	panic("evicting znode %p\n", user_ptr);
 }
 
-/*ARGSUSED*/
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+extern struct vop_vector zfs_shareops;
+
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 {
-	znode_t *zp = arg;
+	znode_t *zp = buf;
 
-	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
-
-	zp->z_vnode = NULL;
+#ifdef __NetBSD__
+	zp = arg;
+#endif
+	POINTER_INVALIDATE(&zp->z_zfsvfs);
 
 	list_link_init(&zp->z_link_node);
 
-	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
-	zp->z_dbuf = NULL;
-	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
+	zp->z_vnode = NULL;
+	zp->z_moved = 0;
 	return (0);
 }
 
@@ -144,21 +161,21 @@ zfs_znode_cache_constructor(void *buf, v
 static void
 zfs_znode_cache_destructor(void *buf, void *arg)
 {
-	znode_t *zp = arg;
+	znode_t *zp = buf;
 
+#ifdef __NetBSD__
+	zp = arg;
+#endif
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	ASSERT(ZTOV(zp) == NULL);
-		
+#ifndef __NetBSD__
+	vn_free(ZTOV(zp));
+#endif
 	ASSERT(!list_link_active(&zp->z_link_node));
-	mutex_destroy(&zp->z_lock);
-	rw_destroy(&zp->z_parent_lock);
-	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	avl_destroy(&zp->z_range_avl);
 	mutex_destroy(&zp->z_range_lock);
 
-	ASSERT(zp->z_dbuf == NULL);
-	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 }
 
@@ -174,6 +191,7 @@ static struct {
 } znode_move_stats;
 #endif	/* ZNODE_STATS */
 
+#ifdef illumos
 static void
 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
 {
@@ -198,11 +216,17 @@ zfs_znode_move_impl(znode_t *ozp, znode_
 	nzp->z_blksz = ozp->z_blksz;
 	nzp->z_seq = ozp->z_seq;
 	nzp->z_mapcnt = ozp->z_mapcnt;
-	nzp->z_last_itx = ozp->z_last_itx;
 	nzp->z_gen = ozp->z_gen;
 	nzp->z_sync_cnt = ozp->z_sync_cnt;
-	nzp->z_phys = ozp->z_phys;
-	nzp->z_dbuf = ozp->z_dbuf;
+	nzp->z_is_sa = ozp->z_is_sa;
+	nzp->z_sa_hdl = ozp->z_sa_hdl;
+	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
+	nzp->z_links = ozp->z_links;
+	nzp->z_size = ozp->z_size;
+	nzp->z_pflags = ozp->z_pflags;
+	nzp->z_uid = ozp->z_uid;
+	nzp->z_gid = ozp->z_gid;
+	nzp->z_mode = ozp->z_mode;
 
 	/*
 	 * Since this is just an idle znode and kmem is already dealing with
@@ -213,9 +237,7 @@ zfs_znode_move_impl(znode_t *ozp, znode_
 		ozp->z_acl_cached = NULL;
 	}
 
-	/* Update back pointers. */
-	(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
-	    znode_evict_error);
+	sa_set_userp(nzp->z_sa_hdl, nzp);
 
 	/*
 	 * Invalidate the original znode by clearing fields that provide a
@@ -223,11 +245,16 @@ zfs_znode_move_impl(znode_t *ozp, znode_
 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
 	 * subsequent callback.
 	 */
-	ozp->z_dbuf = NULL;
+	ozp->z_sa_hdl = NULL;
 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
+
+	/*
+	 * Mark the znode.
+	 */
+	nzp->z_moved = 1;
+	ozp->z_moved = (uint8_t)-1;
 }
 
-#ifndef __NetBSD__
 /*ARGSUSED*/
 static kmem_cbrc_t
 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
@@ -269,7 +296,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 	 * can safely ensure that the filesystem is not and will not be
 	 * unmounted. The next statement is equivalent to ZFS_ENTER().
 	 */
-	rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
 	if (zfsvfs->z_unmounted) {
 		ZFS_EXIT(zfsvfs);
 		rw_exit(&zfsvfs_lock);
@@ -335,7 +362,7 @@ zfs_znode_move(void *buf, void *newbuf, 
 
 	return (KMEM_CBRC_YES);
 }
-#endif	/* !__NetBSD__ */
+#endif /* illumos */
 
 void
 zfs_znode_init(void)
@@ -348,11 +375,18 @@ zfs_znode_init(void)
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+	kmem_cache_set_move(znode_cache, zfs_znode_move);
 }
 
 void
 zfs_znode_fini(void)
 {
+#ifdef illumos
+	/*
+	 * Cleanup vfs & vnode ops
+	 */
+	zfs_remove_op_tables();
+#endif
 
 	/*
 	 * Cleanup zcache
@@ -363,19 +397,17 @@ zfs_znode_fini(void)
 	rw_destroy(&zfsvfs_lock);
 }
 
-#ifndef __NetBSD__
+#ifdef illumos
 struct vnodeops *zfs_dvnodeops;
 struct vnodeops *zfs_fvnodeops;
 struct vnodeops *zfs_symvnodeops;
 struct vnodeops *zfs_xdvnodeops;
 struct vnodeops *zfs_evnodeops;
 struct vnodeops *zfs_sharevnodeops;
-#endif
 
 void
 zfs_remove_op_tables()
 {
-#ifndef __NetBSD__
 	/*
 	 * Remove vfs ops
 	 */
@@ -405,22 +437,18 @@ zfs_remove_op_tables()
 	zfs_xdvnodeops = NULL;
 	zfs_evnodeops = NULL;
 	zfs_sharevnodeops = NULL;
-#endif
 }
 
-#ifndef __NetBSD__
 extern const fs_operation_def_t zfs_dvnodeops_template[];
 extern const fs_operation_def_t zfs_fvnodeops_template[];
 extern const fs_operation_def_t zfs_xdvnodeops_template[];
 extern const fs_operation_def_t zfs_symvnodeops_template[];
 extern const fs_operation_def_t zfs_evnodeops_template[];
 extern const fs_operation_def_t zfs_sharevnodeops_template[];
-#endif
 
 int
 zfs_create_op_tables()
 {
-#ifndef __NetBSD__
 	int error;
 
 	/*
@@ -460,9 +488,8 @@ zfs_create_op_tables()
 	    &zfs_sharevnodeops);
 
 	return (error);
-#endif
-	return 0;
 }
+#endif	/* illumos */
 
 int
 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
@@ -480,26 +507,24 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, d
 	vattr.va_gid = crgetgid(kcred);
 
 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+	sharezp->z_moved = 0;
 	sharezp->z_unlinked = 0;
 	sharezp->z_atime_dirty = 0;
 	sharezp->z_zfsvfs = zfsvfs;
+	sharezp->z_is_sa = zfsvfs->z_use_sa;
 
 	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
 	    kcred, NULL, &acl_ids));
-	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
-	    &zp, 0, &acl_ids);
+	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, sharezp);
-#ifndef __NetBSD__
-	ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
-#endif
 	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
 	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
 	zfsvfs->z_shares_dir = sharezp->z_id;
 
 	zfs_acl_ids_free(&acl_ids);
-	dmu_buf_rele(sharezp->z_dbuf, NULL);
-	sharezp->z_dbuf = NULL;
+	sa_handle_destroy(sharezp->z_sa_hdl);
 	kmem_cache_free(znode_cache, sharezp);
 
 	return (error);
@@ -530,10 +555,8 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, d
 static uint64_t
 zfs_expldev(dev_t dev)
 {
-	return ((uint64_t)major(dev) << NBITSMINOR64) |
-	    (minor_t)minor(dev);
+	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
 }
-
 /*
  * Special cmpldev for ZFS private use.
  * Can't use standard cmpldev since it takes
@@ -544,58 +567,209 @@ zfs_expldev(dev_t dev)
 dev_t
 zfs_cmpldev(uint64_t dev)
 {
-	minor_t minor = (minor_t)dev & MAXMIN64;
-	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
-
-	return makedev(minor, major);
+	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
 }
 
 static void
-zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 {
-	znode_t		*nzp;
-
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
-	mutex_enter(&zp->z_lock);
-
-	ASSERT(zp->z_dbuf == NULL);
+	ASSERT(zp->z_sa_hdl == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
-	zp->z_dbuf = db;
-	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
+	if (sa_hdl == NULL) {
+		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+		    SA_HDL_SHARED, &zp->z_sa_hdl));
+	} else {
+		zp->z_sa_hdl = sa_hdl;
+		sa_set_userp(sa_hdl, zp);
+	}
 
-	/*
-	 * there should be no
-	 * concurrent zgets on this object.
-	 */
-	if (nzp != NULL)
-		panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
+	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 
 	/*
-	 * Slap on VROOT if we are the root znode
+	 * Slap on VROOT if we are the root znode unless we are the root
+	 * node of a snapshot mounted under .zfs.
 	 */
-	if (zp->z_id == zfsvfs->z_root)
+	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
 		ZTOV(zp)->v_flag |= VROOT;
 
-	mutex_exit(&zp->z_lock);
 	vn_exists(ZTOV(zp));
 }
 
 void
 zfs_znode_dmu_fini(znode_t *zp)
 {
-	dmu_buf_t *db = zp->z_dbuf;
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
 	    zp->z_unlinked ||
 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
-	ASSERT(zp->z_dbuf != NULL);
-	zp->z_dbuf = NULL;
-	VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
-	dmu_buf_rele(db, NULL);
+
+	sa_handle_destroy(zp->z_sa_hdl);
+	zp->z_sa_hdl = NULL;
+}
+
+#ifdef __FreeBSD__
+static void
+zfs_vnode_forget(vnode_t *vp)
+{
+
+	/* copied from insmntque_stddtr */
+	vp->v_data = NULL;
+	vp->v_op = &dead_vnodeops;
+	vgone(vp);
+	vput(vp);
 }
 
 /*
+ * Construct a new znode/vnode and intialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+    dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+	znode_t	*zp;
+	vnode_t *vp;
+	uint64_t mode;
+	uint64_t parent;
+	sa_bulk_attr_t bulk[9];
+	int count = 0;
+	int error;
+
+	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+
+	KASSERT(curthread->td_vp_reserv > 0,
+	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+	if (error != 0) {
+		kmem_cache_free(znode_cache, zp);
+		return (NULL);
+	}
+	zp->z_vnode = vp;
+	vp->v_data = zp;
+
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	zp->z_moved = 0;
+
+	/*
+	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
+	 * the zfs_znode_move() callback.
+	 */
+	zp->z_sa_hdl = NULL;
+	zp->z_unlinked = 0;
+	zp->z_atime_dirty = 0;
+	zp->z_mapcnt = 0;
+	zp->z_id = db->db_object;
+	zp->z_blksz = blksz;
+	zp->z_seq = 0x7A4653;
+	zp->z_sync_cnt = 0;
+
+	vp = ZTOV(zp);
+
+	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &zp->z_atime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &zp->z_uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &zp->z_gid, 8);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
+		if (hdl == NULL)
+			sa_handle_destroy(zp->z_sa_hdl);
+		zfs_vnode_forget(vp);
+		zp->z_vnode = NULL;
+		kmem_cache_free(znode_cache, zp);
+		return (NULL);
+	}
+
+	zp->z_mode = mode;
+
+	vp->v_type = IFTOVT((mode_t)mode);
+
+	switch (vp->v_type) {
+	case VDIR:
+		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+		break;
+#ifdef illumos
+	case VBLK:
+	case VCHR:
+		{
+			uint64_t rdev;
+			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
+			    &rdev, sizeof (rdev)) == 0);
+
+			vp->v_rdev = zfs_cmpldev(rdev);
+		}
+		break;
+#endif
+	case VFIFO:
+#ifdef illumos
+	case VSOCK:
+	case VDOOR:
+#endif
+		vp->v_op = &zfs_fifoops;
+		break;
+	case VREG:
+		if (parent == zfsvfs->z_shares_dir) {
+			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
+			vp->v_op = &zfs_shareops;
+		}
+		break;
+#ifdef illumos
+	case VLNK:
+		vn_setops(vp, zfs_symvnodeops);
+		break;
+	default:
+		vn_setops(vp, zfs_evnodeops);
+		break;
+#endif
+	}
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	membar_producer();
+	/*
+	 * Everything else must be valid before assigning z_zfsvfs makes the
+	 * znode eligible for zfs_znode_move().
+	 */
+	zp->z_zfsvfs = zfsvfs;
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * Acquire vnode lock before making it available to the world.
+	 */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VN_LOCK_AREC(vp);
+	if (vp->v_type != VFIFO)
+		VN_LOCK_ASHARE(vp);
+
+#ifdef illumos
+	VFS_HOLD(zfsvfs->z_vfs);
+#endif
+	return (zp);
+}
+#endif /* __FreeBSD__ */
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
  * Create a new DMU object to hold a zfs znode.
  *
  *	IN:	dzp	- parent directory for new znode
@@ -614,14 +788,23 @@ zfs_znode_dmu_fini(znode_t *zp)
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
+    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 {
-	dmu_buf_t	*db;
-	znode_phys_t	*pzp;
+	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
+	uint64_t	mode, size, links, parent, pflags;
+	uint64_t	dzp_pflags = 0;
+	uint64_t	rdev = 0;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	dmu_buf_t	*db;
 	timestruc_t	now;
 	uint64_t	gen, obj;
 	int		err;
+	int		bonuslen;
+	sa_handle_t	*sa_hdl;
+	dmu_object_type_t obj_type;
+	sa_bulk_attr_t	sa_attrs[ZPL_END];
+	int		cnt = 0;
+	zfs_acl_locator_cb_t locate = { 0 };
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
@@ -631,128 +814,253 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, d
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
 		obj = 0;
-		gethrestime(&now);
+		vfs_timestamp(&now);
 		gen = dmu_tx_get_txg(tx);
 	}
 
+	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+	bonuslen = (obj_type == DMU_OT_SA) ?
+	    DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
+
 	/*
 	 * Create a new DMU object.
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
-	 * be to needed allocate a new object, so we accept the small chance
+	 * be needed to allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
 		if (zfsvfs->z_replay) {
-			err = zap_create_claim_norm(zfsvfs->z_os, obj,
+			VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
-			ASSERT3U(err, ==, 0);
+			    obj_type, bonuslen, tx));
 		} else {
 			obj = zap_create_norm(zfsvfs->z_os,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+			    obj_type, bonuslen, tx);
 		}
 	} else {
 		if (zfsvfs->z_replay) {
-			err = dmu_object_claim(zfsvfs->z_os, obj,
+			VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
-			ASSERT3U(err, ==, 0);
+			    obj_type, bonuslen, tx));
 		} else {
 			obj = dmu_object_alloc(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
-			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
+			    obj_type, bonuslen, tx);
 		}
 	}
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
-	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
-	dmu_buf_will_dirty(db, tx);
-
-	/*
-	 * Initialize the znode physical data to zero.
-	 */
-	ASSERT(db->db_size >= sizeof (znode_phys_t));
-	bzero(db->db_data, db->db_size);
-	pzp = db->db_data;
+	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
-		dzp->z_dbuf = db;
-		dzp->z_phys = pzp;
 		dzp->z_id = obj;
+	} else {
+		dzp_pflags = dzp->z_pflags;
 	}
 
 	/*
 	 * If parent is an xattr, so am I.
 	 */
-	if (dzp->z_phys->zp_flags & ZFS_XATTR)
+	if (dzp_pflags & ZFS_XATTR) {
 		flag |= IS_XATTR;
-
-	if (vap->va_type == VBLK || vap->va_type == VCHR) {
-		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
 	}
 
 	if (zfsvfs->z_use_fuids)
-		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+	else
+		pflags = 0;
 
 	if (vap->va_type == VDIR) {
-		pzp->zp_size = 2;		/* contents ("." and "..") */
-		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+		size = 2;		/* contents ("." and "..") */
+		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+	} else {
+		size = links = 0;
+	}
+
+	if (vap->va_type == VBLK || vap->va_type == VCHR) {
+		rdev = zfs_expldev(vap->va_rdev);
 	}
 
-	pzp->zp_parent = dzp->z_id;
+	parent = dzp->z_id;
+	mode = acl_ids->z_mode;
 	if (flag & IS_XATTR)
-		pzp->zp_flags |= ZFS_XATTR;
+		pflags |= ZFS_XATTR;
 
-	pzp->zp_gen = gen;
+	/*
+	 * No execs denied will be deterimed when zfs_mode_compute() is called.
+	 */
+	pflags |= acl_ids->z_aclp->z_hints &
+	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 
-	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
-	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
+	ZFS_TIME_ENCODE(&now, crtime);
+	ZFS_TIME_ENCODE(&now, ctime);
 
 	if (vap->va_mask & AT_ATIME) {
-		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
+		ZFS_TIME_ENCODE(&vap->va_atime, atime);
 	} else {
-		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
+		ZFS_TIME_ENCODE(&now, atime);
 	}
 
 	if (vap->va_mask & AT_MTIME) {
-		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 	} else {
-		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
+		ZFS_TIME_ENCODE(&now, mtime);
 	}
-	pzp->zp_uid = acl_ids->z_fuid;
-	pzp->zp_gid = acl_ids->z_fgid;
-	pzp->zp_mode = acl_ids->z_mode;
+
+	/* Now add in all of the "SA" attributes */
+	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+	    &sa_hdl));
+
+	/*
+	 * Setup the array of attributes to be replaced/set on the new file
+	 *
+	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
+	 * in the old znode_phys_t format.  Don't change this ordering
+	 */
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+	} else {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+		    &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+		    &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+	}
+
+	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+		    &empty_xattr, 8);
+	}
+	if (obj_type == DMU_OT_ZNODE ||
+	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+		    NULL, &rdev, 8);
+
+	}
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+		    &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+		    &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+		    sizeof (uint64_t) * 4);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (zfs_acl_phys_t));
+	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+		    &acl_ids->z_aclp->z_acl_count, 8);
+		locate.cb_aclp = acl_ids->z_aclp;
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate,
+		    acl_ids->z_aclp->z_acl_bytes);
+		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+	}
+
+	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
 	if (!(flag & IS_ROOT_NODE)) {
-		struct vnode *vp;
+#ifdef __NetBSD__
+		vnode_t *vp;
+		struct zfs_loadvnode_args args = { db, 0, obj_type, sa_hdl };
 
+		tsd_set(zfs_loadvnode_key, &args);
 		err = vcache_get(zfsvfs->z_vfs, &obj, sizeof(obj), &vp);
+		tsd_set(zfs_loadvnode_key, NULL);
+
 		ASSERT3U(err, ==, 0);
 		*zpp = VTOZ(vp);
-		dmu_buf_rele(db, NULL);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#else
+		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+#endif
+		ASSERT(*zpp != NULL);
 	} else {
 		/*
 		 * If we are creating the root node, the "parent" we
 		 * passed in is the znode for the root.
 		 */
 		*zpp = dzp;
+
+		(*zpp)->z_sa_hdl = sa_hdl;
 	}
-	VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+
+	(*zpp)->z_pflags = pflags;
+	(*zpp)->z_mode = mode;
+
 	if (vap->va_mask & AT_XVATTR)
-		zfs_xvattr_set(*zpp, (xvattr_t *)vap);
+		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
 
+	if (obj_type == DMU_OT_ZNODE ||
+	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+	}
+#ifndef __NetBSD__
+	if (!(flag & IS_ROOT_NODE)) {
+		vnode_t *vp;
+
+		vp = ZTOV(*zpp);
+		vp->v_vflag |= VV_FORCEINSMQ;
+		err = insmntque(vp, zfsvfs->z_vfs);
+		vp->v_vflag &= ~VV_FORCEINSMQ;
+		KASSERT(err == 0, ("insmntque() failed: error %d", err));
+	}
+#endif
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 }
 
+/*
+ * Update in-core attributes.  It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
 void
-zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 {
 	xoptattr_t *xoap;
 
@@ -760,139 +1068,279 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xv
 	ASSERT(xoap);
 
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
-		ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
+		uint64_t times[2];
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+		    &times, sizeof (times), tx);
 		XVA_SET_RTN(xvap, XAT_CREATETIME);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
-		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
+		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_READONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
-		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
+		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_HIDDEN);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
-		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
+		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SYSTEM);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
-		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
+		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
-		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
+		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
-		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
+		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
-		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
+		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
-		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
+		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NODUMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
-		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
+		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OPAQUE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
-		    xoap->xoa_av_quarantined);
+		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
-		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
+		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
-		(void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
-		    sizeof (xoap->xoa_av_scanstamp));
-		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
+		zfs_sa_set_scanstamp(zp, xvap, tx);
 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
-		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse);
+		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_REPARSE);
 	}
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OFFLINE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SPARSE);
+	}
+}
+
+#ifdef __NetBSD__
+
+static inline int
+zfs_do_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp,
+    int (*get)(struct mount *, const void *, size_t, struct vnode **))
+{
+	struct vnode *vp;
+	int err;
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	err = (*get)(zfsvfs->z_vfs, &obj_num, sizeof(obj_num), &vp);
+	if (err)
+		*zpp = NULL;
+	else
+		*zpp = VTOZ(vp);
+
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+	return (err);
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+
+	return zfs_do_zget(zfsvfs, obj_num, zpp, vcache_get);
+}
+
+int
+zfs_zget_cleaner(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+	dmu_buf_t *db;
+	sa_handle_t *hdl;
+	dmu_object_info_t doi;
+	znode_t *zp;
+	int err;
+
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		return (SET_ERROR(err));
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		return (SET_ERROR(EINVAL));
+	}
+	hdl = dmu_buf_get_user(db);
+	KASSERT(hdl != NULL);
+	zp = sa_get_userdata(hdl);
+	*zpp = zp;
+	return (0);
 }
 
+/*
+ * Callback from vcache to set up the znode.
+ * This is largely copied from zfs_znode_alloc().
+ */
+
 int
 zfs_loadvnode(struct mount *mp, struct vnode *vp,
     const void *key, size_t key_len, const void **new_key)
 {
-	uint64_t obj_num;
+	znode_t *zp;
+	uint64_t mode;
+	uint64_t parent;
+	sa_bulk_attr_t bulk[9];
+	int count = 0;
+	int err;
+
+	uint64_t obj_num, rdev;
 	zfsvfs_t *zfsvfs;
 	dmu_object_info_t doi;
+
+	/* args to zfs_zvnode_alloc() */
+	struct zfs_loadvnode_args *args;
 	dmu_buf_t *db;
-	znode_t *zp;
-	int err;
+	int blksz;
+	dmu_object_type_t obj_type;
+	sa_handle_t *hdl;
 
 	KASSERT(key_len == sizeof(obj_num));
 	memcpy(&obj_num, key, key_len);
 
 	zfsvfs = mp->mnt_data;
 
-	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	args = tsd_get(zfs_loadvnode_key);
+	if (args) {
+		db = args->db;
+		blksz = args->blksz;
+		obj_type = args->obj_type;
+		hdl = args->sa_hdl;
+		goto skip_lookup;
+	}
+
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
-		return err;
+		return (SET_ERROR(err));
 	}
 
 	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		dmu_buf_rele(db, NULL);
-		return EINVAL;
-	}
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		return (SET_ERROR(EINVAL));
+	}
+	blksz = doi.doi_data_block_size;
+	obj_type = doi.doi_bonus_type;
+	hdl = dmu_buf_get_user(db);
 
-	KASSERT(dmu_buf_get_user(db) == NULL);
+	if (hdl != NULL) {
+		zp = sa_get_userdata(hdl);
 
-	/*
-	 * There is a small window where zfs_vget() could
-	 * find this object while a file create is still in
-	 * progress.  Since a gen number can never be zero
-	 * we will check that to determine if its an allocated
-	 * file.
-	 */
+		/*
+		 * Since "SA" does immediate eviction we
+		 * should never find a sa handle that doesn't
+		 * know about the znode.
+		 */
+		ASSERT3P(zp, !=, NULL);
+		ASSERT3U(zp->z_id, ==, obj_num);
 
-	if (((znode_phys_t *)db->db_data)->zp_gen == 0) {
-		dmu_buf_rele(db, NULL);
-		return ENOENT;
+		sa_buf_rele(db, NULL);
+		VFS_HOLD(zfsvfs->z_vfs);
+		*new_key = &zp->z_id;
+		return (0);
 	}
 
+skip_lookup:
+	vp->v_op = zfs_vnodeop_p;
+	vp->v_tag = VT_ZFS;
+
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	zp->z_vnode = vp;
+	vp->v_data = zp;
+
+	extern const struct genfs_ops zfs_genfsops;
+	genfs_node_init(vp, &zfs_genfsops);
 
-	ASSERT(zp->z_dirlocks == NULL);
-	ASSERT(zp->z_dbuf == NULL);
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	zp->z_moved = 0;
 
 	/*
 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
 	 * the zfs_znode_move() callback.
 	 */
-	zp->z_phys = NULL;
+	zp->z_sa_hdl = NULL;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;
 	zp->z_mapcnt = 0;
-	zp->z_last_itx = 0;
 	zp->z_id = db->db_object;
-	zp->z_blksz = doi.doi_data_block_size;
+	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
-	zp->z_vnode = vp;
 
-	zfs_znode_dmu_init(zfsvfs, zp, db);
+	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
-	zp->z_gen = zp->z_phys->zp_gen;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &zp->z_atime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &zp->z_uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &zp->z_gid, 8);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
+		if (hdl == NULL)
+			sa_handle_destroy(zp->z_sa_hdl);
+		zp->z_vnode = NULL;
+		kmem_cache_free(znode_cache, zp);
+		sa_buf_rele(db, NULL);
+		return (SET_ERROR(ENOENT));
+	}
+
+	zp->z_mode = mode;
+
+	vp->v_type = IFTOVT((mode_t)zp->z_mode);
 
-	vp->v_op = zfs_vnodeop_p;
-	vp->v_tag = VT_ZFS;
-	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
-	vp->v_data = zp;
-	genfs_node_init(vp, &zfs_genfsops);
 	switch (vp->v_type) {
 	case VDIR:
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
@@ -900,21 +1348,23 @@ zfs_loadvnode(struct mount *mp, struct v
 	case VBLK:
 	case VCHR:
 	/* XXX NetBSD	vp->v_op = zfs_specop_p; */
-		spec_node_init(vp, zfs_cmpldev(zp->z_phys->zp_rdev));
+		(void) sa_lookup(hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
+		    sizeof (rdev));
+		spec_node_init(vp, zfs_cmpldev(rdev));
 		break;
 	case VFIFO:
 		/* XXX NetBSD vp->v_op = zfs_fifoop_p; */
 		break;
 	}
 
+	uvm_vnp_setsize(vp, zp->z_size);
 	dprintf("zfs_loadvnode znode %p -- vnode %p\n", zp, vp);
 	dprintf("zfs_loadvnode z_id %ld\n", zp->z_id);
-	
-	uvm_vnp_setsize(vp, zp->z_phys->zp_size);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	membar_producer();
+
 	/*
 	 * Everything else must be valid before assigning z_zfsvfs makes the
 	 * znode eligible for zfs_znode_move().
@@ -923,76 +1373,249 @@ zfs_loadvnode(struct mount *mp, struct v
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	VFS_HOLD(zfsvfs->z_vfs);
-
 	*new_key = &zp->z_id;
-
-	return 0;
+	return (0);
 }
 
+#else /* __NetBSD__ */
+
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
-	struct vnode *vp;
-	int error;
+	dmu_object_info_t doi;
+	dmu_buf_t	*db;
+	znode_t		*zp;
+	vnode_t		*vp;
+	sa_handle_t	*hdl;
+	struct thread	*td;
+	int locked;
+	int err;
 
+	td = curthread;
+	getnewvnode_reserve(1);
+again:
+	*zpp = NULL;
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-	error = vcache_get(zfsvfs->z_vfs, &obj_num, sizeof(obj_num), &vp);
-	if (error == 0 && VTOZ(vp)->z_unlinked) {
-		vrele(vp);
-		error = ENOENT;
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		getnewvnode_drop_reserve();
+		return (err);
 	}
-	if (error)
-		*zpp = NULL;
-	else
-		*zpp = VTOZ(vp);
 
-	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+#ifdef __FreeBSD__
+		getnewvnode_drop_reserve();
+#endif
+		return (SET_ERROR(EINVAL));
+	}
+
+	hdl = dmu_buf_get_user(db);
+	if (hdl != NULL) {
+		zp  = sa_get_userdata(hdl);
+
+		/*
+		 * Since "SA" does immediate eviction we
+		 * should never find a sa handle that doesn't
+		 * know about the znode.
+		 */
+		ASSERT3P(zp, !=, NULL);
+		ASSERT3U(zp->z_id, ==, obj_num);
+		*zpp = zp;
+		vp = ZTOV(zp);
+
+		/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
+		VN_HOLD(vp);
 
-	return error;
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+		locked = VOP_ISLOCKED(vp);
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0 &&
+		    locked != LK_EXCLUSIVE) {
+			/*
+			 * The vnode is doomed and this thread doesn't
+			 * hold the exclusive lock on it, so the vnode
+			 * must be being reclaimed by another thread.
+			 * Otherwise the doomed vnode is being reclaimed
+			 * by this thread and zfs_zget is called from
+			 * ZIL internals.
+			 */
+			VI_UNLOCK(vp);
+
+			/*
+			 * XXX vrele() locks the vnode when the last reference
+			 * is dropped.  Although in this case the vnode is
+			 * doomed / dead and so no inactivation is required,
+			 * the vnode lock is still acquired.  That could result
+			 * in a LOR with z_teardown_lock if another thread holds
+			 * the vnode's lock and tries to take z_teardown_lock.
+			 * But that is only possible if the other thread peforms
+			 * a ZFS vnode operation on the vnode.  That either
+			 * should not happen if the vnode is dead or the thread
+			 * should also have a refrence to the vnode and thus
+			 * our reference is not last.
+			 */
+			VN_RELE(vp);
+			goto again;
+		}
+		VI_UNLOCK(vp);
+		getnewvnode_drop_reserve();
+		return (0);
+	}
+
+	/*
+	 * Not found create new znode/vnode
+	 * but only if file exists.
+	 *
+	 * There is a small window where zfs_vget() could
+	 * find this object while a file create is still in
+	 * progress.  This is checked for in zfs_znode_alloc()
+	 *
+	 * if zfs_znode_alloc() fails it will drop the hold on the
+	 * bonus buffer.
+	 */
+	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+	    doi.doi_bonus_type, NULL);
+	if (zp == NULL) {
+		err = SET_ERROR(ENOENT);
+	} else {
+		*zpp = zp;
+	}
+	if (err == 0) {
+		vnode_t *vp = ZTOV(zp);
+
+		err = insmntque(vp, zfsvfs->z_vfs);
+		if (err == 0) {
+			vp->v_hash = obj_num;
+			VOP_UNLOCK(vp, 0);
+		} else {
+			zp->z_vnode = NULL;
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			*zpp = NULL;
+		}
+	}
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+	getnewvnode_drop_reserve();
+	return (err);
 }
 
+#endif /* __NetBSD__ */
+
 int
 zfs_rezget(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	dmu_object_info_t doi;
 	dmu_buf_t *db;
+	vnode_t *vp;
 	uint64_t obj_num = zp->z_id;
+	uint64_t mode, size;
+	sa_bulk_attr_t bulk[8];
 	int err;
+	int count = 0;
+	uint64_t gen;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
-	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+	ASSERT(zp->z_sa_hdl == NULL);
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		dmu_buf_rele(db, NULL);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 
-	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
-		dmu_buf_rele(db, NULL);
+	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+	size = zp->z_size;
+
+	/* reload cached values */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+	    &gen, sizeof (gen));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, sizeof (zp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, sizeof (zp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &zp->z_atime, sizeof (zp->z_atime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &zp->z_uid, sizeof (zp->z_uid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &zp->z_gid, sizeof (zp->z_gid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-		return (EIO);
+		return (SET_ERROR(EIO));
 	}
 
-	mutex_enter(&zp->z_acl_lock);
-	if (zp->z_acl_cached) {
-		zfs_acl_free(zp->z_acl_cached);
-		zp->z_acl_cached = NULL;
+	zp->z_mode = mode;
+
+	if (gen != zp->z_gen) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * It is highly improbable but still quite possible that two
+	 * objects in different datasets are created with the same
+	 * object numbers and in transaction groups with the same
+	 * numbers.  znodes corresponding to those objects would
+	 * have the same z_id and z_gen, but their other attributes
+	 * may be different.
+	 * zfs recv -F may replace one of such objects with the other.
+	 * As a result file properties recorded in the replaced
+	 * object's vnode may no longer match the received object's
+	 * properties.  At present the only cached property is the
+	 * files type recorded in v_type.
+	 * So, handle this case by leaving the old vnode and znode
+	 * disassociated from the actual object.  A new vnode and a
+	 * znode will be created if the object is accessed
+	 * (e.g. via a look-up).  The old vnode and znode will be
+	 * recycled when the last vnode reference is dropped.
+	 */
+	vp = ZTOV(zp);
+	if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (EIO);
 	}
-	mutex_exit(&zp->z_acl_lock);
 
-	zfs_znode_dmu_init(zfsvfs, zp, db);
-	zp->z_unlinked = (zp->z_phys->zp_links == 0);
+	zp->z_unlinked = (zp->z_links == 0);
 	zp->z_blksz = doi.doi_data_block_size;
+	vn_pages_remove(vp, 0, 0);
+	if (zp->z_size != size)
+		vnode_pager_setsize(vp, zp->z_size);
 
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
@@ -1005,11 +1628,13 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zfsvfs->z_os;
 	uint64_t obj = zp->z_id;
-	uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
+	uint64_t acl_obj = zfs_external_acl(zp);
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
-	if (acl_obj)
+	if (acl_obj) {
+		VERIFY(!zp->z_is_sa);
 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+	}
 	VERIFY(0 == dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
@@ -1019,30 +1644,26 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *
 void
 zfs_zinactive(znode_t *zp)
 {
-	vnode_t	*vp = ZTOV(zp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
 
-	ASSERT(zp->z_dbuf && zp->z_phys);
+	ASSERT(zp->z_sa_hdl);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
-	mutex_enter(&zp->z_lock);
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
-		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		zfs_rmnode(zp);
 		return;
 	}
 
-	mutex_exit(&zp->z_lock);
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
@@ -1052,23 +1673,22 @@ void
 zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	struct vnode *vp;
 
-	mutex_enter(&zp->z_lock);
-	vp = ZTOV(zp);
-	if (vp != NULL) {
-		genfs_node_destroy(vp);
-		/*
-		 * To interlock with zfs_sync().
-		 */
-        	mutex_enter(vp->v_interlock);
-		vp->v_data = NULL;
-		mutex_exit(vp->v_interlock);
-	}
-	mutex_exit(&zp->z_lock);
+#ifdef __NetBSD__
+	struct vnode *vp = ZTOV(zp);
 
-	dprintf("destroying znode %p\n", zp);
-	//cpu_Debugger();
+	genfs_node_destroy(vp);
+
+	/*
+	 * Interlock with zfs_sync().
+	 */
+	mutex_enter(vp->v_interlock);
+	vp->v_data = NULL;
+	mutex_exit(vp->v_interlock);
+#endif
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	zp->z_vnode = NULL;
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
 	list_remove(&zfsvfs->z_all_znodes, zp);
@@ -1081,63 +1701,46 @@ zfs_znode_free(znode_t *zp)
 
 	kmem_cache_free(znode_cache, zp);
 
+#ifdef illumos
 	VFS_RELE(zfsvfs->z_vfs);
+#endif
 }
 
 void
-zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2], boolean_t have_tx)
 {
 	timestruc_t	now;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-
-	gethrestime(&now);
+	vfs_timestamp(&now);
 
-	if (tx) {
-		dmu_buf_will_dirty(zp->z_dbuf, tx);
+	if (have_tx) {	/* will sa_bulk_update happen really soon? */
 		zp->z_atime_dirty = 0;
 		zp->z_seq++;
 	} else {
 		zp->z_atime_dirty = 1;
 	}
 
-	if (flag & AT_ATIME)
-		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
+	if (flag & AT_ATIME) {
+		ZFS_TIME_ENCODE(&now, zp->z_atime);
+	}
 
 	if (flag & AT_MTIME) {
-		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
-		if (zp->z_zfsvfs->z_use_fuids)
-			zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
+		ZFS_TIME_ENCODE(&now, mtime);
+		if (zp->z_zfsvfs->z_use_fuids) {
+			zp->z_pflags |= (ZFS_ARCHIVE |
+			    ZFS_AV_MODIFIED);
+		}
 	}
 
 	if (flag & AT_CTIME) {
-		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
+		ZFS_TIME_ENCODE(&now, ctime);
 		if (zp->z_zfsvfs->z_use_fuids)
-			zp->z_phys->zp_flags |= ZFS_ARCHIVE;
+			zp->z_pflags |= ZFS_ARCHIVE;
 	}
 }
 
 /*
- * Update the requested znode timestamps with the current time.
- * If we are in a transaction, then go ahead and mark the znode
- * dirty in the transaction so the timestamps will go to disk.
- * Otherwise, we will get pushed next time the znode is updated
- * in a transaction, or when this znode eventually goes inactive.
- *
- * Why is this OK?
- *  1 - Only the ACCESS time is ever updated outside of a transaction.
- *  2 - Multiple consecutive updates will be collapsed into a single
- *	znode update by the transaction grouping semantics of the DMU.
- */
-void
-zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
-{
-	mutex_enter(&zp->z_lock);
-	zfs_time_stamper_locked(zp, flag, tx);
-	mutex_exit(&zp->z_lock);
-}
-
-/*
  * Grow the block size for a file.
  *
  *	IN:	zp	- znode of file to free data in.
@@ -1159,27 +1762,43 @@ zfs_grow_blocksize(znode_t *zp, uint64_t
 	 * we will not grow.  If there is more than one block in a file,
 	 * the blocksize cannot change.
 	 */
-	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
+	if (zp->z_blksz && zp->z_size > zp->z_blksz)
 		return;
 
 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
 	    size, 0, tx);
+
 	if (error == ENOTSUP)
 		return;
-	ASSERT3U(error, ==, 0);
+	ASSERT0(error);
 
 	/* What blocksize did we actually get? */
-	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
+	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
 }
 
+#ifdef illumos
+/*
+ * This is a dummy interface used when pvn_vplist_dirty() should *not*
+ * be calling back into the fs for a putpage().  E.g.: when truncating
+ * a file, the pages being "thrown away* don't need to be written out.
+ */
+/* ARGSUSED */
+static int
+zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
+    int flags, cred_t *cr)
+{
+	ASSERT(0);
+	return (0);
+}
+#endif
+
 /*
  * Increase the file length
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file
  *
- * 	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_extend(znode_t *zp, uint64_t end)
@@ -1198,21 +1817,26 @@ zfs_extend(znode_t *zp, uint64_t end)
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
-	if (end <= zp->z_phys->zp_size) {
+	if (end <= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
-top:
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
 	if (end > zp->z_blksz &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+			/*
+			 * File's blocksize is already larger than the
+			 * "recordsize" property.  Only let it grow to
+			 * the next power of 2.
+			 */
 			ASSERT(!ISP2(zp->z_blksz));
-			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
 		} else {
 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
@@ -1221,30 +1845,27 @@ top:
 		newblksz = 0;
 	}
 
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
 	if (newblksz)
 		zfs_grow_blocksize(zp, newblksz, tx);
 
-	zp->z_phys->zp_size = end;
+	zp->z_size = end;
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx));
+
+	vnode_pager_setsize(ZTOV(zp), end);
 
 	zfs_range_unlock(rl);
 
 	dmu_tx_commit(tx);
 
-	uvm_vnp_setsize(ZTOV(zp), end);
-
 	return (0);
 }
 
@@ -1255,8 +1876,7 @@ top:
  *		off	- start of section to free.
  *		len	- length of section to free.
  *
- * 	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
@@ -1273,22 +1893,23 @@ zfs_free_range(znode_t *zp, uint64_t off
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
-	if (off >= zp->z_phys->zp_size) {
+	if (off >= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 
-	if (off + len > zp->z_phys->zp_size)
-		len = zp->z_phys->zp_size - off;
+	if (off + len > zp->z_size)
+		len = zp->z_size - off;
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
 
 	if (error == 0) {
 		/*
-		 * In NetBSD we cannot free block in the middle of a file,
-		 * but only at the end of a file.
+		 * In FreeBSD we cannot free block in the middle of a file,
+		 * but only at the end of a file, so this code path should
+		 * never happen.
 		 */
-		uvm_vnp_setsize(ZTOV(zp), off);
+		vnode_pager_setsize(ZTOV(zp), off);
 	}
 
 	zfs_range_unlock(rl);
@@ -1302,8 +1923,7 @@ zfs_free_range(znode_t *zp, uint64_t off
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file.
  *
- * 	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_trunc(znode_t *zp, uint64_t end)
@@ -1313,6 +1933,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	dmu_tx_t *tx;
 	rl_t *rl;
 	int error;
+	sa_bulk_attr_t bulk[2];
+	int count = 0;
 
 	/*
 	 * We will change zp_size, lock the whole file.
@@ -1322,7 +1944,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
-	if (end >= zp->z_phys->zp_size) {
+	if (end >= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
@@ -1332,36 +1954,39 @@ zfs_trunc(znode_t *zp, uint64_t end)
 		zfs_range_unlock(rl);
 		return (error);
 	}
-top:
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
-	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
-	zp->z_phys->zp_size = end;
+	zp->z_size = end;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &zp->z_size, sizeof (zp->z_size));
+
+	if (end == 0) {
+		zp->z_pflags &= ~ZFS_SPARSE;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &zp->z_pflags, 8);
+	}
+	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
 
 	dmu_tx_commit(tx);
 
-	zfs_range_unlock(rl);
-
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
+	vnode_pager_setsize(vp, end);
 
-	uvm_vnp_setsize(vp, end);
+	zfs_range_unlock(rl);
 
 	return (0);
 }
@@ -1375,8 +2000,7 @@ top:
  *		flag	- current file open mode flags.
  *		log	- TRUE if this action should be logged
  *
- * 	RETURN:	0 if success
- *		error code if failure
+ *	RETURN:	0 on success, error code on failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
@@ -1385,9 +2009,17 @@ zfs_freesp(znode_t *zp, uint64_t off, ui
 	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog = zfsvfs->z_log;
+	uint64_t mode;
+	uint64_t mtime[2], ctime[2];
+	sa_bulk_attr_t bulk[3];
+	int count = 0;
 	int error;
 
-	if (off > zp->z_phys->zp_size) {
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+	    sizeof (mode))) != 0)
+		return (error);
+
+	if (off > zp->z_size) {
 		error =  zfs_extend(zp, off+len);
 		if (error == 0 && log)
 			goto log;
@@ -1395,30 +2027,43 @@ zfs_freesp(znode_t *zp, uint64_t off, ui
 			return (error);
 	}
 
+	/*
+	 * Check for any locks in the region to be freed.
+	 */
+
+	if (MANDLOCK(vp, (mode_t)mode)) {
+		uint64_t length = (len ? len : zp->z_size - off);
+		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
+			return (error);
+	}
+
 	if (len == 0) {
 		error = zfs_trunc(zp, off);
 	} else {
 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
-		    off + len > zp->z_phys->zp_size)
+		    off + len > zp->z_size)
 			error = zfs_extend(zp, off+len);
 	}
 	if (error || !log)
 		return (error);
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
-	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
-		if (error == ERESTART) {
-			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto log;
-		}
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
-	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &zp->z_pflags, 8);
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+
 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 
 	dmu_tx_commit(tx);
@@ -1428,14 +2073,14 @@ log:
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
-	zfsvfs_t	zfsvfs;
-	uint64_t	moid, obj, version;
+	uint64_t	moid, obj, sa_obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
 	int		error;
 	int		i;
 	znode_t		*rootzp = NULL;
+	zfsvfs_t	*zfsvfs;
 	vattr_t		vattr;
 	znode_t		*zp;
 	zfs_acl_ids_t	acl_ids;
@@ -1455,12 +2100,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, 
 	/*
 	 * Set starting attributes.
 	 */
-	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
-		version = ZPL_VERSION;
-	else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
-		version = ZPL_VERSION_USERSPACE - 1;
-	else
-		version = ZPL_VERSION_FUID - 1;
+	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
@@ -1486,6 +2126,18 @@ zfs_create_fs(objset_t *os, cred_t *cr, 
 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 
 	/*
+	 * Create zap object used for SA attribute registration
+	 */
+
+	if (version >= ZPL_VERSION_SA) {
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT(error == 0);
+	} else {
+		sa_obj = 0;
+	}
+	/*
 	 * Create a delete queue.
 	 */
 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
@@ -1497,105 +2149,217 @@ zfs_create_fs(objset_t *os, cred_t *cr, 
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
 	 */
-	vattr_null(&vattr);
+	VATTR_NULL(&vattr);
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0755;
 	vattr.va_uid = crgetuid(cr);
 	vattr.va_gid = crgetgid(cr);
 
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+	rootzp->z_moved = 0;
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
+	rootzp->z_is_sa = USE_SA(version, os);
 
-	bzero(&zfsvfs, sizeof (zfsvfs_t));
+	zfsvfs->z_os = os;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_version = version;
+	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+	zfsvfs->z_use_sa = USE_SA(version, os);
+	zfsvfs->z_norm = norm;
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+
+	ASSERT(error == 0);
 
-	zfsvfs.z_os = os;
-	zfsvfs.z_parent = &zfsvfs;
-	zfsvfs.z_version = version;
-	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
-	zfsvfs.z_norm = norm;
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
-		zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
-	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
-	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
-	rootzp->z_zfsvfs = &zfsvfs;
+	rootzp->z_zfsvfs = zfsvfs;
 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids));
-	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT(error == 0);
 	zfs_acl_ids_free(&acl_ids);
 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
-	dmu_buf_rele(rootzp->z_dbuf, NULL);
-	rootzp->z_dbuf = NULL;
+	sa_handle_destroy(rootzp->z_sa_hdl);
 	kmem_cache_free(znode_cache, rootzp);
 
 	/*
 	 * Create shares directory
 	 */
 
-	error = zfs_create_share_dir(&zfsvfs, tx);
+	error = zfs_create_share_dir(zfsvfs, tx);
 
 	ASSERT(error == 0);
 
-	mutex_destroy(&zfsvfs.z_znodes_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
-
 #endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+	uint64_t sa_obj = 0;
+	int error;
+
+	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+	if (error != 0 && error != ENOENT)
+		return (error);
+
+	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+	return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+    dmu_buf_t **db, void *tag)
+{
+	dmu_object_info_t doi;
+	int error;
+
+	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+		return (error);
+
+	dmu_object_info_from_db(*db, &doi);
+	if ((doi.doi_bonus_type != DMU_OT_SA &&
+	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
+	    doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
+		sa_buf_rele(*db, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+	if (error != 0) {
+		sa_buf_rele(*db, tag);
+		return (error);
+	}
+
+	return (0);
+}
+
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+	sa_handle_destroy(hdl);
+	sa_buf_rele(db, tag);
+}
+
 /*
  * Given an object number, return its parent object number and whether
  * or not the object is an extended attribute directory.
  */
 static int
-zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    uint64_t *pobjp, int *is_xattrdir)
 {
-	dmu_buf_t *db;
-	dmu_object_info_t doi;
-	znode_phys_t *zp;
+	uint64_t parent;
+	uint64_t pflags;
+	uint64_t mode;
+	uint64_t parent_mode;
+	sa_bulk_attr_t bulk[3];
+	sa_handle_t *sa_hdl;
+	dmu_buf_t *sa_db;
+	int count = 0;
 	int error;
 
-	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+	    &parent, sizeof (parent));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+	    &pflags, sizeof (pflags));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &mode, sizeof (mode));
+
+	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
 		return (error);
 
-	dmu_object_info_from_db(db, &doi);
-	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
-	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
-		dmu_buf_rele(db, FTAG);
-		return (EINVAL);
-	}
+	/*
+	 * When a link is removed its parent pointer is not changed and will
+	 * be invalid.  There are two cases where a link is removed but the
+	 * file stays around, when it goes to the delete queue and when there
+	 * are additional links.
+	 */
+	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	if (error != 0)
+		return (error);
 
-	zp = db->db_data;
-	*pobjp = zp->zp_parent;
-	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
-	    S_ISDIR(zp->zp_mode);
-	dmu_buf_rele(db, FTAG);
+	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+	/*
+	 * Extended attributes can be applied to files, directories, etc.
+	 * Otherwise the parent must be a directory.
+	 */
+	if (!*is_xattrdir && !S_ISDIR(parent_mode))
+		return (SET_ERROR(EINVAL));
+
+	*pobjp = parent;
 
 	return (0);
 }
 
-int
-zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    zfs_stat_t *sb)
+{
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &sb->zs_mode, sizeof (sb->zs_mode));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+	    &sb->zs_gen, sizeof (sb->zs_gen));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+	    &sb->zs_links, sizeof (sb->zs_links));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+	    &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+	return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+    sa_attr_type_t *sa_table, char *buf, int len)
 {
+	sa_handle_t *sa_hdl;
+	sa_handle_t *prevhdl = NULL;
+	dmu_buf_t *prevdb = NULL;
+	dmu_buf_t *sa_db = NULL;
 	char *path = buf + len - 1;
 	int error;
 
 	*path = '\0';
+	sa_hdl = hdl;
 
 	for (;;) {
 		uint64_t pobj;
@@ -1603,7 +2367,10 @@ zfs_obj_to_path(objset_t *osp, uint64_t 
 		size_t complen;
 		int is_xattrdir;
 
-		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
+		if (prevdb)
+			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
 		    &is_xattrdir)) != 0)
 			break;
 
@@ -1615,8 +2382,7 @@ zfs_obj_to_path(objset_t *osp, uint64_t 
 
 		component[0] = '/';
 		if (is_xattrdir) {
-			(void) snprintf(component + 1, sizeof(component) - 1,
-			    "<xattrdir>");
+			(void) sprintf(component + 1, "<xattrdir>");
 		} else {
 			error = zap_value_search(osp, pobj, obj,
 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
@@ -1629,9 +2395,112 @@ zfs_obj_to_path(objset_t *osp, uint64_t 
 		ASSERT(path >= buf);
 		bcopy(component, path, complen);
 		obj = pobj;
+
+		if (sa_hdl != hdl) {
+			prevhdl = sa_hdl;
+			prevdb = sa_db;
+		}
+		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+		if (error != 0) {
+			sa_hdl = prevhdl;
+			sa_db = prevdb;
+			break;
+		}
+	}
+
+	if (sa_hdl != NULL && sa_hdl != hdl) {
+		ASSERT(sa_db != NULL);
+		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
 	}
 
 	if (error == 0)
 		(void) memmove(buf, path, buf + len - path);
+
 	return (error);
 }
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len)
+{
+	char *path = buf + len - 1;
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	*path = '\0';
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+	if (error != 0) {
+		zfs_release_sa_handle(hdl, db, FTAG);
+		return (error);
+	}
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t parent;
+	int is_xattrdir;
+	int err;
+
+	/* Extended attributes should not be visible as regular files. */
+	if ((zp->z_pflags & ZFS_XATTR) != 0)
+		return (SET_ERROR(EINVAL));
+
+	err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+	    &parent, &is_xattrdir);
+	if (err != 0)
+		return (err);
+	ASSERT0(is_xattrdir);
+
+	/* No name as this is a root object. */
+	if (parent == zp->z_id)
+		return (SET_ERROR(EINVAL));
+
+	err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+	    ZFS_DIRENT_OBJ(-1ULL), buf);
+	if (err != 0)
+		return (err);
+	err = zfs_zget(zfsvfs, parent, dzpp);
+	return (err);
+}
+#endif /* _KERNEL */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zil.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c	27 Feb 2010 22:31:33 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c	12 May 2017 23:57:15 -0000
@@ -19,10 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
@@ -33,8 +36,9 @@
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
-#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
 
 /*
  * The zfs intent log (ZIL) saves transaction records of system calls
@@ -63,9 +67,12 @@
  */
 
 /*
- * This global ZIL switch affects all pools
+ * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
-int zil_disable = 0;	/* disable intent logging */
+int zil_replay_disable = 0;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN,
+    &zil_replay_disable, 0, "Disable intent logging replay");
 
 /*
  * Tunable parameter for debugging or performance analysis.  Setting
@@ -73,15 +80,37 @@ int zil_disable = 0;	/* disable intent l
  * out-of-order write cache is enabled.
  */
 boolean_t zfs_nocacheflush = B_FALSE;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN,
+    &zfs_nocacheflush, 0, "Disable cache flush");
+boolean_t zfs_trim_enabled = B_TRUE;
+SYSCTL_DECL(_vfs_zfs_trim);
+SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
+    "Enable ZFS TRIM");
 
-static kmem_cache_t *zil_lwb_cache;
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that executed with lower (asynchronous) priority to
+ * limit potential SLOG device abuse by single active ZIL writer.
+ */
+uint64_t zil_slog_limit = 768 * 1024;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
+    &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
 
-static boolean_t zil_empty(zilog_t *zilog);
+static kmem_cache_t *zil_lwb_cache;
 
 #define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
     sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
 
 
+/*
+ * ziltest is by and large an ugly hack, but very useful in
+ * checking replay without tedious work.
+ * When running ziltest we want to keep all itx's and so maintain
+ * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG
+ * We subtract TXG_CONCURRENT_STATES to allow for common code.
+ */
+#define	ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES)
+
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
@@ -125,12 +154,17 @@ int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
-	const dva_t *dva = BP_IDENTITY(bp);
+	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
+	if (BP_IS_EMBEDDED(bp))
+		return (0);
+
+	dva = BP_IDENTITY(bp);
+
 	if (avl_find(t, dva, &where) != NULL)
-		return (EEXIST);
+		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
@@ -164,9 +198,9 @@ zil_read_log_block(zilog_t *zilog, const
     char **end)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
-	uint32_t aflags = ARC_WAIT;
+	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
@@ -178,7 +212,7 @@ zil_read_log_block(zilog_t *zilog, const
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
-	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
@@ -201,8 +235,9 @@ zil_read_log_block(zilog_t *zilog, const
 
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
-				error = ECKSUM;
+				error = SET_ERROR(ECKSUM);
 			} else {
+				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, len);
 				*end = (char *)dst + len;
 				*nbp = zilc->zc_next_blk;
@@ -215,15 +250,17 @@ zil_read_log_block(zilog_t *zilog, const
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
-				error = ECKSUM;
+				error = SET_ERROR(ECKSUM);
 			} else {
+				ASSERT3U(zilc->zc_nused, <=,
+				    SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, zilc->zc_nused);
 				*end = (char *)dst + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 
-		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
@@ -237,9 +274,9 @@ zil_read_log_data(zilog_t *zilog, const 
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
-	uint32_t aflags = ARC_WAIT;
+	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
@@ -254,13 +291,13 @@ zil_read_log_data(zilog_t *zilog, const 
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
-	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
-		(void) arc_buf_remove_ref(abuf, &abuf);
+		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
@@ -300,7 +337,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
-	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -320,7 +357,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_
 			break;
 
 		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
-		if (error)
+		if (error != 0)
 			break;
 
 		for (lrp = lrbuf; lrp < end; lrp += reclen) {
@@ -347,7 +384,7 @@ done:
 	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
 	zil_bp_tree_fini(zilog);
-	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
 	return (error);
 }
@@ -359,7 +396,8 @@ zil_claim_log_block(zilog_t *zilog, blkp
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
@@ -409,20 +447,22 @@ zil_free_log_record(zilog_t *zilog, lr_t
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
-	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+	    !BP_IS_HOLE(bp))
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
+	lwb->lwb_slog = slog;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_zio = NULL;
@@ -443,6 +483,58 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *
 }
 
 /*
+ * Called when we create in-memory log transactions so that we know
+ * to cleanup the itxs at the end of spa_sync().
+ */
+void
+zilog_dirty(zilog_t *zilog, uint64_t txg)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
+	if (ds->ds_is_snapshot)
+		panic("dirtying snapshot!");
+
+	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(ds->ds_dbuf, zilog);
+	}
+}
+
+/*
+ * Determine if the zil is dirty in the specified txg. Callers wanting to
+ * ensure that the dirty state does not change must hold the itxg_lock for
+ * the specified txg. Holding the lock will ensure that the zil cannot be
+ * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
+ * state.
+ */
+boolean_t
+zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ * Determine if the zil is dirty. The zil is considered dirty if it has
+ * any pending itx records that have not been cleaned by zil_clean().
+ */
+boolean_t
+zilog_is_dirty(zilog_t *zilog)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+	for (int t = 0; t < TXG_SIZE; t++) {
+		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
  * Create an on-disk intent log.
  */
 static lwb_t *
@@ -454,6 +546,7 @@ zil_create(zilog_t *zilog)
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
+	boolean_t slog = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
@@ -482,7 +575,7 @@ zil_create(zilog_t *zilog)
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
-		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		    ZIL_MIN_BLKSZ, &slog);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -492,7 +585,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, txg);
+		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -549,7 +642,7 @@ zil_destroy(zilog_t *zilog, boolean_t ke
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
-		ASSERT(!keep_first);
+		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
@@ -558,16 +651,23 @@ zil_destroy(zilog_t *zilog, boolean_t ke
 			kmem_cache_free(zil_lwb_cache, lwb);
 		}
 	} else if (!keep_first) {
-		(void) zil_parse(zilog, zil_free_log_block,
-		    zil_free_log_record, tx, zh->zh_claim_txg);
+		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 }
 
+void
+zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+	(void) zil_parse(zilog, zil_free_log_block,
+	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
+}
+
 int
-zil_claim(const char *osname, void *txarg)
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -576,9 +676,17 @@ zil_claim(const char *osname, void *txar
 	objset_t *os;
 	int error;
 
-	error = dmu_objset_hold(osname, FTAG, &os);
-	if (error) {
-		cmn_err(CE_WARN, "can't open objset for %s", osname);
+	error = dmu_objset_own_obj(dp, ds->ds_object,
+	    DMU_OST_ANY, B_FALSE, FTAG, &os);
+	if (error != 0) {
+		/*
+		 * EBUSY indicates that the objset is inconsistent, in which
+		 * case it can not have a ZIL.
+		 */
+		if (error != EBUSY) {
+			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+			    (unsigned long long)ds->ds_object, error);
+		}
 		return (0);
 	}
 
@@ -590,7 +698,7 @@ zil_claim(const char *osname, void *txar
 			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
 		BP_ZERO(&zh->zh_log);
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
-		dmu_objset_rele(os, FTAG);
+		dmu_objset_disown(os, FTAG);
 		return (0);
 	}
 
@@ -615,7 +723,7 @@ zil_claim(const char *osname, void *txar
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
-	dmu_objset_rele(os, FTAG);
+	dmu_objset_disown(os, FTAG);
 	return (0);
 }
 
@@ -624,22 +732,46 @@ zil_claim(const char *osname, void *txar
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
+/* ARGSUSED */
 int
-zil_check_log_chain(const char *osname, void *tx)
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	zilog_t *zilog;
 	objset_t *os;
+	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
-	error = dmu_objset_hold(osname, FTAG, &os);
-	if (error) {
-		cmn_err(CE_WARN, "can't open objset for %s", osname);
+	error = dmu_objset_from_ds(ds, &os);
+	if (error != 0) {
+		cmn_err(CE_WARN, "can't open objset %llu, error %d",
+		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
+	bp = (blkptr_t *)&zilog->zl_header->zh_log;
+
+	/*
+	 * Check the first block and determine if it's on a log device
+	 * which may have been removed or faulted prior to loading this
+	 * pool.  If so, there's no point in checking the rest of the log
+	 * as its content should have already been synced to the pool.
+	 */
+	if (!BP_IS_HOLE(bp)) {
+		vdev_t *vd;
+		boolean_t valid = B_TRUE;
+
+		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+		if (vd->vdev_islog && vdev_is_dead(vd))
+			valid = vdev_log_state_valid(vd);
+		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+		if (!valid)
+			return (0);
+	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
@@ -651,16 +783,14 @@ zil_check_log_chain(const char *osname, 
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
 
-	dmu_objset_rele(os, FTAG);
-
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 static int
 zil_vdev_compare(const void *x1, const void *x2)
 {
-	uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
-	uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	if (v1 < v2)
 		return (-1);
@@ -701,14 +831,14 @@ zil_add_block(zilog_t *zilog, const blkp
 	mutex_exit(&zilog->zl_vdev_lock);
 }
 
-void
+static void
 zil_flush_vdevs(zilog_t *zilog)
 {
 	spa_t *spa = zilog->zl_spa;
 	avl_tree_t *t = &zilog->zl_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
-	zio_t *zio;
+	zio_t *zio = NULL;
 
 	ASSERT(zilog->zl_writer);
 
@@ -721,12 +851,13 @@ zil_flush_vdevs(zilog_t *zilog)
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
-	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL)
+		if (vd != NULL && !vd->vdev_nowritecache) {
+			if (zio == NULL)
+				zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			zio_flush(zio, vd);
+		}
 		kmem_free(zv, sizeof (*zv));
 	}
 
@@ -734,7 +865,8 @@ zil_flush_vdevs(zilog_t *zilog)
 	 * Wait for all the flushes to complete.  Not all devices actually
 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
 	 */
-	(void) zio_wait(zio);
+	if (zio)
+		(void) zio_wait(zio);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 }
@@ -755,7 +887,7 @@ zil_lwb_write_done(zio_t *zio)
 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
 	ASSERT(!BP_IS_GANG(zio->io_bp));
 	ASSERT(!BP_IS_HOLE(zio->io_bp));
-	ASSERT(zio->io_bp->blk_fill == 0);
+	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
 	/*
 	 * Ensure the lwb buffer pointer is cleared before releasing
@@ -785,7 +917,8 @@ zil_lwb_write_done(zio_t *zio)
 static void
 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 {
-	zbookmark_t zb;
+	zbookmark_phys_t zb;
+	zio_priority_t prio;
 
 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -796,18 +929,23 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t
 		    ZIO_FLAG_CANFAIL);
 	}
 	if (lwb->lwb_zio == NULL) {
+		if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
+			prio = ZIO_PRIORITY_SYNC_WRITE;
+		else
+			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
-		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
+		    zil_lwb_write_done, lwb, prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
 
 /*
  * Define a limited set of intent log block sizes.
+ *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,		/* non TX_WRITE */
@@ -817,21 +955,11 @@ uint64_t zil_block_buckets[] = {
 };
 
 /*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
- */
-uint64_t zil_slog_limit = 1024 * 1024;
-#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
-	(((zilog)->zl_cur_used < zil_slog_limit) || \
-	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
-
-/*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
 static lwb_t *
-zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolean_t last)
 {
 	lwb_t *nlwb = NULL;
 	zil_chain_t *zilc;
@@ -839,8 +967,9 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
 	blkptr_t *bp;
 	dmu_tx_t *tx;
 	uint64_t txg;
-	uint64_t zil_blksz;
+	uint64_t zil_blksz, wsz;
 	int i, error;
+	boolean_t slog;
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		zilc = (zil_chain_t *)lwb->lwb_buf;
@@ -889,7 +1018,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
 		continue;
 	zil_blksz = zil_block_buckets[i];
 	if (zil_blksz == UINT64_MAX)
-		zil_blksz = SPA_MAXBLOCKSIZE;
+		zil_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
@@ -897,9 +1026,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
-	    USE_SLOG(zilog));
-	if (!error) {
+	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
+	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
@@ -907,25 +1035,33 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
 		/*
 		 * Allocate a new log write buffer (lwb).
 		 */
-		nlwb = zil_alloc_lwb(zilog, bp, txg);
+		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
 
 		/* Record the block for later vdev flushing */
 		zil_add_block(zilog, &lwb->lwb_blk);
 	}
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
-		uint64_t len;
-
 		/* For Slim ZIL only write what is used. */
-		len = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
-		ASSERT3U(len, <=, lwb->lwb_sz);
-		zio_shrink(lwb->lwb_zio, len);
+		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+		ASSERT3U(wsz, <=, lwb->lwb_sz);
+		zio_shrink(lwb->lwb_zio, wsz);
 
+	} else {
+		wsz = lwb->lwb_sz;
 	}
+
 	zilc->zc_pad = 0;
 	zilc->zc_nused = lwb->lwb_nused;
 	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 
+	/*
+	 * clear unused data for security
+	 */
+	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
+
+	if (last)
+		lwb->lwb_zio->io_pipeline &= ~ZIO_STAGE_ISSUE_ASYNC;
 	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
 
 	/*
@@ -938,12 +1074,13 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
 static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
-	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lrw = (lr_write_t *)lrc;
+	lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
+	lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
 	char *lr_buf;
 	uint64_t txg = lrc->lrc_txg;
 	uint64_t reclen = lrc->lrc_reclen;
 	uint64_t dlen = 0;
+	uint64_t dnow, lwb_sp;
 
 	if (lwb == NULL)
 		return (NULL);
@@ -958,25 +1095,30 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
 
 	zil_lwb_write_init(zilog, lwb);
 
+cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
+	 * For WR_NEED_COPY optimize layout for minimal number of chunks, but
+	 * try to keep wasted space withing reasonable range (12%).
 	 */
-	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
-		lwb = zil_lwb_write_start(zilog, lwb);
+	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+	    lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
+	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
+		lwb = zil_lwb_write_start(zilog, lwb, B_FALSE);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_init(zilog, lwb);
 		ASSERT(LWB_EMPTY(lwb));
-		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-			return (lwb);
-		}
+		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+		ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
 	}
 
+	dnow = MIN(dlen, lwb_sp - reclen);
 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
 	bcopy(lrc, lr_buf, reclen);
-	lrc = (lr_t *)lr_buf;
-	lrw = (lr_write_t *)lrc;
+	lrcb = (lr_t *)lr_buf;
+	lrwb = (lr_write_t *)lrcb;
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
@@ -988,21 +1130,24 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
 			char *dbuf;
 			int error;
 
-			if (dlen) {
-				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
-				lrw->lr_common.lrc_reclen += dlen;
+				lrcb->lrc_reclen += dnow;
+				if (lrwb->lr_length > dnow)
+					lrwb->lr_length = dnow;
+				lrw->lr_offset += dnow;
+				lrw->lr_length -= dnow;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 			error = zilog->zl_get_data(
-			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+			    itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
 			}
-			if (error) {
+			if (error != 0) {
 				ASSERT(error == ENOENT || error == EEXIST ||
 				    error == EALREADY);
 				return (lwb);
@@ -1016,11 +1161,17 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
-	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-	lwb->lwb_nused += reclen + dlen;
+	lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+	lwb->lwb_nused += reclen + dnow;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
-	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
+	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+
+	dlen -= dnow;
+	if (dlen > 0) {
+		zilog->zl_cur_used += reclen;
+		goto cont;
+	}
 
 	return (lwb);
 }
@@ -1035,8 +1186,8 @@ zil_itx_create(uint64_t txtype, size_t l
 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
-	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
+	itx->itx_sync = B_TRUE;		/* default is synchronous */
 
 	return (itx);
 }
@@ -1047,190 +1198,380 @@ zil_itx_destroy(itx_t *itx)
 	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
 }
 
-uint64_t
-zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
 {
-	uint64_t seq;
+	itx_t *itx;
+	list_t *list;
+	avl_tree_t *t;
+	void *cookie;
+	itx_async_node_t *ian;
+
+	list = &itxs->i_sync_list;
+	while ((itx = list_head(list)) != NULL) {
+		list_remove(list, itx);
+		kmem_free(itx, offsetof(itx_t, itx_lr) +
+		    itx->itx_lr.lrc_reclen);
+	}
+
+	cookie = NULL;
+	t = &itxs->i_async_tree;
+	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+		list = &ian->ia_list;
+		while ((itx = list_head(list)) != NULL) {
+			list_remove(list, itx);
+			kmem_free(itx, offsetof(itx_t, itx_lr) +
+			    itx->itx_lr.lrc_reclen);
+		}
+		list_destroy(list);
+		kmem_free(ian, sizeof (itx_async_node_t));
+	}
+	avl_destroy(t);
 
-	ASSERT(itx->itx_lr.lrc_seq == 0);
-	ASSERT(!zilog->zl_replay);
+	kmem_free(itxs, sizeof (itxs_t));
+}
 
-	mutex_enter(&zilog->zl_lock);
-	list_insert_tail(&zilog->zl_itx_list, itx);
-	zilog->zl_itx_list_sz += itx->itx_sod;
-	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
-	itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq;
-	mutex_exit(&zilog->zl_lock);
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
-	return (seq);
+	if (o1 < o2)
+		return (-1);
+	if (o1 > o2)
+		return (1);
+
+	return (0);
 }
 
 /*
- * Free up all in-memory intent log transactions that have now been synced.
+ * Remove all async itx with the given oid.
  */
 static void
-zil_itx_clean(zilog_t *zilog)
+zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
-	uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa);
-	uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa);
+	uint64_t otxg, txg;
+	itx_async_node_t *ian;
+	avl_tree_t *t;
+	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
+	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
-	mutex_enter(&zilog->zl_lock);
-	/* wait for a log writer to finish walking list */
-	while (zilog->zl_writer) {
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-	}
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+		otxg = ZILTEST_TXG;
+	else
+		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
-	/*
-	 * Move the sync'd log transactions to a separate list so we can call
-	 * kmem_free without holding the zl_lock.
-	 *
-	 * There is no need to set zl_writer as we don't drop zl_lock here
-	 */
-	while ((itx = list_head(&zilog->zl_itx_list)) != NULL &&
-	    itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) {
-		list_remove(&zilog->zl_itx_list, itx);
-		zilog->zl_itx_list_sz -= itx->itx_sod;
-		list_insert_tail(&clean_list, itx);
-	}
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
+	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+		mutex_enter(&itxg->itxg_lock);
+		if (itxg->itxg_txg != txg) {
+			mutex_exit(&itxg->itxg_lock);
+			continue;
+		}
 
-	/* destroy sync'd log transactions */
+		/*
+		 * Locate the object node and append its list.
+		 */
+		t = &itxg->itxg_itxs->i_async_tree;
+		ian = avl_find(t, &oid, &where);
+		if (ian != NULL)
+			list_move_tail(&clean_list, &ian->ia_list);
+		mutex_exit(&itxg->itxg_lock);
+	}
 	while ((itx = list_head(&clean_list)) != NULL) {
 		list_remove(&clean_list, itx);
-		zil_itx_destroy(itx);
+		kmem_free(itx, offsetof(itx_t, itx_lr) +
+		    itx->itx_lr.lrc_reclen);
 	}
 	list_destroy(&clean_list);
 }
 
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+	uint64_t txg;
+	itxg_t *itxg;
+	itxs_t *itxs, *clean = NULL;
+
+	/*
+	 * Object ids can be re-instantiated in the next txg so
+	 * remove any async transactions to avoid future leaks.
+	 * This can happen if a fsync occurs on the re-instantiated
+	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+	 * the new file data and flushes a write record for the old object.
+	 */
+	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
+		zil_remove_async(zilog, itx->itx_oid);
+
+	/*
+	 * Ensure the data of a renamed file is committed before the rename.
+	 */
+	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+		zil_async_to_sync(zilog, itx->itx_oid);
+
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
+		txg = ZILTEST_TXG;
+	else
+		txg = dmu_tx_get_txg(tx);
+
+	itxg = &zilog->zl_itxg[txg & TXG_MASK];
+	mutex_enter(&itxg->itxg_lock);
+	itxs = itxg->itxg_itxs;
+	if (itxg->itxg_txg != txg) {
+		if (itxs != NULL) {
+			/*
+			 * The zil_clean callback hasn't got around to cleaning
+			 * this itxg. Save the itxs for release below.
+			 * This should be rare.
+			 */
+			clean = itxg->itxg_itxs;
+		}
+		itxg->itxg_txg = txg;
+		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
+
+		list_create(&itxs->i_sync_list, sizeof (itx_t),
+		    offsetof(itx_t, itx_node));
+		avl_create(&itxs->i_async_tree, zil_aitx_compare,
+		    sizeof (itx_async_node_t),
+		    offsetof(itx_async_node_t, ia_node));
+	}
+	if (itx->itx_sync) {
+		list_insert_tail(&itxs->i_sync_list, itx);
+	} else {
+		avl_tree_t *t = &itxs->i_async_tree;
+		uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
+		itx_async_node_t *ian;
+		avl_index_t where;
+
+		ian = avl_find(t, &foid, &where);
+		if (ian == NULL) {
+			ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
+			list_create(&ian->ia_list, sizeof (itx_t),
+			    offsetof(itx_t, itx_node));
+			ian->ia_foid = foid;
+			avl_insert(t, ian, where);
+		}
+		list_insert_tail(&ian->ia_list, itx);
+	}
+
+	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+	zilog_dirty(zilog, txg);
+	mutex_exit(&itxg->itxg_lock);
+
+	/* Release the old itxs now we've dropped the lock */
+	if (clean != NULL)
+		zil_itxg_clean(clean);
+}
+
 /*
  * If there are any in-memory intent log transactions which have now been
- * synced then start up a taskq to free them.
+ * synced then start up a taskq to free them. We should only do this after we
+ * have written out the uberblocks (i.e. txg has been comitted) so that
+ * don't inadvertently clean out in-memory log records that would be required
+ * by zil_commit().
  */
 void
-zil_clean(zilog_t *zilog)
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
-	itx_t *itx;
+	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+	itxs_t *clean_me;
 
-	mutex_enter(&zilog->zl_lock);
-	itx = list_head(&zilog->zl_itx_list);
-	if ((itx != NULL) &&
-	    (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
-		(void) taskq_dispatch(zilog->zl_clean_taskq,
-		    (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP);
+	mutex_enter(&itxg->itxg_lock);
+	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+		mutex_exit(&itxg->itxg_lock);
+		return;
+	}
+	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+	ASSERT(itxg->itxg_txg != 0);
+	ASSERT(zilog->zl_clean_taskq != NULL);
+	clean_me = itxg->itxg_itxs;
+	itxg->itxg_itxs = NULL;
+	itxg->itxg_txg = 0;
+	mutex_exit(&itxg->itxg_lock);
+	/*
+	 * Preferably start a task queue to free up the old itxs but
+	 * if taskq_dispatch can't allocate resources to do that then
+	 * free it in-line. This should be rare. Note, using TQ_SLEEP
+	 * created a bad performance problem.
+	 */
+	if (taskq_dispatch(zilog->zl_clean_taskq,
+	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0)
+		zil_itxg_clean(clean_me);
+}
+
+/*
+ * Get the list of itxs to commit into zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+	uint64_t otxg, txg;
+	list_t *commit_list = &zilog->zl_itx_commit_list;
+
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+		otxg = ZILTEST_TXG;
+	else
+		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+	/*
+	 * This is inherently racy, since there is nothing to prevent
+	 * the last synced txg from changing. That's okay since we'll
+	 * only commit things in the future.
+	 */
+	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+		mutex_enter(&itxg->itxg_lock);
+		if (itxg->itxg_txg != txg) {
+			mutex_exit(&itxg->itxg_lock);
+			continue;
+		}
+
+		/*
+		 * If we're adding itx records to the zl_itx_commit_list,
+		 * then the zil better be dirty in this "txg". We can assert
+		 * that here since we're holding the itxg_lock which will
+		 * prevent spa_sync from cleaning it. Once we add the itxs
+		 * to the zl_itx_commit_list we must commit it to disk even
+		 * if it's unnecessary (i.e. the txg was synced).
+		 */
+		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
+		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
+		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+
+		mutex_exit(&itxg->itxg_lock);
+	}
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+	uint64_t otxg, txg;
+	itx_async_node_t *ian;
+	avl_tree_t *t;
+	avl_index_t where;
+
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+		otxg = ZILTEST_TXG;
+	else
+		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+	/*
+	 * This is inherently racy, since there is nothing to prevent
+	 * the last synced txg from changing.
+	 */
+	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+		mutex_enter(&itxg->itxg_lock);
+		if (itxg->itxg_txg != txg) {
+			mutex_exit(&itxg->itxg_lock);
+			continue;
+		}
+
+		/*
+		 * If a foid is specified then find that node and append its
+		 * list. Otherwise walk the tree appending all the lists
+		 * to the sync list. We add to the end rather than the
+		 * beginning to ensure the create has happened.
+		 */
+		t = &itxg->itxg_itxs->i_async_tree;
+		if (foid != 0) {
+			ian = avl_find(t, &foid, &where);
+			if (ian != NULL) {
+				list_move_tail(&itxg->itxg_itxs->i_sync_list,
+				    &ian->ia_list);
+			}
+		} else {
+			void *cookie = NULL;
+
+			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+				list_move_tail(&itxg->itxg_itxs->i_sync_list,
+				    &ian->ia_list);
+				list_destroy(&ian->ia_list);
+				kmem_free(ian, sizeof (itx_async_node_t));
+			}
+		}
+		mutex_exit(&itxg->itxg_lock);
 	}
-	mutex_exit(&zilog->zl_lock);
 }
 
 static void
-zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit_writer(zilog_t *zilog)
 {
 	uint64_t txg;
-	uint64_t commit_seq = 0;
-	itx_t *itx, *itx_next;
+	itx_t *itx;
 	lwb_t *lwb;
-	spa_t *spa;
+	spa_t *spa = zilog->zl_spa;
 	int error = 0;
 
-	zilog->zl_writer = B_TRUE;
 	ASSERT(zilog->zl_root_zio == NULL);
-	spa = zilog->zl_spa;
+
+	mutex_exit(&zilog->zl_lock);
+
+	zil_get_commit_list(zilog);
+
+	/*
+	 * Return if there's nothing to commit before we dirty the fs by
+	 * calling zil_create().
+	 */
+	if (list_head(&zilog->zl_itx_commit_list) == NULL) {
+		mutex_enter(&zilog->zl_lock);
+		return;
+	}
 
 	if (zilog->zl_suspend) {
 		lwb = NULL;
 	} else {
 		lwb = list_tail(&zilog->zl_lwb_list);
-		if (lwb == NULL) {
-			/*
-			 * Return if there's nothing to flush before we
-			 * dirty the fs by calling zil_create()
-			 */
-			if (list_is_empty(&zilog->zl_itx_list)) {
-				zilog->zl_writer = B_FALSE;
-				return;
-			}
-			mutex_exit(&zilog->zl_lock);
+		if (lwb == NULL)
 			lwb = zil_create(zilog);
-			mutex_enter(&zilog->zl_lock);
-		}
 	}
-	ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
 
-	/* Loop through in-memory log transactions filling log blocks. */
 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-
-	for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
-		/*
-		 * Save the next pointer.  Even though we drop zl_lock below,
-		 * all threads that can remove itx list entries (other writers
-		 * and zil_itx_clean()) can't do so until they have zl_writer.
-		 */
-		itx_next = list_next(&zilog->zl_itx_list, itx);
+	while (itx = list_head(&zilog->zl_itx_commit_list)) {
+		txg = itx->itx_lr.lrc_txg;
+		ASSERT3U(txg, !=, 0);
 
 		/*
-		 * Determine whether to push this itx.
-		 * Push all transactions related to specified foid and
-		 * all other transactions except those that can be logged
-		 * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
-		 * for all other files.
-		 *
-		 * If foid == 0 (meaning "push all foids") or
-		 * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
+		 * This is inherently racy and may result in us writing
+		 * out a log block for a txg that was just synced. This is
+		 * ok since we'll end cleaning up that log block the next
+		 * time we call zil_sync().
 		 */
-		if (foid != 0 && !itx->itx_sync &&
-		    TX_OOO(itx->itx_lr.lrc_txtype) &&
-		    ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
-			continue; /* skip this record */
-
-		if ((itx->itx_lr.lrc_seq > seq) &&
-		    ((lwb == NULL) || (LWB_EMPTY(lwb)) ||
-		    (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
-			break;
-
-		list_remove(&zilog->zl_itx_list, itx);
-		zilog->zl_itx_list_sz -= itx->itx_sod;
-
-		mutex_exit(&zilog->zl_lock);
-
-		txg = itx->itx_lr.lrc_txg;
-		ASSERT(txg);
-
-		if (txg > spa_last_synced_txg(spa) ||
-		    txg > spa_freeze_txg(spa))
+		if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
 			lwb = zil_lwb_commit(zilog, itx, lwb);
-
-		zil_itx_destroy(itx);
-
-		mutex_enter(&zilog->zl_lock);
+		list_remove(&zilog->zl_itx_commit_list, itx);
+		kmem_free(itx, offsetof(itx_t, itx_lr)
+		    + itx->itx_lr.lrc_reclen);
 	}
 	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
-	/* determine commit sequence number */
-	itx = list_head(&zilog->zl_itx_list);
-	if (itx)
-		commit_seq = itx->itx_lr.lrc_seq - 1;
-	else
-		commit_seq = zilog->zl_itx_seq;
-	mutex_exit(&zilog->zl_lock);
 
 	/* write the last block out */
 	if (lwb != NULL && lwb->lwb_zio != NULL)
-		lwb = zil_lwb_write_start(zilog, lwb);
+		lwb = zil_lwb_write_start(zilog, lwb, B_TRUE);
 
-	zilog->zl_prev_used = zilog->zl_cur_used;
 	zilog->zl_cur_used = 0;
 
 	/*
 	 * Wait if necessary for the log blocks to be on stable storage.
 	 */
 	if (zilog->zl_root_zio) {
-		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
 		error = zio_wait(zilog->zl_root_zio);
 		zilog->zl_root_zio = NULL;
-		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
 		zil_flush_vdevs(zilog);
 	}
 
@@ -1238,10 +1579,6 @@ zil_commit_writer(zilog_t *zilog, uint64
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	mutex_enter(&zilog->zl_lock);
-	zilog->zl_writer = B_FALSE;
-
-	ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
-	zilog->zl_commit_seq = commit_seq;
 
 	/*
 	 * Remember the highest committed log sequence number for ztest.
@@ -1253,58 +1590,61 @@ zil_commit_writer(zilog_t *zilog, uint64
 }
 
 /*
- * Push zfs transactions to stable storage up to the supplied sequence number.
+ * Commit zfs transactions to stable storage.
  * If foid is 0 push out all transactions, otherwise push only those
- * for that file or might have been used to create that file.
+ * for that object or might reference that object.
+ *
+ * itxs are committed in batches. In a heavily stressed zil there will be
+ * a commit writer thread who is writing out a bunch of itxs to the log
+ * for a set of committing threads (cthreads) in the same batch as the writer.
+ * Those cthreads are all waiting on the same cv for that batch.
+ *
+ * There will also be a different and growing batch of threads that are
+ * waiting to commit (qthreads). When the committing batch completes
+ * a transition occurs such that the cthreads exit and the qthreads become
+ * cthreads. One of the new cthreads becomes the writer thread for the
+ * batch. Any new threads arriving become new qthreads.
+ *
+ * Only 2 condition variables are needed and there's no transition
+ * between the two cvs needed. They just flip-flop between qthreads
+ * and cthreads.
+ *
+ * Using this scheme we can efficiently wakeup up only those threads
+ * that have been committed.
  */
 void
-zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
+zil_commit(zilog_t *zilog, uint64_t foid)
 {
-	if (zilog == NULL || seq == 0)
-		return;
+	uint64_t mybatch;
 
-	mutex_enter(&zilog->zl_lock);
+	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+		return;
 
-	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
+	/* move the async itxs for the foid to the sync queues */
+	zil_async_to_sync(zilog, foid);
 
+	mutex_enter(&zilog->zl_lock);
+	mybatch = zilog->zl_next_batch;
 	while (zilog->zl_writer) {
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-		if (seq <= zilog->zl_commit_seq) {
+		cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock);
+		if (mybatch <= zilog->zl_com_batch) {
 			mutex_exit(&zilog->zl_lock);
 			return;
 		}
 	}
-	zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
-	/* wake up others waiting on the commit */
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
-}
-
-/*
- * Report whether all transactions are committed.
- */
-static boolean_t
-zil_is_committed(zilog_t *zilog)
-{
-	lwb_t *lwb;
-	boolean_t committed;
 
-	mutex_enter(&zilog->zl_lock);
-
-	while (zilog->zl_writer)
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+	zilog->zl_next_batch++;
+	zilog->zl_writer = B_TRUE;
+	zil_commit_writer(zilog);
+	zilog->zl_com_batch = mybatch;
+	zilog->zl_writer = B_FALSE;
+	mutex_exit(&zilog->zl_lock);
 
-	if (!list_is_empty(&zilog->zl_itx_list))
-		committed = B_FALSE;		/* unpushed transactions */
-	else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
-		committed = B_TRUE;		/* intent log never used */
-	else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
-		committed = B_FALSE;		/* zil_sync() not done yet */
-	else
-		committed = B_TRUE;		/* everything synced */
+	/* wake up one thread to become the next writer */
+	cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]);
 
-	mutex_exit(&zilog->zl_lock);
-	return (committed);
+	/* wake up all threads waiting for this batch to be committed */
+	cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]);
 }
 
 /*
@@ -1392,6 +1732,12 @@ zil_fini(void)
 }
 
 void
+zil_set_sync(zilog_t *zilog, uint64_t sync)
+{
+	zilog->zl_sync = sync;
+}
+
+void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
@@ -1410,15 +1756,22 @@ zil_alloc(objset_t *os, zil_header_t *zh
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
+	zilog->zl_sync = dmu_objset_syncprop(os);
+	zilog->zl_next_batch = 1;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 
-	list_create(&zilog->zl_itx_list, sizeof (itx_t),
-	    offsetof(itx_t, itx_node));
+	for (int i = 0; i < TXG_SIZE; i++) {
+		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+		    MUTEX_DEFAULT, NULL);
+	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
+	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+	    offsetof(itx_t, itx_node));
+
 	mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	avl_create(&zilog->zl_vdev_tree, zil_vdev_compare,
@@ -1426,6 +1779,8 @@ zil_alloc(objset_t *os, zil_header_t *zh
 
 	cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+	cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL);
+	cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
@@ -1433,27 +1788,39 @@ zil_alloc(objset_t *os, zil_header_t *zh
 void
 zil_free(zilog_t *zilog)
 {
-	lwb_t *lwb;
-
 	zilog->zl_stop_sync = 1;
 
-	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
-		list_remove(&zilog->zl_lwb_list, lwb);
-		if (lwb->lwb_buf != NULL)
-			zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-		kmem_cache_free(zil_lwb_cache, lwb);
-	}
+	ASSERT0(zilog->zl_suspend);
+	ASSERT0(zilog->zl_suspending);
+
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	avl_destroy(&zilog->zl_vdev_tree);
 	mutex_destroy(&zilog->zl_vdev_lock);
 
-	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
-	list_destroy(&zilog->zl_itx_list);
+	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+	list_destroy(&zilog->zl_itx_commit_list);
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		/*
+		 * It's possible for an itx to be generated that doesn't dirty
+		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+		 * callback to remove the entry. We remove those here.
+		 *
+		 * Also free up the ziltest itxs.
+		 */
+		if (zilog->zl_itxg[i].itxg_itxs)
+			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+	}
+
 	mutex_destroy(&zilog->zl_lock);
 
 	cv_destroy(&zilog->zl_cv_writer);
 	cv_destroy(&zilog->zl_cv_suspend);
+	cv_destroy(&zilog->zl_cv_batch[0]);
+	cv_destroy(&zilog->zl_cv_batch[1]);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
@@ -1466,6 +1833,10 @@ zil_open(objset_t *os, zil_get_data_t *g
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
+	ASSERT(zilog->zl_clean_taskq == NULL);
+	ASSERT(zilog->zl_get_data == NULL);
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
 	zilog->zl_get_data = get_data;
 	zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri,
 	    2, 2, TASKQ_PREPOPULATE);
@@ -1479,67 +1850,146 @@ zil_open(objset_t *os, zil_get_data_t *g
 void
 zil_close(zilog_t *zilog)
 {
+	lwb_t *lwb;
+	uint64_t txg = 0;
+
+	zil_commit(zilog, 0); /* commit all itx */
+
 	/*
-	 * If the log isn't already committed, mark the objset dirty
-	 * (so zil_sync() will be called) and wait for that txg to sync.
+	 * The lwb_max_txg for the stubby lwb will reflect the last activity
+	 * for the zil.  After a txg_wait_synced() on the txg we know all the
+	 * callbacks have occurred that may clean the zil.  Only then can we
+	 * destroy the zl_clean_taskq.
 	 */
-	if (!zil_is_committed(zilog)) {
-		uint64_t txg;
-		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		txg = dmu_tx_get_txg(tx);
-		dmu_tx_commit(tx);
+	mutex_enter(&zilog->zl_lock);
+	lwb = list_tail(&zilog->zl_lwb_list);
+	if (lwb != NULL)
+		txg = lwb->lwb_max_txg;
+	mutex_exit(&zilog->zl_lock);
+	if (txg)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
-	}
+
+	if (zilog_is_dirty(zilog))
+		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
+	VERIFY(!zilog_is_dirty(zilog));
 
 	taskq_destroy(zilog->zl_clean_taskq);
 	zilog->zl_clean_taskq = NULL;
 	zilog->zl_get_data = NULL;
 
-	zil_itx_clean(zilog);
-	ASSERT(list_head(&zilog->zl_itx_list) == NULL);
+	/*
+	 * We should have only one LWB left on the list; remove it now.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	lwb = list_head(&zilog->zl_lwb_list);
+	if (lwb != NULL) {
+		ASSERT(lwb == list_tail(&zilog->zl_lwb_list));
+		list_remove(&zilog->zl_lwb_list, lwb);
+		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		kmem_cache_free(zil_lwb_cache, lwb);
+	}
+	mutex_exit(&zilog->zl_lock);
 }
 
+static char *suspend_tag = "zil suspending";
+
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
- * We suspend the log briefly when taking a snapshot so that the snapshot
- * contains all the data it's supposed to, and has an empty intent log.
+ * On old version pools, we suspend the log briefly when taking a
+ * snapshot so that it will have an empty intent log.
+ *
+ * Long holds are not really intended to be used the way we do here --
+ * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
+ * could fail.  Therefore we take pains to only put a long hold if it is
+ * actually necessary.  Fortunately, it will only be necessary if the
+ * objset is currently mounted (or the ZVOL equivalent).  In that case it
+ * will already have a long hold, so we are not really making things any worse.
+ *
+ * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
+ * zvol_state_t), and use their mechanism to prevent their hold from being
+ * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
+ * very little gain.
+ *
+ * if cookiep == NULL, this does both the suspend & resume.
+ * Otherwise, it returns with the dataset "long held", and the cookie
+ * should be passed into zil_resume().
  */
 int
-zil_suspend(zilog_t *zilog)
+zil_suspend(const char *osname, void **cookiep)
 {
-	const zil_header_t *zh = zilog->zl_header;
+	objset_t *os;
+	zilog_t *zilog;
+	const zil_header_t *zh;
+	int error;
+
+	error = dmu_objset_hold(osname, suspend_tag, &os);
+	if (error != 0)
+		return (error);
+	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
+	zh = zilog->zl_header;
+
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
-		return (EBUSY);
+		dmu_objset_rele(os, suspend_tag);
+		return (SET_ERROR(EBUSY));
+	}
+
+	/*
+	 * Don't put a long hold in the cases where we can avoid it.  This
+	 * is when there is no cookie so we are doing a suspend & resume
+	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
+	 * for the suspend because it's already suspended, or there's no ZIL.
+	 */
+	if (cookiep == NULL && !zilog->zl_suspending &&
+	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
+		mutex_exit(&zilog->zl_lock);
+		dmu_objset_rele(os, suspend_tag);
+		return (0);
 	}
-	if (zilog->zl_suspend++ != 0) {
+
+	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
+	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
+
+	zilog->zl_suspend++;
+
+	if (zilog->zl_suspend > 1) {
 		/*
-		 * Someone else already began a suspend.
+		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
+
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
+
+		if (cookiep == NULL)
+			zil_resume(os);
+		else
+			*cookiep = os;
 		return (0);
 	}
-	zilog->zl_suspending = B_TRUE;
-	mutex_exit(&zilog->zl_lock);
-
-	zil_commit(zilog, UINT64_MAX, 0);
 
 	/*
-	 * Wait for any in-flight log writes to complete.
+	 * If there is no pointer to an on-disk block, this ZIL must not
+	 * be active (e.g. filesystem not mounted), so there's nothing
+	 * to clean up.
 	 */
-	mutex_enter(&zilog->zl_lock);
-	while (zilog->zl_writer)
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+	if (BP_IS_HOLE(&zh->zh_log)) {
+		ASSERT(cookiep != NULL); /* fast path already handled */
+
+		*cookiep = os;
+		mutex_exit(&zilog->zl_lock);
+		return (0);
+	}
+
+	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
+	zil_commit(zilog, 0);
+
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
@@ -1547,16 +1997,25 @@ zil_suspend(zilog_t *zilog)
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
+	if (cookiep == NULL)
+		zil_resume(os);
+	else
+		*cookiep = os;
 	return (0);
 }
 
 void
-zil_resume(zilog_t *zilog)
+zil_resume(void *cookie)
 {
+	objset_t *os = cookie;
+	zilog_t *zilog = dmu_objset_zil(os);
+
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
+	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
+	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
@@ -1569,7 +2028,7 @@ typedef struct zil_replay_arg {
 static int
 zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
 {
-	char name[MAXNAMELEN];
+	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
@@ -1629,7 +2088,7 @@ zil_replay_log_record(zilog_t *zilog, lr
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
-		if (error)
+		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
@@ -1650,7 +2109,7 @@ zil_replay_log_record(zilog_t *zilog, lr
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
-	if (error) {
+	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
@@ -1660,7 +2119,7 @@ zil_replay_log_record(zilog_t *zilog, lr
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
-		if (error)
+		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
@@ -1715,7 +2174,7 @@ zil_replay(objset_t *os, void *arg, zil_
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
-	if (zilog == NULL)
+	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
@@ -1732,19 +2191,10 @@ zil_replaying(zilog_t *zilog, dmu_tx_t *
 int
 zil_vdev_offline(const char *osname, void *arg)
 {
-	objset_t *os;
-	zilog_t *zilog;
 	int error;
 
-	error = dmu_objset_hold(osname, FTAG, &os);
-	if (error)
-		return (error);
-
-	zilog = dmu_objset_zil(os);
-	if (zil_suspend(zilog) != 0)
-		error = EEXIST;
-	else
-		zil_resume(zilog);
-	dmu_objset_rele(os, FTAG);
-	return (error);
+	error = zil_suspend(osname, NULL);
+	if (error != 0)
+		return (SET_ERROR(EEXIST));
+	return (0);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c,v
retrieving revision 1.4
diff -u -p -r1.4 zio.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c	27 Mar 2014 15:50:48 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c	7 May 2017 14:10:06 -0000
@@ -19,10 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+#include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
@@ -35,36 +38,55 @@
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
+#include <sys/trim_map.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+#include <sys/metaslab_impl.h>
 
-/*
- * ==========================================================================
- * I/O priority table
- * ==========================================================================
- */
-uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
-	0,	/* ZIO_PRIORITY_NOW		*/
-	0,	/* ZIO_PRIORITY_SYNC_READ	*/
-	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
-	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
-	1,	/* ZIO_PRIORITY_CACHE_FILL	*/
-	1,	/* ZIO_PRIORITY_AGG		*/
-	4,	/* ZIO_PRIORITY_FREE		*/
-	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
-	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
-	10,	/* ZIO_PRIORITY_RESILVER	*/
-	20,	/* ZIO_PRIORITY_SCRUB		*/
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+#ifdef __NetBSD__
+const int zio_use_uma = 0;
+#else
+#if defined(__amd64__)
+static int zio_use_uma = 1;
+#else
+static int zio_use_uma = 0;
+#endif
+#endif
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
+    "Use uma(9) for ZIO allocations");
+static int zio_exclude_metadata = 0;
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+    "Exclude metadata buffers from dumps as well");
+
+zio_trim_stats_t zio_trim_stats = {
+	{ "bytes",		KSTAT_DATA_UINT64,
+	  "Number of bytes successfully TRIMmed" },
+	{ "success",		KSTAT_DATA_UINT64,
+	  "Number of successful TRIM requests" },
+	{ "unsupported",	KSTAT_DATA_UINT64,
+	  "Number of TRIM requests that failed because TRIM is not supported" },
+	{ "failed",		KSTAT_DATA_UINT64,
+	  "Number of TRIM requests that failed for reasons other than not supported" },
 };
 
+static kstat_t *zio_trim_ksp;
+
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
-char *zio_type_name[ZIO_TYPES] = {
+const char *zio_type_name[ZIO_TYPES] = {
 	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
 	"zio_ioctl"
 };
 
+boolean_t zio_dva_throttle_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
+    &zio_dva_throttle_enabled, 0, "");
+
 /*
  * ==========================================================================
  * I/O kmem caches
@@ -75,10 +97,38 @@ kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 
-#if defined(_KERNEL) && !defined(__NetBSD__)
+#ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
 
+#define	ZIO_PIPELINE_CONTINUE		0x100
+#define	ZIO_PIPELINE_STOP		0x101
+
+#define	BP_SPANB(indblkshift, level) \
+	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define	COMPARE_META_LEVEL	0x80000000ul
+/*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
+    &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
+int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
+    &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
+    &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
+
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
@@ -87,62 +137,76 @@ extern vmem_t *zio_alloc_arena;
 
 boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
 
+#ifdef illumos
 #ifdef ZFS_DEBUG
 int zio_buf_debug_limit = 16384;
 #else
 int zio_buf_debug_limit = 0;
 #endif
+#endif
+
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 void
 zio_init(void)
 {
 	size_t c;
-	vmem_t *data_alloc_arena = NULL;
-
-#if defined(_KERNEL) && !defined(__NetBSD__)
-	data_alloc_arena = zio_alloc_arena;
-#endif
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 #ifndef __NetBSD__
+	if (!zio_use_uma)
+		goto out;
+
 	/*
 	 * For small buffers, we want a cache for each multiple of
-	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
-	 * for each quarter-power of 2.  For large buffers, we want
-	 * a cache for each multiple of PAGESIZE.
+	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+	 * for each quarter-power of 2.
 	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t p2 = size;
 		size_t align = 0;
+		int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
 
-		while (p2 & (p2 - 1))
+		while (!ISP2(p2))
 			p2 &= p2 - 1;
 
+#ifdef illumos
+#ifndef _KERNEL
+		/*
+		 * If we are using watchpoints, put each buffer on its own page,
+		 * to eliminate the performance overhead of trapping to the
+		 * kernel when modifying a non-watched buffer that shares the
+		 * page with a watched buffer.
+		 */
+		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
+			continue;
+#endif
+#endif /* illumos */
 		if (size <= 4 * SPA_MINBLOCKSIZE) {
 			align = SPA_MINBLOCKSIZE;
-		} else if (P2PHASE(size, PAGESIZE) == 0) {
-			align = PAGESIZE;
-		} else if (P2PHASE(size, p2 >> 2) == 0) {
-			align = p2 >> 2;
+		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
+			align = MIN(p2 >> 2, PAGESIZE);
 		}
 
 		if (align != 0) {
 			char name[36];
-			(void) snprintf(name, sizeof(name), "zio_buf_%lu",
-			    (ulong_t)size);
+			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL,
-			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
+			    align, NULL, NULL, NULL, NULL, NULL, cflags);
 
-			(void) snprintf(name, sizeof(name), "zio_data_buf_%lu",
-			    (ulong_t)size);
+			/*
+			 * Since zio_data bufs do not appear in crash dumps, we
+			 * pass KMC_NOTOUCH so that no allocator metadata is
+			 * stored with the buffers.
+			 */
+			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
-			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
+			    align, NULL, NULL, NULL, NULL, NULL,
+			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
 		}
 	}
 
@@ -155,8 +219,20 @@ zio_init(void)
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
+out:
 #endif /* __NetBSD__ */
+
 	zio_inject_init();
+
+	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
+	    KSTAT_TYPE_NAMED,
+	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zio_trim_ksp != NULL) {
+		zio_trim_ksp->ks_data = &zio_trim_stats;
+		kstat_install(zio_trim_ksp);
+	}
 }
 
 void
@@ -186,6 +262,11 @@ zio_fini(void)
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
+
+	if (zio_trim_ksp != NULL) {
+		kstat_delete(zio_trim_ksp);
+		zio_trim_ksp = NULL;
+	}
 }
 
 /*
@@ -200,16 +281,33 @@ zio_fini(void)
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
+static void *
+zio_buf_alloc_impl(size_t size, boolean_t canwait)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	if (zio_use_uma) {
+		return (kmem_cache_alloc(zio_buf_cache[c],
+		    canwait ? KM_PUSHPAGE : KM_NOSLEEP));
+	} else {
+		return (kmem_alloc(size,
+		    (canwait ? KM_SLEEP : KM_NOSLEEP) | flags));
+	}
+}
+
 void *
 zio_buf_alloc(size_t size)
 {
-	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-#ifdef __NetBSD__
-	return (kmem_alloc(size, KM_SLEEP));
-#else
-	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
-#endif	
+	return (zio_buf_alloc_impl(size, B_TRUE));
+}
+
+void *
+zio_buf_alloc_nowait(size_t size)
+{
+	return (zio_buf_alloc_impl(size, B_FALSE));
 }
 
 /*
@@ -223,12 +321,12 @@ zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
-#ifdef __NetBSD__
-	return (kmem_alloc(size, KM_SLEEP));
-#else
-	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
-#endif	
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	if (zio_use_uma)
+		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+	else
+		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
 }
 
 void
@@ -236,13 +334,12 @@ zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
-#ifdef __NetBSD__
-	kmem_free(buf, size);
-#else	
-	kmem_cache_free(zio_buf_cache[c], buf);
-#endif	
+	if (zio_use_uma)
+		kmem_cache_free(zio_buf_cache[c], buf);
+	else
+		kmem_free(buf, size);
 }
 
 void
@@ -250,13 +347,12 @@ zio_data_buf_free(void *buf, size_t size
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
-#ifdef __NetBSD__
-	kmem_free(buf, size);
-#else	
-	kmem_cache_free(zio_data_buf_cache[c], buf);
-#endif	
+	if (zio_use_uma)
+		kmem_cache_free(zio_data_buf_cache[c], buf);
+	else
+		kmem_free(buf, size);
 }
 
 /*
@@ -264,9 +360,9 @@ zio_data_buf_free(void *buf, size_t size
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
-static void
+void
 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
-	zio_transform_func_t *transform)
+    zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
@@ -282,7 +378,7 @@ zio_push_transform(zio_t *zio, void *dat
 	zio->io_size = size;
 }
 
-static void
+void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
@@ -323,7 +419,7 @@ zio_decompress(zio_t *zio, void *data, u
 	if (zio->io_error == 0 &&
 	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 	    zio->io_data, data, zio->io_size, size) != 0)
-		zio->io_error = EIO;
+		zio->io_error = SET_ERROR(EIO);
 }
 
 /*
@@ -331,52 +427,39 @@ zio_decompress(zio_t *zio, void *data, u
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
-/*
- * NOTE - Callers to zio_walk_parents() and zio_walk_children must
- *        continue calling these functions until they return NULL.
- *        Otherwise, the next caller will pick up the list walk in
- *        some indeterminate state.  (Otherwise every caller would
- *        have to pass in a cookie to keep the state represented by
- *        io_walk_link, which gets annoying.)
- */
 zio_t *
-zio_walk_parents(zio_t *cio)
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
-	zio_link_t *zl = cio->io_walk_link;
 	list_t *pl = &cio->io_parent_list;
 
-	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
-	cio->io_walk_link = zl;
-
-	if (zl == NULL)
+	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+	if (*zl == NULL)
 		return (NULL);
 
-	ASSERT(zl->zl_child == cio);
-	return (zl->zl_parent);
+	ASSERT((*zl)->zl_child == cio);
+	return ((*zl)->zl_parent);
 }
 
 zio_t *
-zio_walk_children(zio_t *pio)
+zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
-	zio_link_t *zl = pio->io_walk_link;
 	list_t *cl = &pio->io_child_list;
 
-	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
-	pio->io_walk_link = zl;
-
-	if (zl == NULL)
+	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+	if (*zl == NULL)
 		return (NULL);
 
-	ASSERT(zl->zl_parent == pio);
-	return (zl->zl_child);
+	ASSERT((*zl)->zl_parent == pio);
+	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
-	zio_t *pio = zio_walk_parents(cio);
+	zio_link_t *zl = NULL;
+	zio_t *pio = zio_walk_parents(cio, &zl);
 
-	VERIFY(zio_walk_parents(cio) == NULL);
+	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
@@ -445,6 +528,7 @@ zio_wait_for_children(zio_t *zio, enum z
 	ASSERT(zio->io_stall == NULL);
 	if (*countp != 0) {
 		zio->io_stage >>= 1;
+		ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 		zio->io_stall = countp;
 		waiting = B_TRUE;
 	}
@@ -464,10 +548,22 @@ zio_notify_parent(zio_t *pio, zio_t *zio
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
-	if (--*countp == 0 && pio->io_stall == countp) {
+
+	(*countp)--;
+
+	if (*countp == 0 && pio->io_stall == countp) {
+		zio_taskq_type_t type =
+		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
-		zio_execute(pio);
+		/*
+		 * Dispatch the parent zio in its own taskq so that
+		 * the child can continue to make progress. This also
+		 * prevents overflowing the stack when we have deeply nested
+		 * parent-child relationships.
+		 */
+		zio_taskq_dispatch(pio, type, B_FALSE);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
@@ -480,6 +576,45 @@ zio_inherit_child_errors(zio_t *zio, enu
 		zio->io_error = zio->io_child_error[c];
 }
 
+int
+zio_timestamp_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_queued_timestamp < z2->io_queued_timestamp)
+		return (-1);
+	if (z1->io_queued_timestamp > z2->io_queued_timestamp)
+		return (1);
+
+	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
+		return (-1);
+	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
+		return (1);
+
+	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
+		return (-1);
+	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
+		return (1);
+
+	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
+		return (-1);
+	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
+		return (1);
+
+	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
+		return (-1);
+	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
@@ -488,13 +623,13 @@ zio_inherit_child_errors(zio_t *zio, enu
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, enum zio_flag flags,
-    vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+    vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
     enum zio_stage stage, enum zio_stage pipeline)
 {
 	zio_t *zio;
 
-	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
@@ -512,6 +647,7 @@ zio_create(zio_t *pio, spa_t *spa, uint6
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
+	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
@@ -548,6 +684,7 @@ zio_create(zio_t *pio, spa_t *spa, uint6
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
@@ -569,6 +706,7 @@ zio_create(zio_t *pio, spa_t *spa, uint6
 static void
 zio_destroy(zio_t *zio)
 {
+	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
@@ -595,13 +733,97 @@ zio_root(spa_t *spa, zio_done_func_t *do
 	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
+void
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
+{
+	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+		zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
+		    bp, (longlong_t)BP_GET_TYPE(bp));
+	}
+	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+		zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
+		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
+	}
+	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+		zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
+		    bp, (longlong_t)BP_GET_COMPRESS(bp));
+	}
+	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
+		    bp, (longlong_t)BP_GET_LSIZE(bp));
+	}
+	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
+		    bp, (longlong_t)BP_GET_PSIZE(bp));
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
+			zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
+			    bp, (longlong_t)BPE_GET_ETYPE(bp));
+		}
+	}
+
+	/*
+	 * Pool-specific checks.
+	 *
+	 * Note: it would be nice to verify that the blk_birth and
+	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
+	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
+	 * that are in the log) to be arbitrarily large.
+	 */
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+		if (vdevid >= spa->spa_root_vdev->vdev_children) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+		if (vd == NULL) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		if (vd->vdev_ops == &vdev_hole_ops) {
+			zfs_panic_recover("blkptr at %p DVA %u has hole "
+			    "VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		if (vd->vdev_ops == &vdev_missing_ops) {
+			/*
+			 * "missing" vdevs are valid during import, but we
+			 * don't have their detailed info (e.g. asize), so
+			 * we can't perform any more checks on them.
+			 */
+			continue;
+		}
+		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+		uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+		if (BP_IS_GANG(bp))
+			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+		if (offset + asize > vd->vdev_asize) {
+			zfs_panic_recover("blkptr at %p DVA %u has invalid "
+			    "OFFSET %llu",
+			    bp, i, (longlong_t)offset);
+		}
+	}
+}
+
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
+	zfs_blkptr_verify(spa, bp);
+
 	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
@@ -614,8 +836,10 @@ zio_read(zio_t *pio, spa_t *spa, const b
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb)
+    zio_done_func_t *ready, zio_done_func_t *children_ready,
+    zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private, zio_priority_t priority, enum zio_flag flags,
+    const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
@@ -623,12 +847,10 @@ zio_write(zio_t *pio, spa_t *spa, uint64
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
-	    zp->zp_type < DMU_OT_NUMTYPES &&
+	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
-	    zp->zp_copies <= spa_max_replication(spa) &&
-	    zp->zp_dedup <= 1 &&
-	    zp->zp_dedup_verify <= 1);
+	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
@@ -636,33 +858,52 @@ zio_write(zio_t *pio, spa_t *spa, uint64
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
+	zio->io_children_ready = children_ready;
+	zio->io_physdone = physdone;
 	zio->io_prop = *zp;
 
+	/*
+	 * Data can be NULL if we are going to call zio_write_override() to
+	 * provide the already-allocated BP.  But we may need the data to
+	 * verify a dedup hit (if requested).  In this case, don't try to
+	 * dedup (just take the already-allocated BP verbatim).
+	 */
+	if (data == NULL && zio->io_prop.zp_dedup_verify) {
+		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+	}
+
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private, int priority,
-    enum zio_flag flags, zbookmark_t *zb)
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 
+	/*
+	 * We must reset the io_prop to match the values that existed
+	 * when the bp was first written by dmu_sync() keeping in mind
+	 * that nopwrite and dedup are mutually exclusive.
+	 */
+	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
@@ -670,22 +911,65 @@ zio_write_override(zio_t *zio, blkptr_t 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
-	bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+
+	/*
+	 * The check for EMBEDDED is a performance optimization.  We
+	 * process the free here (by ignoring it) rather than
+	 * putting it on the list and then processing it in zio_free_sync().
+	 */
+	if (BP_IS_EMBEDDED(bp))
+		return;
+	metaslab_check_free(spa, bp);
+
+	/*
+	 * Frees that are for the currently-syncing txg, are not going to be
+	 * deferred, and which will not need to do a read (i.e. not GANG or
+	 * DEDUP), can be processed immediately.  Otherwise, put them on the
+	 * in-memory list for later processing.
+	 */
+	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
+	    txg != spa->spa_syncing_txg ||
+	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
+		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+	} else {
+		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
+		    BP_GET_PSIZE(bp), 0)));
+	}
 }
 
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
-    enum zio_flag flags)
+    uint64_t size, enum zio_flag flags)
 {
 	zio_t *zio;
+	enum zio_stage stage = ZIO_FREE_PIPELINE;
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
-	ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
+	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
-	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
-	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
-	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+	if (BP_IS_EMBEDDED(bp))
+		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
+	metaslab_check_free(spa, bp);
+	arc_freed(spa, bp);
+
+	if (zfs_trim_enabled)
+		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
+		    ZIO_STAGE_VDEV_IO_ASSESS;
+	/*
+	 * GANG and DEDUP blocks can induce a read (for the gang block header,
+	 * or the DDT), so issue them asynchronously so that this thread is
+	 * not tied up.
+	 */
+	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
+		stage |= ZIO_STAGE_ISSUE_ASYNC;
+
+	flags |= ZIO_FLAG_DONT_QUEUE;
+
+	zio = zio_create(pio, spa, txg, bp, NULL, size,
+	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
+	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 
 	return (zio);
 }
@@ -696,6 +980,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64
 {
 	zio_t *zio;
 
+	dprintf_bp(bp, "claiming in txg %llu", txg);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
@@ -715,20 +1004,22 @@ zio_claim(zio_t *pio, spa_t *spa, uint64
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
-zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
 
 	if (vd->vdev_children == 0) {
-		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-		    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
+		zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
+		    ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 
 		zio->io_cmd = cmd;
@@ -737,7 +1028,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
-			    done, private, priority, flags));
+			    offset, size, done, private, priority, flags));
 	}
 
 	return (zio);
@@ -746,7 +1037,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -756,8 +1047,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, ui
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
-	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
@@ -767,7 +1058,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, ui
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, boolean_t labels)
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -777,12 +1068,12 @@ zio_write_phys(zio_t *pio, vdev_t *vd, u
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+	    NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
-	if (zio_checksum_table[checksum].ci_eck) {
+	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
@@ -802,8 +1093,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, u
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-	void *data, uint64_t size, int type, int priority, enum zio_flag flags,
-	zio_done_func_t *done, void *private)
+    void *data, uint64_t size, int type, zio_priority_t priority,
+    enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
@@ -822,6 +1113,10 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
+	/* Not all IO types require vdev io done stage e.g. free */
+	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
+		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
+
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
@@ -834,17 +1129,42 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
+	/*
+	 * If we're creating a child I/O that is not associated with a
+	 * top-level vdev, then the child zio is not an allocating I/O.
+	 * If this is a retried I/O then we ignore it since we will
+	 * have already processed the original allocating I/O.
+	 */
+	if (flags & ZIO_FLAG_IO_ALLOCATING &&
+	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+		metaslab_class_t *mc = spa_normal_class(pio->io_spa);
+
+		ASSERT(mc->mc_alloc_throttle_enabled);
+		ASSERT(type == ZIO_TYPE_WRITE);
+		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+		    pio->io_child_type == ZIO_CHILD_GANG);
+
+		flags &= ~ZIO_FLAG_IO_ALLOCATING;
+	}
+
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+
+	zio->io_physdone = pio->io_physdone;
+	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+		zio->io_logical->io_phys_children++;
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
-	int type, int priority, enum zio_flag flags,
-	zio_done_func_t *done, void *private)
+    int type, zio_priority_t priority, enum zio_flag flags,
+    zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
@@ -852,7 +1172,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, done, private, type, priority,
-	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
+	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
@@ -862,11 +1182,23 @@ zio_vdev_delegated_io(vdev_t *vd, uint64
 void
 zio_flush(zio_t *zio, vdev_t *vd)
 {
-	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
 	    NULL, NULL, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
+zio_t *
+zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
+{
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
+	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
+}
+
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
@@ -898,13 +1230,21 @@ zio_read_bp_init(zio_t *zio)
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
-		uint64_t psize = BP_GET_PSIZE(bp);
+		uint64_t psize =
+		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(psize);
 
 		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 	}
 
-	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		decode_embedded_bp_compressed(bp, zio->io_data);
+	} else {
+		ASSERT(!BP_IS_EMBEDDED(bp));
+	}
+
+	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
@@ -919,50 +1259,98 @@ zio_read_bp_init(zio_t *zio)
 static int
 zio_write_bp_init(zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	zio_prop_t *zp = &zio->io_prop;
-	enum zio_compress compress = zp->zp_compress;
-	blkptr_t *bp = zio->io_bp;
-	uint64_t lsize = zio->io_size;
-	uint64_t psize = lsize;
-	int pass = 1;
-
-	/*
-	 * If our children haven't all reached the ready stage,
-	 * wait for them and then repeat this pipeline stage.
-	 */
-	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
-	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
-		return (ZIO_PIPELINE_STOP);
-
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
+		blkptr_t *bp = zio->io_bp;
+		zio_prop_t *zp = &zio->io_prop;
+
 		ASSERT(bp->blk_birth != zio->io_txg);
 		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		if (BP_IS_EMBEDDED(bp))
+			return (ZIO_PIPELINE_CONTINUE);
+
+		/*
+		 * If we've been overridden and nopwrite is set then
+		 * set the flag accordingly to indicate that a nopwrite
+		 * has already occurred.
+		 */
+		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+			ASSERT(!zp->zp_dedup);
+			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
+			zio->io_flags |= ZIO_FLAG_NOPWRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		ASSERT(!zp->zp_nopwrite);
+
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
-		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
-		    zp->zp_dedup_verify);
+		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
 		zio->io_bp_override = NULL;
-		BP_ZERO(bp);
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_write_compress(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_prop_t *zp = &zio->io_prop;
+	enum zio_compress compress = zp->zp_compress;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t lsize = zio->io_size;
+	uint64_t psize = lsize;
+	int pass = 1;
+
+	/*
+	 * If our children haven't all reached the ready stage,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
+		return (ZIO_PIPELINE_STOP);
+
+	if (!IO_IS_ALLOCATING(zio))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	if (zio->io_children_ready != NULL) {
+		/*
+		 * Now that all our children are ready, run the callback
+		 * associated with this zio in case it wants to modify the
+		 * data to be written.
+		 */
+		ASSERT3U(zp->zp_level, >, 0);
+		zio->io_children_ready(zio);
 	}
 
-	if (bp->blk_birth == zio->io_txg) {
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+	ASSERT(zio->io_bp_override == NULL);
+
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
@@ -978,11 +1366,11 @@ zio_write_bp_init(zio_t *zio)
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
-		if (pass > SYNC_PASS_DONT_COMPRESS)
+		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
@@ -992,10 +1380,51 @@ zio_write_bp_init(zio_t *zio)
 		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			zio_buf_free(cbuf, lsize);
+		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+			encode_embedded_bp_compressed(bp,
+			    cbuf, compress, lsize, psize);
+			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+			BP_SET_TYPE(bp, zio->io_prop.zp_type);
+			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+			zio_buf_free(cbuf, lsize);
+			bp->blk_birth = zio->io_txg;
+			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+			ASSERT(spa_feature_is_active(spa,
+			    SPA_FEATURE_EMBEDDED_DATA));
+			return (ZIO_PIPELINE_CONTINUE);
 		} else {
-			ASSERT(psize < lsize);
-			zio_push_transform(zio, cbuf, psize, lsize, NULL);
+			/*
+			 * Round up compressed size up to the ashift
+			 * of the smallest-ashift device, and zero the tail.
+			 * This ensures that the compressed size of the BP
+			 * (and thus compressratio property) are correct,
+			 * in that we charge for the padding used to fill out
+			 * the last sector.
+			 */
+			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+			size_t rounded = (size_t)P2ROUNDUP(psize,
+			    1ULL << spa->spa_min_ashift);
+			if (rounded >= lsize) {
+				compress = ZIO_COMPRESS_OFF;
+				zio_buf_free(cbuf, lsize);
+				psize = lsize;
+			} else {
+				bzero((char *)cbuf + psize, rounded - psize);
+				psize = rounded;
+				zio_push_transform(zio, cbuf,
+				    psize, lsize, NULL);
+			}
 		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
+		zio->io_bp_override = NULL;
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	/*
@@ -1006,8 +1435,9 @@ zio_write_bp_init(zio_t *zio)
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
-	    pass > SYNC_PASS_REWRITE) {
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	    BP_GET_PSIZE(bp) == psize &&
+	    pass >= zfs_sync_pass_rewrite) {
 		ASSERT(psize != 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
@@ -1018,15 +1448,22 @@ zio_write_bp_init(zio_t *zio)
 	}
 
 	if (psize == 0) {
+		if (zio->io_bp_orig.blk_birth != 0 &&
+		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, zp->zp_type);
+			BP_SET_LEVEL(bp, zp->zp_level);
+			BP_SET_BIRTH(bp, zio->io_txg, 0);
+		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
+		BP_SET_TYPE(bp, zp->zp_type);
+		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
-		BP_SET_TYPE(bp, zp->zp_type);
-		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
@@ -1034,8 +1471,12 @@ zio_write_bp_init(zio_t *zio)
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
+		if (zp->zp_nopwrite) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+		}
 	}
-
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
@@ -1047,8 +1488,6 @@ zio_free_bp_init(zio_t *zio)
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
-		else
-			arc_free(zio->io_spa, bp);
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
@@ -1061,11 +1500,13 @@ zio_free_bp_init(zio_t *zio)
  */
 
 static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
-	int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
+	int flags = (cutinline ? TQ_FRONT : 0);
+
+	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
@@ -1082,26 +1523,43 @@ zio_taskq_dispatch(zio_t *zio, enum zio_
 		t = ZIO_TYPE_NULL;
 
 	/*
-	 * If this is a high priority I/O, then use the high priority taskq.
+	 * If this is a high priority I/O, then use the high priority taskq if
+	 * available.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_NOW &&
-	    spa->spa_zio_taskq[t][q + 1] != NULL)
+	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
-	(void) taskq_dispatch(spa->spa_zio_taskq[t][q],
-	    (task_func_t *)zio_execute, zio, flags);
+
+	/*
+	 * NB: We are assuming that the zio can only be dispatched
+	 * to a single taskq at a time.  It would be a grievous error
+	 * to dispatch the zio to another taskq at the same time.
+	 */
+#if defined(illumos) || defined(__NetBSD__) || !defined(_KERNEL)
+	ASSERT(zio->io_tqent.tqent_next == NULL);
+#else
+	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
+#endif
+	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
+	    flags, &zio->io_tqent);
 }
 
 static boolean_t
-zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	kthread_t *executor = zio->io_executor;
 	spa_t *spa = zio->io_spa;
 
-	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
-		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
-			return (B_TRUE);
+	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
+		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+		uint_t i;
+		for (i = 0; i < tqs->stqs_count; i++) {
+			if (taskq_member(tqs->stqs_taskq[i], executor))
+				return (B_TRUE);
+		}
+	}
 
 	return (B_FALSE);
 }
@@ -1120,15 +1578,71 @@ zio_interrupt(zio_t *zio)
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
+void
+zio_delay_interrupt(zio_t *zio)
+{
+	/*
+	 * The timeout_generic() function isn't defined in userspace, so
+	 * rather than trying to implement the function, the zio delay
+	 * functionality has been disabled for userspace builds.
+	 */
+
+#ifndef __NetBSD__
+	/* XXXNETBSD implement timeout_generic() with a callout_t in zio_t */
+	/*
+	 * If io_target_timestamp is zero, then no delay has been registered
+	 * for this IO, thus jump to the end of this function and "skip" the
+	 * delay; issuing it directly to the zio layer.
+	 */
+	if (zio->io_target_timestamp != 0) {
+		hrtime_t now = gethrtime();
+
+		if (now >= zio->io_target_timestamp) {
+			/*
+			 * This IO has already taken longer than the target
+			 * delay to complete, so we don't want to delay it
+			 * any longer; we "miss" the delay and issue it
+			 * directly to the zio layer. This is likely due to
+			 * the target latency being set to a value less than
+			 * the underlying hardware can satisfy (e.g. delay
+			 * set to 1ms, but the disks take 10ms to complete an
+			 * IO request).
+			 */
+
+			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
+			    hrtime_t, now);
+
+			zio_interrupt(zio);
+		} else {
+			hrtime_t diff = zio->io_target_timestamp - now;
+
+			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
+			    hrtime_t, now, hrtime_t, diff);
+
+			(void) timeout_generic(CALLOUT_NORMAL,
+			    (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
+		}
+
+		return;
+	}
+#endif
+
+	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
+	zio_interrupt(zio);
+}
+
 /*
  * Execute the I/O pipeline until one of the following occurs:
- * (1) the I/O completes; (2) the pipeline stalls waiting for
- * dependent child I/Os; (3) the I/O issues, so we're waiting
- * for an I/O completion interrupt; (4) the I/O is delegated by
- * vdev-level caching or aggregation; (5) the I/O is deferred
- * due to vdev-level queueing; (6) the I/O is handed off to
- * another thread.  In all cases, the pipeline stops whenever
- * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ *	(1) the I/O completes
+ *	(2) the pipeline stalls waiting for dependent child I/Os
+ *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
+ *	(4) the I/O is delegated by vdev-level caching or aggregation
+ *	(5) the I/O is deferred due to vdev-level queueing
+ *	(6) the I/O is handed off to another thread.
+ *
+ * In all cases, the pipeline stops whenever there's no CPU work; it never
+ * burns a thread in cv_wait().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
@@ -1140,6 +1654,8 @@ zio_execute(zio_t *zio)
 {
 	zio->io_executor = curthread;
 
+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
@@ -1173,7 +1689,8 @@ zio_execute(zio_t *zio)
 		}
 
 		zio->io_stage = stage;
-		rv = zio_pipeline[highbit(stage) - 1](zio);
+		zio->io_pipeline_trace |= zio->io_stage;
+		rv = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
@@ -1196,6 +1713,8 @@ zio_wait(zio_t *zio)
 	ASSERT(zio->io_executor == NULL);
 
 	zio->io_waiter = curthread;
+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();
 
 	zio_execute(zio);
 
@@ -1224,9 +1743,11 @@ zio_nowait(zio_t *zio)
 		 */
 		spa_t *spa = zio->io_spa;
 
-		zio_add_child(spa->spa_async_zio_root, zio);
+		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
 	}
 
+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();
 	zio_execute(zio);
 }
 
@@ -1250,6 +1771,8 @@ zio_reexecute(zio_t *pio)
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
+	pio->io_flags |= ZIO_FLAG_REEXECUTED;
+	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
@@ -1266,8 +1789,9 @@ zio_reexecute(zio_t *pio)
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
-	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
-		cio_next = zio_walk_children(pio);
+	zio_link_t *zl = NULL;
+	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio, &zl);
 		mutex_enter(&pio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			pio->io_children[cio->io_child_type][w]++;
@@ -1280,8 +1804,10 @@ zio_reexecute(zio_t *pio)
 	 * We don't reexecute "The Godfather" I/O here as it's the
 	 * responsibility of the caller to wait on him.
 	 */
-	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+		pio->io_queued_timestamp = gethrtime();
 		zio_execute(pio);
+	}
 }
 
 void
@@ -1465,6 +1991,7 @@ zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
 	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
 	    ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
@@ -1597,7 +2124,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang
 		}
 	}
 
-	if (gn == gio->io_gang_tree)
+	if (gn == gio->io_gang_tree && gio->io_data != NULL)
 		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
 
 	if (zio != pio)
@@ -1674,6 +2201,7 @@ static int
 zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
+	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
@@ -1687,10 +2215,44 @@ zio_write_gang_block(zio_t *pio)
 	zio_prop_t zp;
 	int error;
 
-	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
-	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
-	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+		flags |= METASLAB_ASYNC_ALLOC;
+		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+
+		/*
+		 * The logical zio has already placed a reservation for
+		 * 'copies' allocation slots but gang blocks may require
+		 * additional copies. These additional copies
+		 * (i.e. gbh_copies - copies) are guaranteed to succeed
+		 * since metaslab_class_throttle_reserve() always allows
+		 * additional reservations for gang blocks.
+		 */
+		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+		    pio, flags));
+	}
+
+	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
+	    &pio->io_alloc_list, pio);
 	if (error) {
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+			/*
+			 * If we failed to allocate the gang block header then
+			 * we remove any additional allocation reservations that
+			 * we placed here. The original reservation will
+			 * be removed when the logical I/O goes to the ready
+			 * stage.
+			 */
+			metaslab_class_throttle_unreserve(mc,
+			    gbh_copies - copies, pio);
+		}
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
@@ -1725,14 +2287,29 @@ zio_write_gang_block(zio_t *pio)
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
-		zp.zp_dedup = 0;
-		zp.zp_dedup_verify = 0;
+		zp.zp_dedup = B_FALSE;
+		zp.zp_dedup_verify = B_FALSE;
+		zp.zp_nopwrite = B_FALSE;
 
-		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
-		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
-		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
-		    &pio->io_bookmark));
+		    zio_write_gang_member_ready, NULL, NULL, NULL,
+		    &gn->gn_child[g], pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+			/*
+			 * Gang children won't throttle but we should
+			 * account for their work, so reserve an allocation
+			 * slot for them here.
+			 */
+			VERIFY(metaslab_class_throttle_reserve(mc,
+			    zp.zp_copies, cio, flags));
+		}
+		zio_nowait(cio);
 	}
 
 	/*
@@ -1746,6 +2323,74 @@ zio_write_gang_block(zio_t *pio)
 }
 
 /*
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary.  The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required.  Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions.  To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
+ */
+static int
+zio_nop_write(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	zio_prop_t *zp = &zio->io_prop;
+
+	ASSERT(BP_GET_LEVEL(bp) == 0);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(zp->zp_nopwrite);
+	ASSERT(!zp->zp_dedup);
+	ASSERT(zio->io_bp_override == NULL);
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Check to see if the original bp and the new bp have matching
+	 * characteristics (i.e. same checksum, compression algorithms, etc).
+	 * If they don't then just continue with the pipeline which will
+	 * allocate a new bp.
+	 */
+	if (BP_IS_HOLE(bp_orig) ||
+	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+	    ZCHECKSUM_FLAG_NOPWRITE) ||
+	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	/*
+	 * If the checksums match then reset the pipeline so that we
+	 * avoid allocating a new bp and issuing any I/O.
+	 */
+	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE);
+		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+		    sizeof (uint64_t)) == 0);
+
+		*bp = *bp_orig;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		zio->io_flags |= ZIO_FLAG_NOPWRITE;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
  * ==========================================================================
  * Dedup
  * ==========================================================================
@@ -1875,7 +2520,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt
 
 		if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
-			uint32_t aflags = ARC_WAIT;
+			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
@@ -1883,7 +2528,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt
 
 			ddt_exit(ddt);
 
-			error = arc_read_nolock(NULL, spa, &blk,
+			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zio->io_bookmark);
@@ -1892,8 +2537,8 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt
 				if (arc_buf_size(abuf) != zio->io_orig_size ||
 				    bcmp(abuf->b_data, zio->io_orig_data,
 				    zio->io_orig_size) != 0)
-					error = EEXIST;
-				VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+					error = SET_ERROR(EEXIST);
+				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
@@ -1922,7 +2567,8 @@ zio_ddt_child_write_ready(zio_t *zio)
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
-	while ((pio = zio_walk_parents(zio)) != NULL)
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
@@ -1943,7 +2589,8 @@ zio_ddt_child_write_done(zio_t *zio)
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
-		while (zio_walk_parents(zio) != NULL)
+		zio_link_t *zl = NULL;
+		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
@@ -2011,13 +2658,14 @@ zio_ddt_write(zio_t *zio)
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
-		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
-			zp->zp_dedup = 0;
+			zp->zp_dedup = B_FALSE;
 		}
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
@@ -2051,8 +2699,8 @@ zio_ddt_write(zio_t *zio)
 		}
 
 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-		    zio->io_orig_size, &czp, NULL,
-		    zio_ddt_ditto_write_done, dde, zio->io_priority,
+		    zio->io_orig_size, &czp, NULL, NULL,
+		    NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
@@ -2073,7 +2721,8 @@ zio_ddt_write(zio_t *zio)
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
-		    zio->io_orig_size, zp, zio_ddt_child_write_ready,
+		    zio->io_orig_size, zp,
+		    zio_ddt_child_write_ready, NULL, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
@@ -2091,6 +2740,8 @@ zio_ddt_write(zio_t *zio)
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
+ddt_entry_t *freedde; /* for debugging */
+
 static int
 zio_ddt_free(zio_t *zio)
 {
@@ -2104,7 +2755,7 @@ zio_ddt_free(zio_t *zio)
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
-	dde = ddt_lookup(ddt, bp, B_TRUE);
+	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = ddt_phys_select(dde, bp);
 	ddt_phys_decref(ddp);
 	ddt_exit(ddt);
@@ -2117,6 +2768,97 @@ zio_ddt_free(zio_t *zio)
  * Allocate and free blocks
  * ==========================================================================
  */
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa)
+{
+	zio_t *zio;
+
+	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+
+	zio = avl_first(&spa->spa_alloc_tree);
+	if (zio == NULL)
+		return (NULL);
+
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Try to place a reservation for this zio. If we're unable to
+	 * reserve then we throttle.
+	 */
+	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+	    zio->io_prop.zp_copies, zio, 0)) {
+		return (NULL);
+	}
+
+	avl_remove(&spa->spa_alloc_tree, zio);
+	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+	return (zio);
+}
+
+static int
+zio_dva_throttle(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_t *nio;
+
+	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+	    !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+	    zio->io_child_type == ZIO_CHILD_GANG ||
+	    zio->io_flags & ZIO_FLAG_NODATA) {
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+	mutex_enter(&spa->spa_alloc_lock);
+
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	avl_add(&spa->spa_alloc_tree, zio);
+
+	nio = zio_io_to_allocate(zio->io_spa);
+	mutex_exit(&spa->spa_alloc_lock);
+
+	if (nio == zio)
+		return (ZIO_PIPELINE_CONTINUE);
+
+	if (nio != NULL) {
+		ASSERT3U(nio->io_queued_timestamp, <=,
+		    zio->io_queued_timestamp);
+		ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+		/*
+		 * We are passing control to a new zio so make sure that
+		 * it is processed by a different thread. We do this to
+		 * avoid stack overflows that can occur when parents are
+		 * throttled and children are making progress. We allow
+		 * it to go to the head of the taskq since it's already
+		 * been waiting.
+		 */
+		zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
+	}
+	return (ZIO_PIPELINE_STOP);
+}
+
+void
+zio_allocate_dispatch(spa_t *spa)
+{
+	zio_t *zio;
+
+	mutex_enter(&spa->spa_alloc_lock);
+	zio = zio_io_to_allocate(spa);
+	mutex_exit(&spa->spa_alloc_lock);
+	if (zio == NULL)
+		return;
+
+	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+	ASSERT0(zio->io_error);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
 static int
 zio_dva_allocate(zio_t *zio)
 {
@@ -2124,6 +2866,7 @@ zio_dva_allocate(zio_t *zio)
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
+	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2131,15 +2874,29 @@ zio_dva_allocate(zio_t *zio)
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
-	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		flags |= METASLAB_DONT_THROTTLE;
+	}
+	if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
+		flags |= METASLAB_GANG_CHILD;
+	}
+	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
+		flags |= METASLAB_ASYNC_ALLOC;
+	}
+
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+	    &zio->io_alloc_list, zio);
 
-	if (error) {
+	if (error != 0) {
+		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;
@@ -2195,19 +2952,26 @@ zio_dva_unallocate(zio_t *zio, zio_gang_
  */
 int
 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t use_slog)
+    uint64_t size, boolean_t *slog)
 {
 	int error = 1;
+	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
-	if (use_slog)
-		error = metaslab_alloc(spa, spa_log_class(spa), size,
-		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
-
-	if (error)
+	metaslab_trace_init(&io_alloc_list);
+	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
+	if (error == 0) {
+		*slog = TRUE;
+	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
-		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
+		    &io_alloc_list, NULL);
+		if (error == 0)
+			*slog = FALSE;
+	}
+	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
@@ -2239,15 +3003,28 @@ zio_free_zil(spa_t *spa, uint64_t txg, b
 
 /*
  * ==========================================================================
- * Read and write to physical devices
+ * Read, write and delete to physical devices
  * ==========================================================================
  */
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
 static int
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
+	int ret;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
@@ -2259,25 +3036,76 @@ zio_vdev_io_start(zio_t *zio)
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
-		return (vdev_mirror_ops.vdev_op_io_start(zio));
+		vdev_mirror_ops.vdev_op_io_start(zio);
+		return (ZIO_PIPELINE_STOP);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
+	    zio->io_priority == ZIO_PRIORITY_NOW) {
+		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	ASSERT3P(zio->io_logical, !=, zio);
+
+	/*
+	 * We keep track of time-sensitive I/Os so that the scan thread
+	 * can quickly react to certain workloads.  In particular, we care
+	 * about non-scrubbing, top-level reads and writes with the following
+	 * characteristics:
+	 *	- synchronous writes of user data to non-slog devices
+	 *	- any reads of user data
+	 * When these conditions are met, adjust the timestamp of spa_last_io
+	 * which allows the scan thread to adjust its workload accordingly.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
+	    vd == vd->vdev_top && !vd->vdev_islog &&
+	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
+	    zio->io_txg != spa_syncing_txg(spa)) {
+		uint64_t old = spa->spa_last_io;
+		uint64_t new = ddi_get_lbolt64();
+		if (old != new)
+			(void) atomic_cas_64(&spa->spa_last_io, old, new);
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
-	if (P2PHASE(zio->io_size, align) != 0) {
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+	    P2PHASE(zio->io_size, align) != 0) {
+		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
-		char *abuf = zio_buf_alloc(asize);
+		char *abuf = NULL;
+		if (zio->io_type == ZIO_TYPE_READ ||
+		    zio->io_type == ZIO_TYPE_WRITE)
+			abuf = zio_buf_alloc(asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			bcopy(zio->io_data, abuf, zio->io_size);
 			bzero(abuf + zio->io_size, asize - zio->io_size);
 		}
-		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
+		    zio_subblock);
 	}
 
-	ASSERT(P2PHASE(zio->io_offset, align) == 0);
-	ASSERT(P2PHASE(zio->io_size, align) == 0);
-	ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+	/*
+	 * If this is not a physical io, make sure that it is properly aligned
+	 * before proceeding.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+		ASSERT0(P2PHASE(zio->io_offset, align));
+		ASSERT0(P2PHASE(zio->io_size, align));
+	} else {
+		/*
+		 * For the physical io we allow alignment
+		 * to a logical block size.
+		 */
+		uint64_t log_align =
+		    1ULL << vd->vdev_top->vdev_logical_ashift;
+		ASSERT0(P2PHASE(zio->io_offset, log_align));
+		ASSERT0(P2PHASE(zio->io_size, log_align));
+	}
+
+	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
@@ -2301,23 +3129,37 @@ zio_vdev_io_start(zio_t *zio)
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	if (vd->vdev_ops->vdev_op_leaf &&
-	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
-
-		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-			return (ZIO_PIPELINE_CONTINUE);
-
-		if ((zio = vdev_queue_io(zio)) == NULL)
-			return (ZIO_PIPELINE_STOP);
+	if (vd->vdev_ops->vdev_op_leaf) {
+		switch (zio->io_type) {
+		case ZIO_TYPE_READ:
+			if (vdev_cache_read(zio))
+				return (ZIO_PIPELINE_CONTINUE);
+			/* FALLTHROUGH */
+		case ZIO_TYPE_WRITE:
+		case ZIO_TYPE_FREE:
+			if ((zio = vdev_queue_io(zio)) == NULL)
+				return (ZIO_PIPELINE_STOP);
 
-		if (!vdev_accessible(vd, zio)) {
-			zio->io_error = ENXIO;
-			zio_interrupt(zio);
-			return (ZIO_PIPELINE_STOP);
+			if (!vdev_accessible(vd, zio)) {
+				zio->io_error = SET_ERROR(ENXIO);
+				zio_interrupt(zio);
+				return (ZIO_PIPELINE_STOP);
+			}
+			break;
 		}
+		/*
+		 * Note that we ignore repair writes for TRIM because they can
+		 * conflict with normal writes. This isn't an issue because, by
+		 * definition, we only repair blocks that aren't freed.
+		 */
+		if (zio->io_type == ZIO_TYPE_WRITE &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+		    !trim_map_write_start(zio))
+			return (ZIO_PIPELINE_STOP);
 	}
 
-	return (vd->vdev_ops->vdev_op_io_start(zio));
+	vd->vdev_ops->vdev_op_io_start(zio);
+	return (ZIO_PIPELINE_STOP);
 }
 
 static int
@@ -2330,9 +3172,16 @@ zio_vdev_io_done(zio_t *zio)
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
-	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
 
-	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_FREE)) {
+
+		if (zio->io_type == ZIO_TYPE_WRITE &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+			trim_map_write_done(zio);
 
 		vdev_queue_io_done(zio);
 
@@ -2347,8 +3196,11 @@ zio_vdev_io_done(zio_t *zio)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error) {
-			if (!vdev_accessible(vd, zio)) {
-				zio->io_error = ENXIO;
+			if (zio->io_error == ENOTSUP &&
+			    zio->io_type == ZIO_TYPE_FREE) {
+				/* Not all devices support TRIM. */
+			} else if (!vdev_accessible(vd, zio)) {
+				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
@@ -2408,6 +3260,22 @@ zio_vdev_io_assess(zio_t *zio)
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
+	if (zio->io_type == ZIO_TYPE_FREE &&
+	    zio->io_priority != ZIO_PRIORITY_NOW) {
+		switch (zio->io_error) {
+		case 0:
+			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
+			ZIO_TRIM_STAT_BUMP(success);
+			break;
+		case EOPNOTSUPP:
+			ZIO_TRIM_STAT_BUMP(unsupported);
+			break;
+		default:
+			ZIO_TRIM_STAT_BUMP(failed);
+			break;
+		}
+	}
+
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
@@ -2433,19 +3301,27 @@ zio_vdev_io_assess(zio_t *zio)
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
-		zio->io_error = ENXIO;
+		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
-	    vd != NULL && !vd->vdev_ops->vdev_op_leaf)
+	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vd->vdev_cant_write = B_TRUE;
+	}
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    zio->io_physdone != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+		zio->io_physdone(zio->io_logical);
+	}
+
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
@@ -2534,7 +3410,8 @@ zio_checksum_verify(zio_t *zio)
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
-		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		if (error == ECKSUM &&
+		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, zio, zio->io_offset,
 			    zio->io_size, NULL, &info);
@@ -2556,7 +3433,7 @@ zio_checksum_verified(zio_t *zio)
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
- * An error of 0 indictes success.  ENXIO indicates whole-device failure,
+ * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
@@ -2589,6 +3466,7 @@ zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
+	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
@@ -2596,7 +3474,8 @@ zio_ready(zio_t *zio)
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
@@ -2605,12 +3484,26 @@ zio_ready(zio_t *zio)
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 
-	if (zio->io_error)
+	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(IO_IS_ALLOCATING(zio));
+			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			/*
+			 * We were unable to allocate anything, unreserve and
+			 * issue the next I/O to allocate.
+			 */
+			metaslab_class_throttle_unreserve(
+			    spa_normal_class(zio->io_spa),
+			    zio->io_prop.zp_copies, zio);
+			zio_allocate_dispatch(zio->io_spa);
+		}
+	}
+
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
-	pio = zio_walk_parents(zio);
+	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
@@ -2621,7 +3514,7 @@ zio_ready(zio_t *zio)
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
-		pio_next = zio_walk_parents(zio);
+		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
 	}
 
@@ -2641,6 +3534,66 @@ zio_ready(zio_t *zio)
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+	zio_t *lio = zio->io_logical;
+	zio_t *pio = zio_unique_parent(zio);
+	vdev_t *vd = zio->io_vd;
+	int flags = METASLAB_ASYNC_ALLOC;
+
+	ASSERT3P(zio->io_bp, !=, NULL);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+	ASSERT(vd != NULL);
+	ASSERT3P(vd, ==, vd->vdev_top);
+	ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
+	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+	/*
+	 * Parents of gang children can have two flavors -- ones that
+	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+	 * and ones that allocated the constituent blocks. The allocation
+	 * throttle needs to know the allocating parent zio so we must find
+	 * it here.
+	 */
+	if (pio->io_child_type == ZIO_CHILD_GANG) {
+		/*
+		 * If our parent is a rewrite gang child then our grandparent
+		 * would have been the one that performed the allocation.
+		 */
+		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+			pio = zio_unique_parent(pio);
+		flags |= METASLAB_GANG_CHILD;
+	}
+
+	ASSERT(IO_IS_ALLOCATING(pio));
+	ASSERT3P(zio, !=, zio->io_logical);
+	ASSERT(zio->io_logical != NULL);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+
+	mutex_enter(&pio->io_lock);
+	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+	mutex_exit(&pio->io_lock);
+
+	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
+	    1, pio);
+
+	/*
+	 * Call into the pipeline to see if there is more work that
+	 * needs to be done. If there is work to be done it will be
+	 * dispatched to another taskq thread.
+	 */
+	zio_allocate_dispatch(zio->io_spa);
+}
+
 static int
 zio_done(zio_t *zio)
 {
@@ -2650,6 +3603,8 @@ zio_done(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
+	metaslab_class_t *mc = spa_normal_class(spa);
+	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
@@ -2661,11 +3616,35 @@ zio_done(zio_t *zio)
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
+	/*
+	 * If the allocation throttle is enabled, then update the accounting.
+	 * We only track child I/Os that are part of an allocating async
+	 * write. We must do this since the allocation is performed
+	 * by the logical I/O but the actual write is done by child I/Os.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+	    zio->io_child_type == ZIO_CHILD_VDEV) {
+		ASSERT(mc->mc_alloc_throttle_enabled);
+		zio_dva_throttle_done(zio);
+	}
+
+	/*
+	 * If the allocation throttle is enabled, verify that
+	 * we have decremented the refcounts for every I/O that was throttled.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(bp != NULL);
+		metaslab_group_alloc_verify(spa, zio->io_bp, zio);
+		VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
+	}
+
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
-	if (bp != NULL) {
+	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
 		ASSERT(bp->blk_pad[0] == 0);
 		ASSERT(bp->blk_pad[1] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
@@ -2678,6 +3657,8 @@ zio_done(zio_t *zio)
 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
 		}
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
 	}
 
 	/*
@@ -2759,6 +3740,7 @@ zio_done(zio_t *zio)
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
+		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
@@ -2786,7 +3768,7 @@ zio_done(zio_t *zio)
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
-	    !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
+	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
@@ -2827,13 +3809,15 @@ zio_done(zio_t *zio)
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
-		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
-			zio_link_t *zl = zio->io_walk_link;
-			pio_next = zio_walk_parents(zio);
+		zl = NULL;
+		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+		    pio = pio_next) {
+			zio_link_t *remove_zl = zl;
+			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
-				zio_remove_child(pio, zio, zl);
+				zio_remove_child(pio, zio, remove_zl);
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 			}
 		}
@@ -2858,9 +3842,14 @@ zio_done(zio_t *zio)
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
-			(void) taskq_dispatch(
-			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
-			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+#if defined(illumos) || defined(__NetBSD__) || !defined(_KERNEL)
+			ASSERT(zio->io_tqent.tqent_next == NULL);
+#else
+			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
+#endif
+			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
+			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
+			    0, &zio->io_tqent);
 		}
 		return (ZIO_PIPELINE_STOP);
 	}
@@ -2892,10 +3881,11 @@ zio_done(zio_t *zio)
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
-	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
-		zio_link_t *zl = zio->io_walk_link;
-		pio_next = zio_walk_parents(zio);
-		zio_remove_child(pio, zio, zl);
+	zl = NULL;
+	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+		zio_link_t *remove_zl = zl;
+		pio_next = zio_walk_parents(zio, &zl);
+		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}
 
@@ -2919,16 +3909,19 @@ zio_done(zio_t *zio)
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
+	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
-	zio_write_bp_init,
+	zio_write_compress,
 	zio_checksum_generate,
+	zio_nop_write,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
+	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
@@ -2939,3 +3932,128 @@ static zio_pipe_stage_t *zio_pipeline[] 
 	zio_checksum_verify,
 	zio_done
 };
+
+
+
+
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects.  Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
+	/*
+	 * These variables represent the "equivalent" values for the zbookmark,
+	 * after converting zbookmarks inside the meta dnode to their
+	 * normal-object equivalents.
+	 */
+	uint64_t zb1obj, zb2obj;
+	uint64_t zb1L0, zb2L0;
+	uint64_t zb1level, zb2level;
+
+	if (zb1->zb_object == zb2->zb_object &&
+	    zb1->zb_level == zb2->zb_level &&
+	    zb1->zb_blkid == zb2->zb_blkid)
+		return (0);
+
+	/*
+	 * BP_SPANB calculates the span in blocks.
+	 */
+	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
+
+	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb1L0 = 0;
+		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb1obj = zb1->zb_object;
+		zb1level = zb1->zb_level;
+	}
+
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb2L0 = 0;
+		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb2obj = zb2->zb_object;
+		zb2level = zb2->zb_level;
+	}
+
+	/* Now that we have a canonical representation, do the comparison. */
+	if (zb1obj != zb2obj)
+		return (zb1obj < zb2obj ? -1 : 1);
+	else if (zb1L0 != zb2L0)
+		return (zb1L0 < zb2L0 ? -1 : 1);
+	else if (zb1level != zb2level)
+		return (zb1level > zb2level ? -1 : 1);
+	/*
+	 * This can (theoretically) happen if the bookmarks have the same object
+	 * and level, but different blkids, if the block sizes are not the same.
+	 * There is presently no way to change the indirect block sizes
+	 */
+	return (0);
+}
+
+/*
+ *  This function checks the following: given that last_block is the place that
+ *  our traversal stopped last time, does that guarantee that we've visited
+ *  every node under subtree_root?  Therefore, we can't just use the raw output
+ *  of zbookmark_compare.  We have to pass in a modified version of
+ *  subtree_root; by incrementing the block id, and then checking whether
+ *  last_block is before or equal to that, we can tell whether or not having
+ *  visited last_block implies that all of subtree_root's children have been
+ *  visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+	zbookmark_phys_t mod_zb = *subtree_root;
+	mod_zb.zb_blkid++;
+	ASSERT(last_block->zb_level == 0);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
+		return (B_FALSE);
+
+	/*
+	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+	 * data block size in sectors, because that variable is only used if
+	 * the bookmark refers to a block in the meta-dnode.  Since we don't
+	 * know without examining it what object it refers to, and there's no
+	 * harm in passing in this value in other cases, we always pass it in.
+	 *
+	 * We pass in 0 for the indirect block size shift because zb2 must be
+	 * level 0.  The indirect block size is only used to calculate the span
+	 * of the bookmark, but since the bookmark must be level 0, the span is
+	 * always 1, so the math works out.
+	 *
+	 * If you make changes to how the zbookmark_compare code works, be sure
+	 * to make sure that this code still works afterwards.
+	 */
+	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+	    last_block) <= 0);
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zio_checksum.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c	27 Feb 2010 22:31:36 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c	28 Jun 2017 00:04:54 -0000
@@ -19,15 +19,19 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
+#include <zfs_fletcher.h>
 
 /*
  * Checksum vectors.
@@ -57,28 +61,109 @@
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really).  A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed.  This salt is kept secret (stored on the pool, but
+ * never shown to the user).  Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time.  How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data.  Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context.  The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
  */
 
 /*ARGSUSED*/
 static void
-zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_off(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-	{{NULL,			NULL},			0, 0, 0, "inherit"},
-	{{NULL,			NULL},			0, 0, 0, "on"},
-	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
+	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
+	{{NULL, NULL}, NULL, NULL, 0, "on"},
+	{{zio_checksum_off,		zio_checksum_off},
+	    NULL, NULL, 0, "off"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "label"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "gang_header"},
+	{{fletcher_2_native,		fletcher_2_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+	{{fletcher_2_native,		fletcher_2_byteswap},
+	    NULL, NULL, 0, "fletcher2"},
+	{{fletcher_4_native,		fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+	{{fletcher_4_native,		fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+	{{zio_checksum_off,		zio_checksum_off},
+	    NULL, NULL, 0, "noparity"},
+#ifndef __NetBSD__
+	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+	{{zio_checksum_skein_native,	zio_checksum_skein_byteswap},
+	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+#endif
+#ifdef illumos
+	{{zio_checksum_edonr_native,	zio_checksum_edonr_byteswap},
+	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+#endif
 };
 
+/*
+ * The flag corresponding to the "verify" in dedup=[checksum,]verify
+ * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
+ */
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
+
+	switch (cksum) {
+#ifndef __NetBSD__
+	case ZIO_CHECKSUM_SHA512:
+		return (SPA_FEATURE_SHA512);
+	case ZIO_CHECKSUM_SKEIN:
+		return (SPA_FEATURE_SKEIN);
+#endif
+#ifdef illumos
+	case ZIO_CHECKSUM_EDONR:
+		return (SPA_FEATURE_EDONR);
+#endif
+	}
+	return (SPA_FEATURE_NONE);
+}
+
 enum zio_checksum
 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
@@ -112,7 +197,8 @@ zio_checksum_dedup_select(spa_t *spa, en
 	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 
-	ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP) ||
 	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 
 	return (child);
@@ -145,21 +231,48 @@ zio_checksum_label_verifier(zio_cksum_t 
 }
 
 /*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+	if (ci->ci_tmpl_init == NULL)
+		return;
+	if (spa->spa_cksum_tmpls[checksum] != NULL)
+		return;
+
+	VERIFY(ci->ci_tmpl_free != NULL);
+	mutex_enter(&spa->spa_cksum_tmpls_lock);
+	if (spa->spa_cksum_tmpls[checksum] == NULL) {
+		spa->spa_cksum_tmpls[checksum] =
+		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
+		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+	}
+	mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/*
  * Generate the checksum.
  */
 void
 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
-	void *data, uint64_t size)
+    void *data, uint64_t size)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t cksum;
+	spa_t *spa = zio->io_spa;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
-	if (ci->ci_eck) {
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
@@ -178,33 +291,31 @@ zio_checksum_compute(zio_t *zio, enum zi
 		else
 			bp->blk_cksum = eck->zec_cksum;
 		eck->zec_magic = ZEC_MAGIC;
-		ci->ci_func[0](data, size, &cksum);
+		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		    &cksum);
 		eck->zec_cksum = cksum;
 	} else {
-		ci->ci_func[0](data, size, &bp->blk_cksum);
+		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		    &bp->blk_cksum);
 	}
 }
 
 int
-zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
+    void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
 {
-	blkptr_t *bp = zio->io_bp;
-	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
-	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
-	int byteswap;
-	int error;
-	uint64_t size = (bp == NULL ? zio->io_size :
-	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
-	uint64_t offset = zio->io_offset;
-	void *data = zio->io_data;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t actual_cksum, expected_cksum, verifier;
+	zio_cksum_t actual_cksum, expected_cksum;
+	int byteswap;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
-	if (ci->ci_eck) {
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
+		zio_cksum_t verifier;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = data;
@@ -216,10 +327,10 @@ zio_checksum_error(zio_t *zio, zio_bad_c
 			else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
 				nused = BSWAP_64(zilc->zc_nused);
 			else
-				return (ECKSUM);
+				return (SET_ERROR(ECKSUM));
 
 			if (nused > size)
-				return (ECKSUM);
+				return (SET_ERROR(ECKSUM));
 
 			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
 		} else {
@@ -240,35 +351,76 @@ zio_checksum_error(zio_t *zio, zio_bad_c
 
 		expected_cksum = eck->zec_cksum;
 		eck->zec_cksum = verifier;
-		ci->ci_func[byteswap](data, size, &actual_cksum);
+		ci->ci_func[byteswap](data, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 		eck->zec_cksum = expected_cksum;
 
-		if (byteswap)
+		if (byteswap) {
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
+		}
 	} else {
-		ASSERT(!BP_IS_GANG(bp));
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
-		ci->ci_func[byteswap](data, size, &actual_cksum);
+		ci->ci_func[byteswap](data, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}
 
-	info->zbc_expected = expected_cksum;
-	info->zbc_actual = actual_cksum;
-	info->zbc_checksum_name = ci->ci_name;
-	info->zbc_byteswapped = byteswap;
-	info->zbc_injected = 0;
-	info->zbc_has_cksum = 1;
+	if (info != NULL) {
+		info->zbc_expected = expected_cksum;
+		info->zbc_actual = actual_cksum;
+		info->zbc_checksum_name = ci->ci_name;
+		info->zbc_byteswapped = byteswap;
+		info->zbc_injected = 0;
+		info->zbc_has_cksum = 1;
+	}
 
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
-		return (ECKSUM);
+		return (SET_ERROR(ECKSUM));
 
-	if (zio_injection_enabled && !zio->io_error &&
+	return (0);
+}
+
+int
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+{
+	blkptr_t *bp = zio->io_bp;
+	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+	int error;
+	uint64_t size = (bp == NULL ? zio->io_size :
+	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+	uint64_t offset = zio->io_offset;
+	void *data = zio->io_data;
+	spa_t *spa = zio->io_spa;
+
+	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
+	    offset, info);
+	if (error != 0 && zio_injection_enabled && !zio->io_error &&
 	    (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
 
 		info->zbc_injected = 1;
 		return (error);
 	}
+	return (error);
+}
 
-	return (0);
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+	for (enum zio_checksum checksum = 0;
+	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
+		if (spa->spa_cksum_tmpls[checksum] != NULL) {
+			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+			VERIFY(ci->ci_tmpl_free != NULL);
+			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+			spa->spa_cksum_tmpls[checksum] = NULL;
+		}
+	}
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zio_compress.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c	27 Feb 2010 22:31:36 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c	30 Aug 2015 02:17:54 -0000
@@ -23,13 +23,41 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
 
 #include <sys/zfs_context.h>
 #include <sys/compress.h>
+#include <sys/kstat.h>
 #include <sys/spa.h>
+#include <sys/zfeature.h>
 #include <sys/zio.h>
 #include <sys/zio_compress.h>
 
+typedef struct zcomp_stats {
+	kstat_named_t zcompstat_attempts;
+	kstat_named_t zcompstat_empty;
+	kstat_named_t zcompstat_skipped_insufficient_gain;
+} zcomp_stats_t;
+
+static zcomp_stats_t zcomp_stats = {
+	{ "attempts",			KSTAT_DATA_UINT64 },
+	{ "empty",			KSTAT_DATA_UINT64 },
+	{ "skipped_insufficient_gain",	KSTAT_DATA_UINT64 }
+};
+
+#define	ZCOMPSTAT_INCR(stat, val) \
+	atomic_add_64(&zcomp_stats.stat.value.ui64, (val));
+
+#define	ZCOMPSTAT_BUMP(stat)		ZCOMPSTAT_INCR(stat, 1);
+
+kstat_t		*zcomp_ksp;
+
 /*
  * Compression vectors.
  */
@@ -50,34 +78,45 @@ zio_compress_info_t zio_compress_table[Z
 	{gzip_compress,		gzip_decompress,	8,	"gzip-8"},
 	{gzip_compress,		gzip_decompress,	9,	"gzip-9"},
 	{zle_compress,		zle_decompress,		64,	"zle"},
+	{lz4_compress,		lz4_decompress,		0,	"lz4"},
 };
 
 enum zio_compress
-zio_compress_select(enum zio_compress child, enum zio_compress parent)
+zio_compress_select(spa_t *spa, enum zio_compress child,
+    enum zio_compress parent)
 {
+	enum zio_compress result;
+
 	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
 	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
-
-	if (child == ZIO_COMPRESS_INHERIT)
-		return (parent);
+	ASSERT(parent != ZIO_COMPRESS_INHERIT);
 
-	if (child == ZIO_COMPRESS_ON)
-		return (ZIO_COMPRESS_ON_VALUE);
+	result = child;
+	if (result == ZIO_COMPRESS_INHERIT)
+		result = parent;
+
+	if (result == ZIO_COMPRESS_ON) {
+		if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
+			result = ZIO_COMPRESS_LZ4_ON_VALUE;
+		else
+			result = ZIO_COMPRESS_LEGACY_ON_VALUE;
+	}
 
-	return (child);
+	return (result);
 }
 
 size_t
 zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 {
 	uint64_t *word, *word_end;
-	size_t c_len, d_len, r_len;
+	size_t c_len, d_len;
 	zio_compress_info_t *ci = &zio_compress_table[c];
 
 	ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
 	ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
 
+	ZCOMPSTAT_BUMP(zcompstat_attempts);
+
 	/*
 	 * If the data is all zeroes, we don't even need to allocate
 	 * a block for it.  We indicate this by returning zero size.
@@ -87,35 +126,24 @@ zio_compress_data(enum zio_compress c, v
 		if (*word != 0)
 			break;
 
-	if (word == word_end)
-		return (0);
+	if (word == word_end) {
+		ZCOMPSTAT_BUMP(zcompstat_empty);
+ 		return (0);
+	}
 
 	if (c == ZIO_COMPRESS_EMPTY)
 		return (s_len);
 
 	/* Compress at least 12.5% */
-	d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
-	if (d_len == 0)
-		return (s_len);
-
+	d_len = s_len - (s_len >> 3);
 	c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
 
-	if (c_len > d_len)
+	if (c_len > d_len) {
+		ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain);
 		return (s_len);
-
-	/*
-	 * Cool.  We compressed at least as much as we were hoping to.
-	 * For both security and repeatability, pad out the last sector.
-	 */
-	r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
-	if (r_len > c_len) {
-		bzero((char *)dst + c_len, r_len - c_len);
-		c_len = r_len;
 	}
 
 	ASSERT3U(c_len, <=, d_len);
-	ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
-
 	return (c_len);
 }
 
@@ -126,7 +154,30 @@ zio_decompress_data(enum zio_compress c,
 	zio_compress_info_t *ci = &zio_compress_table[c];
 
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
 }
+
+void
+zio_compress_init(void)
+{
+
+	zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zcomp_ksp != NULL) {
+		zcomp_ksp->ks_data = &zcomp_stats;
+		kstat_install(zcomp_ksp);
+	}
+}
+
+void
+zio_compress_fini(void)
+{
+	if (zcomp_ksp != NULL) {
+		kstat_delete(zcomp_ksp);
+		zcomp_ksp = NULL;
+	}
+}
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zio_inject.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c	27 Feb 2010 22:31:36 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c	27 Mar 2016 02:52:21 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 /*
@@ -49,22 +49,60 @@
 
 uint32_t zio_injection_enabled;
 
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
 typedef struct inject_handler {
 	int			zi_id;
 	spa_t			*zi_spa;
 	zinject_record_t	zi_record;
+	uint64_t		*zi_lanes;
+	int			zi_next_lane;
 	list_node_t		zi_link;
 } inject_handler_t;
 
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
 static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
 static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
 static int inject_next_id = 1;
 
 /*
  * Returns true if the given record matches the I/O in progress.
  */
 static boolean_t
-zio_match_handler(zbookmark_t *zb, uint64_t type,
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
     zinject_record_t *record, int error)
 {
 	/*
@@ -148,14 +186,8 @@ zio_handle_fault_injection(zio_t *zio, i
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa)
-			continue;
-
-		/* Ignore device errors and panic injection */
-		if (handler->zi_record.zi_guid != 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
 			continue;
 
 		/* If this handler matches, return EIO */
@@ -198,10 +230,7 @@ zio_handle_label_injection(zio_t *zio, i
 		uint64_t start = handler->zi_record.zi_start;
 		uint64_t end = handler->zi_record.zi_end;
 
-		/* Ignore device only faults or panic injection */
-		if (handler->zi_record.zi_start == 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
 			continue;
 
 		/*
@@ -239,7 +268,7 @@ zio_handle_device_injection(vdev_t *vd, 
 
 		if (offset < VDEV_LABEL_START_SIZE ||
 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
-		return (0);
+			return (0);
 	}
 
 	rw_enter(&inject_lock, RW_READER);
@@ -247,13 +276,7 @@ zio_handle_device_injection(vdev_t *vd, 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/*
-		 * Ignore label specific faults, panic injection
-		 * or fake writes
-		 */
-		if (handler->zi_record.zi_start != 0 ||
-		    handler->zi_record.zi_func[0] != '\0' ||
-		    handler->zi_record.zi_duration != 0)
+		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
@@ -277,11 +300,21 @@ zio_handle_device_injection(vdev_t *vd, 
 				if (error == ENXIO)
 					vd->vdev_stat.vs_aux =
 					    VDEV_AUX_OPEN_FAILED;
+
+				/*
+				 * Treat these errors as if they had been
+				 * retried so that all the appropriate stats
+				 * and FMA events are generated.
+				 */
+				if (!handler->zi_record.zi_failfast &&
+				    zio != NULL)
+					zio->io_flags |= ZIO_FLAG_IO_RETRY;
+
 				ret = error;
 				break;
 			}
 			if (handler->zi_record.zi_error == ENXIO) {
-				ret = EIO;
+				ret = SET_ERROR(EIO);
 				break;
 			}
 		}
@@ -307,10 +340,8 @@ zio_handle_ignored_writes(zio_t *zio)
 	    handler = list_next(&inject_handlers, handler)) {
 
 		/* Ignore errors not destined for this pool */
-		if (zio->io_spa != handler->zi_spa)
-			continue;
-
-		if (handler->zi_record.zi_duration == 0)
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		/*
@@ -346,11 +377,8 @@ spa_handle_ignored_writes(spa_t *spa)
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore errors not destined for this pool */
-		if (spa != handler->zi_spa)
-			continue;
-
-		if (handler->zi_record.zi_duration == 0)
+		if (spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		if (handler->zi_record.zi_duration > 0) {
@@ -370,6 +398,166 @@ spa_handle_ignored_writes(spa_t *spa)
 	rw_exit(&inject_lock);
 }
 
+hrtime_t
+zio_handle_io_delay(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	inject_handler_t *min_handler = NULL;
+	hrtime_t min_target = 0;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	/*
+	 * inject_delay_count is a subset of zio_injection_enabled that
+	 * is only incremented for delay handlers. These checks are
+	 * mainly added to remind the reader why we're not explicitly
+	 * checking zio_injection_enabled like the other functions.
+	 */
+	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+	/*
+	 * If there aren't any inject delay handlers registered, then we
+	 * can short circuit and simply return 0 here. A value of zero
+	 * informs zio_delay_interrupt() that this request should not be
+	 * delayed. This short circuit keeps us from acquiring the
+	 * inject_delay_mutex unnecessarily.
+	 */
+	if (inject_delay_count == 0) {
+		rw_exit(&inject_lock);
+		return (0);
+	}
+
+	/*
+	 * Each inject handler has a number of "lanes" associated with
+	 * it. Each lane is able to handle requests independently of one
+	 * another, and at a latency defined by the inject handler
+	 * record's zi_timer field. Thus if a handler in configured with
+	 * a single lane with a 10ms latency, it will delay requests
+	 * such that only a single request is completed every 10ms. So,
+	 * if more than one request is attempted per each 10ms interval,
+	 * the average latency of the requests will be greater than
+	 * 10ms; but if only a single request is submitted each 10ms
+	 * interval the average latency will be 10ms.
+	 *
+	 * We need to acquire this mutex to prevent multiple concurrent
+	 * threads being assigned to the same lane of a given inject
+	 * handler. The mutex allows us to perform the following two
+	 * operations atomically:
+	 *
+	 *	1. determine the minimum handler and minimum target
+	 *	   value of all the possible handlers
+	 *	2. update that minimum handler's lane array
+	 *
+	 * Without atomicity, two (or more) threads could pick the same
+	 * lane in step (1), and then conflict with each other in step
+	 * (2). This could allow a single lane handler to process
+	 * multiple requests simultaneously, which shouldn't be possible.
+	 */
+	mutex_enter(&inject_delay_mtx);
+
+	for (inject_handler_t *handler = list_head(&inject_handlers);
+	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
+		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+			continue;
+
+		if (vd->vdev_guid != handler->zi_record.zi_guid)
+			continue;
+
+		/*
+		 * Defensive; should never happen as the array allocation
+		 * occurs prior to inserting this handler on the list.
+		 */
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+
+		/*
+		 * This should never happen, the zinject command should
+		 * prevent a user from setting an IO delay with zero lanes.
+		 */
+		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+		ASSERT3U(handler->zi_record.zi_nlanes, >,
+		    handler->zi_next_lane);
+
+		/*
+		 * We want to issue this IO to the lane that will become
+		 * idle the soonest, so we compare the soonest this
+		 * specific handler can complete the IO with all other
+		 * handlers, to find the lowest value of all possible
+		 * lanes. We then use this lane to submit the request.
+		 *
+		 * Since each handler has a constant value for its
+		 * delay, we can just use the "next" lane for that
+		 * handler; as it will always be the lane with the
+		 * lowest value for that particular handler (i.e. the
+		 * lane that will become idle the soonest). This saves a
+		 * scan of each handler's lanes array.
+		 *
+		 * There's two cases to consider when determining when
+		 * this specific IO request should complete. If this
+		 * lane is idle, we want to "submit" the request now so
+		 * it will complete after zi_timer milliseconds. Thus,
+		 * we set the target to now + zi_timer.
+		 *
+		 * If the lane is busy, we want this request to complete
+		 * zi_timer milliseconds after the lane becomes idle.
+		 * Since the 'zi_lanes' array holds the time at which
+		 * each lane will become idle, we use that value to
+		 * determine when this request should complete.
+		 */
+		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
+		hrtime_t busy = handler->zi_record.zi_timer +
+		    handler->zi_lanes[handler->zi_next_lane];
+		hrtime_t target = MAX(idle, busy);
+
+		if (min_handler == NULL) {
+			min_handler = handler;
+			min_target = target;
+			continue;
+		}
+
+		ASSERT3P(min_handler, !=, NULL);
+		ASSERT3U(min_target, !=, 0);
+
+		/*
+		 * We don't yet increment the "next lane" variable since
+		 * we still might find a lower value lane in another
+		 * handler during any remaining iterations. Once we're
+		 * sure we've selected the absolute minimum, we'll claim
+		 * the lane and increment the handler's "next lane"
+		 * field below.
+		 */
+
+		if (target < min_target) {
+			min_handler = handler;
+			min_target = target;
+		}
+	}
+
+	/*
+	 * 'min_handler' will be NULL if no IO delays are registered for
+	 * this vdev, otherwise it will point to the handler containing
+	 * the lane that will become idle the soonest.
+	 */
+	if (min_handler != NULL) {
+		ASSERT3U(min_target, !=, 0);
+		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+		/*
+		 * If we've used all possible lanes for this handler,
+		 * loop back and start using the first lane again;
+		 * otherwise, just increment the lane index.
+		 */
+		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+		    min_handler->zi_record.zi_nlanes;
+	}
+
+	mutex_exit(&inject_delay_mtx);
+	rw_exit(&inject_lock);
+
+	return (min_target);
+}
+
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
@@ -391,6 +579,24 @@ zio_inject_fault(char *name, int flags, 
 		if ((error = spa_reset(name)) != 0)
 			return (error);
 
+	if (record->zi_cmd == ZINJECT_DELAY_IO) {
+		/*
+		 * A value of zero for the number of lanes or for the
+		 * delay time doesn't make sense.
+		 */
+		if (record->zi_timer == 0 || record->zi_nlanes == 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The number of lanes is directly mapped to the size of
+		 * an array used by the handler. Thus, to ensure the
+		 * user doesn't trigger an allocation that's "too large"
+		 * we cap the number of lanes here.
+		 */
+		if (record->zi_nlanes >= UINT16_MAX)
+			return (SET_ERROR(EINVAL));
+	}
+
 	if (!(flags & ZINJECT_NULL)) {
 		/*
 		 * spa_inject_ref() will add an injection reference, which will
@@ -398,17 +604,40 @@ zio_inject_fault(char *name, int flags, 
 		 * still allowing it to be unloaded.
 		 */
 		if ((spa = spa_inject_addref(name)) == NULL)
-			return (ENOENT);
+			return (SET_ERROR(ENOENT));
 
 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
 
+		handler->zi_spa = spa;
+		handler->zi_record = *record;
+
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			handler->zi_lanes = kmem_zalloc(
+			    sizeof (*handler->zi_lanes) *
+			    handler->zi_record.zi_nlanes, KM_SLEEP);
+			handler->zi_next_lane = 0;
+		} else {
+			handler->zi_lanes = NULL;
+			handler->zi_next_lane = 0;
+		}
+
 		rw_enter(&inject_lock, RW_WRITER);
 
+		/*
+		 * We can't move this increment into the conditional
+		 * above because we need to hold the RW_WRITER lock of
+		 * inject_lock, and we don't want to hold that while
+		 * allocating the handler's zi_lanes array.
+		 */
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			ASSERT3S(inject_delay_count, >=, 0);
+			inject_delay_count++;
+			ASSERT3S(inject_delay_count, >, 0);
+		}
+
 		*id = handler->zi_id = inject_next_id++;
-		handler->zi_spa = spa;
-		handler->zi_record = *record;
 		list_insert_tail(&inject_handlers, handler);
-		atomic_add_32(&zio_injection_enabled, 1);
+		atomic_inc_32(&zio_injection_enabled);
 
 		rw_exit(&inject_lock);
 	}
@@ -420,7 +649,11 @@ zio_inject_fault(char *name, int flags, 
 	 * fault injection isn't a performance critical path.
 	 */
 	if (flags & ZINJECT_FLUSH_ARC)
-		arc_flush(NULL);
+		/*
+		 * We must use FALSE to ensure arc_flush returns, since
+		 * we're not preventing concurrent ARC insertions.
+		 */
+		arc_flush(NULL, FALSE);
 
 	return (0);
 }
@@ -450,7 +683,7 @@ zio_inject_list_next(int *id, char *name
 		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
 		ret = 0;
 	} else {
-		ret = ENOENT;
+		ret = SET_ERROR(ENOENT);
 	}
 
 	rw_exit(&inject_lock);
@@ -467,7 +700,6 @@ int
 zio_clear_fault(int id)
 {
 	inject_handler_t *handler;
-	int ret;
 
 	rw_enter(&inject_lock, RW_WRITER);
 
@@ -477,24 +709,39 @@ zio_clear_fault(int id)
 			break;
 
 	if (handler == NULL) {
-		ret = ENOENT;
-	} else {
-		list_remove(&inject_handlers, handler);
-		spa_inject_delref(handler->zi_spa);
-		kmem_free(handler, sizeof (inject_handler_t));
-		atomic_add_32(&zio_injection_enabled, -1);
-		ret = 0;
+		rw_exit(&inject_lock);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3S(inject_delay_count, >, 0);
+		inject_delay_count--;
+		ASSERT3S(inject_delay_count, >=, 0);
 	}
 
+	list_remove(&inject_handlers, handler);
 	rw_exit(&inject_lock);
 
-	return (ret);
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+		    handler->zi_record.zi_nlanes);
+	} else {
+		ASSERT3P(handler->zi_lanes, ==, NULL);
+	}
+
+	spa_inject_delref(handler->zi_spa);
+	kmem_free(handler, sizeof (inject_handler_t));
+	atomic_dec_32(&zio_injection_enabled);
+
+	return (0);
 }
 
 void
 zio_inject_init(void)
 {
 	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&inject_handlers, sizeof (inject_handler_t),
 	    offsetof(inject_handler_t, zi_link));
 }
@@ -503,5 +750,6 @@ void
 zio_inject_fini(void)
 {
 	list_destroy(&inject_handlers);
+	mutex_destroy(&inject_delay_mtx);
 	rw_destroy(&inject_lock);
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zrlock.c
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zrlock.c
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zrlock.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zrlock.c	3 Dec 2016 17:03:49 -0000
@@ -0,0 +1,187 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define	ZRL_LOCKED	-1
+#define	ZRL_DESTROYED	-2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+	mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+	zrl->zr_refcount = 0;
+	cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef	ZFS_DEBUG
+	zrl->zr_owner = NULL;
+	zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+	ASSERT0(zrl->zr_refcount);
+
+	mutex_destroy(&zrl->zr_mtx);
+	zrl->zr_refcount = ZRL_DESTROYED;
+	cv_destroy(&zrl->zr_cv);
+}
+
+void
+zrl_add_impl(zrlock_t *zrl, const char *zc)
+{
+	for (;;) {
+		uint32_t n = (uint32_t)zrl->zr_refcount;
+		while (n != ZRL_LOCKED) {
+			uint32_t cas = atomic_cas_32(
+			    (uint32_t *)&zrl->zr_refcount, n, n + 1);
+			if (cas == n) {
+				ASSERT3S((int32_t)n, >=, 0);
+#ifdef	ZFS_DEBUG
+				if (zrl->zr_owner == curthread) {
+					DTRACE_PROBE2(zrlock__reentry,
+					    zrlock_t *, zrl, uint32_t, n);
+				}
+				zrl->zr_owner = curthread;
+				zrl->zr_caller = zc;
+#endif
+				return;
+			}
+			n = cas;
+		}
+
+		mutex_enter(&zrl->zr_mtx);
+		while (zrl->zr_refcount == ZRL_LOCKED) {
+			cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+		}
+		mutex_exit(&zrl->zr_mtx);
+	}
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+	uint32_t n;
+
+#ifdef	ZFS_DEBUG
+	if (zrl->zr_owner == curthread) {
+		zrl->zr_owner = NULL;
+		zrl->zr_caller = NULL;
+	}
+#endif
+	n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+	ASSERT3S((int32_t)n, >=, 0);
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+	uint32_t n = (uint32_t)zrl->zr_refcount;
+
+	if (n == 0) {
+		uint32_t cas = atomic_cas_32(
+		    (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+		if (cas == 0) {
+#ifdef	ZFS_DEBUG
+			ASSERT3P(zrl->zr_owner, ==, NULL);
+			zrl->zr_owner = curthread;
+#endif
+			return (1);
+		}
+	}
+
+	ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
+
+	return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+	ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
+
+	mutex_enter(&zrl->zr_mtx);
+#ifdef	ZFS_DEBUG
+	ASSERT3P(zrl->zr_owner, ==, curthread);
+	zrl->zr_owner = NULL;
+	membar_producer();	/* make sure the owner store happens first */
+#endif
+	zrl->zr_refcount = 0;
+	cv_broadcast(&zrl->zr_cv);
+	mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_refcount(zrlock_t *zrl)
+{
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+	int n = (int)zrl->zr_refcount;
+	return (n <= 0 ? 0 : n);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+	return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+	return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef	ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+	return (zrl->zr_owner);
+}
+#endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c,v
retrieving revision 1.7
diff -u -p -r1.7 zvol.c
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c	29 Feb 2016 16:19:20 -0000	1.7
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c	28 Jun 2017 00:31:17 -0000
@@ -19,10 +19,21 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
 /*
  * ZFS volume emulation driver.
  *
@@ -35,14 +46,18 @@
  * These links are created by the /dev filesystem (sdev_zvolops.c).
  * Volumes are persistent through reboot.  No user command needs to be
  * run before opening and using a device.
+ *
+ * FreeBSD notes.
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
+#include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
-#include <sys/modctl.h>
 #include <sys/open.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
@@ -50,59 +65,96 @@
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
+#include <sys/disk.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dnode.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dkio.h>
-#include <sys/efi_partition.h>
 #include <sys/byteorder.h>
-#include <sys/pathname.h>
-#include <sys/ddi.h>
 #include <sys/sunddi.h>
-#include <sys/crc32.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
+#include <sys/queue.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/mkdev.h>
 #include <sys/zil.h>
 #include <sys/refcount.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_rlock.h>
-#include <sys/vdev_disk.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
-#include <sys/disk.h>
-#include <sys/dkio.h>
-#include <sys/disklabel.h>
-
-#ifdef __NetBSD__
-#include <prop/proplib.h>
-#endif
 #include <sys/zil_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/filio.h>
 
 #include "zfs_namecheck.h"
 
-static void *zvol_state;
-static char *zvol_tag = "zvol_tag";
+#ifdef __FreeBSD__
+#include <sys/bio.h>
+#include <geom/geom.h>
+
+struct g_class zfs_zvol_class = {
+	.name = "ZFS::ZVOL",
+	.version = G_VERSION,
+};
 
-#define	ZVOL_DUMPSIZE		"dumpsize"
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+#endif
+
+#ifdef __NetBSD__
+#include <sys/pathname.h>
+#include <prop/proplib.h>
+
+#define	DROP_GIANT()	/* nothing */
+#define PICKUP_GIANT()	/* nothing */
 
 void	zvol_minphys(struct buf *);
+static struct dkdriver zvol_dkdriver = { zvol_strategy, zvol_minphys };
+
+#define	bioerror(bp, er)	((bp)->b_error = (er))
+#define	b_edev			b_dev
+#endif
 
-static struct	dkdriver zvol_dkdriver = { zvol_strategy, zvol_minphys };
+void *zfsdev_state;
+static char *zvol_tag = "zvol_tag";
+
+#define	ZVOL_DUMPSIZE		"dumpsize"
 
+#ifdef __FreeBSD__
+/*
+ * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
+ * spa_namespace_lock in the ZVOL code.
+ */
+#define zfsdev_state_lock spa_namespace_lock
+#else
 /*
- * This lock protects the zvol_state structure from being modified
+ * This lock protects the zfsdev_state structure from being modified
  * while it's being used, e.g. an open that comes in before a create
  * finishes.  It also protects temporary opens of the dataset so that,
  * e.g., an open doesn't get a spurious EBUSY.
  */
-static kmutex_t zvol_state_lock;
+kmutex_t zfsdev_state_lock;
+#endif
 static uint32_t zvol_minors;
 
+#ifndef illumos
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+static int	volmode = ZFS_VOLMODE_GEOM;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
+    "Expose as GEOM providers (1), device files (2) or neither");
+static boolean_t zpool_on_zvol = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
+    "Allow zpools to use zvols as vdevs (DANGEROUS)");
+
+#endif
 typedef struct zvol_extent {
 	list_node_t	ze_node;
 	dva_t		ze_dva;		/* dva associated with this extent */
@@ -113,24 +165,45 @@ typedef struct zvol_extent {
  * The in-core state of each volume.
  */
 typedef struct zvol_state {
+#ifndef illumos
+	LIST_ENTRY(zvol_state)	zv_links;
+#endif
 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
 	uint64_t	zv_volsize;	/* amount of space we advertise */
 	uint64_t	zv_volblocksize; /* volume block size */
+#ifdef __FreeBSD__
+	struct cdev	*zv_dev;	/* non-GEOM device */
+	struct g_provider *zv_provider;	/* GEOM provider */
+#else
 	minor_t		zv_minor;	/* minor number */
+#endif
 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
 	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
 	objset_t	*zv_objset;	/* objset handle */
+#if defined(illumos) || defined(__NetBSD__)
 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
+#endif
 	uint32_t	zv_total_opens;	/* total open count */
+	uint32_t	zv_sync_cnt;	/* synchronous open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
 	list_t		zv_extents;	/* List of extents for dump */
 	znode_t		zv_znode;	/* for range locking */
+	dmu_buf_t	*zv_dbuf;	/* bonus handle */
+#ifdef __FreeBSD__
+	int		zv_state;
+	int		zv_volmode;	/* Provide GEOM or cdev */
+	struct bio_queue_head zv_queue;
+	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
+#endif
 #ifdef __NetBSD__
 	struct disk	zv_dk;		/* disk statistics */
 	kmutex_t	zv_dklock;	/* disk statistics */
 #endif
 } zvol_state_t;
 
+#ifndef illumos
+static LIST_HEAD(, zvol_state) all_zvols;
+#endif
 /*
  * zvol specific flags
  */
@@ -144,8 +217,59 @@ typedef struct zvol_state {
  */
 int zvol_maxphys = DMU_MAX_ACCESS/2;
 
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+
+/*
+ * If true, unmaps requested as synchronous are executed synchronously,
+ * otherwise all unmaps are asynchronous.
+ */
+boolean_t zvol_unmap_sync_enabled = B_FALSE;
+
+#ifdef __FreeBSD__
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+    &zvol_unmap_enabled, 0,
+    "Enable UNMAP functionality");
+
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN,
+    &zvol_unmap_sync_enabled, 0,
+    "UNMAPs requested as sync are executed synchronously");
+
+static d_open_t		zvol_d_open;
+static d_close_t	zvol_d_close;
+static d_read_t		zvol_read;
+static d_write_t	zvol_write;
+static d_ioctl_t	zvol_d_ioctl;
+static d_strategy_t	zvol_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	zvol_d_open,
+	.d_close =	zvol_d_close,
+	.d_read =	zvol_read,
+	.d_write =	zvol_write,
+	.d_ioctl =	zvol_d_ioctl,
+	.d_strategy =	zvol_strategy,
+	.d_name =	"zvol",
+	.d_flags =	D_DISK | D_TRACKCLOSE,
+};
+
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_start(struct bio *bp);
+static void zvol_geom_worker(void *arg);
+static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
+    uint64_t len, boolean_t sync);
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+/* XXXNETBSD need devsw, etc */
+#endif
+
 extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
-    nvlist_t *, nvlist_t **);
+    nvlist_t *, nvlist_t *);
 static int zvol_remove_zv(zvol_state_t *);
 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 static int zvol_dumpify(zvol_state_t *zv);
@@ -153,16 +277,52 @@ static int zvol_dump_fini(zvol_state_t *
 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 
 static void
-zvol_size_changed(zvol_state_t *zv)
+zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
 {
+#ifdef illumos
+	dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
+
+	zv->zv_volsize = volsize;
+	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+	    "Size", volsize) == DDI_SUCCESS);
+	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
+	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
+
+	/* Notify specfs to invalidate the cached size */
+	spec_size_invalidate(dev, VBLK);
+	spec_size_invalidate(dev, VCHR);
+#endif /* illumos */
+#ifdef __FreeBSD__
+	zv->zv_volsize = volsize;
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct g_provider *pp;
+
+		pp = zv->zv_provider;
+		if (pp == NULL)
+			return;
+		g_topology_lock();
+
+		/*
+		 * Do not invoke resize event when initial size was zero.
+		 * ZVOL initializes the size on first open, this is not
+		 * real resizing.
+		 */
+		if (pp->mediasize == 0)
+			pp->mediasize = zv->zv_volsize;
+		else
+			g_resize_provider(pp, zv->zv_volsize);
+		g_topology_unlock();
+	}
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
 	prop_dictionary_t disk_info, odisk_info, geom;
 	struct disk *disk;
 
 	disk = &zv->zv_dk;
-	
+
 	disk_info = prop_dictionary_create();
 	geom = prop_dictionary_create();
-	
+
 	prop_dictionary_set_cstring_nocopy(disk_info, "type", "ESDI");
 	prop_dictionary_set_uint64(geom, "sectors-per-unit", zv->zv_volsize);
 	prop_dictionary_set_uint32(geom, "sector-size",
@@ -178,22 +338,22 @@ zvol_size_changed(zvol_state_t *zv)
 
 	if (odisk_info != NULL)
 		prop_object_release(odisk_info);
+#endif
 }
 
+
 int
 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 {
 	if (volsize == 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	if (volsize % blocksize != 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
-#if 0
 #ifdef _ILP32
 	if (volsize - 1 > SPEC_MAXOFFSET_T)
-		return (EOVERFLOW);
-#endif
+		return (SET_ERROR(EOVERFLOW));
 #endif
 	return (0);
 }
@@ -202,9 +362,9 @@ int
 zvol_check_volblocksize(uint64_t volblocksize)
 {
 	if (volblocksize < SPA_MINBLOCKSIZE ||
-	    volblocksize > SPA_MAXBLOCKSIZE ||
+	    volblocksize > SPA_OLD_MAXBLOCKSIZE ||
 	    !ISP2(volblocksize))
-		return (EDOM);
+		return (SET_ERROR(EDOM));
 
 	return (0);
 }
@@ -232,40 +392,33 @@ zvol_get_stats(objset_t *os, nvlist_t *n
 	return (error);
 }
 
-/*
- * Find a free minor number.
- */
-static minor_t
-zvol_minor_alloc(void)
-{
-	minor_t minor;
-
-	ASSERT(MUTEX_HELD(&zvol_state_lock));
-
-	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
-		if (ddi_get_soft_state(zvol_state, minor) == NULL)
-			return (minor);
-
-	return (0);
-}
-
 static zvol_state_t *
 zvol_minor_lookup(const char *name)
 {
+#ifdef illumos
 	minor_t minor;
+#endif
 	zvol_state_t *zv;
 
-	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
-	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
-		zv = ddi_get_soft_state(zvol_state, minor);
+#ifdef illumos
+	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++)
+#else
+	LIST_FOREACH(zv, &all_zvols, zv_links)
+#endif
+	{
+#ifdef illumos
+		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 		if (zv == NULL)
 			continue;
+#endif
+
 		if (strcmp(zv->zv_name, name) == 0)
-			break;
+			return (zv);
 	}
 
-	return (zv);
+	return (NULL);
 }
 
 /* extent mapping arg */
@@ -277,21 +430,24 @@ struct maparg {
 /*ARGSUSED*/
 static int
 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct maparg *ma = arg;
 	zvol_extent_t *ze;
 	int bs = ma->ma_zv->zv_volblocksize;
 
-	if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
+	if (bp == NULL || BP_IS_HOLE(bp) ||
+	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
 		return (0);
 
+	VERIFY(!BP_IS_EMBEDDED(bp));
+
 	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
 	ma->ma_blks++;
 
 	/* Abort immediately if we have encountered gang blocks */
 	if (BP_IS_GANG(bp))
-		return (EFRAGS);
+		return (SET_ERROR(EFRAGS));
 
 	/*
 	 * See if the block is at the end of the previous extent.
@@ -386,6 +542,24 @@ zvol_create_cb(objset_t *os, void *arg, 
 }
 
 /*
+ * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
+ * implement DKIOCFREE/free-long-range.
+ */
+static int
+zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
+{
+	uint64_t offset, length;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
+}
+
+/*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
@@ -430,12 +604,12 @@ zvol_replay_write(zvol_state_t *zv, lr_w
 static int
 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
 {
-	return (ENOTSUP);
+	return (SET_ERROR(ENOTSUP));
 }
 
 /*
  * Callback vectors for replaying records.
- * Only TX_WRITE is needed for zvol.
+ * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* 0 no such transaction type */
@@ -448,7 +622,7 @@ zil_replay_func_t *zvol_replay_vector[TX
 	zvol_replay_err,	/* TX_LINK */
 	zvol_replay_err,	/* TX_RENAME */
 	zvol_replay_write,	/* TX_WRITE */
-	zvol_replay_err,	/* TX_TRUNCATE */
+	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
 	zvol_replay_err,	/* TX_CREATE_ACL */
@@ -460,18 +634,20 @@ zil_replay_func_t *zvol_replay_vector[TX
 	zvol_replay_err,	/* TX_WRITE2 */
 };
 
+#ifdef illumos
 int
 zvol_name2minor(const char *name, minor_t *minor)
 {
 	zvol_state_t *zv;
 
-	mutex_enter(&zvol_state_lock);
+	mutex_enter(&zfsdev_state_lock);
 	zv = zvol_minor_lookup(name);
 	if (minor && zv)
 		*minor = zv->zv_minor;
-	mutex_exit(&zvol_state_lock);
+	mutex_exit(&zfsdev_state_lock);
 	return (zv ? 0 : -1);
 }
+#endif	/* illumos */
 
 /*
  * Create a minor node (plus a whole lot more) for the specified volume.
@@ -479,42 +655,147 @@ zvol_name2minor(const char *name, minor_
 int
 zvol_create_minor(const char *name)
 {
+	zfs_soft_state_t *zs;
 	zvol_state_t *zv;
 	objset_t *os;
+	int error;
+#ifdef illumos
+	dmu_object_info_t doi;
+	minor_t minor = 0;
+	char chrbuf[30], blkbuf[30];
+#endif
+#ifdef __FreeBSD__
+	struct g_provider *pp;
+	struct g_geom *gp;
+	uint64_t mode;
+
+	ZFS_LOG(1, "Creating ZVOL %s...", name);
+#endif
+#ifdef __NetBSD__
 	dmu_object_info_t doi;
 	minor_t minor = 0;
 	vnode_t *vp = NULL;
 	char *devpath;
 	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1;
+#endif
 
-	int error;
-	
-	mutex_enter(&zvol_state_lock);
+	mutex_enter(&zfsdev_state_lock);
 
 	if (zvol_minor_lookup(name) != NULL) {
-		mutex_exit(&zvol_state_lock);
-		return (EEXIST);
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(EEXIST));
 	}
 
 	/* lie and say we're read-only */
-	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
+	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
 
 	if (error) {
-		mutex_exit(&zvol_state_lock);
+		mutex_exit(&zfsdev_state_lock);
 		return (error);
 	}
 
+#ifdef illumos
+	if ((minor = zfsdev_minor_alloc()) == 0) {
+		dmu_objset_disown(os, FTAG);
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
+		dmu_objset_disown(os, FTAG);
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(EAGAIN));
+	}
+	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
+	    (char *)name);
+
+	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
+
+	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
+	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_soft_state_free(zfsdev_state, minor);
+		dmu_objset_disown(os, FTAG);
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(EAGAIN));
+	}
+
+	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
+
+	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
+	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
+		ddi_remove_minor_node(zfs_dip, chrbuf);
+		ddi_soft_state_free(zfsdev_state, minor);
+		dmu_objset_disown(os, FTAG);
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(EAGAIN));
+	}
+
+	zs = ddi_get_soft_state(zfsdev_state, minor);
+	zs->zss_type = ZSST_ZVOL;
+	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+#endif /* illumos */
+
+#ifdef __FreeBSD__
+	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+	zv->zv_state = 0;
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
+	if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
+		mode = volmode;
+
+	DROP_GIANT();
+	zv->zv_volmode = mode;
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		g_topology_lock();
+		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+		gp->start = zvol_geom_start;
+		gp->access = zvol_geom_access;
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = 0;
+		pp->private = zv;
+
+		zv->zv_provider = pp;
+		bioq_init(&zv->zv_queue);
+		mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct make_dev_args args;
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		error = make_dev_s(&args, &zv->zv_dev,
+		    "%s/%s", ZVOL_DRIVER, name);
+		if (error != 0) {
+			kmem_free(zv, sizeof(*zv));
+			dmu_objset_disown(os, FTAG);
+			mutex_exit(&zfsdev_state_lock);
+			return (error);
+		}
+		zv->zv_dev->si_iosize_max = MAXPHYS;
+	}
+	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
+
 	/*
 	 * If there's an existing /dev/zvol symlink, try to use the
 	 * same minor number we used last time.
 	 */
 	devpath = kmem_alloc(devpathlen, KM_SLEEP);
-	
+
 	/* Get full path to ZFS volume disk device */
 	(void) snprintf(devpath, devpathlen, "%s/%s", ZVOL_FULL_DEV_DIR, name);
-	
-	error = lookupname(devpath, UIO_SYSSPACE, NULL, &vp);
-	
+
+	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
+
 	if (error == 0 && vp->v_type != VBLK) {
 		error = EINVAL;
 	}
@@ -523,7 +804,7 @@ zvol_create_minor(const char *name)
 		struct stat sb;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = vn_stat(vp, &sb);
-		VOP_UNLOCK(vp);
+		VOP_UNLOCK(vp, 0);
 		if (error == 0) {
 			minor = getminor(sb.st_rdev);
 		}
@@ -535,22 +816,23 @@ zvol_create_minor(const char *name)
 	/*
 	 * If we found a minor but it's already in use, we must pick a new one.
 	 */
-	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
+
+	if (minor != 0 && zfsdev_get_soft_state(minor, ZSST_ZVOL) != NULL)
 		minor = 0;
 
 	if (minor == 0)
-		minor = zvol_minor_alloc();
+		minor = zfsdev_minor_alloc();
 
 	if (minor == 0) {
 		dmu_objset_disown(os, zvol_tag);
-		mutex_exit(&zvol_state_lock);
+		mutex_exit(&zfsdev_state_lock);
 		kmem_free(devpath, devpathlen);
 		return (ENXIO);
 	}
 
-	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
+	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
 		dmu_objset_disown(os, zvol_tag);
-		mutex_exit(&zvol_state_lock);
+		mutex_exit(&zfsdev_state_lock);
 		kmem_free(devpath, devpathlen);
 		return (EAGAIN);
 	}
@@ -559,9 +841,9 @@ zvol_create_minor(const char *name)
 
 	if (ddi_create_minor_node(zfs_dip, (char *)name, S_IFCHR,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
-		ddi_soft_state_free(zvol_state, minor);
+		ddi_soft_state_free(zfsdev_state, minor);
 		dmu_objset_disown(os, zvol_tag);
-		mutex_exit(&zvol_state_lock);
+		mutex_exit(&zfsdev_state_lock);
 		kmem_free(devpath, devpathlen);
 		return (EAGAIN);
 	}
@@ -569,46 +851,64 @@ zvol_create_minor(const char *name)
 	if (ddi_create_minor_node(zfs_dip, (char *)name, S_IFBLK,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 		ddi_remove_minor_node(zfs_dip, (char *)name);
-		ddi_soft_state_free(zvol_state, minor);
+		ddi_soft_state_free(zfsdev_state, minor);
 		dmu_objset_disown(os, zvol_tag);
-		mutex_exit(&zvol_state_lock);
+		mutex_exit(&zfsdev_state_lock);
 		kmem_free(devpath, devpathlen);
 		return (EAGAIN);
 	}
-	zv = ddi_get_soft_state(zvol_state, minor);
+	zs = ddi_get_soft_state(zfsdev_state, minor);
+	zs->zss_type = ZSST_ZVOL;
+	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+
+	disk_init(&zv->zv_dk, name, &zvol_dkdriver);
+	disk_attach(&zv->zv_dk);
+	mutex_init(&zv->zv_dklock, NULL, MUTEX_DEFAULT, NULL);
+
+	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
+#endif /* __NetBSD__ */
 
 	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 	zv->zv_min_bs = DEV_BSHIFT;
+#ifdef illumos
 	zv->zv_minor = minor;
+#endif
 	zv->zv_objset = os;
-	if (dmu_objset_is_snapshot(os))
+	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
 	    offsetof(zvol_extent_t, ze_node));
+#if defined(illumos) || defined(__NetBSD__)
 	/* get and cache the blocksize */
 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
 	ASSERT(error == 0);
 	zv->zv_volblocksize = doi.doi_data_block_size;
+#endif
 
-	disk_init(&zv->zv_dk, name, &zvol_dkdriver);
-	disk_attach(&zv->zv_dk);
-	mutex_init(&zv->zv_dklock, NULL, MUTEX_DEFAULT, NULL);
-	
-	zil_replay(os, zv, zvol_replay_vector);
-	dmu_objset_disown(os, zvol_tag);
+	if (spa_writeable(dmu_objset_spa(os))) {
+		if (zil_replay_disable)
+			zil_destroy(dmu_objset_zil(os), B_FALSE);
+		else
+			zil_replay(os, zv, zvol_replay_vector);
+	}
+	dmu_objset_disown(os, FTAG);
 	zv->zv_objset = NULL;
 
-	zvol_size_changed(zv);
-
 	zvol_minors++;
 
-	mutex_exit(&zvol_state_lock);
+	mutex_exit(&zfsdev_state_lock);
+#ifdef __FreeBSD__
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		zvol_geom_run(zv);
+		g_topology_unlock();
+	}
+	PICKUP_GIANT();
 
-//	kmem_free(devpath, devpathlen);
-	
+	ZFS_LOG(1, "ZVOL %s created.", name);
+#endif
 	return (0);
 }
 
@@ -618,23 +918,56 @@ zvol_create_minor(const char *name)
 static int
 zvol_remove_zv(zvol_state_t *zv)
 {
-	char nmbuf[20];
 
-	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 	if (zv->zv_total_opens != 0)
-		return (EBUSY);
+		return (SET_ERROR(EBUSY));
+
+#ifdef illumos
+	char nmbuf[20];
+	minor_t minor = zv->zv_minor;
+
+	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
+	ddi_remove_minor_node(zfs_dip, nmbuf);
+
+	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
+	ddi_remove_minor_node(zfs_dip, nmbuf);
+#endif
+#ifdef __FreeBSD__
+	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+	LIST_REMOVE(zv, zv_links);
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		g_topology_lock();
+		zvol_geom_destroy(zv);
+		g_topology_unlock();
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		if (zv->zv_dev != NULL)
+			destroy_dev(zv->zv_dev);
+	}
+#endif
+#ifdef __NetBSD__
+	char nmbuf[20];
+	minor_t minor = zv->zv_minor;
 
+	/* XXXNETBSD needs changes here */
 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", zv->zv_minor);
 	ddi_remove_minor_node(zfs_dip, nmbuf);
 
 	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", zv->zv_minor);
 	ddi_remove_minor_node(zfs_dip, nmbuf);
+#endif
 
 	avl_destroy(&zv->zv_znode.z_range_avl);
 	mutex_destroy(&zv->zv_znode.z_range_lock);
 
-	ddi_soft_state_free(zvol_state, zv->zv_minor);
-
+	kmem_free(zv, sizeof (zvol_state_t));
+#ifdef illumos
+	ddi_soft_state_free(zfsdev_state, minor);
+#endif
+#ifdef __NetBSD__
+	ddi_soft_state_free(zfsdev_state, minor);
+#endif
 	zvol_minors--;
 	return (0);
 }
@@ -645,19 +978,20 @@ zvol_remove_minor(const char *name)
 	zvol_state_t *zv;
 	int rc;
 
-	mutex_enter(&zvol_state_lock);
+	mutex_enter(&zfsdev_state_lock);
 	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		mutex_exit(&zvol_state_lock);
-		return (ENXIO);
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
 	}
 	rc = zvol_remove_zv(zv);
-	mutex_exit(&zvol_state_lock);
+	mutex_exit(&zfsdev_state_lock);
 	return (rc);
 }
 
 int
 zvol_first_open(zvol_state_t *zv)
 {
+	dmu_object_info_t doi;
 	objset_t *os;
 	uint64_t volsize;
 	int error;
@@ -669,20 +1003,36 @@ zvol_first_open(zvol_state_t *zv)
 	if (error)
 		return (error);
 
+	zv->zv_objset = os;
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error) {
 		ASSERT(error == 0);
 		dmu_objset_disown(os, zvol_tag);
 		return (error);
 	}
-	zv->zv_objset = os;
-	zv->zv_volsize = volsize;
+
+	/* get and cache the blocksize */
+	error = dmu_object_info(os, ZVOL_OBJ, &doi);
+	if (error) {
+		ASSERT(error == 0);
+		dmu_objset_disown(os, zvol_tag);
+		return (error);
+	}
+	zv->zv_volblocksize = doi.doi_data_block_size;
+
+	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+	if (error) {
+		dmu_objset_disown(os, zvol_tag);
+		return (error);
+	}
+
+	zvol_size_changed(zv, volsize);
 	zv->zv_zilog = zil_open(os, zvol_get_data);
-	zvol_size_changed(zv);
 
 	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
-		NULL) == 0);
-	if (readonly || dmu_objset_is_snapshot(os))
+	    NULL) == 0);
+	if (readonly || dmu_objset_is_snapshot(os) ||
+	    !spa_writeable(dmu_objset_spa(os)))
 		zv->zv_flags |= ZVOL_RDONLY;
 	else
 		zv->zv_flags &= ~ZVOL_RDONLY;
@@ -694,16 +1044,29 @@ zvol_last_close(zvol_state_t *zv)
 {
 	zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
+
+	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
+	zv->zv_dbuf = NULL;
+
+	/*
+	 * Evict cached data
+	 */
+	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
+	    !(zv->zv_flags & ZVOL_RDONLY))
+		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+	dmu_objset_evict_dbufs(zv->zv_objset);
+
 	dmu_objset_disown(zv->zv_objset, zvol_tag);
 	zv->zv_objset = NULL;
-#ifdef __NetBSD__
+#ifdef __NetBSD__xxx
+	/* the old code has this here, but it's in the wrong place. */
 	disk_detach(&zv->zv_dk);
 	disk_destroy(&zv->zv_dk);
 	mutex_destroy(&zv->zv_dklock);
 #endif
-	return;
 }
 
+#ifdef illumos
 int
 zvol_prealloc(zvol_state_t *zv)
 {
@@ -716,14 +1079,14 @@ zvol_prealloc(zvol_state_t *zv)
 	/* Check the space usage before attempting to allocate the space */
 	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
 	if (avail < zv->zv_volsize)
-		return (ENOSPC);
+		return (SET_ERROR(ENOSPC));
 
 	/* Free old extents if they exist */
 	zvol_free_extents(zv);
 
 	while (resid != 0) {
 		int error;
-		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+		uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
@@ -742,17 +1105,19 @@ zvol_prealloc(zvol_state_t *zv)
 
 	return (0);
 }
+#endif	/* illumos */
 
-int
+static int
 zvol_update_volsize(objset_t *os, uint64_t volsize)
 {
 	dmu_tx_t *tx;
 	int error;
 
-	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -772,6 +1137,7 @@ zvol_update_volsize(objset_t *os, uint64
 void
 zvol_remove_minors(const char *name)
 {
+#ifdef illumos
 	zvol_state_t *zv;
 	char *namebuf;
 	minor_t minor;
@@ -779,10 +1145,10 @@ zvol_remove_minors(const char *name)
 	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
 	(void) strncpy(namebuf, name, strlen(name));
 	(void) strcat(namebuf, "/");
-	mutex_enter(&zvol_state_lock);
-	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
+	mutex_enter(&zfsdev_state_lock);
+	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
 
-		zv = ddi_get_soft_state(zvol_state, minor);
+		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 		if (zv == NULL)
 			continue;
 		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
@@ -790,68 +1156,67 @@ zvol_remove_minors(const char *name)
 	}
 	kmem_free(namebuf, strlen(name) + 2);
 
-	mutex_exit(&zvol_state_lock);
+	mutex_exit(&zfsdev_state_lock);
+#else	/* !illumos */
+	zvol_state_t *zv, *tzv;
+	size_t namelen;
+
+	namelen = strlen(name);
+
+	DROP_GIANT();
+	mutex_enter(&zfsdev_state_lock);
+
+	LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
+		if (strcmp(zv->zv_name, name) == 0 ||
+		    (strncmp(zv->zv_name, name, namelen) == 0 &&
+		    strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
+		    zv->zv_name[namelen] == '@'))) {
+			(void) zvol_remove_zv(zv);
+		}
+	}
+
+	mutex_exit(&zfsdev_state_lock);
+	PICKUP_GIANT();
+#endif	/* illumos */
 }
 
-int
-zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
+static int
+zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
 {
-	zvol_state_t *zv = NULL;
-	objset_t *os;
-	int error;
-	dmu_object_info_t doi;
 	uint64_t old_volsize = 0ULL;
-	uint64_t readonly;
-
-	mutex_enter(&zvol_state_lock);
-	zv = zvol_minor_lookup(name);
-	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
-		mutex_exit(&zvol_state_lock);
-		return (error);
-	}
-
-	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
-	    (error = zvol_check_volsize(volsize,
-	    doi.doi_data_block_size)) != 0)
-		goto out;
-
-	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
-	    NULL) == 0);
-	if (readonly) {
-		error = EROFS;
-		goto out;
-	}
+	int error = 0;
 
-	error = zvol_update_volsize(os, volsize);
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
-#ifndef __NetBSD__
 	/*
 	 * Reinitialize the dump area to the new size. If we
 	 * failed to resize the dump area then restore it back to
-	 * its original size.
+	 * its original size.  We must set the new volsize prior
+	 * to calling dumpvp_resize() to ensure that the devices'
+	 * size(9P) is not visible by the dump subsystem.
 	 */
-	if (zv && error == 0) {
-		if (zv->zv_flags & ZVOL_DUMPIFIED) {
-			old_volsize = zv->zv_volsize;
-			zv->zv_volsize = volsize;
-			if ((error = zvol_dumpify(zv)) != 0 ||
-			    (error = dumpvp_resize()) != 0) {
-				(void) zvol_update_volsize(os, old_volsize);
-				zv->zv_volsize = old_volsize;
-				error = zvol_dumpify(zv);
-			}
-		}
-		if (error == 0) {
-			zv->zv_volsize = volsize;
-			zvol_size_changed(volsize, maj, zv->zv_minor);
+	old_volsize = zv->zv_volsize;
+	zvol_size_changed(zv, volsize);
+
+#ifdef ZVOL_DUMP
+	if (zv->zv_flags & ZVOL_DUMPIFIED) {
+		if ((error = zvol_dumpify(zv)) != 0 ||
+		    (error = dumpvp_resize()) != 0) {
+			int dumpify_error;
+
+			(void) zvol_update_volsize(zv->zv_objset, old_volsize);
+			zvol_size_changed(zv, old_volsize);
+			dumpify_error = zvol_dumpify(zv);
+			error = dumpify_error ? dumpify_error : error;
 		}
 	}
-#endif
+#endif	/* ZVOL_DUMP */
 
+#ifdef illumos
 	/*
 	 * Generate a LUN expansion event.
 	 */
-	if (zv && error == 0) {
+	if (error == 0) {
 		sysevent_id_t eid;
 		nvlist_t *attr;
 		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
@@ -868,88 +1233,251 @@ zvol_set_volsize(const char *name, major
 		nvlist_free(attr);
 		kmem_free(physpath, MAXPATHLEN);
 	}
-
-out:
-	dmu_objset_rele(os, FTAG);
-
-	mutex_exit(&zvol_state_lock);
-
+#endif	/* illumos */
 	return (error);
 }
 
-/*ARGSUSED*/
 int
-zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+zvol_set_volsize(const char *name, uint64_t volsize)
 {
-	minor_t minor = getminor(*devp);
-	zvol_state_t *zv;
-	int err = 0;
+	zvol_state_t *zv = NULL;
+	objset_t *os;
+	int error;
+	dmu_object_info_t doi;
+	uint64_t readonly;
+	boolean_t owned = B_FALSE;
 
-	if (minor == 0)			/* This is the control device */
-		return (0);
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+	if (error != 0)
+		return (error);
+	if (readonly)
+		return (SET_ERROR(EROFS));
 
-	mutex_enter(&zvol_state_lock);
+	mutex_enter(&zfsdev_state_lock);
+	zv = zvol_minor_lookup(name);
 
-	zv = ddi_get_soft_state(zvol_state, minor);
-	if (zv == NULL) {
-		mutex_exit(&zvol_state_lock);
-		return (ENXIO);
+	if (zv == NULL || zv->zv_objset == NULL) {
+		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
+		    FTAG, &os)) != 0) {
+			mutex_exit(&zfsdev_state_lock);
+			return (error);
+		}
+		owned = B_TRUE;
+		if (zv != NULL)
+			zv->zv_objset = os;
+	} else {
+		os = zv->zv_objset;
 	}
 
-	if (zv->zv_total_opens == 0)
-		err = zvol_first_open(zv);
-	if (err) {
-		mutex_exit(&zvol_state_lock);
+	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
+	    (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
+		goto out;
+
+	error = zvol_update_volsize(os, volsize);
+
+	if (error == 0 && zv != NULL)
+		error = zvol_update_live_volsize(zv, volsize);
+out:
+	if (owned) {
+		dmu_objset_disown(os, FTAG);
+		if (zv != NULL)
+			zv->zv_objset = NULL;
+	}
+	mutex_exit(&zfsdev_state_lock);
+	return (error);
+}
+
+/*ARGSUSED*/
+#ifdef illumos
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+#endif
+#ifdef __FreeBSD__
+static int
+zvol_open(struct g_provider *pp, int flag, int count)
+#endif
+#ifdef __NetBSD__
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+#endif
+{
+	zvol_state_t *zv;
+	int err = 0;
+
+#ifdef illumos
+	mutex_enter(&zfsdev_state_lock);
+
+	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
+	if (zv == NULL) {
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (zv->zv_total_opens == 0)
+		err = zvol_first_open(zv);
+	if (err) {
+		mutex_exit(&zfsdev_state_lock);
+		return (err);
+	}
+#endif /* !illumos */
+#ifdef __FreeBSD__
+	boolean_t locked = B_FALSE;
+
+	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+		/*
+		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
+		 * attempting to probe geom providers while looking for a
+		 * replacement for a missing VDEV.  In this case, the
+		 * spa_namespace_lock will not be held, but it is still illegal
+		 * to use a zvol as a vdev.  Deadlocks can result if another
+		 * thread has spa_namespace_lock
+		 */
+		return (EOPNOTSUPP);
+	}
+	/*
+	 * Protect against recursively entering spa_namespace_lock
+	 * when spa_open() is used for a pool on a (local) ZVOL(s).
+	 * This is needed since we replaced upstream zfsdev_state_lock
+	 * with spa_namespace_lock in the ZVOL code.
+	 * We are using the same trick as spa_open().
+	 * Note that calls in zvol_first_open which need to resolve
+	 * pool name to a spa object will enter spa_open()
+	 * recursively, but that function already has all the
+	 * necessary protection.
+	 */
+	if (!MUTEX_HELD(&zfsdev_state_lock)) {
+		mutex_enter(&zfsdev_state_lock);
+		locked = B_TRUE;
+	}
+
+	zv = pp->private;
+	if (zv == NULL) {
+		if (locked)
+			mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (zv->zv_total_opens == 0) {
+		err = zvol_first_open(zv);
+		if (err) {
+			if (locked)
+				mutex_exit(&zfsdev_state_lock);
+			return (err);
+		}
+		pp->mediasize = zv->zv_volsize;
+		pp->stripeoffset = 0;
+		pp->stripesize = zv->zv_volblocksize;
+	}
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+	mutex_enter(&zfsdev_state_lock);
+
+	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
+	if (zv == NULL) {
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (zv->zv_total_opens == 0)
+		err = zvol_first_open(zv);
+	if (err) {
+		mutex_exit(&zfsdev_state_lock);
 		return (err);
 	}
+#endif
+
 	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
-		err = EROFS;
+		err = SET_ERROR(EROFS);
 		goto out;
 	}
 	if (zv->zv_flags & ZVOL_EXCL) {
-		err = EBUSY;
+		err = SET_ERROR(EBUSY);
 		goto out;
 	}
+#ifdef FEXCL
 	if (flag & FEXCL) {
 		if (zv->zv_total_opens != 0) {
-			err = EBUSY;
+			err = SET_ERROR(EBUSY);
 			goto out;
 		}
 		zv->zv_flags |= ZVOL_EXCL;
 	}
+#endif
 
+#ifdef illumos
+	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
+		zv->zv_open_count[otyp]++;
+		zv->zv_total_opens++;
+	}
+	mutex_exit(&zfsdev_state_lock);
+#endif
+#ifdef __FreeBSD__
+	zv->zv_total_opens += count;
+	if (locked)
+		mutex_exit(&zfsdev_state_lock);
+#endif
+#ifdef __NetBSD__
 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
 		zv->zv_open_count[otyp]++;
 		zv->zv_total_opens++;
 	}
-	mutex_exit(&zvol_state_lock);
+	mutex_exit(&zfsdev_state_lock);
+#endif
 
 	return (err);
 out:
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
-	mutex_exit(&zvol_state_lock);
+#ifdef __FreeBSD__
+	if (locked)
+#endif
+		mutex_exit(&zfsdev_state_lock);
+
 	return (err);
 }
 
 /*ARGSUSED*/
+#if defined(illumos) || defined(__NetBSD__)
 int
 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
+#endif
+#ifdef __FreeBSD__
+static int
+zvol_close(struct g_provider *pp, int flag, int count)
+#endif
 {
+#if defined(illumos) || defined(__NetBSD__)
 	minor_t minor = getminor(dev);
 	zvol_state_t *zv;
 	int error = 0;
 
-	if (minor == 0)		/* This is the control device */
-		return (0);
+	mutex_enter(&zfsdev_state_lock);
+
+	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+	if (zv == NULL) {
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+#endif /* illumos */
+#ifdef __FreeBSD__
+	zvol_state_t *zv;
+	int error = 0;
+	boolean_t locked = B_FALSE;
 
-	mutex_enter(&zvol_state_lock);
+	/* See comment in zvol_open(). */
+	if (!MUTEX_HELD(&zfsdev_state_lock)) {
+		mutex_enter(&zfsdev_state_lock);
+		locked = B_TRUE;
+	}
 
-	zv = ddi_get_soft_state(zvol_state, minor);
+	zv = pp->private;
 	if (zv == NULL) {
-		mutex_exit(&zvol_state_lock);
-		return (ENXIO);
+		if (locked)
+			mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
 	}
+#endif /* __FreeBSD__ */
 
 	if (zv->zv_flags & ZVOL_EXCL) {
 		ASSERT(zv->zv_total_opens == 1);
@@ -960,19 +1488,30 @@ zvol_close(dev_t dev, int flag, int otyp
 	 * If the open count is zero, this is a spurious close.
 	 * That indicates a bug in the kernel / DDI framework.
 	 */
+#if defined(illumos) || defined(__NetBSD__)
 	ASSERT(zv->zv_open_count[otyp] != 0);
+#endif
 	ASSERT(zv->zv_total_opens != 0);
 
 	/*
 	 * You may get multiple opens, but only one close.
 	 */
+#if defined(illumos) || defined(__NetBSD__)
 	zv->zv_open_count[otyp]--;
 	zv->zv_total_opens--;
+#else
+	zv->zv_total_opens -= count;
+#endif
 
 	if (zv->zv_total_opens == 0)
 		zvol_last_close(zv);
 
-	mutex_exit(&zvol_state_lock);
+#if defined(illumos) || defined(__NetBSD__)
+	mutex_exit(&zfsdev_state_lock);
+#else
+	if (locked)
+		mutex_exit(&zfsdev_state_lock);
+#endif
 	return (error);
 }
 
@@ -1026,8 +1565,15 @@ zvol_get_data(void *arg, lr_write_t *lr,
 	} else {
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN(offset, size);
-		error = dmu_buf_hold(os, object, offset, zgd, &db);
+		error = dmu_buf_hold(os, object, offset, zgd, &db,
+		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
@@ -1054,6 +1600,10 @@ zvol_get_data(void *arg, lr_write_t *lr,
  * Otherwise we will later flush the data out via dmu_sync().
  */
 ssize_t zvol_immediate_write_sz = 32768;
+#ifdef _KERNEL
+SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
+    &zvol_immediate_write_sz, 0, "Minimal size for indirect log write");
+#endif
 
 static void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
@@ -1061,57 +1611,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
-	boolean_t slogging;
-	ssize_t immediate_write_sz;
-
-	if (zil_disable)
-		return;
+	itx_wr_state_t write_state;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zvol_immediate_write_sz;
-
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
+		write_state = WR_INDIRECT;
+	else if (sync)
+		write_state = WR_COPIED;
+	else
+		write_state = WR_NEED_COPY;
 
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
-		itx_wr_state_t write_state;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * Unlike zfs_log_write() we can be called with
-		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
-		 */
-		if (blocksize > immediate_write_sz && !slogging &&
-		    resid >= blocksize && off % blocksize == 0) {
-			write_state = WR_INDIRECT; /* uses dmu_sync */
-			len = blocksize;
-		} else if (sync) {
-			write_state = WR_COPIED;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		} else {
-			write_state = WR_NEED_COPY;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		}
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
 		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
 		lr->lr_length = len;
@@ -1119,58 +1656,68 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
-		itx->itx_sync = sync;
 
-		(void) zil_itx_assign(zilog, itx, tx);
+		if (!sync && (zv->zv_sync_cnt == 0))
+			itx->itx_sync = B_FALSE;
+
+		zil_itx_assign(zilog, itx, tx);
 
 		off += len;
 		resid -= len;
 	}
 }
 
-#ifndef __NetBSD__
+#ifdef illumos
 static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
-    boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+    uint64_t size, boolean_t doread, boolean_t isdump)
 {
 	vdev_disk_t *dvd;
 	int c;
 	int numerrors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
-		    vd->vdev_ops == &vdev_replacing_ops ||
-		    vd->vdev_ops == &vdev_spare_ops);
-		int err = zvol_dumpio_vdev(vd->vdev_child[c],
-		    addr, offset, size, doread, isdump);
-		if (err != 0) {
-			numerrors++;
-		} else if (doread) {
-			break;
+	if (vd->vdev_ops == &vdev_mirror_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops) {
+		for (c = 0; c < vd->vdev_children; c++) {
+			int err = zvol_dumpio_vdev(vd->vdev_child[c],
+			    addr, offset, origoffset, size, doread, isdump);
+			if (err != 0) {
+				numerrors++;
+			} else if (doread) {
+				break;
+			}
 		}
 	}
 
-	if (!vd->vdev_ops->vdev_op_leaf)
+	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
 		return (numerrors < vd->vdev_children ? 0 : EIO);
 
 	if (doread && !vdev_readable(vd))
-		return (EIO);
+		return (SET_ERROR(EIO));
 	else if (!doread && !vdev_writeable(vd))
-		return (EIO);
+		return (SET_ERROR(EIO));
+
+	if (vd->vdev_ops == &vdev_raidz_ops) {
+		return (vdev_raidz_physio(vd,
+		    addr, size, offset, origoffset, doread, isdump));
+	}
 
-	dvd = vd->vdev_tsd;
-	ASSERT3P(dvd, !=, NULL);
 	offset += VDEV_LABEL_START_SIZE;
 
 	if (ddi_in_panic() || isdump) {
 		ASSERT(!doread);
 		if (doread)
-			return (EIO);
+			return (SET_ERROR(EIO));
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
 		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
 		    lbtodb(size)));
 	} else {
-		return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
-		    doread ? B_READ : B_WRITE));
+		dvd = vd->vdev_tsd;
+		ASSERT3P(dvd, !=, NULL);
+		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+		    offset, doread ? B_READ : B_WRITE));
 	}
 }
 
@@ -1186,7 +1733,7 @@ zvol_dumpio(zvol_state_t *zv, void *addr
 	/* Must be sector aligned, and not stradle a block boundary. */
 	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
 	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 	}
 	ASSERT(size <= zv->zv_volblocksize);
 
@@ -1196,53 +1743,185 @@ zvol_dumpio(zvol_state_t *zv, void *addr
 		offset -= ze->ze_nblks * zv->zv_volblocksize;
 		ze = list_next(&zv->zv_extents, ze);
 	}
-	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+	if (ze == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if (!ddi_in_panic())
+		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
 	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
 	offset += DVA_GET_OFFSET(&ze->ze_dva);
-	error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
-	spa_config_exit(spa, SCL_STATE, FTAG);
+	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+	    size, doread, isdump);
+
+	if (!ddi_in_panic())
+		spa_config_exit(spa, SCL_STATE, FTAG);
+
 	return (error);
 }
-#endif	/* __NetBSD__ */
+#else /* !illumos */
+static inline int
+zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
+    boolean_t doread, boolean_t isdump)
+{
+	return 0;
+}
+#endif /* illumos */
 
+#ifdef illumos
+int
+zvol_strategy(buf_t *bp)
+#endif
+#ifdef __FreeBSD__
+void
+zvol_strategy(struct bio *bp)
+#endif
+#ifdef __NetBSD__
 void
 zvol_strategy(buf_t *bp)
+#endif
 {
-	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
+	zvol_state_t *zv;
 	uint64_t off, volsize;
 	size_t resid;
 	char *addr;
 	objset_t *os;
 	rl_t *rl;
 	int error = 0;
+#ifdef illumos
 	boolean_t doread = bp->b_flags & B_READ;
-	boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
+#else
+	boolean_t doread = 0;
+#endif
+	boolean_t is_dumpified;
 	boolean_t sync;
 
-	if (zv == NULL) {
-		bioerror(bp, ENXIO);
+#ifdef illumos
+	zfs_soft_state_t *zs = NULL;
+
+	if (getminor(bp->b_edev) == 0) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
+		if (zs == NULL)
+			error = SET_ERROR(ENXIO);
+		else if (zs->zss_type != ZSST_ZVOL)
+			error = SET_ERROR(EINVAL);
+	}
+
+	if (error) {
+		bioerror(bp, error);
 		biodone(bp);
-		return;
+		return (0);
+	}
+
+	zv = zs->zss_data;
+
+	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
+		bioerror(bp, EROFS);
+		biodone(bp);
+		return (0);
+	}
+
+	off = ldbtob(bp->b_blkno);
+#endif /* illumos */
+#ifdef __FreeBSD__
+	if (bp->bio_to)
+		zv = bp->bio_to->private;
+	else
+		zv = bp->bio_dev->si_drv2;
+
+	if (zv == NULL) {
+		error = SET_ERROR(ENXIO);
+		goto out;
 	}
 
+	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
+		error = SET_ERROR(EROFS);
+		goto out;
+	}
+
+	switch (bp->bio_cmd) {
+	case BIO_FLUSH:
+		goto sync;
+	case BIO_READ:
+		doread = 1;
+	case BIO_WRITE:
+	case BIO_DELETE:
+		break;
+	default:
+		error = EOPNOTSUPP;
+		goto out;
+	}
+
+	off = bp->bio_offset;
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+	zfs_soft_state_t *zs = NULL;
+
 	if (getminor(bp->b_edev) == 0) {
-		bioerror(bp, EINVAL);
+		error = SET_ERROR(EINVAL);
+	} else {
+		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
+		if (zs == NULL)
+			error = SET_ERROR(ENXIO);
+		else if (zs->zss_type != ZSST_ZVOL)
+			error = SET_ERROR(EINVAL);
+	}
+
+	if (error) {
+		bioerror(bp, error);
 		biodone(bp);
 		return;
 	}
 
+	zv = zs->zss_data;
+
 	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
 		bioerror(bp, EROFS);
 		biodone(bp);
 		return;
 	}
-
 	off = (uint64_t)bp->b_blkno * DEV_BSIZE;
+#endif
+
 	volsize = zv->zv_volsize;
 
 	os = zv->zv_objset;
 	ASSERT(os != NULL);
 
+#ifdef illumos
+	bp_mapin(bp);
+	addr = bp->b_un.b_addr;
+	resid = bp->b_bcount;
+
+	if (resid > 0 && (off < 0 || off >= volsize)) {
+		bioerror(bp, EIO);
+		biodone(bp);
+		return (0);
+	}
+
+	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
+	sync = ((!(bp->b_flags & B_ASYNC) &&
+	    !(zv->zv_flags & ZVOL_WCE)) ||
+	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
+	    !doread && !is_dumpified;
+#endif /* illumos */
+#ifdef __FreeBSD__
+	addr = bp->bio_data;
+	resid = bp->bio_length;
+
+	if (resid > 0 && (off < 0 || off >= volsize)) {
+		error = SET_ERROR(EIO);
+		goto out;
+	}
+
+	is_dumpified = B_FALSE;
+	sync = !doread && !is_dumpified &&
+	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
 	addr = bp->b_data;
 	resid = bp->b_bcount;
 
@@ -1252,30 +1931,46 @@ zvol_strategy(buf_t *bp)
 		return;
 	}
 
-	sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump &&
-	    !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+	is_dumpified = B_FALSE;
+	sync = ((!(bp->b_flags & B_ASYNC) &&
+	    !(zv->zv_flags & ZVOL_WCE)) ||
+	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
+	    !doread && !is_dumpified;
 
-	/*
-	 * There must be no buffer changes when doing a dmu_sync() because
-	 * we can't change the data whilst calculating the checksum.
-	 */
 	mutex_enter(&zv->zv_dklock);
 	disk_busy(&zv->zv_dk);
 	mutex_exit(&zv->zv_dklock);
+#endif
 
+	/*
+	 * There must be no buffer changes when doing a dmu_sync() because
+	 * we can't change the data whilst calculating the checksum.
+	 */
 	rl = zfs_range_lock(&zv->zv_znode, off, resid,
 	    doread ? RL_READER : RL_WRITER);
 
+#ifdef __FreeBSD__
+	if (bp->bio_cmd == BIO_DELETE) {
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+		} else {
+			zvol_log_truncate(zv, tx, off, resid, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    off, resid);
+			resid = 0;
+		}
+		goto unlock;
+	}
+#endif
 	while (resid != 0 && off < volsize) {
 		size_t size = MIN(resid, zvol_maxphys);
-		if (is_dump) {
-#ifdef __NetBSD__
-			printf("XXXNETBSD zvol_strategy: how?");
-#else
+		if (is_dumpified) {
 			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
 			error = zvol_dumpio(zv, addr, off, size,
 			    doread, B_FALSE);
-#endif
 		} else if (doread) {
 			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
 			    DMU_READ_PREFETCH);
@@ -1294,28 +1989,57 @@ zvol_strategy(buf_t *bp)
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
-				error = EIO;
+				error = SET_ERROR(EIO);
 			break;
 		}
 		off += size;
 		addr += size;
 		resid -= size;
 	}
+#ifdef __FreeBSD__
+unlock:
+#endif
 	zfs_range_unlock(rl);
 
+#ifdef illumos
+	if ((bp->b_resid = resid) == bp->b_bcount)
+		bioerror(bp, off > volsize ? EINVAL : error);
+
+	if (sync)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	biodone(bp);
+
+	return (0);
+#endif /* illumos */
+#ifdef __FreeBSD__
+	bp->bio_completed = bp->bio_length - resid;
+	if (bp->bio_completed < bp->bio_length && off > volsize)
+		error = EINVAL;
+
+	if (sync) {
+sync:
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	}
+out:
+	if (bp->bio_to)
+		g_io_deliver(bp, error);
+	else
+		biofinish(bp, NULL, error);
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
 	if ((bp->b_resid = resid) == bp->b_bcount)
 		bioerror(bp, off > volsize ? EINVAL : error);
 
 	if (sync)
-		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	mutex_enter(&zv->zv_dklock);
 	disk_unbusy(&zv->zv_dk, bp->b_bcount - bp->b_resid, doread);
 	mutex_exit(&zv->zv_dklock);
 	biodone(bp);
-
-	return;
+#endif /* __NetBSD__ */
 }
 
+#if defined(illumos) || defined(__NetBSD__)
 /*
  * Set the buffer count to the zvol maximum transfer.
  * Using our own routine instead of the default minphys()
@@ -1330,8 +2054,9 @@ zvol_minphys(struct buf *bp)
 	if (bp->b_bcount > zvol_maxphys)
 		bp->b_bcount = zvol_maxphys;
 }
+#endif
 
-#ifndef __NetBSD__
+#ifdef illumos
 int
 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
 {
@@ -1342,12 +2067,12 @@ zvol_dump(dev_t dev, caddr_t addr, daddr
 	uint64_t boff;
 	uint64_t resid;
 
-	if (minor == 0)			/* This is the control device */
-		return (ENXIO);
-
-	zv = ddi_get_soft_state(zvol_state, minor);
+	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
+
+	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
+		return (SET_ERROR(EINVAL));
 
 	boff = ldbtob(blkno);
 	resid = ldbtob(nblocks);
@@ -1366,35 +2091,46 @@ zvol_dump(dev_t dev, caddr_t addr, daddr
 
 	return (error);
 }
-#endif	/* !__NetBSD__ */
+#endif
 
 /*ARGSUSED*/
+#if defined(illumos) || defined(__NetBSD__)
 int
 zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
+#endif
+#ifdef __FreeBSD__
+int
+zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
+#endif
 {
-	minor_t minor = getminor(dev);
 	zvol_state_t *zv;
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
 
-	if (minor == 0)			/* This is the control device */
-		return (ENXIO);
+#if defined(illumos) || defined(__NetBSD__)
+	minor_t minor = getminor(dev);
 
-	zv = ddi_get_soft_state(zvol_state, minor);
+	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
+#else
+	zv = dev->si_drv2;
+#endif
 
 	volsize = zv->zv_volsize;
+	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
 	if (uio->uio_resid > 0 &&
-	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
-		return (EIO);
+	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+		return (SET_ERROR(EIO));
 
+#ifdef illumos
 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
 		error = physio(zvol_strategy, NULL, dev, B_READ,
 		    zvol_minphys, uio);
 		return (error);
 	}
+#endif
 
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_READER);
@@ -1405,11 +2141,11 @@ zvol_read(dev_t dev, uio_t *uio, cred_t 
 		if (bytes > volsize - uio->uio_loffset)
 			bytes = volsize - uio->uio_loffset;
 
-		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
+		error =  dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
-				error = EIO;
+				error = SET_ERROR(EIO);
 			break;
 		}
 	}
@@ -1418,35 +2154,52 @@ zvol_read(dev_t dev, uio_t *uio, cred_t 
 }
 
 /*ARGSUSED*/
+#if defined(illumos) || defined(__NetBSD__)
 int
 zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
+#else
+int
+zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
+#endif
 {
-	minor_t minor = getminor(dev);
 	zvol_state_t *zv;
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
 	boolean_t sync;
 
-	if (minor == 0)			/* This is the control device */
-		return (ENXIO);
+#if defined(illumos) || defined(__NetBSD__)
+	minor_t minor = getminor(dev);
 
-	zv = ddi_get_soft_state(zvol_state, minor);
+	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
 	if (zv == NULL)
-		return (ENXIO);
+		return (SET_ERROR(ENXIO));
+#else
+	zv = dev->si_drv2;
+#endif
 
 	volsize = zv->zv_volsize;
+	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
 	if (uio->uio_resid > 0 &&
-	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
-		return (EIO);
+	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
+		return (SET_ERROR(EIO));
 
+#ifdef illumos
 	if (zv->zv_flags & ZVOL_DUMPIFIED) {
 		error = physio(zvol_strategy, NULL, dev, B_WRITE,
 		    zvol_minphys, uio);
 		return (error);
 	}
 
-	sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+	sync = !(zv->zv_flags & ZVOL_WCE) ||
+#endif
+#ifdef __FreeBSD__
+	sync = (ioflag & IO_SYNC) ||
+#endif
+#ifdef __NetBSD__
+	sync = 1 ||
+#endif
+	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_WRITER);
@@ -1464,7 +2217,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t
 			dmu_tx_abort(tx);
 			break;
 		}
-		error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
+		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
 		if (error == 0)
 			zvol_log_write(zv, tx, off, bytes, sync);
 		dmu_tx_commit(tx);
@@ -1474,89 +2227,23 @@ zvol_write(dev_t dev, uio_t *uio, cred_t
 	}
 	zfs_range_unlock(rl);
 	if (sync)
-		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 	return (error);
 }
 
-#ifdef __NetBSD__
-
-/*
- * Dirtbag ioctls to support newfs(1) for UFS filesystems.
- */
-/*ARGSUSED*/
+#ifdef illumos
 int
-zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
 {
-	zvol_state_t *zv;
-	int error = 0;
-	
-	mutex_enter(&zvol_state_lock);
-
-	zv = ddi_get_soft_state(zvol_state, getminor(dev));
-
-	if (zv == NULL) {
-		mutex_exit(&zvol_state_lock);
-		return (ENXIO);
-	}
-
-	switch(cmd) {
-	case DIOCGWEDGEINFO:
-	{
-		struct dkwedge_info *dkw = (void *) arg;
-
-		strlcpy(dkw->dkw_devname, zv->zv_name, 16);
-		strlcpy(dkw->dkw_wname, zv->zv_name, MAXPATHLEN);
-		strlcpy(dkw->dkw_parent, zv->zv_name, 16);
-		
-		dkw->dkw_offset = 0;
-		/* XXX NetBSD supports only DEV_BSIZE device block
-		   size zv_volblocksize >> DEV_BSIZE*/
-		dkw->dkw_size = (zv->zv_volsize / DEV_BSIZE);
-		dprintf("dkw %"PRIu64" volsize %"PRIu64" volblock %"PRIu64" \n",
-		    dkw->dkw_size, zv->zv_volsize, zv->zv_volblocksize);
-		strcpy(dkw->dkw_ptype, DKW_PTYPE_FFS);
-
-		break;
-	}
-
-	case DIOCGDISKINFO:
-	{
-		struct plistref *pref = (struct plistref *) arg;
-
-		if (zv->zv_dk.dk_info == NULL) {
-			mutex_exit(&zvol_state_lock);
-			return ENOTSUP;
-		} else
-			prop_dictionary_copyout_ioctl(pref, cmd,
-			    zv->zv_dk.dk_info);
-		
-		break;
-	}
-	
-	default:
-		aprint_debug("unknown disk_ioctl called\n");
-		error = ENOTTY;
-		break; 
-	}
-	
-	mutex_exit(&zvol_state_lock);
-	return (error);
-}
-
-#else	/* __NetBSD__ */
-
-int
-zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
-{
-	struct uuid uuid = EFI_RESERVED;
-	efi_gpe_t gpe = { 0 };
-	uint32_t crc;
-	dk_efi_t efi;
-	int length;
-	char *ptr;
+	struct uuid uuid = EFI_RESERVED;
+	efi_gpe_t gpe = { 0 };
+	uint32_t crc;
+	dk_efi_t efi;
+	int length;
+	char *ptr;
 
 	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
-		return (EFAULT);
+		return (SET_ERROR(EFAULT));
 	ptr = (char *)(uintptr_t)efi.dki_data_64;
 	length = efi.dki_length;
 	/*
@@ -1567,7 +2254,7 @@ zvol_getefi(void *arg, int flag, uint64_
 	 * PMBR.
 	 */
 	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
-		return (EINVAL);
+		return (SET_ERROR(EINVAL));
 
 	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
 	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
@@ -1592,78 +2279,199 @@ zvol_getefi(void *arg, int flag, uint64_
 		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
 		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
 		    flag))
-			return (EFAULT);
+			return (SET_ERROR(EFAULT));
 		ptr += sizeof (gpt);
 		length -= sizeof (gpt);
 	}
 	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
 	    length), flag))
-		return (EFAULT);
+		return (SET_ERROR(EFAULT));
 	return (0);
 }
 
 /*
+ * BEGIN entry points to allow external callers access to the volume.
+ */
+/*
+ * Return the volume parameters needed for access from an external caller.
+ * These values are invariant as long as the volume is held open.
+ */
+int
+zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+    void **rl_hdl, void **bonus_hdl)
+{
+	zvol_state_t *zv;
+
+	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+	if (zv == NULL)
+		return (SET_ERROR(ENXIO));
+	if (zv->zv_flags & ZVOL_DUMPIFIED)
+		return (SET_ERROR(ENXIO));
+
+	ASSERT(blksize && max_xfer_len && minor_hdl &&
+	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
+
+	*blksize = zv->zv_volblocksize;
+	*max_xfer_len = (uint64_t)zvol_maxphys;
+	*minor_hdl = zv;
+	*objset_hdl = zv->zv_objset;
+	*zil_hdl = zv->zv_zilog;
+	*rl_hdl = &zv->zv_znode;
+	*bonus_hdl = zv->zv_dbuf;
+	return (0);
+}
+
+/*
+ * Return the current volume size to an external caller.
+ * The size can change while the volume is open.
+ */
+uint64_t
+zvol_get_volume_size(void *minor_hdl)
+{
+	zvol_state_t *zv = minor_hdl;
+
+	return (zv->zv_volsize);
+}
+
+/*
+ * Return the current WCE setting to an external caller.
+ * The WCE setting can change while the volume is open.
+ */
+int
+zvol_get_volume_wce(void *minor_hdl)
+{
+	zvol_state_t *zv = minor_hdl;
+
+	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
+}
+
+/*
+ * Entry point for external callers to zvol_log_write
+ */
+void
+zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
+    boolean_t sync)
+{
+	zvol_state_t *zv = minor_hdl;
+
+	zvol_log_write(zv, tx, off, resid, sync);
+}
+/*
+ * END entry points to allow external callers access to the volume.
+ */
+#endif	/* illumos */
+
+/*
+ * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
+ */
+static void
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
+    boolean_t sync)
+{
+	itx_t *itx;
+	lr_truncate_t *lr;
+	zilog_t *zilog = zv->zv_zilog;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+	lr = (lr_truncate_t *)&itx->itx_lr;
+	lr->lr_foid = ZVOL_OBJ;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+
+	itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
+	zil_itx_assign(zilog, itx, tx);
+}
+
+#ifdef illumos
+/*
  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
+ * Also a dirtbag dkio ioctl for unmap/free-block functionality.
  */
 /*ARGSUSED*/
 int
 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 {
 	zvol_state_t *zv;
-	struct dk_cinfo dki;
-	struct dk_minfo dkm;
 	struct dk_callback *dkc;
 	int error = 0;
 	rl_t *rl;
 
-	mutex_enter(&zvol_state_lock);
+	mutex_enter(&zfsdev_state_lock);
 
-	zv = ddi_get_soft_state(zvol_state, getminor(dev));
+	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
 
 	if (zv == NULL) {
-		mutex_exit(&zvol_state_lock);
-		return (ENXIO);
+		mutex_exit(&zfsdev_state_lock);
+		return (SET_ERROR(ENXIO));
 	}
 	ASSERT(zv->zv_total_opens > 0);
 
 	switch (cmd) {
 
 	case DKIOCINFO:
+	{
+		struct dk_cinfo dki;
+
 		bzero(&dki, sizeof (dki));
 		(void) strcpy(dki.dki_cname, "zvol");
 		(void) strcpy(dki.dki_dname, "zvol");
 		dki.dki_ctype = DKC_UNKNOWN;
 		dki.dki_unit = getminor(dev);
-		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
-		mutex_exit(&zvol_state_lock);
+		dki.dki_maxtransfer =
+		    1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
+		mutex_exit(&zfsdev_state_lock);
 		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
-			error = EFAULT;
+			error = SET_ERROR(EFAULT);
 		return (error);
+	}
 
 	case DKIOCGMEDIAINFO:
+	{
+		struct dk_minfo dkm;
+
 		bzero(&dkm, sizeof (dkm));
 		dkm.dki_lbsize = 1U << zv->zv_min_bs;
 		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
 		dkm.dki_media_type = DK_UNKNOWN;
-		mutex_exit(&zvol_state_lock);
+		mutex_exit(&zfsdev_state_lock);
 		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
-			error = EFAULT;
+			error = SET_ERROR(EFAULT);
 		return (error);
+	}
+
+	case DKIOCGMEDIAINFOEXT:
+	{
+		struct dk_minfo_ext dkmext;
+
+		bzero(&dkmext, sizeof (dkmext));
+		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
+		dkmext.dki_pbsize = zv->zv_volblocksize;
+		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+		dkmext.dki_media_type = DK_UNKNOWN;
+		mutex_exit(&zfsdev_state_lock);
+		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
+			error = SET_ERROR(EFAULT);
+		return (error);
+	}
 
 	case DKIOCGETEFI:
-		{
-			uint64_t vs = zv->zv_volsize;
-			uint8_t bs = zv->zv_min_bs;
+	{
+		uint64_t vs = zv->zv_volsize;
+		uint8_t bs = zv->zv_min_bs;
 
-			mutex_exit(&zvol_state_lock);
-			error = zvol_getefi((void *)arg, flag, vs, bs);
-			return (error);
-		}
+		mutex_exit(&zfsdev_state_lock);
+		error = zvol_getefi((void *)arg, flag, vs, bs);
+		return (error);
+	}
 
 	case DKIOCFLUSHWRITECACHE:
 		dkc = (struct dk_callback *)arg;
-		mutex_exit(&zvol_state_lock);
-		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+		mutex_exit(&zfsdev_state_lock);
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
 			error = 0;
@@ -1671,31 +2479,31 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t 
 		return (error);
 
 	case DKIOCGETWCE:
-		{
-			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
-			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
-			    flag))
-				error = EFAULT;
+	{
+		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+		    flag))
+			error = SET_ERROR(EFAULT);
+		break;
+	}
+	case DKIOCSETWCE:
+	{
+		int wce;
+		if (ddi_copyin((void *)arg, &wce, sizeof (int),
+		    flag)) {
+			error = SET_ERROR(EFAULT);
 			break;
 		}
-	case DKIOCSETWCE:
-		{
-			int wce;
-			if (ddi_copyin((void *)arg, &wce, sizeof (int),
-			    flag)) {
-				error = EFAULT;
-				break;
-			}
-			if (wce) {
-				zv->zv_flags |= ZVOL_WCE;
-				mutex_exit(&zvol_state_lock);
-			} else {
-				zv->zv_flags &= ~ZVOL_WCE;
-				mutex_exit(&zvol_state_lock);
-				zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
-			}
-			return (0);
+		if (wce) {
+			zv->zv_flags |= ZVOL_WCE;
+			mutex_exit(&zfsdev_state_lock);
+		} else {
+			zv->zv_flags &= ~ZVOL_WCE;
+			mutex_exit(&zfsdev_state_lock);
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
 		}
+		return (0);
+	}
 
 	case DKIOCGGEOM:
 	case DKIOCGVTOC:
@@ -1703,7 +2511,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t 
 		 * commands using these (like prtvtoc) expect ENOTSUP
 		 * since we're emulating an EFI label
 		 */
-		error = ENOTSUP;
+		error = SET_ERROR(ENOTSUP);
 		break;
 
 	case DKIOCDUMPINIT:
@@ -1722,16 +2530,73 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t 
 		zfs_range_unlock(rl);
 		break;
 
+	case DKIOCFREE:
+	{
+		dkioc_free_t df;
+		dmu_tx_t *tx;
+
+		if (!zvol_unmap_enabled)
+			break;
+
+		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
+			error = SET_ERROR(EFAULT);
+			break;
+		}
+
+		/*
+		 * Apply Postel's Law to length-checking.  If they overshoot,
+		 * just blank out until the end, if there's a need to blank
+		 * out anything.
+		 */
+		if (df.df_start >= zv->zv_volsize)
+			break;	/* No need to do anything... */
+
+		mutex_exit(&zfsdev_state_lock);
+
+		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
+		    RL_WRITER);
+		tx = dmu_tx_create(zv->zv_objset);
+		dmu_tx_mark_netfree(tx);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+		} else {
+			zvol_log_truncate(zv, tx, df.df_start,
+			    df.df_length, B_TRUE);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    df.df_start, df.df_length);
+		}
+
+		zfs_range_unlock(rl);
+
+		/*
+		 * If the write-cache is disabled, 'sync' property
+		 * is set to 'always', or if the caller is asking for
+		 * a synchronous free, commit this operation to the zil.
+		 * This will sync any previous uncommitted writes to the
+		 * zvol object.
+		 * Can be overridden by the zvol_unmap_sync_enabled tunable.
+		 */
+		if ((error == 0) && zvol_unmap_sync_enabled &&
+		    (!(zv->zv_flags & ZVOL_WCE) ||
+		    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
+		    (df.df_flags & DF_WAIT_SYNC))) {
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		}
+
+		return (error);
+	}
+
 	default:
-		error = ENOTTY;
+		error = SET_ERROR(ENOTTY);
 		break;
 
 	}
-	mutex_exit(&zvol_state_lock);
+	mutex_exit(&zfsdev_state_lock);
 	return (error);
 }
-
-#endif	/* __NetBSD__ */
+#endif	/* illumos */
 
 int
 zvol_busy(void)
@@ -1742,38 +2607,118 @@ zvol_busy(void)
 void
 zvol_init(void)
 {
-	VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
-	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
+	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
+	    1) == 0);
+#ifndef __FreeBSD__
+	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+#endif
+	ZFS_LOG(1, "ZVOL Initialized.");
 }
 
 void
 zvol_fini(void)
 {
-	mutex_destroy(&zvol_state_lock);
-	ddi_soft_state_fini(&zvol_state);
+#ifndef __FreeBSD__
+	mutex_destroy(&zfsdev_state_lock);
+#endif
+	ddi_soft_state_fini(&zfsdev_state);
+	ZFS_LOG(1, "ZVOL Deinitialized.");
+}
+
+#ifdef illumos
+/*ARGSUSED*/
+static int
+zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+		return (1);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
 }
 
-#ifndef __NetBSD__
 static int
 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 {
 	dmu_tx_t *tx;
-	int error = 0;
+	int error;
 	objset_t *os = zv->zv_objset;
+	spa_t *spa = dmu_objset_spa(os);
+	vdev_t *vd = spa->spa_root_vdev;
 	nvlist_t *nv = NULL;
-	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
+	uint64_t version = spa_version(spa);
+	uint64_t checksum, compress, refresrv, vbs, dedup;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+	ASSERT(vd->vdev_ops == &vdev_root_ops);
 
-	ASSERT(MUTEX_HELD(&zvol_state_lock));
 	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
 	    DMU_OBJECT_END);
+	if (error != 0)
+		return (error);
 	/* wait for dmu_free_long_range to actually free the blocks */
 	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
+	/*
+	 * If the pool on which the dump device is being initialized has more
+	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
+	 * enabled.  If so, bump that feature's counter to indicate that the
+	 * feature is active. We also check the vdev type to handle the
+	 * following case:
+	 *   # zpool create test raidz disk1 disk2 disk3
+	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
+	 *   the raidz vdev itself has 3 children.
+	 */
+	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
+		if (!spa_feature_is_enabled(spa,
+		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+			return (SET_ERROR(ENOTSUP));
+		(void) dsl_sync_task(spa_name(spa),
+		    zfs_mvdev_dump_feature_check,
+		    zfs_mvdev_dump_activate_feature_sync, NULL,
+		    2, ZFS_SPACE_CHECK_RESERVED);
+	}
+
+	if (!resize) {
+		error = dsl_prop_get_integer(zv->zv_name,
+		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
+		if (error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
+			    NULL);
+		}
+		if (error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+			    &refresrv, NULL);
+		}
+		if (error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
+			    NULL);
+		}
+		if (version >= SPA_VERSION_DEDUP && error == 0) {
+			error = dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+		}
+	}
+	if (error != 0)
+		return (error);
+
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
+	if (error != 0) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
@@ -1789,42 +2734,35 @@ zvol_dump_init(zvol_state_t *zv, boolean
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
 		    &zv->zv_volsize, tx);
 	} else {
-		uint64_t checksum, compress, refresrv, vbs, dedup;
-
-		error = dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
-		error = error ? error : dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
-		error = error ? error : dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
-		error = error ? error : dsl_prop_get_integer(zv->zv_name,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
-		if (version >= SPA_VERSION_DEDUP) {
-			error = error ? error :
-			    dsl_prop_get_integer(zv->zv_name,
-			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
-		}
-
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+		error = zap_update(os, ZVOL_ZAP_OBJ,
 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
 		    &compress, tx);
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
-		    &refresrv, tx);
-		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
-		    &vbs, tx);
-		error = error ? error : dmu_object_set_blocksize(
-		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
-		if (version >= SPA_VERSION_DEDUP) {
-			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+		if (error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
+			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
+			    &checksum, tx);
+		}
+		if (error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+			    &refresrv, tx);
+		}
+		if (error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
+			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
+			    &vbs, tx);
+		}
+		if (error == 0) {
+			error = dmu_object_set_blocksize(
+			    os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
+		}
+		if (version >= SPA_VERSION_DEDUP && error == 0) {
+			error = zap_update(os, ZVOL_ZAP_OBJ,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
 			    &dedup, tx);
 		}
 		if (error == 0)
-			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
+			zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
 	}
 	dmu_tx_commit(tx);
 
@@ -1832,7 +2770,15 @@ zvol_dump_init(zvol_state_t *zv, boolean
 	 * We only need update the zvol's property if we are initializing
 	 * the dump area for the first time.
 	 */
-	if (!resize) {
+	if (error == 0 && !resize) {
+		/*
+		 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
+		 * function.  Otherwise, use the old default -- OFF.
+		 */
+		checksum = spa_feature_is_active(spa,
+		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
+		    ZIO_CHECKSUM_OFF;
+
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
@@ -1841,7 +2787,7 @@ zvol_dump_init(zvol_state_t *zv, boolean
 		    ZIO_COMPRESS_OFF) == 0);
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-		    ZIO_CHECKSUM_OFF) == 0);
+		    checksum) == 0);
 		if (version >= SPA_VERSION_DEDUP) {
 			VERIFY(nvlist_add_uint64(nv,
 			    zfs_prop_to_name(ZFS_PROP_DEDUP),
@@ -1851,13 +2797,11 @@ zvol_dump_init(zvol_state_t *zv, boolean
 		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
 		    nv, NULL);
 		nvlist_free(nv);
-
-		if (error)
-			return (error);
 	}
 
 	/* Allocate the space for the dump */
-	error = zvol_prealloc(zv);
+	if (error == 0)
+		error = zvol_prealloc(zv);
 	return (error);
 }
 
@@ -1870,11 +2814,11 @@ zvol_dumpify(zvol_state_t *zv)
 	objset_t *os = zv->zv_objset;
 
 	if (zv->zv_flags & ZVOL_RDONLY)
-		return (EROFS);
+		return (SET_ERROR(EROFS));
 
 	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
 	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
-		boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
+		boolean_t resize = (dumpsize > 0);
 
 		if ((error = zvol_dump_init(zv, resize)) != 0) {
 			(void) zvol_dump_fini(zv);
@@ -1985,4 +2929,659 @@ zvol_dump_fini(zvol_state_t *zv)
 
 	return (0);
 }
-#endif	/* !__NetBSD__ */
+#endif /* illumos */
+
+#ifdef __FreeBSD__
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+	struct g_provider *pp;
+
+	pp = zv->zv_provider;
+	g_error_provider(pp, 0);
+
+	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
+	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+	struct g_provider *pp;
+
+	g_topology_assert();
+
+	mtx_lock(&zv->zv_queue_mtx);
+	zv->zv_state = 1;
+	wakeup_one(&zv->zv_queue);
+	while (zv->zv_state != 2)
+		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
+	mtx_destroy(&zv->zv_queue_mtx);
+
+	pp = zv->zv_provider;
+	zv->zv_provider = NULL;
+	pp->private = NULL;
+	g_wither_geom(pp->geom, ENXIO);
+}
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+	int count, error, flags;
+
+	g_topology_assert();
+
+	/*
+	 * To make it easier we expect either open or close, but not both
+	 * at the same time.
+	 */
+	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+	    (acr <= 0 && acw <= 0 && ace <= 0),
+	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+	    pp->name, acr, acw, ace));
+
+	if (pp->private == NULL) {
+		if (acr <= 0 && acw <= 0 && ace <= 0)
+			return (0);
+		return (pp->error);
+	}
+
+	/*
+	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
+	 * because GEOM already handles that and handles it a bit differently.
+	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
+	 * only one exclusive consumer, no matter if it is reader or writer.
+	 * I like better the way GEOM works so I'll leave it for GEOM to
+	 * decide what to do.
+	 */
+
+	count = acr + acw + ace;
+	if (count == 0)
+		return (0);
+
+	flags = 0;
+	if (acr != 0 || ace != 0)
+		flags |= FREAD;
+	if (acw != 0)
+		flags |= FWRITE;
+
+	g_topology_unlock();
+	if (count > 0)
+		error = zvol_open(pp, flags, count);
+	else
+		error = zvol_close(pp, flags, -count);
+	g_topology_lock();
+	return (error);
+}
+
+static void
+zvol_geom_start(struct bio *bp)
+{
+	zvol_state_t *zv;
+	boolean_t first;
+
+	zv = bp->bio_to->private;
+	ASSERT(zv != NULL);
+	switch (bp->bio_cmd) {
+	case BIO_FLUSH:
+		if (!THREAD_CAN_SLEEP())
+			goto enqueue;
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		g_io_deliver(bp, 0);
+		break;
+	case BIO_READ:
+	case BIO_WRITE:
+	case BIO_DELETE:
+		if (!THREAD_CAN_SLEEP())
+			goto enqueue;
+		zvol_strategy(bp);
+		break;
+	case BIO_GETATTR: {
+		spa_t *spa = dmu_objset_spa(zv->zv_objset);
+		uint64_t refd, avail, usedobjs, availobjs, val;
+
+		if (g_handleattr_int(bp, "GEOM::candelete", 1))
+			return;
+		if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			if (g_handleattr_off_t(bp, "blocksavail",
+			    avail / DEV_BSIZE))
+				return;
+		} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			if (g_handleattr_off_t(bp, "blocksused",
+			    refd / DEV_BSIZE))
+				return;
+		} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+			avail = metaslab_class_get_space(spa_normal_class(spa));
+			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+			if (g_handleattr_off_t(bp, "poolblocksavail",
+			    avail / DEV_BSIZE))
+				return;
+		} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+			refd = metaslab_class_get_alloc(spa_normal_class(spa));
+			if (g_handleattr_off_t(bp, "poolblocksused",
+			    refd / DEV_BSIZE))
+				return;
+		}
+		/* FALLTHROUGH */
+	}
+	default:
+		g_io_deliver(bp, EOPNOTSUPP);
+		break;
+	}
+	return;
+
+enqueue:
+	mtx_lock(&zv->zv_queue_mtx);
+	first = (bioq_first(&zv->zv_queue) == NULL);
+	bioq_insert_tail(&zv->zv_queue, bp);
+	mtx_unlock(&zv->zv_queue_mtx);
+	if (first)
+		wakeup_one(&zv->zv_queue);
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+	zvol_state_t *zv;
+	struct bio *bp;
+
+	thread_lock(curthread);
+	sched_prio(curthread, PRIBIO);
+	thread_unlock(curthread);
+
+	zv = arg;
+	for (;;) {
+		mtx_lock(&zv->zv_queue_mtx);
+		bp = bioq_takefirst(&zv->zv_queue);
+		if (bp == NULL) {
+			if (zv->zv_state == 1) {
+				zv->zv_state = 2;
+				wakeup(&zv->zv_state);
+				mtx_unlock(&zv->zv_queue_mtx);
+				kthread_exit();
+			}
+			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
+			    "zvol:io", 0);
+			continue;
+		}
+		mtx_unlock(&zv->zv_queue_mtx);
+		switch (bp->bio_cmd) {
+		case BIO_FLUSH:
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+			g_io_deliver(bp, 0);
+			break;
+		case BIO_READ:
+		case BIO_WRITE:
+		case BIO_DELETE:
+			zvol_strategy(bp);
+			break;
+		default:
+			g_io_deliver(bp, EOPNOTSUPP);
+			break;
+		}
+	}
+}
+
+extern boolean_t dataset_name_hidden(const char *name);
+
+static int
+zvol_create_snapshots(objset_t *os, const char *name)
+{
+	uint64_t cookie, obj;
+	char *sname;
+	int error, len;
+
+	cookie = obj = 0;
+	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+#if 0
+	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
+	    DS_FIND_SNAPSHOTS);
+#endif
+
+	for (;;) {
+		len = snprintf(sname, MAXPATHLEN, "%s@", name);
+		if (len >= MAXPATHLEN) {
+			dmu_objset_rele(os, FTAG);
+			error = ENAMETOOLONG;
+			break;
+		}
+
+		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
+		    sname + len, &obj, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT)
+				error = 0;
+			break;
+		}
+
+		error = zvol_create_minor(sname);
+		if (error != 0 && error != EEXIST) {
+			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+			    sname, error);
+			break;
+		}
+	}
+
+	kmem_free(sname, MAXPATHLEN);
+	return (error);
+}
+
+int
+zvol_create_minors(const char *name)
+{
+	uint64_t cookie;
+	objset_t *os;
+	char *osname, *p;
+	int error, len;
+
+	if (dataset_name_hidden(name))
+		return (0);
+
+	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+		    name, error);
+		return (error);
+	}
+	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
+		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
+		dsl_pool_rele(dmu_objset_pool(os), FTAG);
+		error = zvol_create_minor(name);
+		if (error == 0 || error == EEXIST) {
+			error = zvol_create_snapshots(os, name);
+		} else {
+			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
+			    name, error);
+		}
+		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
+		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
+		return (error);
+	}
+	if (dmu_objset_type(os) != DMU_OST_ZFS) {
+		dmu_objset_rele(os, FTAG);
+		return (0);
+	}
+
+	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
+		dmu_objset_rele(os, FTAG);
+		kmem_free(osname, MAXPATHLEN);
+		return (ENOENT);
+	}
+	p = osname + strlen(osname);
+	len = MAXPATHLEN - (p - osname);
+
+#if 0
+	/* Prefetch the datasets. */
+	cookie = 0;
+	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
+		if (!dataset_name_hidden(osname))
+			(void) dmu_objset_prefetch(osname, NULL);
+	}
+#endif
+
+	cookie = 0;
+	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
+	    &cookie) == 0) {
+		dmu_objset_rele(os, FTAG);
+		(void)zvol_create_minors(osname);
+		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
+			    name, error);
+			return (error);
+		}
+	}
+
+	dmu_objset_rele(os, FTAG);
+	kmem_free(osname, MAXPATHLEN);
+	return (0);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+	struct g_geom *gp;
+	struct g_provider *pp;
+	struct cdev *dev;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		g_topology_lock();
+		pp = zv->zv_provider;
+		ASSERT(pp != NULL);
+		gp = pp->geom;
+		ASSERT(gp != NULL);
+
+		zv->zv_provider = NULL;
+		g_wither_provider(pp, ENXIO);
+
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = zv->zv_volsize;
+		pp->private = zv;
+		zv->zv_provider = pp;
+		g_error_provider(pp, 0);
+		g_topology_unlock();
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct make_dev_args args;
+
+		if ((dev = zv->zv_dev) != NULL) {
+			zv->zv_dev = NULL;
+			destroy_dev(dev);
+			if (zv->zv_total_opens > 0) {
+				zv->zv_flags &= ~ZVOL_EXCL;
+				zv->zv_total_opens = 0;
+				zvol_last_close(zv);
+			}
+		}
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		if (make_dev_s(&args, &zv->zv_dev,
+		    "%s/%s", ZVOL_DRIVER, newname) == 0)
+			zv->zv_dev->si_iosize_max = MAXPHYS;
+	}
+	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
+}
+
+void
+zvol_rename_minors(const char *oldname, const char *newname)
+{
+	char name[MAXPATHLEN];
+	struct g_provider *pp;
+	struct g_geom *gp;
+	size_t oldnamelen, newnamelen;
+	zvol_state_t *zv;
+	char *namebuf;
+	boolean_t locked = B_FALSE;
+
+	oldnamelen = strlen(oldname);
+	newnamelen = strlen(newname);
+
+	DROP_GIANT();
+	/* See comment in zvol_open(). */
+	if (!MUTEX_HELD(&zfsdev_state_lock)) {
+		mutex_enter(&zfsdev_state_lock);
+		locked = B_TRUE;
+	}
+
+	LIST_FOREACH(zv, &all_zvols, zv_links) {
+		if (strcmp(zv->zv_name, oldname) == 0) {
+			zvol_rename_minor(zv, newname);
+		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+		    (zv->zv_name[oldnamelen] == '/' ||
+		     zv->zv_name[oldnamelen] == '@')) {
+			snprintf(name, sizeof(name), "%s%c%s", newname,
+			    zv->zv_name[oldnamelen],
+			    zv->zv_name + oldnamelen + 1);
+			zvol_rename_minor(zv, name);
+		}
+	}
+
+	if (locked)
+		mutex_exit(&zfsdev_state_lock);
+	PICKUP_GIANT();
+}
+
+static int
+zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv = dev->si_drv2;
+	int err = 0;
+
+	mutex_enter(&zfsdev_state_lock);
+	if (zv->zv_total_opens == 0)
+		err = zvol_first_open(zv);
+	if (err) {
+		mutex_exit(&zfsdev_state_lock);
+		return (err);
+	}
+	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		err = SET_ERROR(EROFS);
+		goto out;
+	}
+	if (zv->zv_flags & ZVOL_EXCL) {
+		err = SET_ERROR(EBUSY);
+		goto out;
+	}
+#ifdef FEXCL
+	if (flags & FEXCL) {
+		if (zv->zv_total_opens != 0) {
+			err = SET_ERROR(EBUSY);
+			goto out;
+		}
+		zv->zv_flags |= ZVOL_EXCL;
+	}
+#endif
+
+	zv->zv_total_opens++;
+	if (flags & (FSYNC | FDSYNC)) {
+		zv->zv_sync_cnt++;
+		if (zv->zv_sync_cnt == 1)
+			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+	}
+	mutex_exit(&zfsdev_state_lock);
+	return (err);
+out:
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
+	mutex_exit(&zfsdev_state_lock);
+	return (err);
+}
+
+static int
+zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv = dev->si_drv2;
+
+	mutex_enter(&zfsdev_state_lock);
+	if (zv->zv_flags & ZVOL_EXCL) {
+		ASSERT(zv->zv_total_opens == 1);
+		zv->zv_flags &= ~ZVOL_EXCL;
+	}
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT(zv->zv_total_opens != 0);
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_total_opens--;
+	if (flags & (FSYNC | FDSYNC))
+		zv->zv_sync_cnt--;
+
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
+
+	mutex_exit(&zfsdev_state_lock);
+	return (0);
+}
+
+static int
+zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+	zvol_state_t *zv;
+	rl_t *rl;
+	off_t offset, length;
+	int i, error;
+	boolean_t sync;
+
+	zv = dev->si_drv2;
+
+	error = 0;
+	KASSERT(zv->zv_total_opens > 0,
+	    ("Device with zero access count in zvol_d_ioctl"));
+
+	i = IOCPARM_LEN(cmd);
+	switch (cmd) {
+	case DIOCGSECTORSIZE:
+		*(u_int *)data = DEV_BSIZE;
+		break;
+	case DIOCGMEDIASIZE:
+		*(off_t *)data = zv->zv_volsize;
+		break;
+	case DIOCGFLUSH:
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		break;
+	case DIOCGDELETE:
+		if (!zvol_unmap_enabled)
+			break;
+
+		offset = ((off_t *)data)[0];
+		length = ((off_t *)data)[1];
+		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+		    offset < 0 || offset >= zv->zv_volsize ||
+		    length <= 0) {
+			printf("%s: offset=%jd length=%jd\n", __func__, offset,
+			    length);
+			error = EINVAL;
+			break;
+		}
+
+		rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			sync = FALSE;
+			dmu_tx_abort(tx);
+		} else {
+			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+			zvol_log_truncate(zv, tx, offset, length, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    offset, length);
+		}
+		zfs_range_unlock(rl);
+		if (sync)
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		break;
+	case DIOCGSTRIPESIZE:
+		*(off_t *)data = zv->zv_volblocksize;
+		break;
+	case DIOCGSTRIPEOFFSET:
+		*(off_t *)data = 0;
+		break;
+	case DIOCGATTR: {
+		spa_t *spa = dmu_objset_spa(zv->zv_objset);
+		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+		uint64_t refd, avail, usedobjs, availobjs;
+
+		if (strcmp(arg->name, "GEOM::candelete") == 0)
+			arg->value.i = 1;
+		else if (strcmp(arg->name, "blocksavail") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "blocksused") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = refd / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
+			avail = metaslab_class_get_space(spa_normal_class(spa));
+			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksused") == 0) {
+			refd = metaslab_class_get_alloc(spa_normal_class(spa));
+			arg->value.off = refd / DEV_BSIZE;
+		} else
+			error = ENOIOCTL;
+		break;
+	}
+	case FIOSEEKHOLE:
+	case FIOSEEKDATA: {
+		off_t *off = (off_t *)data;
+		uint64_t noff;
+		boolean_t hole;
+
+		hole = (cmd == FIOSEEKHOLE);
+		noff = *off;
+		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+		*off = noff;
+		break;
+	}
+	default:
+		error = ENOIOCTL;
+	}
+
+	return (error);
+}
+#endif /* __FreeBSD__ */
+
+#ifdef __NetBSD__
+/*ARGSUSED*/
+int
+zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
+{
+	zvol_state_t *zv;
+	int error = 0;
+	
+	mutex_enter(&zfsdev_state_lock);
+
+	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
+
+	if (zv == NULL) {
+		mutex_exit(&zfsdev_state_lock);
+		return (ENXIO);
+	}
+
+	switch(cmd) {
+	case DIOCGWEDGEINFO:
+	{
+		struct dkwedge_info *dkw = (void *) arg;
+
+		strlcpy(dkw->dkw_devname, zv->zv_name, 16);
+		strlcpy(dkw->dkw_wname, zv->zv_name, MAXPATHLEN);
+		strlcpy(dkw->dkw_parent, zv->zv_name, 16);
+		
+		dkw->dkw_offset = 0;
+		/* XXX NetBSD supports only DEV_BSIZE device block
+		   size zv_volblocksize >> DEV_BSIZE*/
+		dkw->dkw_size = (zv->zv_volsize / DEV_BSIZE);
+		dprintf("dkw %"PRIu64" volsize %"PRIu64" volblock %"PRIu64" \n",
+		    dkw->dkw_size, zv->zv_volsize, zv->zv_volblocksize);
+		strcpy(dkw->dkw_ptype, DKW_PTYPE_FFS);
+
+		break;
+	}
+
+	case DIOCGDISKINFO:
+	{
+		struct plistref *pref = (struct plistref *) arg;
+
+		if (zv->zv_dk.dk_info == NULL) {
+			mutex_exit(&zfsdev_state_lock);
+			return ENOTSUP;
+		} else
+			prop_dictionary_copyout_ioctl(pref, cmd,
+			    zv->zv_dk.dk_info);
+		
+		break;
+	}
+	
+	default:
+		aprint_debug("unknown disk_ioctl called\n");
+		error = ENOTTY;
+		break; 
+	}
+	
+	mutex_exit(&zfsdev_state_lock);
+	return (error);
+}
+#endif /* __NetBSD__ */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 arc.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h	27 Feb 2010 22:31:38 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h	10 Oct 2016 11:09:53 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #ifndef	_SYS_ARC_H
@@ -36,43 +37,111 @@ extern "C" {
 #include <sys/dmu.h>
 #include <sys/spa.h>
 
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define	ARC_EVICT_ALL	-1ULL
+
+#define	HDR_SET_LSIZE(hdr, x) do { \
+	ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
+	(hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	HDR_SET_PSIZE(hdr, x) do { \
+	ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
+	(hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	HDR_GET_LSIZE(hdr)	((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
+#define	HDR_GET_PSIZE(hdr)	((hdr)->b_psize << SPA_MINBLOCKSHIFT)
+
 typedef struct arc_buf_hdr arc_buf_hdr_t;
 typedef struct arc_buf arc_buf_t;
-typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
-typedef int arc_evict_func_t(void *private);
+typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
 
 /* generic arc_done_func_t's which you can use */
 arc_done_func_t arc_bcopy_func;
 arc_done_func_t arc_getbuf_func;
 
+extern int zfs_arc_num_sublists_per_state;
+
+typedef enum arc_flags
+{
+	/*
+	 * Public flags that can be passed into the ARC by external consumers.
+	 */
+	ARC_FLAG_WAIT			= 1 << 0,	/* perform sync I/O */
+	ARC_FLAG_NOWAIT			= 1 << 1,	/* perform async I/O */
+	ARC_FLAG_PREFETCH		= 1 << 2,	/* I/O is a prefetch */
+	ARC_FLAG_CACHED			= 1 << 3,	/* I/O was in cache */
+	ARC_FLAG_L2CACHE		= 1 << 4,	/* cache in L2ARC */
+	ARC_FLAG_PREDICTIVE_PREFETCH	= 1 << 5,	/* I/O from zfetch */
+
+	/*
+	 * Private ARC flags.  These flags are private ARC only flags that
+	 * will show up in b_flags in the arc_hdr_buf_t. These flags should
+	 * only be set by ARC code.
+	 */
+	ARC_FLAG_IN_HASH_TABLE		= 1 << 6,	/* buffer is hashed */
+	ARC_FLAG_IO_IN_PROGRESS		= 1 << 7,	/* I/O in progress */
+	ARC_FLAG_IO_ERROR		= 1 << 8,	/* I/O failed for buf */
+	ARC_FLAG_INDIRECT		= 1 << 9,	/* indirect block */
+	/* Indicates that block was read with ASYNC priority. */
+	ARC_FLAG_PRIO_ASYNC_READ	= 1 << 10,
+	ARC_FLAG_L2_WRITING		= 1 << 11,	/* write in progress */
+	ARC_FLAG_L2_EVICTED		= 1 << 12,	/* evicted during I/O */
+	ARC_FLAG_L2_WRITE_HEAD		= 1 << 13,	/* head of write list */
+	/* indicates that the buffer contains metadata (otherwise, data) */
+	ARC_FLAG_BUFC_METADATA		= 1 << 14,
+
+	/* Flags specifying whether optional hdr struct fields are defined */
+	ARC_FLAG_HAS_L1HDR		= 1 << 15,
+	ARC_FLAG_HAS_L2HDR		= 1 << 16,
+
+	/*
+	 * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
+	 * This allows the l2arc to use the blkptr's checksum to verify
+	 * the data without having to store the checksum in the hdr.
+	 */
+	ARC_FLAG_COMPRESSED_ARC		= 1 << 17,
+	ARC_FLAG_SHARED_DATA		= 1 << 18,
+
+	/*
+	 * The arc buffer's compression mode is stored in the top 7 bits of the
+	 * flags field, so these dummy flags are included so that MDB can
+	 * interpret the enum properly.
+	 */
+	ARC_FLAG_COMPRESS_0		= 1 << 24,
+	ARC_FLAG_COMPRESS_1		= 1 << 25,
+	ARC_FLAG_COMPRESS_2		= 1 << 26,
+	ARC_FLAG_COMPRESS_3		= 1 << 27,
+	ARC_FLAG_COMPRESS_4		= 1 << 28,
+	ARC_FLAG_COMPRESS_5		= 1 << 29,
+	ARC_FLAG_COMPRESS_6		= 1 << 30
+
+} arc_flags_t;
+
 struct arc_buf {
 	arc_buf_hdr_t		*b_hdr;
 	arc_buf_t		*b_next;
-	krwlock_t		b_lock;
+	kmutex_t		b_evict_lock;
 	void			*b_data;
-	arc_evict_func_t	*b_efunc;
-	void			*b_private;
 };
 
 typedef enum arc_buf_contents {
+	ARC_BUFC_INVALID,			/* invalid type */
 	ARC_BUFC_DATA,				/* buffer contains data */
 	ARC_BUFC_METADATA,			/* buffer contains metadata */
 	ARC_BUFC_NUMTYPES
 } arc_buf_contents_t;
-/*
- * These are the flags we pass into calls to the arc
- */
-#define	ARC_WAIT	(1 << 1)	/* perform I/O synchronously */
-#define	ARC_NOWAIT	(1 << 2)	/* perform I/O asynchronously */
-#define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
-#define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */
-#define	ARC_L2CACHE	(1 << 5)	/* cache in L2ARC */
 
 /*
  * The following breakdows of arc_size exist for kstat only.
  */
 typedef enum arc_space_type {
 	ARC_SPACE_DATA,
+	ARC_SPACE_META,
 	ARC_SPACE_HDRS,
 	ARC_SPACE_L2HDRS,
 	ARC_SPACE_OTHER,
@@ -81,44 +150,37 @@ typedef enum arc_space_type {
 
 void arc_space_consume(uint64_t space, arc_space_type_t type);
 void arc_space_return(uint64_t space, arc_space_type_t type);
-void *arc_data_buf_alloc(uint64_t space);
-void arc_data_buf_free(void *buf, uint64_t space);
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
     arc_buf_contents_t type);
 arc_buf_t *arc_loan_buf(spa_t *spa, int size);
 void arc_return_buf(arc_buf_t *buf, void *tag);
 void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
-void arc_buf_add_ref(arc_buf_t *buf, void *tag);
-int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+void arc_buf_destroy(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
 void arc_release(arc_buf_t *buf, void *tag);
 int arc_released(arc_buf_t *buf);
-int arc_has_callback(arc_buf_t *buf);
 void arc_buf_freeze(arc_buf_t *buf);
 void arc_buf_thaw(arc_buf_t *buf);
 #ifdef ZFS_DEBUG
 int arc_referenced(arc_buf_t *buf);
 #endif
 
-int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
-    arc_done_func_t *done, void *private, int priority, int zio_flags,
-    uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
-    arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t *arc_flags, const zbookmark_t *zb);
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+    arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
+    arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
 zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private,
-    int priority, int zio_flags, const zbookmark_t *zb);
-void arc_free(spa_t *spa, const blkptr_t *bp);
+    arc_done_func_t *ready, arc_done_func_t *child_ready,
+    arc_done_func_t *physdone, arc_done_func_t *done,
+    void *priv, zio_priority_t priority, int zio_flags,
+    const zbookmark_phys_t *zb);
+void arc_freed(spa_t *spa, const blkptr_t *bp);
 
-void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
-int arc_buf_evict(arc_buf_t *buf);
-
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
 void arc_tempreserve_clear(uint64_t reserve);
 int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
 
+uint64_t arc_max_bytes(void);
 void arc_init(void);
 void arc_fini(void);
 
@@ -134,6 +196,13 @@ void l2arc_fini(void);
 void l2arc_start(void);
 void l2arc_stop(void);
 
+#ifdef illumos
+#ifndef _KERNEL
+extern boolean_t arc_watch;
+extern int arc_procfd;
+#endif
+#endif /* illumos */
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/blkptr.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/blkptr.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/blkptr.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/blkptr.h	17 Jul 2014 16:23:18 -0000
@@ -0,0 +1,38 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_BLKPTR_H
+#define	_SYS_BLKPTR_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+void encode_embedded_bp_compressed(blkptr_t *, void *,
+    enum zio_compress, int, int);
+void decode_embedded_bp_compressed(const blkptr_t *, void *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_BLKPTR_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 bplist.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h	27 Feb 2010 22:31:38 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h	12 Jun 2012 05:57:27 -0000
@@ -19,75 +19,36 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_BPLIST_H
 #define	_SYS_BPLIST_H
 
-#include <sys/dmu.h>
-#include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
 #include <sys/zfs_context.h>
+#include <sys/spa.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-typedef struct bplist_phys {
-	/*
-	 * This is the bonus buffer for the dead lists.  The object's
-	 * contents is an array of bpl_entries blkptr_t's, representing
-	 * a total of bpl_bytes physical space.
-	 */
-	uint64_t	bpl_entries;
-	uint64_t	bpl_bytes;
-	uint64_t	bpl_comp;
-	uint64_t	bpl_uncomp;
-} bplist_phys_t;
-
-#define	BPLIST_SIZE_V0	(2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
-	blkptr_t	bpq_blk;
-	void		*bpq_next;
-} bplist_q_t;
+typedef struct bplist_entry {
+	blkptr_t	bpe_blk;
+	list_node_t	bpe_node;
+} bplist_entry_t;
 
 typedef struct bplist {
 	kmutex_t	bpl_lock;
-	objset_t	*bpl_mos;
-	uint64_t	bpl_object;
-	uint8_t		bpl_blockshift;
-	uint8_t		bpl_bpshift;
-	uint8_t		bpl_havecomp;
-	bplist_q_t	*bpl_queue;
-	bplist_phys_t	*bpl_phys;
-	dmu_buf_t	*bpl_dbuf;
-	dmu_buf_t	*bpl_cached_dbuf;
+	list_t		bpl_list;
 } bplist_t;
 
-typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
-extern void bplist_init(bplist_t *bpl);
-extern void bplist_fini(bplist_t *bpl);
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func,
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
     void *arg, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-extern int bplist_space_birthrange(bplist_t *bpl,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bpobj.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bpobj.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bpobj.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bpobj.h	30 Aug 2015 02:17:54 -0000
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_BPOBJ_H
+#define	_SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+	/*
+	 * This is the bonus buffer for the dead lists.  The object's
+	 * contents is an array of bpo_entries blkptr_t's, representing
+	 * a total of bpo_bytes physical space.
+	 */
+	uint64_t	bpo_num_blkptrs;
+	uint64_t	bpo_bytes;
+	uint64_t	bpo_comp;
+	uint64_t	bpo_uncomp;
+	uint64_t	bpo_subobjs;
+	uint64_t	bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define	BPOBJ_SIZE_V0	(2 * sizeof (uint64_t))
+#define	BPOBJ_SIZE_V1	(4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+	kmutex_t	bpo_lock;
+	objset_t	*bpo_os;
+	uint64_t	bpo_object;
+	int		bpo_epb;
+	uint8_t		bpo_havecomp;
+	uint8_t		bpo_havesubobj;
+	bpobj_phys_t	*bpo_phys;
+	dmu_buf_t	*bpo_dbuf;
+	dmu_buf_t	*bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bptree.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bptree.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bptree.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bptree.h	17 Jul 2014 16:23:18 -0000
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_BPTREE_H
+#define	_SYS_BPTREE_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct bptree_phys {
+	uint64_t bt_begin;
+	uint64_t bt_end;
+	uint64_t bt_bytes;
+	uint64_t bt_comp;
+	uint64_t bt_uncomp;
+} bptree_phys_t;
+
+typedef struct bptree_entry_phys {
+	blkptr_t be_bp;
+	uint64_t be_birth_txg; /* only delete blocks born after this txg */
+	zbookmark_phys_t be_zb; /* holds traversal resume point if needed */
+} bptree_entry_phys_t;
+
+typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
+int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
+
+void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
+
+int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free,
+    bptree_itor_t func, void *arg, dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_BPTREE_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bqueue.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bqueue.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bqueue.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bqueue.h	3 Jun 2017 00:01:49 -0000
@@ -0,0 +1,54 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_BQUEUE_H
+#define	_BQUEUE_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include	<sys/zfs_context.h>
+
+typedef struct bqueue {
+	list_t bq_list;
+	kmutex_t bq_lock;
+	kcondvar_t bq_add_cv;
+	kcondvar_t bq_pop_cv;
+	uint64_t bq_size;
+	uint64_t bq_maxsize;
+	size_t bq_node_offset;
+} bqueue_t;
+
+typedef struct bqueue_node {
+	list_node_t bqn_node;
+	uint64_t bqn_size;
+} bqueue_node_t;
+
+
+int bqueue_init(bqueue_t *, uint64_t, size_t);
+void bqueue_destroy(bqueue_t *);
+void bqueue_enqueue(bqueue_t *, void *, uint64_t);
+void *bqueue_dequeue(bqueue_t *);
+boolean_t bqueue_empty(bqueue_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _BQUEUE_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h,v
retrieving revision 1.2
diff -u -p -r1.2 dbuf.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h	28 Mar 2014 03:46:56 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h	10 Oct 2016 11:09:53 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
@@ -33,12 +35,13 @@
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
+#include <sys/zrlock.h>
+#include <sys/multilist.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-#define	DB_BONUS_BLKID (-1ULL)
 #define	IN_DMU_SYNC 2
 
 /*
@@ -65,8 +68,13 @@ extern "C" {
  *		|			 |
  *		|			 |
  *		+--------> NOFILL -------+
+ *
+ * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
+ * to find all dbufs in a range of a dnode and must be less than any other
+ * dbuf_states_t (see comment on dn_dbufs in dnode.h).
  */
 typedef enum dbuf_states {
+	DB_SEARCH = -1,
 	DB_UNCACHED,
 	DB_FILL,
 	DB_NOFILL,
@@ -84,9 +92,6 @@ struct dmu_tx;
  * etc.
  */
 
-#define	LIST_LINK_INACTIVE(link) \
-	((link)->list_next == NULL && (link)->list_prev == NULL)
-
 struct dmu_buf_impl;
 
 typedef enum override_states {
@@ -114,6 +119,12 @@ typedef struct dbuf_dirty_record {
 	/* pointer to parent dirty record */
 	struct dbuf_dirty_record *dr_parent;
 
+	/* How much space was changed to dsl_pool_dirty_space() for this? */
+	unsigned int dr_accounted;
+
+	/* A copy of the bp that points to us */
+	blkptr_t dr_bp_copy;
+
 	union dirty_types {
 		struct dirty_indirect {
 
@@ -134,6 +145,7 @@ typedef struct dbuf_dirty_record {
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
+			boolean_t dr_nopwrite;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;
@@ -151,15 +163,17 @@ typedef struct dmu_buf_impl {
 	struct objset *db_objset;
 
 	/*
-	 * the dnode we belong to (NULL when evicted)
+	 * handle to safely access the dnode we belong to (NULL when evicted)
 	 */
-	struct dnode *db_dnode;
+	struct dnode_handle *db_dnode_handle;
 
 	/*
 	 * our parent buffer; if the dnode points to us directly,
-	 * db_parent == db_dnode->dn_dbuf
+	 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
 	 * only accessed by sync thread ???
 	 * (NULL when evicted)
+	 * May change from NULL to non-NULL under the protection of db_mtx
+	 * (see dbuf_check_blkptr())
 	 */
 	struct dmu_buf_impl *db_parent;
 
@@ -213,18 +227,37 @@ typedef struct dmu_buf_impl {
 	 * Our link on the owner dnodes's dn_dbufs list.
 	 * Protected by its dn_dbufs_mtx.
 	 */
-	list_node_t db_link;
+	avl_node_t db_link;
+
+	/*
+	 * Link in dbuf_cache.
+	 */
+	multilist_node_t db_cache_link;
 
 	/* Data which is unique to data (leaf) blocks: */
 
-	/* stuff we store for the user (see dmu_buf_set_user) */
-	void *db_user_ptr;
-	void **db_user_data_ptr_ptr;
-	dmu_buf_evict_func_t *db_evict_func;
+	/* User callback information. */
+	dmu_buf_user_t *db_user;
+
+	/*
+	 * Evict user data as soon as the dirty and reference
+	 * counts are equal.
+	 */
+	uint8_t db_user_immediate_evict;
 
-	uint8_t db_immediate_evict;
+	/*
+	 * This block was freed while a read or write was
+	 * active.
+	 */
 	uint8_t db_freed_in_flight;
 
+	/*
+	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to
+	 * evict this dbuf, but couldn't due to outstanding
+	 * references.  Evict once the refcount drops to 0.
+	 */
+	uint8_t db_pending_evict;
+
 	uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
@@ -237,68 +270,82 @@ typedef struct dbuf_hash_table {
 	kmutex_t hash_mutexes[DBUF_MUTEXES];
 } dbuf_hash_table_t;
 
-
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
+uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 void dbuf_create_bonus(struct dnode *dn);
+int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
+void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
+
+void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp);
 
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+    zio_priority_t prio, arc_flags_t aflags);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
+    uint64_t blkid, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
-dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
+    uint64_t blkid);
 
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
-void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 
-void dbuf_clear(dmu_buf_impl_t *db);
-void dbuf_evict(dmu_buf_impl_t *db);
+void dbuf_destroy(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
+void dbuf_release_bp(dmu_buf_impl_t *db);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
+#define	DB_DNODE(_db)		((_db)->db_dnode_handle->dnh_dnode)
+#define	DB_DNODE_LOCK(_db)	((_db)->db_dnode_handle->dnh_zrlock)
+#define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
+#define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
+#define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
+
 void dbuf_init(void);
 void dbuf_fini(void);
 
-#define	DBUF_IS_METADATA(db)	\
-	((db)->db_level > 0 || dmu_ot[(db)->db_dnode->dn_type].ot_metadata)
+boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 
-#define	DBUF_GET_BUFC_TYPE(db)	\
-	(DBUF_IS_METADATA(db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
+#define	DBUF_GET_BUFC_TYPE(_db)	\
+	(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
-#define	DBUF_IS_CACHEABLE(db)						\
-	((db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
-	(DBUF_IS_METADATA(db) &&					\
-	((db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
-
-#define	DBUF_IS_L2CACHEABLE(db)						\
-	((db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
-	(DBUF_IS_METADATA(db) &&					\
-	((db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+#define	DBUF_IS_CACHEABLE(_db)						\
+	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(dbuf_is_metadata(_db) &&					\
+	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
+
+#define	DBUF_IS_L2CACHEABLE(_db)					\
+	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
+	(dbuf_is_metadata(_db) &&					\
+	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #ifdef ZFS_DEBUG
 
@@ -326,10 +373,10 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);				\
+	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
-	} 							\
+	}							\
 _NOTE(CONSTCOND) } while (0)
 
 #define	DBUF_VERIFY(db)	dbuf_verify(db)
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h,v
retrieving revision 1.2
diff -u -p -r1.2 ddt.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h	27 Mar 2014 15:50:48 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h	2 Sep 2013 11:38:20 -0000
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_DDT_H
@@ -64,16 +63,15 @@ enum ddt_class {
  */
 typedef struct ddt_key {
 	zio_cksum_t	ddk_cksum;	/* 256-bit block checksum */
-	uint64_t	ddk_prop;	/* LSIZE, PSIZE, compression */
+	/*
+	 * Encoded with logical & physical size, and compression, as follows:
+	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
+	 *   |   0   |   0   |   0   | comp  |     PSIZE     |     LSIZE     |
+	 *   +-------+-------+-------+-------+-------+-------+-------+-------+
+	 */
+	uint64_t	ddk_prop;
 } ddt_key_t;
 
-/*
- * ddk_prop layout:
- *
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- *	|   0	|   0	|   0	| comp	|     PSIZE	|     LSIZE	|
- *	+-------+-------+-------+-------+-------+-------+-------+-------+
- */
 #define	DDK_GET_LSIZE(ddk)	\
 	BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
 #define	DDK_SET_LSIZE(ddk, x)	\
@@ -101,7 +99,7 @@ enum ddt_phys_type {
 	DDT_PHYS_DOUBLE = 2,
 	DDT_PHYS_TRIPLE = 3,
 	DDT_PHYS_TYPES
-} ddt_phys_type_t;
+};
 
 /*
  * In-core ddt entry
@@ -132,6 +130,8 @@ struct ddt {
 	uint64_t	ddt_stat_object;
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
+	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];
 	avl_node_t	ddt_node;
 };
 
@@ -154,27 +154,29 @@ typedef struct ddt_ops {
 	    boolean_t prehash);
 	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
 	int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+	void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
+	    ddt_entry_t *dde);
 	int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
 	    dmu_tx_t *tx);
 	int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
 	    dmu_tx_t *tx);
 	int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
 	    uint64_t *walk);
-	uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+	int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
 } ddt_ops_t;
 
 #define	DDT_NAMELEN	80
 
 extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class class, char *name, size_t namelen);
+    enum ddt_class cls, char *name);
 extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
-extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class class);
+    enum ddt_class cls, uint64_t *walk, ddt_entry_t *dde);
+extern int ddt_object_count(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class cls, uint64_t *count);
 extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class class, dmu_object_info_t *);
+    enum ddt_class cls, dmu_object_info_t *);
 extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class class);
+    enum ddt_class cls);
 
 extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
     uint64_t txg);
@@ -215,6 +217,7 @@ extern ddt_t *ddt_select(spa_t *spa, con
 extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
 
 extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
@@ -230,6 +233,8 @@ extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
 extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class cls, ddt_entry_t *dde, dmu_tx_t *tx);
 
 extern const ddt_ops_t ddt_zap_ops;
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h,v
retrieving revision 1.2
diff -u -p -r1.2 dmu.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h	10 Jan 2017 19:20:35 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h	22 Apr 2017 07:57:28 -0000
@@ -18,11 +18,21 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
@@ -34,11 +44,11 @@
  * dmu_spa.h.
  */
 
+#include <sys/zfs_context.h>
 #include <sys/inttypes.h>
-#include <sys/types.h>
-#include <sys/param.h>
 #include <sys/cred.h>
-#include <sys/time.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio_priority.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -58,15 +68,73 @@ struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
-struct zbookmark;
+struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
+struct sa_handle;
+struct file;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
+typedef struct dnode dnode_t;
+
+typedef enum dmu_object_byteswap {
+	DMU_BSWAP_UINT8,
+	DMU_BSWAP_UINT16,
+	DMU_BSWAP_UINT32,
+	DMU_BSWAP_UINT64,
+	DMU_BSWAP_ZAP,
+	DMU_BSWAP_DNODE,
+	DMU_BSWAP_OBJSET,
+	DMU_BSWAP_ZNODE,
+	DMU_BSWAP_OLDACL,
+	DMU_BSWAP_ACL,
+	/*
+	 * Allocating a new byteswap type number makes the on-disk format
+	 * incompatible with any other format that uses the same number.
+	 *
+	 * Data can usually be structured to work with one of the
+	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
+	 */
+	DMU_BSWAP_NUMFUNCS
+} dmu_object_byteswap_t;
+
+#define	DMU_OT_NEWTYPE 0x80
+#define	DMU_OT_METADATA 0x40
+#define	DMU_OT_BYTESWAP_MASK 0x3f
+
+/*
+ * Defines a uint8_t object type. Object types specify if the data
+ * in the object is metadata (boolean) and how to byteswap the data
+ * (dmu_object_byteswap_t).
+ */
+#define	DMU_OT(byteswap, metadata) \
+	(DMU_OT_NEWTYPE | \
+	((metadata) ? DMU_OT_METADATA : 0) | \
+	((byteswap) & DMU_OT_BYTESWAP_MASK))
+
+#define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
+	(ot) < DMU_OT_NUMTYPES)
+
+#define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	((ot) & DMU_OT_METADATA) : \
+	dmu_ot[(ot)].ot_metadata)
+
+/*
+ * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
+ * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
+ * is repurposed for embedded BPs.
+ */
+#define	DMU_OT_HAS_FILL(ot) \
+	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
+
+#define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	((ot) & DMU_OT_BYTESWAP_MASK) : \
+	dmu_ot[(ot)].ot_byteswap)
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
@@ -75,8 +143,8 @@ typedef enum dmu_object_type {
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
-	DMU_OT_BPLIST,			/* UINT64 */
-	DMU_OT_BPLIST_HDR,		/* UINT64 */
+	DMU_OT_BPOBJ,			/* UINT64 */
+	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
@@ -116,24 +184,58 @@ typedef enum dmu_object_type {
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
-	DMU_OT_SCRUB_QUEUE,		/* ZAP */
+	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
-	DMU_OT_NUMTYPES
+	DMU_OT_SA,			/* System attr */
+	DMU_OT_SA_MASTER_NODE,		/* ZAP */
+	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
+	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
+	DMU_OT_SCAN_XLATE,		/* ZAP */
+	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
+	DMU_OT_DEADLIST,		/* ZAP */
+	DMU_OT_DEADLIST_HDR,		/* UINT64 */
+	DMU_OT_DSL_CLONES,		/* ZAP */
+	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
+	/*
+	 * Do not allocate new object types here. Doing so makes the on-disk
+	 * format incompatible with any other format that uses the same object
+	 * type number.
+	 *
+	 * When creating an object which does not have one of the above types
+	 * use the DMU_OTN_* type with the correct byteswap and metadata
+	 * values.
+	 *
+	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
+	 * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
+	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
+	 * and DMU_OTN_* types).
+	 */
+	DMU_OT_NUMTYPES,
+
+	/*
+	 * Names for valid types declared with DMU_OT().
+	 */
+	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
+	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
+	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
+	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
+	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
+	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
+	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
+	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
+	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
+	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
-typedef enum dmu_objset_type {
-	DMU_OST_NONE,
-	DMU_OST_META,
-	DMU_OST_ZFS,
-	DMU_OST_ZVOL,
-	DMU_OST_OTHER,			/* For testing only! */
-	DMU_OST_ANY,			/* Be careful! */
-	DMU_OST_NUMTYPES
-} dmu_objset_type_t;
+typedef enum txg_how {
+	TXG_WAIT = 1,
+	TXG_NOWAIT,
+	TXG_WAITED,
+} txg_how_t;
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
@@ -146,19 +248,24 @@ void zfs_znode_byteswap(void *buf, size_
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
+#define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
-#define	DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define	DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
-#define	DMU_DEADLIST_OBJECT	(-3ULL)
 
 /*
+ * artificial blkids for bonus buffer and spill blocks
+ */
+#define	DMU_BONUS_BLKID		(-1ULL)
+#define	DMU_SPILL_BLKID		(-2ULL)
+/*
  * Public routines to create, destroy, open, and close objsets.
  */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
@@ -168,20 +275,21 @@ void dmu_objset_rele(objset_t *os, void 
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
-int dmu_objset_evict_dbufs(objset_t *os);
+void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
-    uint64_t flags);
-int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
-int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
-    boolean_t recursive);
-int dmu_objset_rename(const char *name, const char *newname,
-    boolean_t recursive);
+int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
+    struct nvlist *snaps);
+int dmu_objset_clone(const char *name, const char *origin);
+int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
+    struct nvlist *errlist);
+int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
+int dmu_objset_snapshot_tmp(const char *, const char *, int);
 int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
+int dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
@@ -190,15 +298,17 @@ typedef struct dmu_buf {
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
-typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
-
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
+#define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
+#define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
+#define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
+#define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
-#define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
+#define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
@@ -209,23 +319,13 @@ typedef void dmu_buf_evict_func_t(struct
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
-
-/* 4x8 zbookmark_t */
-#define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
-/* 4x8 ddt_bookmark_t */
-#define	DMU_POOL_SCRUB_DDT_BOOKMARK	"scrub_ddt_bookmark"
-/* 1x8 max_class */
-#define	DMU_POOL_SCRUB_DDT_CLASS_MAX	"scrub_ddt_class_max"
-/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
-#define	DMU_POOL_SCRUB_QUEUE		"scrub_queue"
-/* 1x8 txg */
-#define	DMU_POOL_SCRUB_MIN_TXG		"scrub_min_txg"
-/* 1x8 txg */
-#define	DMU_POOL_SCRUB_MAX_TXG		"scrub_max_txg"
-/* 1x4 enum scrub_func */
-#define	DMU_POOL_SCRUB_FUNC		"scrub_func"
-/* 1x8 count */
-#define	DMU_POOL_SCRUB_ERRORS		"scrub_errors"
+#define	DMU_POOL_CREATION_VERSION	"creation_version"
+#define	DMU_POOL_SCAN			"scan"
+#define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
+#define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
+#define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
+#define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
+#define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
@@ -247,7 +347,7 @@ uint64_t dmu_object_alloc(objset_t *os, 
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen);
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 
 /*
  * Free an object from this objset.
@@ -309,13 +409,19 @@ void dmu_object_set_checksum(objset_t *o
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx);
+
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
+#define	WP_SPILL	0x4
 
-void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
@@ -326,10 +432,24 @@ void dmu_write_policy(objset_t *os, stru
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
+ *
+ * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
+int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
+dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
+int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
+
+/*
+ * Special spill buffer support used by "SA" framework
+ */
+
+int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
+    void *tag, dmu_buf_t **dbp);
+int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
@@ -346,8 +466,26 @@ int dmu_set_bonus(dmu_buf_t *, int, dmu_
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **);
+    void *tag, dmu_buf_t **, int flags);
+int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags);
+
+/*
+ * Add a reference to a dmu buffer that has already been held via
+ * dmu_buf_hold() in the current context.
+ */
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+
+/*
+ * Attempt to add a reference to a dmu buffer that is in an unknown state,
+ * using a pointer that may have been invalidated by eviction processing.
+ * The request will succeed if the passed in dbuf still represents the
+ * same os/object/blkid, is ineligible for eviction, and has at least
+ * one hold by a user other than the syncer.
+ */
+boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
+    uint64_t blkid, void *tag);
+
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
@@ -362,46 +500,149 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db)
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+    uint64_t length, boolean_t read, void *tag,
+    int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
+typedef void dmu_buf_evict_func_t(void *user_ptr);
+
 /*
- * Returns NULL on success, or the existing user ptr if it's already
- * been set.
+ * A DMU buffer user object may be associated with a dbuf for the
+ * duration of its lifetime.  This allows the user of a dbuf (client)
+ * to attach private data to a dbuf (e.g. in-core only data such as a
+ * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
+ * when that dbuf has been evicted.  Clients typically respond to the
+ * eviction notification by freeing their private data, thus ensuring
+ * the same lifetime for both dbuf and private data.
+ *
+ * The mapping from a dmu_buf_user_t to any client private data is the
+ * client's responsibility.  All current consumers of the API with private
+ * data embed a dmu_buf_user_t as the first member of the structure for
+ * their private data.  This allows conversions between the two types
+ * with a simple cast.  Since the DMU buf user API never needs access
+ * to the private data, other strategies can be employed if necessary
+ * or convenient for the client (e.g. using container_of() to do the
+ * conversion for private data that cannot have the dmu_buf_user_t as
+ * its first member).
+ *
+ * Eviction callbacks are executed without the dbuf mutex held or any
+ * other type of mechanism to guarantee that the dbuf is still available.
+ * For this reason, users must assume the dbuf has already been freed
+ * and not reference the dbuf from the callback context.
+ *
+ * Users requesting "immediate eviction" are notified as soon as the dbuf
+ * is only referenced by dirty records (dirties == holds).  Otherwise the
+ * notification occurs after eviction processing for the dbuf begins.
+ */
+typedef struct dmu_buf_user {
+	/*
+	 * Asynchronous user eviction callback state.
+	 */
+	taskq_ent_t	dbu_tqent;
+
+	/*
+	 * This instance's eviction function pointers.
+	 *
+	 * dbu_evict_func_sync is called synchronously and then
+	 * dbu_evict_func_async is executed asynchronously on a taskq.
+	 */
+	dmu_buf_evict_func_t *dbu_evict_func_sync;
+	dmu_buf_evict_func_t *dbu_evict_func_async;
+#ifdef ZFS_DEBUG
+	/*
+	 * Pointer to user's dbuf pointer.  NULL for clients that do
+	 * not associate a dbuf with their user data.
+	 *
+	 * The dbuf pointer is cleared upon eviction so as to catch
+	 * use-after-evict bugs in clients.
+	 */
+	dmu_buf_t **dbu_clear_on_evict_dbufp;
+#endif
+} dmu_buf_user_t;
+
+/*
+ * Initialize the given dmu_buf_user_t instance with the eviction function
+ * evict_func, to be called when the user is evicted.
  *
- * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ * NOTE: This function should only be called once on a given dmu_buf_user_t.
+ *       To allow enforcement of this, dbu must already be zeroed on entry.
+ */
+#ifdef __lint
+/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
+extern void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+    dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp);
+#else /* __lint */
+inline void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
+    dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp)
+{
+	ASSERT(dbu->dbu_evict_func_sync == NULL);
+	ASSERT(dbu->dbu_evict_func_async == NULL);
+
+	/* must have at least one evict func */
+	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
+	dbu->dbu_evict_func_sync = evict_func_sync;
+	dbu->dbu_evict_func_async = evict_func_async;
+#ifdef ZFS_DEBUG
+	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
+#endif
+}
+#endif /* __lint */
+
+/*
+ * Attach user data to a dbuf and mark it for normal (when the dbuf's
+ * data is cleared or its reference count goes to zero) eviction processing.
  *
- * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
- * will be set to db->db_data when you are allowed to access it.  Note
- * that db->db_data (the pointer) can change when you do dmu_buf_read(),
- * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
- * *user_data_ptr_ptr will be set to the new value when it changes.
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Attach user data to a dbuf and mark it for immediate (its dirty and
+ * reference counts are equal) eviction processing.
  *
- * If non-NULL, pageout func will be called when this buffer is being
- * excised from the cache, so that you can clean up the data structure
- * pointed to by user_ptr.
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
+ */
+void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Replace the current user of a dbuf.
  *
- * dmu_evict_user() will call the pageout func for all buffers in a
- * objset with a given pageout func.
+ * If given the current user of a dbuf, replaces the dbuf's user with
+ * "new_user" and returns the user data pointer that was replaced.
+ * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
+void *dmu_buf_replace_user(dmu_buf_t *db,
+    dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
+
 /*
- * set_user_ie is the same as set_user, but request immediate eviction
- * when hold count goes to zero.
+ * Remove the specified user data for a DMU buffer.
+ *
+ * Returns the user that was removed on success, or the current user if
+ * another user currently owns the buffer.
  */
-void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
-    void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
-    void *user_ptr, void *user_data_ptr_ptr,
-    dmu_buf_evict_func_t *pageout_func);
-void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
- * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
+objset_t *dmu_buf_get_objset(dmu_buf_t *db);
+dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
+void dmu_buf_dnode_exit(dmu_buf_t *db);
+
+/* Block until any in-progress dmu buf user evictions complete. */
+void dmu_buf_user_evict_wait(void);
+
+/*
+ * Returns the blkptr associated with this dbuf, or NULL if not set.
+ */
+struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
+
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
@@ -443,10 +684,14 @@ void dmu_tx_hold_free(dmu_tx_t *tx, uint
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
+void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
+void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
@@ -470,13 +715,13 @@ void dmu_tx_callback_register(dmu_tx_t *
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
- * zero, the range from offset to end-of-file is freed.
+ * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size);
-int dmu_free_object(objset_t *os, uint64_t object);
+int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
@@ -493,10 +738,20 @@ void dmu_write(objset_t *os, uint64_t ob
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
+int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
+    dmu_tx_t *tx);
+#ifdef _KERNEL
+#ifdef illumos
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
+#else
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
+#endif
+#endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
@@ -508,16 +763,17 @@ int dmu_xuio_add(struct xuio *uio, struc
 int dmu_xuio_cnt(struct xuio *uio);
 struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
 void dmu_xuio_clear(struct xuio *uio, int i);
-void xuio_stat_wbuf_copied(void);
-void xuio_stat_wbuf_nocopy(void);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
 
-extern int zfs_prefetch_disable;
+extern boolean_t zfs_prefetch_disable;
+extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
-void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t len);
+void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
@@ -529,7 +785,8 @@ typedef struct dmu_object_info {
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
-	uint8_t doi_pad[5];
+	uint8_t doi_nblkptr;
+	uint8_t doi_pad[4];
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
@@ -538,12 +795,18 @@ typedef struct dmu_object_info {
 typedef void arc_byteswap_func_t(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
-	arc_byteswap_func_t	*ot_byteswap;
+	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
+typedef struct dmu_object_byteswap_info {
+	arc_byteswap_func_t	*ob_func;
+	char			*ob_name;
+} dmu_object_byteswap_info_t;
+
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
+extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
@@ -553,8 +816,14 @@ extern const dmu_object_type_info_t dmu_
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
-void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dnode in hand. */
+void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+/*
+ * Like dmu_object_info_from_db, but faster still when you only care about
+ * the size.  This is specifically optimized for zfs_getattr().
+ */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
@@ -565,7 +834,7 @@ typedef struct dmu_objset_stats {
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
-	char dds_origin[MAXNAMELEN];
+	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
 } dmu_objset_stats_t;
 
 /*
@@ -615,7 +884,8 @@ extern struct dsl_dataset *dmu_objset_ds
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
-extern uint64_t dmu_objset_logbias(objset_t *os);
+extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
+extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
@@ -668,6 +938,15 @@ int dmu_offset_next(objset_t *os, uint64
     uint64_t *off);
 
 /*
+ * Check if a DMU object has any dirty blocks. If so, sync out
+ * all pending transaction groups. Otherwise, this function
+ * does not alter DMU state. This could be improved to only sync
+ * out the necessary transaction groups for this particular
+ * object.
+ */
+int dmu_object_wait_synced(objset_t *os, uint64_t object);
+
+/*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
@@ -678,35 +957,20 @@ typedef void (*dmu_traverse_cb_t)(objset
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
-int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
-    struct vnode *vp, offset_t *off);
-
-typedef struct dmu_recv_cookie {
-	/*
-	 * This structure is opaque!
-	 *
-	 * If logical and real are different, we are recving the stream
-	 * into the "real" temporary clone, and then switching it with
-	 * the "logical" target.
-	 */
-	struct dsl_dataset *drc_logical_ds;
-	struct dsl_dataset *drc_real_ds;
-	struct drr_begin *drc_drrb;
-	char *drc_tosnap;
-	char *drc_top_ds;
-	boolean_t drc_newfs;
-	boolean_t drc_force;
-} dmu_recv_cookie_t;
-
-int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
-    boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
-int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
-int dmu_recv_end(dmu_recv_cookie_t *drc);
+#ifdef __FreeBSD__
+int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+    struct file *fp, offset_t *offp);
+#else
+int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+    struct vnode *vp, offset_t *offp);
+#endif
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
+extern int zfs_mdcomp_disable;
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h	27 Feb 2010 22:31:38 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h	10 Oct 2016 11:09:53 -0000
@@ -22,6 +22,11 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_DMU_IMPL_H
 #define	_SYS_DMU_IMPL_H
@@ -29,7 +34,9 @@
 #include <sys/txg_impl.h>
 #include <sys/zio.h>
 #include <sys/dnode.h>
+#include <sys/kstat.h>
 #include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -264,6 +271,44 @@ static xuio_stats_t xuio_stats = {
 	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
 #define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
 
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another.  Multiple calls to dump_free() and
+ * dump_freeobjects() can be aggregated into a single DRR_FREE or
+ * DRR_FREEOBJECTS replay record.
+ */
+typedef enum {
+	PENDING_NONE,
+	PENDING_FREE,
+	PENDING_FREEOBJECTS
+} dmu_pendop_t;
+
+typedef struct dmu_sendarg {
+	list_node_t dsa_link;
+	dmu_replay_record_t *dsa_drr;
+	kthread_t *dsa_td;
+	struct file *dsa_fp;
+	int dsa_outfd;
+	struct proc *dsa_proc;
+	offset_t *dsa_off;
+	objset_t *dsa_os;
+	zio_cksum_t dsa_zc;
+	uint64_t dsa_toguid;
+	int dsa_err;
+	dmu_pendop_t dsa_pending_op;
+	uint64_t dsa_featureflags;
+	uint64_t dsa_last_data_object;
+	uint64_t dsa_last_data_offset;
+	uint64_t dsa_resume_object;
+	uint64_t dsa_resume_offset;
+	boolean_t dsa_sent_begin;
+	boolean_t dsa_sent_end;
+} dmu_sendarg_t;
+
+void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
+void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
+int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
+    void *, dmu_buf_t **);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_objset.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h	27 Feb 2010 22:31:38 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h	10 Oct 2016 11:09:53 -0000
@@ -19,10 +19,15 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_DMU_OBJSET_H
 #define	_SYS_DMU_OBJSET_H
 
@@ -33,17 +38,24 @@
 #include <sys/dnode.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
+#include <sys/sa.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+extern krwlock_t os_lock;
+
+struct dsl_pool;
 struct dsl_dataset;
 struct dmu_tx;
 
 #define	OBJSET_PHYS_SIZE 2048
 #define	OBJSET_OLD_PHYS_SIZE 1024
 
+#define	OBJSET_BUF_HAS_USERUSED(buf) \
+	(arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
+
 #define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)
 
 typedef struct objset_phys {
@@ -63,24 +75,40 @@ struct objset {
 	spa_t *os_spa;
 	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
-	dnode_t *os_meta_dnode;
-	dnode_t *os_userused_dnode;
-	dnode_t *os_groupused_dnode;
+	/*
+	 * The following "special" dnodes have no parent, are exempt
+	 * from dnode_move(), and are not recorded in os_dnodes, but they
+	 * root their descendents in this objset using handles anyway, so
+	 * that all access to dnodes from dbufs consistently uses handles.
+	 */
+	dnode_handle_t os_meta_dnode;
+	dnode_handle_t os_userused_dnode;
+	dnode_handle_t os_groupused_dnode;
 	zilog_t *os_zil;
 
+	list_node_t os_evicting_node;
+
 	/* can change, under dsl_dir's locks: */
-	uint8_t os_checksum;
-	uint8_t os_compress;
+	enum zio_checksum os_checksum;
+	enum zio_compress os_compress;
 	uint8_t os_copies;
-	uint8_t os_dedup_checksum;
-	uint8_t os_dedup_verify;
-	uint8_t os_logbias;
-	uint8_t os_primary_cache;
-	uint8_t os_secondary_cache;
+	enum zio_checksum os_dedup_checksum;
+	boolean_t os_dedup_verify;
+	zfs_logbias_op_t os_logbias;
+	zfs_cache_type_t os_primary_cache;
+	zfs_cache_type_t os_secondary_cache;
+	zfs_sync_type_t os_sync;
+	zfs_redundant_metadata_type_t os_redundant_metadata;
+	int os_recordsize;
+
+	/*
+	 * Pointer is constant; the blkptr it points to is protected by
+	 * os_dsl_dataset->ds_bp_rwlock
+	 */
+	blkptr_t *os_rootbp;
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
-	blkptr_t *os_rootbp;
 	zil_header_t os_zil_header;
 	list_t os_synced_dnodes;
 	uint64_t os_flags;
@@ -99,43 +127,43 @@ struct objset {
 	/* stuff we store for the user */
 	kmutex_t os_user_ptr_lock;
 	void *os_user_ptr;
+	sa_os_t *os_sa;
 };
 
 #define	DMU_META_OBJSET		0
 #define	DMU_META_DNODE_OBJECT	0
 #define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
+#define	DMU_META_DNODE(os)	((os)->os_meta_dnode.dnh_dnode)
+#define	DMU_USERUSED_DNODE(os)	((os)->os_userused_dnode.dnh_dnode)
+#define	DMU_GROUPUSED_DNODE(os)	((os)->os_groupused_dnode.dnh_dnode)
 
 #define	DMU_OS_IS_L2CACHEABLE(os)				\
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
 	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
 
+#define	DMU_OS_IS_L2COMPRESSIBLE(os)	(zfs_mdcomp_disable == B_FALSE)
+
 /* called from zpl */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
+    dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_refresh_ownership(objset_t *os, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
 
-int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
-    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
-    uint64_t flags);
-int dmu_objset_destroy(const char *name, boolean_t defer);
-int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
-    boolean_t recursive);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
-    int flags);
-int dmu_objset_find_spa(spa_t *spa, const char *name,
-    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
+int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
+    int func(struct dsl_pool *, struct dsl_dataset *, void *),
+    void *arg, int flags);
 int dmu_objset_prefetch(const char *name, void *arg);
-void dmu_objset_byteswap(void *buf, size_t size);
-int dmu_objset_evict_dbufs(objset_t *os);
+void dmu_objset_evict_dbufs(objset_t *os);
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 /* called from dsl */
@@ -146,10 +174,17 @@ objset_t *dmu_objset_create_impl(spa_t *
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
     objset_t **osp);
 void dmu_objset_evict(objset_t *os);
-void dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
+void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
 boolean_t dmu_objset_userused_enabled(objset_t *os);
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
+int dmu_fsname(const char *snapname, char *buf);
+
+void dmu_objset_evict_done(objset_t *os);
+
+void dmu_objset_init(void);
+void dmu_objset_fini(void);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_send.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_send.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_send.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_send.h	27 Mar 2016 02:52:19 -0000
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+#ifndef _DMU_SEND_H
+#define	_DMU_SEND_H
+
+#include <sys/spa.h>
+
+struct vnode;
+struct dsl_dataset;
+struct drr_begin;
+struct avl_tree;
+struct dmu_replay_record;
+
+extern const char *recv_clone_name;
+
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+#ifdef illumos
+    struct vnode *vp, offset_t *off);
+#else
+    struct file *fp, offset_t *off);
+#endif
+int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
+    uint64_t *sizep);
+int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
+    uint64_t *sizep);
+int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok,
+#ifdef illumos
+    int outfd, struct vnode *vp, offset_t *off);
+#else
+    int outfd, struct file *fp, offset_t *off);
+#endif
+
+typedef struct dmu_recv_cookie {
+	struct dsl_dataset *drc_ds;
+	struct dmu_replay_record *drc_drr_begin;
+	struct drr_begin *drc_drrb;
+	const char *drc_tofs;
+	const char *drc_tosnap;
+	boolean_t drc_newfs;
+	boolean_t drc_byteswap;
+	boolean_t drc_force;
+	boolean_t drc_resumable;
+	struct avl_tree *drc_guid_to_ds_map;
+	zio_cksum_t drc_cksum;
+	uint64_t drc_newsnapobj;
+	void *drc_owner;
+	cred_t *drc_cred;
+} dmu_recv_cookie_t;
+
+int dmu_recv_begin(char *tofs, char *tosnap,
+    struct dmu_replay_record *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
+#ifdef illumos
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
+#else
+int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
+#endif
+    int cleanup_fd, uint64_t *action_handlep);
+int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
+boolean_t dmu_objset_is_receiving(objset_t *os);
+
+#endif /* _DMU_SEND_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_traverse.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h	27 Feb 2010 22:31:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h	22 Nov 2015 17:22:31 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DMU_TRAVERSE_H
@@ -37,9 +37,10 @@ extern "C" {
 struct dnode_phys;
 struct dsl_dataset;
 struct zilog;
+struct arc_buf;
 
 typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
-    const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg);
 
 #define	TRAVERSE_PRE			(1<<0)
 #define	TRAVERSE_POST			(1<<1)
@@ -48,8 +49,16 @@ typedef int (blkptr_cb_t)(spa_t *spa, zi
 #define	TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
 #define	TRAVERSE_HARD			(1<<4)
 
+/* Special traverse error return value to indicate skipping of children */
+#define	TRAVERSE_VISIT_NO_CHILDREN	-1
+
 int traverse_dataset(struct dsl_dataset *ds,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+    blkptr_cb_t func, void *arg);
 int traverse_pool(spa_t *spa,
     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_tx.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h	27 Feb 2010 22:31:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h	10 Oct 2016 11:09:53 -0000
@@ -19,14 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
 
 #ifndef	_SYS_DMU_TX_H
 #define	_SYS_DMU_TX_H
 
-#include <sys/inttypes.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
 #include <sys/refcount.h>
@@ -57,8 +59,22 @@ struct dmu_tx {
 	txg_handle_t tx_txgh;
 	void *tx_tempreserve_cookie;
 	struct dmu_tx_hold *tx_needassign_txh;
-	list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
-	uint8_t tx_anyobj;
+
+	/* list of dmu_tx_callback_t on this dmu_tx */
+	list_t tx_callbacks;
+
+	/* placeholder for syncing context, doesn't need specific holds */
+	boolean_t tx_anyobj;
+
+	/* has this transaction already been delayed? */
+	boolean_t tx_waited;
+
+	/* time this transaction was created */
+	hrtime_t tx_start;
+
+	/* need to wait for sufficient dirty space */
+	boolean_t tx_wait_dirty;
+
 	int tx_err;
 #ifdef ZFS_DEBUG
 	uint64_t tx_space_towrite;
@@ -77,6 +93,7 @@ enum dmu_tx_hold_type {
 	THT_FREE,
 	THT_ZAP,
 	THT_SPACE,
+	THT_SPILL,
 	THT_NUMTYPES
 };
 
@@ -84,12 +101,12 @@ typedef struct dmu_tx_hold {
 	dmu_tx_t *txh_tx;
 	list_node_t txh_node;
 	struct dnode *txh_dnode;
-	uint64_t txh_space_towrite;
-	uint64_t txh_space_tofree;
-	uint64_t txh_space_tooverwrite;
-	uint64_t txh_space_tounref;
-	uint64_t txh_memory_tohold;
-	uint64_t txh_fudge;
+	refcount_t txh_space_towrite;
+	refcount_t txh_space_tofree;
+	refcount_t txh_space_tooverwrite;
+	refcount_t txh_space_tounref;
+	refcount_t txh_memory_tohold;
+	refcount_t txh_fudge;
 #ifdef ZFS_DEBUG
 	enum dmu_tx_hold_type txh_type;
 	uint64_t txh_arg1;
@@ -107,10 +124,11 @@ typedef struct dmu_tx_callback {
  * These routines are defined in dmu.h, and are called by the user.
  */
 dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_abort(dmu_tx_t *tx);
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
+struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx);
 void dmu_tx_wait(dmu_tx_t *tx);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dmu_zfetch.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h	27 Feb 2010 22:31:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h	16 Apr 2016 16:39:04 -0000
@@ -23,8 +23,12 @@
  * Use is subject to license terms.
  */
 
-#ifndef	_DFETCH_H
-#define	_DFETCH_H
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_DMU_ZFETCH_H
+#define	_DMU_ZFETCH_H
 
 #include <sys/zfs_context.h>
 
@@ -36,41 +40,37 @@ extern uint64_t	zfetch_array_rd_sz;
 
 struct dnode;				/* so we can reference dnode */
 
-typedef enum zfetch_dirn {
-	ZFETCH_FORWARD = 1,		/* prefetch increasing block numbers */
-	ZFETCH_BACKWARD	= -1		/* prefetch decreasing block numbers */
-} zfetch_dirn_t;
-
 typedef struct zstream {
-	uint64_t	zst_offset;	/* offset of starting block in range */
-	uint64_t	zst_len;	/* length of range, in blocks */
-	zfetch_dirn_t	zst_direction;	/* direction of prefetch */
-	uint64_t	zst_stride;	/* length of stride, in blocks */
-	uint64_t	zst_ph_offset;	/* prefetch offset, in blocks */
-	uint64_t	zst_cap;	/* prefetch limit (cap), in blocks */
-	kmutex_t	zst_lock;	/* protects stream */
-	clock_t		zst_last;	/* lbolt of last prefetch */
-	avl_node_t	zst_node;	/* embed avl node here */
+	uint64_t	zs_blkid;	/* expect next access at this blkid */
+	uint64_t	zs_pf_blkid;	/* next block to prefetch */
+
+	/*
+	 * We will next prefetch the L1 indirect block of this level-0
+	 * block id.
+	 */
+	uint64_t	zs_ipf_blkid;
+
+	kmutex_t	zs_lock;	/* protects stream */
+	hrtime_t	zs_atime;	/* time last prefetch issued */
+	list_node_t	zs_node;	/* link for zf_stream */
 } zstream_t;
 
 typedef struct zfetch {
 	krwlock_t	zf_rwlock;	/* protects zfetch structure */
-	list_t		zf_stream;	/* AVL tree of zstream_t's */
+	list_t		zf_stream;	/* list of zstream_t's */
 	struct dnode	*zf_dnode;	/* dnode that owns this zfetch */
-	uint32_t	zf_stream_cnt;	/* # of active streams */
-	uint64_t	zf_alloc_fail;	/* # of failed attempts to alloc strm */
 } zfetch_t;
 
 void		zfetch_init(void);
 void		zfetch_fini(void);
 
 void		dmu_zfetch_init(zfetch_t *, struct dnode *);
-void		dmu_zfetch_rele(zfetch_t *);
-void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
+void		dmu_zfetch_fini(zfetch_t *);
+void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
 
 
 #ifdef	__cplusplus
 }
 #endif
 
-#endif	/* _DFETCH_H */
+#endif	/* _DMU_ZFETCH_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dnode.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h	27 Feb 2010 22:31:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h	2 Mar 2017 10:54:19 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DNODE_H
@@ -33,6 +34,7 @@
 #include <sys/zio.h>
 #include <sys/refcount.h>
 #include <sys/dmu_zfetch.h>
+#include <sys/zrlock.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -55,14 +57,26 @@ extern "C" {
  * Fixed constants.
  */
 #define	DNODE_SHIFT		9	/* 512 bytes */
-#define	DN_MIN_INDBLKSHIFT	10	/* 1k */
-#define	DN_MAX_INDBLKSHIFT	14	/* 16k */
+#define	DN_MIN_INDBLKSHIFT	12	/* 4k */
+#define	DN_MAX_INDBLKSHIFT	17	/* 128k */
 #define	DNODE_BLOCK_SHIFT	14	/* 16k */
 #define	DNODE_CORE_SIZE		64	/* 64 bytes for dnode sans blkptrs */
 #define	DN_MAX_OBJECT_SHIFT	48	/* 256 trillion (zfs_fid_t limit) */
 #define	DN_MAX_OFFSET_SHIFT	64	/* 2^64 bytes in a dnode */
 
 /*
+ * dnode id flags
+ *
+ * Note: a file will never ever have its
+ * ids moved from bonus->spill
+ * and only in a crypto environment would it be on spill
+ */
+#define	DN_ID_CHKED_BONUS	0x1
+#define	DN_ID_CHKED_SPILL	0x2
+#define	DN_ID_OLD_EXIST		0x4
+#define	DN_ID_NEW_EXIST		0x8
+
+/*
  * Derived constants.
  */
 #define	DNODE_SIZE	(1 << DNODE_SHIFT)
@@ -70,9 +84,15 @@ extern "C" {
 #define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
 #define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
 #define	DN_ZERO_BONUSLEN	(DN_MAX_BONUSLEN + 1)
+#define	DN_KILL_SPILLBLK (1)
 
 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
+
+/*
+ * This is inaccurate if the indblkshift of the particular object is not the
+ * max.  But it's only used by userland to calculate the zvol reservation.
+ */
 #define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
 #define	DNODES_PER_LEVEL	(1ULL << DNODES_PER_LEVEL_SHIFT)
 
@@ -102,6 +122,9 @@ enum dnode_dirtycontext {
 #define	DNODE_FLAG_USED_BYTES		(1<<0)
 #define	DNODE_FLAG_USERUSED_ACCOUNTED	(1<<1)
 
+/* Does dnode have a SA spill blkptr in bonus? */
+#define	DNODE_FLAG_SPILL_BLKPTR	(1<<2)
+
 typedef struct dnode_phys {
 	uint8_t dn_type;		/* dmu_object_type_t */
 	uint8_t dn_indblkshift;		/* ln2(indirect block size) */
@@ -122,14 +145,14 @@ typedef struct dnode_phys {
 	uint64_t dn_pad3[4];
 
 	blkptr_t dn_blkptr[1];
-	uint8_t dn_bonus[DN_MAX_BONUSLEN];
+	uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)];
+	blkptr_t dn_spill;
 } dnode_phys_t;
 
-typedef struct dnode {
+struct dnode {
 	/*
-	 * dn_struct_rwlock protects the structure of the dnode,
-	 * including the number of levels of indirection (dn_nlevels),
-	 * dn_maxblkid, and dn_next_*
+	 * Protects the structure of the dnode, including the number of levels
+	 * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
 	 */
 	krwlock_t dn_struct_rwlock;
 
@@ -140,6 +163,7 @@ typedef struct dnode {
 	struct objset *dn_objset;
 	uint64_t dn_object;
 	struct dmu_buf_impl *dn_dbuf;
+	struct dnode_handle *dn_handle;
 	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
 
 	/*
@@ -156,22 +180,29 @@ typedef struct dnode {
 	uint8_t dn_nlevels;
 	uint8_t dn_indblkshift;
 	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
+	uint8_t dn_moved;		/* Has this dnode been moved? */
 	uint16_t dn_datablkszsec;	/* in 512b sectors */
 	uint32_t dn_datablksz;		/* in bytes */
 	uint64_t dn_maxblkid;
+	uint8_t dn_next_type[TXG_SIZE];
 	uint8_t dn_next_nblkptr[TXG_SIZE];
 	uint8_t dn_next_nlevels[TXG_SIZE];
 	uint8_t dn_next_indblkshift[TXG_SIZE];
+	uint8_t dn_next_bonustype[TXG_SIZE];
+	uint8_t dn_rm_spillblk[TXG_SIZE];	/* for removing spill blk */
 	uint16_t dn_next_bonuslen[TXG_SIZE];
 	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
 
+	/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
+	uint32_t dn_dbufs_count;	/* count of dn_dbufs */
+
 	/* protected by os_lock: */
 	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
 
 	/* protected by dn_mtx: */
 	kmutex_t dn_mtx;
 	list_t dn_dirty_records[TXG_SIZE];
-	avl_tree_t dn_ranges[TXG_SIZE];
+	struct range_tree *dn_free_ranges[TXG_SIZE];
 	uint64_t dn_allocated_txg;
 	uint64_t dn_free_txg;
 	uint64_t dn_assigned_txg;
@@ -184,18 +215,54 @@ typedef struct dnode {
 	refcount_t dn_holds;
 
 	kmutex_t dn_dbufs_mtx;
-	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
+	/*
+	 * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
+	 * can contain multiple dbufs of the same (level, blkid) when a
+	 * dbuf is marked DB_EVICTING without being removed from
+	 * dn_dbufs. To maintain the avl invariant that there cannot be
+	 * duplicate entries, we order the dbufs by an arbitrary value -
+	 * their address in memory. This means that dn_dbufs cannot be used to
+	 * directly look up a dbuf. Instead, callers must use avl_walk, have
+	 * a reference to the dbuf, or look up a non-existant node with
+	 * db_state = DB_SEARCH (see dbuf_free_range for an example).
+	 */
+	avl_tree_t dn_dbufs;
+
+	/* protected by dn_struct_rwlock */
 	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
 
+	boolean_t dn_have_spill;	/* have spill or are spilling */
+
 	/* parent IO for current sync write */
 	zio_t *dn_zio;
 
 	/* used in syncing context */
-	dnode_phys_t *dn_oldphys;
+	uint64_t dn_oldused;	/* old phys used bytes */
+	uint64_t dn_oldflags;	/* old phys dn_flags */
+	uint64_t dn_olduid, dn_oldgid;
+	uint64_t dn_newuid, dn_newgid;
+	int dn_id_flags;
 
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
-} dnode_t;
+};
+
+/*
+ * Adds a level of indirection between the dbuf and the dnode to avoid
+ * iterating descendent dbufs in dnode_move(). Handles are not allocated
+ * individually, but as an array of child dnodes in dnode_hold_impl().
+ */
+typedef struct dnode_handle {
+	/* Protects dnh_dnode from modification by dnode_move(). */
+	zrlock_t dnh_zrlock;
+	dnode_t *dnh_dnode;
+} dnode_handle_t;
+
+typedef struct dnode_children {
+	dmu_buf_user_t dnc_dbu;		/* User evict data */
+	size_t dnc_count;		/* number of children */
+	dnode_handle_t dnc_children[];	/* sized dynamically */
+} dnode_children_t;
 
 typedef struct free_range {
 	avl_node_t fr_node;
@@ -203,17 +270,21 @@ typedef struct free_range {
 	uint64_t fr_nblks;
 } free_range_t;
 
-dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
-    uint64_t object);
-void dnode_special_close(dnode_t *dn);
+void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+    uint64_t object, dnode_handle_t *dnh);
+void dnode_special_close(dnode_handle_t *dnh);
 
 void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
+void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx);
+void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx);
+
 int dnode_hold(struct objset *dd, uint64_t object,
     void *ref, dnode_t **dnp);
 int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
@@ -225,10 +296,7 @@ void dnode_byteswap(dnode_phys_t *dnp);
 void dnode_buf_byteswap(void *buf, size_t size);
 void dnode_verify(dnode_t *dn);
 int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
-uint64_t dnode_current_max_length(dnode_t *dn);
 void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
-void dnode_clear_range(dnode_t *dn, uint64_t blkid,
-    uint64_t nblks, dmu_tx_t *tx);
 void dnode_diduse_space(dnode_t *dn, int64_t space);
 void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
 void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
@@ -238,6 +306,16 @@ void dnode_fini(void);
 int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
     int minlvl, uint64_t blkfill, uint64_t txg);
 void dnode_evict_dbufs(dnode_t *dn);
+void dnode_evict_bonus(dnode_t *dn);
+
+#define	DNODE_IS_CACHEABLE(_dn)						\
+	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(DMU_OT_IS_METADATA((_dn)->dn_type) &&				\
+	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
+
+#define	DNODE_META_IS_CACHEABLE(_dn)					\
+	((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
+	(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
 
 #ifdef ZFS_DEBUG
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_bookmark.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_bookmark.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_bookmark.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_bookmark.h	13 Jan 2014 02:59:34 -0000
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_BOOKMARK_H
+#define	_SYS_DSL_BOOKMARK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+struct dsl_dataset;
+
+/*
+ * On disk zap object.
+ */
+typedef struct zfs_bookmark_phys {
+	uint64_t zbm_guid;		/* guid of bookmarked dataset */
+	uint64_t zbm_creation_txg;	/* birth transaction group */
+	uint64_t zbm_creation_time;	/* bookmark creation time */
+} zfs_bookmark_phys_t;
+
+int dsl_bookmark_create(nvlist_t *, nvlist_t *);
+int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *);
+int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *);
+int dsl_bookmark_destroy(nvlist_t *, nvlist_t *);
+int dsl_bookmark_lookup(struct dsl_pool *, const char *,
+    struct dsl_dataset *, zfs_bookmark_phys_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_BOOKMARK_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h,v
retrieving revision 1.2
diff -u -p -r1.2 dsl_dataset.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h	19 Feb 2016 19:25:22 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h	3 Dec 2016 17:03:48 -0000
@@ -19,8 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_DSL_DATASET_H
@@ -33,6 +37,10 @@
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+#include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -44,12 +52,10 @@ struct dsl_pool;
 
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 #define	DS_IS_INCONSISTENT(ds)	\
-	((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
+	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
+
 /*
- * NB: nopromote can not yet be set, but we want support for it in this
- * on-disk version, so that we don't need to upgrade for it later.  It
- * will be needed when we implement 'zfs split' (where the split off
- * clone should not be promoted).
+ * Do not allow this dataset to be promoted.
  */
 #define	DS_FLAG_NOPROMOTE	(1ULL<<1)
 
@@ -66,7 +72,31 @@ struct dsl_pool;
  */
 #define	DS_FLAG_DEFER_DESTROY	(1ULL<<3)
 #define	DS_IS_DEFER_DESTROY(ds)	\
-	((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+	(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
+
+/*
+ * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+/*
+ * This field's value is the object ID of a zap object which contains the
+ * bookmarks of this dataset.  If it is present, then this dataset is counted
+ * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
+ */
+#define	DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
+
+/*
+ * These fields are set on datasets that are in the middle of a resumable
+ * receive, and allow the sender to resume the send if it is interrupted.
+ */
+#define	DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
+#define	DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
+#define	DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
+#define	DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
+#define	DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
+#define	DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define	DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
 
 /*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
@@ -74,6 +104,8 @@ struct dsl_pool;
  */
 #define	DS_FLAG_CI_DATASET	(1ULL<<16)
 
+#define	DS_CREATE_FLAG_NODIRTY	(1ULL<<24)
+
 typedef struct dsl_dataset_phys {
 	uint64_t ds_dir_obj;		/* DMU_OT_DSL_DIR */
 	uint64_t ds_prev_snap_obj;	/* DMU_OT_DSL_DATASET */
@@ -83,8 +115,13 @@ typedef struct dsl_dataset_phys {
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
-	uint64_t ds_deadlist_obj;	/* DMU_OT_BPLIST */
-	uint64_t ds_used_bytes;
+	uint64_t ds_deadlist_obj;	/* DMU_OT_DEADLIST */
+	/*
+	 * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes
+	 * include all blocks referenced by this dataset, including those
+	 * shared with any other datasets.
+	 */
+	uint64_t ds_referenced_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
 	uint64_t ds_unique_bytes;	/* only relevant to snapshots */
@@ -104,22 +141,23 @@ typedef struct dsl_dataset_phys {
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
+	dmu_buf_user_t ds_dbu;
+	rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */
+
 	/* Immutable: */
 	struct dsl_dir *ds_dir;
-	dsl_dataset_phys_t *ds_phys;
 	dmu_buf_t *ds_dbuf;
 	uint64_t ds_object;
 	uint64_t ds_fsid_guid;
+	boolean_t ds_is_snapshot;
 
 	/* only used in syncing context, only valid for non-snapshots: */
 	struct dsl_dataset *ds_prev;
-	uint64_t ds_origin_txg;
+	uint64_t ds_bookmarks;  /* DMU_OTN_ZAP_METADATA */
 
 	/* has internal locking: */
-	bplist_t ds_deadlist;
-
-	/* to protect against multiple concurrent incremental recv */
-	kmutex_t ds_recvlock;
+	dsl_deadlist_t ds_deadlist;
+	bplist_t ds_pending_deadlist;
 
 	/* protected by lock on pool's dp_dirty_datasets list */
 	txg_node_t ds_dirty_link;
@@ -132,13 +170,15 @@ typedef struct dsl_dataset {
 	kmutex_t ds_lock;
 	objset_t *ds_objset;
 	uint64_t ds_userrefs;
+	void *ds_owner;
 
 	/*
-	 * ds_owner is protected by the ds_rwlock and the ds_lock
+	 * Long holds prevent the ds from being destroyed; they allow the
+	 * ds to remain held even after dropping the dp_config_rwlock.
+	 * Owning counts as a long hold.  See the comments above
+	 * dsl_pool_hold() for details.
 	 */
-	krwlock_t ds_rwlock;
-	kcondvar_t ds_exclusive_cv;
-	void *ds_owner;
+	refcount_t ds_longholds;
 
 	/* no locking; only for making guesses */
 	uint64_t ds_trysnap_txg;
@@ -149,75 +189,99 @@ typedef struct dsl_dataset {
 	uint64_t ds_reserved;	/* cached refreservation */
 	uint64_t ds_quota;	/* cached refquota */
 
+	kmutex_t ds_sendstream_lock;
+	list_t ds_sendstreams;
+
+	/*
+	 * When in the middle of a resumable receive, tracks how much
+	 * progress we have made.
+	 */
+	uint64_t ds_resume_object[TXG_SIZE];
+	uint64_t ds_resume_offset[TXG_SIZE];
+	uint64_t ds_resume_bytes[TXG_SIZE];
+
+	/* Protected by our dsl_dir's dd_lock */
+	list_t ds_prop_cbs;
+
+	/*
+	 * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
+	 * uses this feature.
+	 */
+	uint8_t ds_feature_inuse[SPA_FEATURES];
+
+	/*
+	 * Set if we need to activate the feature on this dataset this txg
+	 * (used only in syncing context).
+	 */
+	uint8_t ds_feature_activation_needed[SPA_FEATURES];
+
 	/* Protected by ds_lock; keep at end of struct for better locality */
-	char ds_snapname[MAXNAMELEN];
+	char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
 } dsl_dataset_t;
 
-struct dsl_ds_destroyarg {
-	dsl_dataset_t *ds;		/* ds to destroy */
-	dsl_dataset_t *rm_origin;	/* also remove our origin? */
-	boolean_t is_origin_rm;		/* set if removing origin snap */
-	boolean_t defer;		/* destroy -d requested? */
-	boolean_t releasing;		/* destroying due to release? */
-	boolean_t need_prep;		/* do we need to retry due to EBUSY? */
-};
+inline dsl_dataset_phys_t *
+dsl_dataset_phys(dsl_dataset_t *ds)
+{
+	return (ds->ds_dbuf->db_data);
+}
 
-#define	dsl_dataset_is_snapshot(ds)	\
-	((ds)->ds_phys->ds_num_children != 0)
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define	MAX_TAG_PREFIX_LEN	17
+
+#define	dsl_dataset_is_snapshot(ds) \
+	(dsl_dataset_phys(ds)->ds_num_children != 0)
 
 #define	DS_UNIQUE_IS_ACCURATE(ds)	\
-	(((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+	((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
 
-int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
-int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
-    void *tag, dsl_dataset_t **);
-int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
+    dsl_dataset_t **dsp);
+boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
+    void *tag);
+int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
+    dsl_dataset_t **);
+void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
+int dsl_dataset_own(struct dsl_pool *dp, const char *name,
     void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
-    boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
-void dsl_dataset_name(dsl_dataset_t *ds, char *name);
-void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
+    void *tag, dsl_dataset_t **dsp);
 void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
-boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
-    void *tag);
-void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_name(dsl_dataset_t *ds, char *name);
+boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
+int dsl_dataset_namelen(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx);
-int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
-int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
-dsl_checkfunc_t dsl_dataset_destroy_check;
-dsl_syncfunc_t dsl_dataset_destroy_sync;
-dsl_checkfunc_t dsl_dataset_snapshot_check;
-dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rename(const char *name, const char *newname, boolean_t recursive);
+int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors);
 int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
-int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
-    boolean_t recursive, boolean_t temphold);
-int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
-    boolean_t recursive);
-int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
-    char *htag);
-int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
+int dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive);
+int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+    minor_t cleanup_minor, const char *htag);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
-void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
-boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
+boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
+    dsl_dataset_t *snap);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
+void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx);
 int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx, boolean_t async);
-boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
+    uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
@@ -227,28 +291,62 @@ void dsl_dataset_space(dsl_dataset_t *ds
     uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
+int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds);
 
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used,
     uint64_t *ref_rsrv);
-int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
     uint64_t quota);
-void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
-    dmu_tx_t *tx);
-int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
     uint64_t reservation);
 
-int dsl_destroy_inconsistent(const char *dsname, void *arg);
+boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+    uint64_t earlier_txg);
+void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
+void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
+
+int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
+void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, dmu_tx_t *tx);
+int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
+void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx);
+
+void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx);
+void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds);
+int dsl_dataset_get_snapname(dsl_dataset_t *ds);
+int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
+    uint64_t *value);
+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt);
+void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+    zprop_source_t source, uint64_t value, dmu_tx_t *tx);
+void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
+int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
+
+void dsl_dataset_deactivate_feature(uint64_t dsobj,
+    spa_feature_t f, dmu_tx_t *tx);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
 	dsl_dataset_name(ds, __ds_name); \
 	dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, MAXNAMELEN); \
+	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deadlist.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deadlist.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deadlist.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deadlist.h	12 Jun 2012 05:57:27 -0000
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_DEADLIST_H
+#define	_SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+	uint64_t dl_used;
+	uint64_t dl_comp;
+	uint64_t dl_uncomp;
+	uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+	objset_t *dl_os;
+	uint64_t dl_object;
+	avl_tree_t dl_tree;
+	boolean_t dl_havetree;
+	struct dmu_buf *dl_dbuf;
+	dsl_deadlist_phys_t *dl_phys;
+	kmutex_t dl_lock;
+
+	/* if it's the old on-disk format: */
+	bpobj_t dl_bpobj;
+	boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+	avl_node_t dle_node;
+	uint64_t dle_mintxg;
+	bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+    uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dsl_deleg.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h	27 Feb 2010 22:31:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h	13 Jan 2014 02:59:34 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DELEG_H
@@ -55,6 +55,8 @@ extern "C" {
 #define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
 #define	ZFS_DELEG_PERM_HOLD		"hold"
 #define	ZFS_DELEG_PERM_RELEASE		"release"
+#define	ZFS_DELEG_PERM_DIFF		"diff"
+#define	ZFS_DELEG_PERM_BOOKMARK		"bookmark"
 
 /*
  * Note: the names of properties that are marked delegatable are also
@@ -64,6 +66,7 @@ extern "C" {
 int dsl_deleg_get(const char *ddname, nvlist_t **nvp);
 int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset);
 int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr);
+int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr);
 void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr);
 int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr);
 int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_destroy.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_destroy.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_destroy.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_destroy.h	2 Sep 2013 11:38:20 -0000
@@ -0,0 +1,53 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_DESTROY_H
+#define	_SYS_DSL_DESTROY_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct nvlist;
+struct dsl_dataset;
+struct dmu_tx;
+
+int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
+    struct nvlist *);
+int dsl_destroy_snapshot(const char *, boolean_t);
+int dsl_destroy_head(const char *);
+int dsl_destroy_head_check_impl(struct dsl_dataset *, int);
+void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *);
+int dsl_destroy_inconsistent(const char *, void *);
+int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t);
+void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *,
+    boolean_t, struct dmu_tx *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DESTROY_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dsl_dir.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h	27 Feb 2010 22:31:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h	10 Oct 2016 11:09:53 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DSL_DIR_H
@@ -38,6 +40,14 @@ extern "C" {
 
 struct dsl_dataset;
 
+/*
+ * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+#define	DD_FIELD_FILESYSTEM_COUNT	"com.joyent:filesystem_count"
+#define	DD_FIELD_SNAPSHOT_COUNT		"com.joyent:snapshot_count"
+
 typedef enum dd_used {
 	DD_USED_HEAD,
 	DD_USED_SNAP,
@@ -70,16 +80,20 @@ typedef struct dsl_dir_phys {
 	uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
 	uint64_t dd_flags;
 	uint64_t dd_used_breakdown[DD_USED_NUM];
-	uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
+	uint64_t dd_clones; /* dsl_dir objects */
+	uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
 } dsl_dir_phys_t;
 
 struct dsl_dir {
+	dmu_buf_user_t dd_dbu;
+
 	/* These are immutable; no lock needed: */
 	uint64_t dd_object;
-	dsl_dir_phys_t *dd_phys;
-	dmu_buf_t *dd_dbuf;
 	dsl_pool_t *dd_pool;
 
+	/* Stable until user eviction; no lock needed: */
+	dmu_buf_t *dd_dbuf;
+
 	/* protected by lock on pool's dp_dirty_dirs list */
 	txg_node_t dd_dirty_link;
 
@@ -88,8 +102,9 @@ struct dsl_dir {
 
 	/* Protected by dd_lock */
 	kmutex_t dd_lock;
-	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+	list_t dd_props; /* list of dsl_prop_record_t's */
 	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
+	uint64_t dd_origin_txg;
 
 	/* gross estimate of space used by in-flight tx's */
 	uint64_t dd_tempreserved[TXG_SIZE];
@@ -97,21 +112,25 @@ struct dsl_dir {
 	int64_t dd_space_towrite[TXG_SIZE];
 
 	/* protected by dd_lock; keep at end of struct for better locality */
-	char dd_myname[MAXNAMELEN];
+	char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
 };
 
-void dsl_dir_close(dsl_dir_t *dd, void *tag);
-int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
-int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
-    const char **tailp);
-int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+inline dsl_dir_phys_t *
+dsl_dir_phys(dsl_dir_t *dd)
+{
+	return (dd->dd_dbuf->db_data);
+}
+
+void dsl_dir_rele(dsl_dir_t *dd, void *tag);
+void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
+int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+    dsl_dir_t **, const char **tail);
+int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **);
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_namelen(dsl_dir_t *dd);
 uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
     const char *name, dmu_tx_t *tx);
-dsl_checkfunc_t dsl_dir_destroy_check;
-dsl_syncfunc_t dsl_dir_destroy_sync;
 void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
 uint64_t dsl_dir_space_available(dsl_dir_t *dd,
     dsl_dir_t *ancestor, int64_t delta, int ondiskonly);
@@ -130,27 +149,37 @@ int dsl_dir_set_quota(const char *ddname
     uint64_t quota);
 int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
     uint64_t reservation);
-int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
-int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
+int dsl_dir_activate_fs_ss_limit(const char *);
+int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
+    cred_t *);
+void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
+int dsl_dir_rename(const char *oldname, const char *newname);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
 void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
 timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
+void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
+    dmu_tx_t *tx);
+void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
+boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
 
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
 #define	ORIGIN_DIR_NAME "$ORIGIN"
+#define	XLATION_DIR_NAME "$XLATION"
+#define	FREE_DIR_NAME "$FREE"
+#define	LEAK_DIR_NAME "$LEAK"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
-	char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
-	    KM_SLEEP); \
+	char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
 	dsl_dir_name(dd, __ds_name); \
 	dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
-	kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
+	kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dsl_pool.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h	27 Feb 2010 22:31:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h	24 Jan 2017 17:21:52 -0000
@@ -19,8 +19,9 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_DSL_POOL_H
@@ -33,6 +34,10 @@
 #include <sys/zio.h>
 #include <sys/dnode.h>
 #include <sys/ddt.h>
+#include <sys/arc.h>
+#include <sys/bpobj.h>
+#include <sys/bptree.h>
+#include <sys/rrwlock.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -43,16 +48,19 @@ struct dsl_dir;
 struct dsl_dataset;
 struct dsl_pool;
 struct dmu_tx;
+struct dsl_scan;
 
-enum scrub_func {
-	SCRUB_FUNC_NONE,
-	SCRUB_FUNC_CLEAN,
-	SCRUB_FUNC_NUMFUNCS
-};
+extern uint64_t zfs_dirty_data_max;
+extern uint64_t zfs_dirty_data_max_max;
+extern uint64_t zfs_dirty_data_sync;
+extern int zfs_dirty_data_max_percent;
+extern int zfs_delay_min_dirty_percent;
+extern uint64_t zfs_delay_scale;
 
 /* These macros are for indexing into the zfs_all_blkstats_t. */
 #define	DMU_OT_DEFERRED	DMU_OT_NONE
-#define	DMU_OT_TOTAL	DMU_OT_NUMTYPES
+#define	DMU_OT_OTHER	DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
+#define	DMU_OT_TOTAL	(DMU_OT_NUMTYPES + 1)
 
 typedef struct zfs_blkstat {
 	uint64_t	zb_count;
@@ -76,55 +84,58 @@ typedef struct dsl_pool {
 	struct objset *dp_meta_objset;
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
+	struct dsl_dir *dp_free_dir;
+	struct dsl_dir *dp_leak_dir;
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_vnrele_taskq;
 
 	/* No lock needed - sync context only */
 	blkptr_t dp_meta_rootbp;
-	list_t dp_synced_datasets;
-	hrtime_t dp_read_overhead;
-	uint64_t dp_throughput; /* bytes per millisec */
-	uint64_t dp_write_limit;
 	uint64_t dp_tmp_userrefs_obj;
+	bpobj_t dp_free_bpobj;
+	uint64_t dp_bptree_obj;
+	uint64_t dp_empty_bpobj;
+
+	struct dsl_scan *dp_scan;
 
 	/* Uses dp_lock */
 	kmutex_t dp_lock;
-	uint64_t dp_space_towrite[TXG_SIZE];
-	uint64_t dp_tempreserved[TXG_SIZE];
+	kcondvar_t dp_spaceavail_cv;
+	uint64_t dp_dirty_pertxg[TXG_SIZE];
+	uint64_t dp_dirty_total;
+	uint64_t dp_long_free_dirty_pertxg[TXG_SIZE];
+	uint64_t dp_mos_used_delta;
+	uint64_t dp_mos_compressed_delta;
+	uint64_t dp_mos_uncompressed_delta;
 
-	enum scrub_func dp_scrub_func;
-	uint64_t dp_scrub_queue_obj;
-	uint64_t dp_scrub_min_txg;
-	uint64_t dp_scrub_max_txg;
-	uint64_t dp_scrub_start_time;
-	uint64_t dp_scrub_ddt_class_max;
-	zbookmark_t dp_scrub_bookmark;
-	ddt_bookmark_t dp_scrub_ddt_bookmark;
-	boolean_t dp_scrub_pausing;
-	boolean_t dp_scrub_isresilver;
-	boolean_t dp_scrub_restart;
-	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
-	zio_t *dp_scrub_prefetch_zio_root;
+	/*
+	 * Time of most recently scheduled (furthest in the future)
+	 * wakeup for delayed transactions.
+	 */
+	hrtime_t dp_last_wakeup;
 
 	/* Has its own locking */
 	tx_state_t dp_tx;
 	txg_list_t dp_dirty_datasets;
+	txg_list_t dp_dirty_zilogs;
 	txg_list_t dp_dirty_dirs;
 	txg_list_t dp_sync_tasks;
 
 	/*
 	 * Protects administrative changes (properties, namespace)
+	 *
 	 * It is only held for write in syncing context.  Therefore
 	 * syncing context does not need to ever have it for read, since
 	 * nobody else could possibly have it for write.
 	 */
-	krwlock_t dp_config_rwlock;
+	rrwlock_t dp_config_rwlock;
 
 	zfs_all_blkstats_t *dp_blkstats;
 } dsl_pool_t;
 
-int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
+int dsl_pool_open(dsl_pool_t *dp);
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
@@ -132,32 +143,33 @@ void dsl_pool_sync_done(dsl_pool_t *dp, 
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
 uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
-int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
-void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-void dsl_pool_memory_pressure(dsl_pool_t *dp);
-void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
-void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
-void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
-    struct dmu_tx *tx);
+void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
+    const blkptr_t *bpp);
 void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
-
-int dsl_pool_scrub_cancel(dsl_pool_t *dp);
-int dsl_pool_scrub_clean(dsl_pool_t *dp);
-void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
-void dsl_pool_scrub_restart(dsl_pool_t *dp);
-void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
-    const ddt_entry_t *dde);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp);
+void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
+boolean_t dsl_pool_config_held(dsl_pool_t *dp);
+boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
+boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
 
 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
 
-extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
-    const char *tag, uint64_t *now, dmu_tx_t *tx);
-extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t now, dmu_tx_t *tx);
+int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
     const char *tag, dmu_tx_t *tx);
-extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
+int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **);
+int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp);
+void dsl_pool_rele(dsl_pool_t *dp, void *tag);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 dsl_prop.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h	27 Feb 2010 22:31:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h	22 Nov 2015 17:22:31 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_PROP_H
@@ -41,10 +41,17 @@ struct dsl_dir;
 /* The callback func may not call into the DMU or DSL! */
 typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
 
+typedef struct dsl_prop_record {
+	list_node_t pr_node; /* link on dd_props */
+	const char *pr_propname;
+	list_t pr_cbs;
+} dsl_prop_record_t;
+
 typedef struct dsl_prop_cb_record {
-	list_node_t cbr_node; /* link on dd_prop_cbs */
+	list_node_t cbr_pr_node; /* link on pr_cbs */
+	list_node_t cbr_ds_node; /* link on ds_prop_cbs */
+	dsl_prop_record_t *cbr_pr;
 	struct dsl_dataset *cbr_ds;
-	const char *cbr_propname;
 	dsl_prop_changed_cb_t *cbr_func;
 	void *cbr_arg;
 } dsl_prop_cb_record_t;
@@ -54,60 +61,48 @@ typedef struct dsl_props_arg {
 	zprop_source_t pa_source;
 } dsl_props_arg_t;
 
-typedef struct dsl_prop_set_arg {
-	const char *psa_name;
-	zprop_source_t psa_source;
-	int psa_intsz;
-	int psa_numints;
-	const void *psa_value;
-
-	/*
-	 * Used to handle the special requirements of the quota and reservation
-	 * properties.
-	 */
-	uint64_t psa_effective_value;
-} dsl_prop_setarg_t;
-
+void dsl_prop_init(dsl_dir_t *dd);
+void dsl_prop_fini(dsl_dir_t *dd);
 int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
-    dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_numcb(struct dsl_dataset *ds);
+void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg);
+void dsl_prop_notify_all(struct dsl_dir *dd);
+boolean_t dsl_prop_hascb(struct dsl_dataset *ds);
 
 int dsl_prop_get(const char *ddname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_integer(const char *ddname, const char *propname,
     uint64_t *valuep, char *setpoint);
 int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
-int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(const char *dsname, nvlist_t **nvp);
 int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
+int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname,
+    uint64_t *valuep);
 int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
     int intsz, int numints, void *buf, char *setpoint,
     boolean_t snapshot);
 
-dsl_syncfunc_t dsl_props_set_sync;
-int dsl_prop_set(const char *ddname, const char *propname,
-    zprop_source_t source, int intsz, int numints, const void *buf);
+void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source,
+    nvlist_t *props, dmu_tx_t *tx);
+void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname,
+    zprop_source_t source, int intsz, int numints, const void *value,
+    dmu_tx_t *tx);
 int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
-void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
-    cred_t *cr, dmu_tx_t *tx);
+int dsl_prop_set_int(const char *dsname, const char *propname,
+    zprop_source_t source, uint64_t value);
+int dsl_prop_set_string(const char *dsname, const char *propname,
+    zprop_source_t source, const char *value);
+int dsl_prop_inherit(const char *dsname, const char *propname,
+    zprop_source_t source);
 
-void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
-    zprop_source_t source, uint64_t *value);
-int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
-#ifdef	ZFS_DEBUG
-void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
-#define	DSL_PROP_CHECK_PREDICTION(dd, psa)	\
-	dsl_prop_check_prediction((dd), (psa))
-#else
-#define	DSL_PROP_CHECK_PREDICTION(dd, psa)	/* nothing */
-#endif
+int dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+    zprop_source_t source, uint64_t value, uint64_t *newvalp);
 
 /* flag first receive on or after SPA_VERSION_RECVD_PROPS */
-boolean_t dsl_prop_get_hasrecvd(objset_t *os);
-void dsl_prop_set_hasrecvd(objset_t *os);
-void dsl_prop_unset_hasrecvd(objset_t *os);
+boolean_t dsl_prop_get_hasrecvd(const char *dsname);
+int dsl_prop_set_hasrecvd(const char *dsname);
+void dsl_prop_unset_hasrecvd(const char *dsname);
 
 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
 void dsl_prop_nvlist_add_string(nvlist_t *nv,
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_scan.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_scan.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_scan.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_scan.h	17 Jul 2014 16:23:18 -0000
@@ -0,0 +1,145 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_SCAN_H
+#define	_SYS_DSL_SCAN_H
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/bplist.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct objset;
+struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+/*
+ * All members of this structure must be uint64_t, for byteswap
+ * purposes.
+ */
+typedef struct dsl_scan_phys {
+	uint64_t scn_func; /* pool_scan_func_t */
+	uint64_t scn_state; /* dsl_scan_state_t */
+	uint64_t scn_queue_obj;
+	uint64_t scn_min_txg;
+	uint64_t scn_max_txg;
+	uint64_t scn_cur_min_txg;
+	uint64_t scn_cur_max_txg;
+	uint64_t scn_start_time;
+	uint64_t scn_end_time;
+	uint64_t scn_to_examine; /* total bytes to be scanned */
+	uint64_t scn_examined; /* bytes scanned so far */
+	uint64_t scn_to_process;
+	uint64_t scn_processed;
+	uint64_t scn_errors;	/* scan I/O error count */
+	uint64_t scn_ddt_class_max;
+	ddt_bookmark_t scn_ddt_bookmark;
+	zbookmark_phys_t scn_bookmark;
+	uint64_t scn_flags; /* dsl_scan_flags_t */
+} dsl_scan_phys_t;
+
+#define	SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t))
+
+typedef enum dsl_scan_flags {
+	DSF_VISIT_DS_AGAIN = 1<<0,
+} dsl_scan_flags_t;
+
+/*
+ * Every pool will have one dsl_scan_t and this structure will contain
+ * in-memory information about the scan and a pointer to the on-disk
+ * representation (i.e. dsl_scan_phys_t). Most of the state of the scan
+ * is contained on-disk to allow the scan to resume in the event of a reboot
+ * or panic. This structure maintains information about the behavior of a
+ * running scan, some caching information, and how it should traverse the pool.
+ *
+ * The following members of this structure direct the behavior of the scan:
+ *
+ * scn_pausing -	a scan that cannot be completed in a single txg or
+ *			has exceeded its allotted time will need to pause.
+ *			When this flag is set the scanner will stop traversing
+ *			the pool and write out the current state to disk.
+ *
+ * scn_restart_txg -	directs the scanner to either restart or start a
+ *			a scan at the specified txg value.
+ *
+ * scn_done_txg -	when a scan completes its traversal it will set
+ *			the completion txg to the next txg. This is necessary
+ *			to ensure that any blocks that were freed during
+ *			the scan but have not yet been processed (i.e deferred
+ *			frees) are accounted for.
+ *
+ * This structure also maintains information about deferred frees which are
+ * a special kind of traversal. Deferred free can exist in either a bptree or
+ * a bpobj structure. The scn_is_bptree flag will indicate the type of
+ * deferred free that is in progress. If the deferred free is part of an
+ * asynchronous destroy then the scn_async_destroying flag will be set.
+ */
+typedef struct dsl_scan {
+	struct dsl_pool *scn_dp;
+
+	boolean_t scn_pausing;
+	uint64_t scn_restart_txg;
+	uint64_t scn_done_txg;
+	uint64_t scn_sync_start_time;
+	zio_t *scn_zio_root;
+
+	/* for freeing blocks */
+	boolean_t scn_is_bptree;
+	boolean_t scn_async_destroying;
+	boolean_t scn_async_stalled;
+
+	/* for debugging / information */
+	uint64_t scn_visited_this_txg;
+
+	dsl_scan_phys_t scn_phys;
+} dsl_scan_t;
+
+int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
+void dsl_scan_fini(struct dsl_pool *dp);
+void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
+int dsl_scan_cancel(struct dsl_pool *);
+int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
+void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx);
+void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
+    struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_SCAN_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_synctask.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_synctask.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 dsl_synctask.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_synctask.h	7 Aug 2009 18:33:39 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_synctask.h	17 Jul 2014 16:23:18 -0000
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DSL_SYNCTASK_H
 #define	_SYS_DSL_SYNCTASK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/txg.h>
 #include <sys/zfs_context.h>
 
@@ -37,44 +35,56 @@ extern "C" {
 
 struct dsl_pool;
 
-typedef int (dsl_checkfunc_t)(void *, void *, dmu_tx_t *);
-typedef void (dsl_syncfunc_t)(void *, void *, cred_t *, dmu_tx_t *);
+typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
+typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
+
+typedef enum zfs_space_check {
+	/*
+	 * Normal space check: if there is less than 3.2% free space,
+	 * the operation will fail.  Operations which are logically
+	 * creating things should use this (e.g. "zfs create", "zfs snapshot").
+	 * User writes (via the ZPL / ZVOL) also fail at this point.
+	 */
+	ZFS_SPACE_CHECK_NORMAL,
+
+	/*
+	 * Space check allows use of half the slop space.  If there
+	 * is less than 1.6% free space, the operation will fail.  Most
+	 * operations should use this (e.g. "zfs set", "zfs rename"),
+	 * because we want them to succeed even after user writes are failing,
+	 * so that they can be used as part of the space recovery process.
+	 */
+	ZFS_SPACE_CHECK_RESERVED,
+
+	/*
+	 * No space check is performed.  Only operations which we expect to
+	 * result in a net reduction in space should use this
+	 * (e.g. "zfs destroy". Setting quotas & reservations also uses
+	 * this because it needs to circumvent the quota/reservation checks).
+	 *
+	 * See also the comments above spa_slop_shift.
+	 */
+	ZFS_SPACE_CHECK_NONE,
+} zfs_space_check_t;
 
 typedef struct dsl_sync_task {
-	list_node_t dst_node;
+	txg_node_t dst_node;
+	struct dsl_pool *dst_pool;
+	uint64_t dst_txg;
+	int dst_space;
+	zfs_space_check_t dst_space_check;
 	dsl_checkfunc_t *dst_checkfunc;
 	dsl_syncfunc_t *dst_syncfunc;
-	void *dst_arg1;
-	void *dst_arg2;
-	int dst_err;
+	void *dst_arg;
+	int dst_error;
+	boolean_t dst_nowaiter;
 } dsl_sync_task_t;
 
-typedef struct dsl_sync_task_group {
-	txg_node_t dstg_node;
-	list_t dstg_tasks;
-	struct dsl_pool *dstg_pool;
-	cred_t *dstg_cr;
-	uint64_t dstg_txg;
-	int dstg_err;
-	int dstg_space;
-	boolean_t dstg_nowaiter;
-} dsl_sync_task_group_t;
-
-dsl_sync_task_group_t *dsl_sync_task_group_create(struct dsl_pool *dp);
-void dsl_sync_task_create(dsl_sync_task_group_t *dstg,
-    dsl_checkfunc_t *, dsl_syncfunc_t *,
-    void *arg1, void *arg2, int blocks_modified);
-int dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
-void dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg);
-void dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx);
-
-int dsl_sync_task_do(struct dsl_pool *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified);
-void dsl_sync_task_do_nowait(struct dsl_pool *dp,
-    dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc,
-    void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx);
+void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *);
+int dsl_sync_task(const char *, dsl_checkfunc_t *,
+    dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+    void *, int, zfs_space_check_t, dmu_tx_t *);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_userhold.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_userhold.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_userhold.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_userhold.h	19 Jun 2013 13:06:07 -0000
@@ -0,0 +1,57 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_USERHOLD_H
+#define	_SYS_DSL_USERHOLD_H
+
+#include <sys/nvpair.h>
+#include <sys/types.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dsl_pool;
+struct dsl_dataset;
+struct dmu_tx;
+
+int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor,
+    nvlist_t *errlist);
+int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl);
+void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds);
+int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag,
+    boolean_t temphold, struct dmu_tx *tx);
+void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag,
+    minor_t minor, uint64_t now, struct dmu_tx *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_USERHOLD_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 metaslab.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h	27 Feb 2010 22:31:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h	27 Mar 2017 06:19:45 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -36,43 +36,74 @@
 extern "C" {
 #endif
 
-extern space_map_ops_t *zfs_metaslab_ops;
 
-extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-    uint64_t start, uint64_t size, uint64_t txg);
-extern void metaslab_fini(metaslab_t *msp);
-extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_reassess(metaslab_group_t *mg);
-
-#define	METASLAB_HINTBP_FAVOR	0x0
-#define	METASLAB_HINTBP_AVOID	0x1
-#define	METASLAB_GANG_HEADER	0x2
-
-extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
-    blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
-extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
-    boolean_t now);
-extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-
-extern metaslab_class_t *metaslab_class_create(spa_t *spa,
-    space_map_ops_t *ops);
-extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern int metaslab_class_validate(metaslab_class_t *mc);
-
-extern void metaslab_class_space_update(metaslab_class_t *mc,
-    int64_t alloc_delta, int64_t defer_delta,
-    int64_t space_delta, int64_t dspace_delta);
-extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
-
-extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
-    vdev_t *vd);
-extern void metaslab_group_destroy(metaslab_group_t *mg);
-extern void metaslab_group_activate(metaslab_group_t *mg);
-extern void metaslab_group_passivate(metaslab_group_t *mg);
+typedef struct metaslab_ops {
+	uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
+} metaslab_ops_t;
+
+
+extern metaslab_ops_t *zfs_metaslab_ops;
+
+int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
+    metaslab_t **);
+void metaslab_fini(metaslab_t *);
+
+void metaslab_load_wait(metaslab_t *);
+int metaslab_load(metaslab_t *);
+void metaslab_unload(metaslab_t *);
+
+void metaslab_sync(metaslab_t *, uint64_t);
+void metaslab_sync_done(metaslab_t *, uint64_t);
+void metaslab_sync_reassess(metaslab_group_t *);
+uint64_t metaslab_block_maxsize(metaslab_t *);
+
+#define	METASLAB_HINTBP_FAVOR		0x0
+#define	METASLAB_HINTBP_AVOID		0x1
+#define	METASLAB_GANG_HEADER		0x2
+#define	METASLAB_GANG_CHILD		0x4
+#define	METASLAB_ASYNC_ALLOC		0x8
+#define	METASLAB_DONT_THROTTLE		0x10
+
+int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
+    blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *);
+void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
+int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
+void metaslab_check_free(spa_t *, const blkptr_t *);
+
+void metaslab_alloc_trace_init(void);
+void metaslab_alloc_trace_fini(void);
+void metaslab_trace_init(zio_alloc_list_t *);
+void metaslab_trace_fini(zio_alloc_list_t *);
+
+metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
+void metaslab_class_destroy(metaslab_class_t *);
+int metaslab_class_validate(metaslab_class_t *);
+void metaslab_class_histogram_verify(metaslab_class_t *);
+uint64_t metaslab_class_fragmentation(metaslab_class_t *);
+uint64_t metaslab_class_expandable_space(metaslab_class_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+    zio_t *, int);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+
+void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
+    int64_t, int64_t);
+uint64_t metaslab_class_get_alloc(metaslab_class_t *);
+uint64_t metaslab_class_get_space(metaslab_class_t *);
+uint64_t metaslab_class_get_dspace(metaslab_class_t *);
+uint64_t metaslab_class_get_deferred(metaslab_class_t *);
+uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
+
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+void metaslab_group_destroy(metaslab_group_t *);
+void metaslab_group_activate(metaslab_group_t *);
+void metaslab_group_passivate(metaslab_group_t *);
+boolean_t metaslab_group_initialized(metaslab_group_t *);
+uint64_t metaslab_group_get_space(metaslab_group_t *);
+void metaslab_group_histogram_verify(metaslab_group_t *);
+uint64_t metaslab_group_fragmentation(metaslab_group_t *);
+void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 metaslab_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h	27 Feb 2010 22:31:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h	27 Mar 2017 06:19:45 -0000
@@ -23,11 +23,16 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_METASLAB_IMPL_H
 #define	_SYS_METASLAB_IMPL_H
 
 #include <sys/metaslab.h>
 #include <sys/space_map.h>
+#include <sys/range_tree.h>
 #include <sys/vdev.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
@@ -36,47 +41,318 @@
 extern "C" {
 #endif
 
+/*
+ * Metaslab allocation tracing record.
+ */
+typedef struct metaslab_alloc_trace {
+	list_node_t			mat_list_node;
+	metaslab_group_t		*mat_mg;
+	metaslab_t			*mat_msp;
+	uint64_t			mat_size;
+	uint64_t			mat_weight;
+	uint32_t			mat_dva_id;
+	uint64_t			mat_offset;
+} metaslab_alloc_trace_t;
+
+/*
+ * Used by the metaslab allocation tracing facility to indicate
+ * error conditions. These errors are stored to the offset member
+ * of the metaslab_alloc_trace_t record and displayed by mdb.
+ */
+typedef enum trace_alloc_type {
+	TRACE_ALLOC_FAILURE	= -1ULL,
+	TRACE_TOO_SMALL		= -2ULL,
+	TRACE_FORCE_GANG	= -3ULL,
+	TRACE_NOT_ALLOCATABLE	= -4ULL,
+	TRACE_GROUP_FAILURE	= -5ULL,
+	TRACE_ENOSPC		= -6ULL,
+	TRACE_CONDENSING	= -7ULL,
+	TRACE_VDEV_ERROR	= -8ULL
+} trace_alloc_type_t;
+
+#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
+#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
+#define	METASLAB_WEIGHT_TYPE		(1ULL << 61)
+#define	METASLAB_ACTIVE_MASK		\
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+
+/*
+ * The metaslab weight is used to encode the amount of free space in a
+ * metaslab, such that the "best" metaslab appears first when sorting the
+ * metaslabs by weight. The weight (and therefore the "best" metaslab) can
+ * be determined in two different ways: by computing a weighted sum of all
+ * the free space in the metaslab (a space based weight) or by counting only
+ * the free segments of the largest size (a segment based weight). We prefer
+ * the segment based weight because it reflects how the free space is
+ * comprised, but we cannot always use it -- legacy pools do not have the
+ * space map histogram information necessary to determine the largest
+ * contiguous regions. Pools that have the space map histogram determine
+ * the segment weight by looking at each bucket in the histogram and
+ * determining the free space whose size in bytes is in the range:
+ *	[2^i, 2^(i+1))
+ * We then encode the largest index, i, that contains regions into the
+ * segment-weighted value.
+ *
+ * Space-based weight:
+ *
+ *      64      56      48      40      32      24      16      8       0
+ *      +-------+-------+-------+-------+-------+-------+-------+-------+
+ *      |PS1|                   weighted-free space                     |
+ *      +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ *	PS - indicates primary and secondary activation
+ *	space - the fragmentation-weighted space
+ *
+ * Segment-based weight:
+ *
+ *      64      56      48      40      32      24      16      8       0
+ *      +-------+-------+-------+-------+-------+-------+-------+-------+
+ *      |PS0| idx|             count of segments in region              |
+ *      +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ *	PS - indicates primary and secondary activation
+ *	idx - index for the highest bucket in the histogram
+ *	count - number of segments in the specified bucket
+ */
+#define	WEIGHT_GET_ACTIVE(weight)		BF64_GET((weight), 62, 2)
+#define	WEIGHT_SET_ACTIVE(weight, x)		BF64_SET((weight), 62, 2, x)
+
+#define	WEIGHT_IS_SPACEBASED(weight)		\
+	((weight) == 0 || BF64_GET((weight), 61, 1))
+#define	WEIGHT_SET_SPACEBASED(weight)		BF64_SET((weight), 61, 1, 1)
+
+/*
+ * These macros are only applicable to segment-based weighting.
+ */
+#define	WEIGHT_GET_INDEX(weight)		BF64_GET((weight), 55, 6)
+#define	WEIGHT_SET_INDEX(weight, x)		BF64_SET((weight), 55, 6, x)
+#define	WEIGHT_GET_COUNT(weight)		BF64_GET((weight), 0, 55)
+#define	WEIGHT_SET_COUNT(weight, x)		BF64_SET((weight), 0, 55, x)
+
+/*
+ * A metaslab class encompasses a category of allocatable top-level vdevs.
+ * Each top-level vdev is associated with a metaslab group which defines
+ * the allocatable region for that vdev. Examples of these categories include
+ * "normal" for data block allocations (i.e. main pool allocations) or "log"
+ * for allocations designated for intent log devices (i.e. slog devices).
+ * When a block allocation is requested from the SPA it is associated with a
+ * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
+ * to the class can be used to satisfy that request. Allocations are done
+ * by traversing the metaslab groups that are linked off of the mc_rotor field.
+ * This rotor points to the next metaslab group where allocations will be
+ * attempted. Allocating a block is a 3 step process -- select the metaslab
+ * group, select the metaslab, and then allocate the block. The metaslab
+ * class defines the low-level block allocator that will be used as the
+ * final step in allocation. These allocators are pluggable allowing each class
+ * to use a block allocator that best suits that class.
+ */
 struct metaslab_class {
+	kmutex_t		mc_lock;
 	spa_t			*mc_spa;
 	metaslab_group_t	*mc_rotor;
-	space_map_ops_t		*mc_ops;
+	metaslab_ops_t		*mc_ops;
 	uint64_t		mc_aliquot;
+
+	/*
+	 * Track the number of metaslab groups that have been initialized
+	 * and can accept allocations. An initialized metaslab group is
+	 * one has been completely added to the config (i.e. we have
+	 * updated the MOS config and the space has been added to the pool).
+	 */
+	uint64_t		mc_groups;
+
+	/*
+	 * Toggle to enable/disable the allocation throttle.
+	 */
+	boolean_t		mc_alloc_throttle_enabled;
+
+	/*
+	 * The allocation throttle works on a reservation system. Whenever
+	 * an asynchronous zio wants to perform an allocation it must
+	 * first reserve the number of blocks that it wants to allocate.
+	 * If there aren't sufficient slots available for the pending zio
+	 * then that I/O is throttled until more slots free up. The current
+	 * number of reserved allocations is maintained by the mc_alloc_slots
+	 * refcount. The mc_alloc_max_slots value determines the maximum
+	 * number of allocations that the system allows. Gang blocks are
+	 * allowed to reserve slots even if we've reached the maximum
+	 * number of allocations allowed.
+	 */
+	uint64_t		mc_alloc_max_slots;
+	refcount_t		mc_alloc_slots;
+
+	uint64_t		mc_alloc_groups; /* # of allocatable groups */
+
 	uint64_t		mc_alloc;	/* total allocated space */
 	uint64_t		mc_deferred;	/* total deferred frees */
 	uint64_t		mc_space;	/* total space (alloc + free) */
 	uint64_t		mc_dspace;	/* total deflated space */
+	uint64_t		mc_minblocksize;
+	uint64_t		mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 };
 
+/*
+ * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
+ * of a top-level vdev. They are linked togther to form a circular linked
+ * list and can belong to only one metaslab class. Metaslab groups may become
+ * ineligible for allocations for a number of reasons such as limited free
+ * space, fragmentation, or going offline. When this happens the allocator will
+ * simply find the next metaslab group in the linked list and attempt
+ * to allocate from that group instead.
+ */
 struct metaslab_group {
 	kmutex_t		mg_lock;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
-	uint64_t		mg_bonus_area;
+	boolean_t		mg_allocatable;		/* can we allocate? */
+
+	/*
+	 * A metaslab group is considered to be initialized only after
+	 * we have updated the MOS config and added the space to the pool.
+	 * We only allow allocation attempts to a metaslab group if it
+	 * has been initialized.
+	 */
+	boolean_t		mg_initialized;
+
+	uint64_t		mg_free_capacity;	/* percentage free */
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;
+	taskq_t			*mg_taskq;
 	metaslab_group_t	*mg_prev;
 	metaslab_group_t	*mg_next;
+
+	/*
+	 * Each metaslab group can handle mg_max_alloc_queue_depth allocations
+	 * which are tracked by mg_alloc_queue_depth. It's possible for a
+	 * metaslab group to handle more allocations than its max. This
+	 * can occur when gang blocks are required or when other groups
+	 * are unable to handle their share of allocations.
+	 */
+	uint64_t		mg_max_alloc_queue_depth;
+	refcount_t		mg_alloc_queue_depth;
+
+	/*
+	 * A metalab group that can no longer allocate the minimum block
+	 * size will set mg_no_free_space. Once a metaslab group is out
+	 * of space then its share of work must be distributed to other
+	 * groups.
+	 */
+	boolean_t		mg_no_free_space;
+
+	uint64_t		mg_allocations;
+	uint64_t		mg_failed_allocations;
+	uint64_t		mg_fragmentation;
+	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
 };
 
 /*
- * Each metaslab's free space is tracked in space map object in the MOS,
- * which is only updated in syncing context.  Each time we sync a txg,
- * we append the allocs and frees from that txg to the space map object.
- * When the txg is done syncing, metaslab_sync_done() updates ms_smo
- * to ms_smo_syncing.  Everything in ms_smo is always safe to allocate.
+ * This value defines the number of elements in the ms_lbas array. The value
+ * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
+ * This is the equivalent of highbit(UINT64_MAX).
+ */
+#define	MAX_LBAS	64
+
+/*
+ * Each metaslab maintains a set of in-core trees to track metaslab operations.
+ * The in-core free tree (ms_tree) contains the current list of free segments.
+ * As blocks are allocated, the allocated segment are removed from the ms_tree
+ * and added to a per txg allocation tree (ms_alloctree). As blocks are freed,
+ * they are added to the per txg free tree (ms_freetree). These per txg
+ * trees allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps. One additional in-core
+ * tree is maintained to track deferred frees (ms_defertree). Once a block
+ * is freed it will move from the ms_freetree to the ms_defertree. A deferred
+ * free means that a block has been freed but cannot be used by the pool
+ * until TXG_DEFER_SIZE transactions groups later. For example, a block
+ * that is freed in txg 50 will not be available for reallocation until
+ * txg 52 (50 + TXG_DEFER_SIZE).  This provides a safety net for uberblock
+ * rollback. A pool could be safely rolled back TXG_DEFERS_SIZE
+ * transactions groups and ensure that no block has been reallocated.
+ *
+ * The simplified transition diagram looks like this:
+ *
+ *
+ *      ALLOCATE
+ *         |
+ *         V
+ *    free segment (ms_tree) --------> ms_alloctree ----> (write to space map)
+ *         ^
+ *         |
+ *         |                           ms_freetree <--- FREE
+ *         |                                 |
+ *         |                                 |
+ *         |                                 |
+ *         +----------- ms_defertree <-------+---------> (write to space map)
+ *
+ *
+ * Each metaslab's space is tracked in a single space map in the MOS,
+ * which is only updated in syncing context. Each time we sync a txg,
+ * we append the allocs and frees from that txg to the space map.
+ * The pool space is only updated once all metaslabs have finished syncing.
+ *
+ * To load the in-core free tree we read the space map from disk.
+ * This object contains a series of alloc and free records that are
+ * combined to make up the list of all free segments in this metaslab. These
+ * segments are represented in-core by the ms_tree and are stored in an
+ * AVL tree.
+ *
+ * As the space map grows (as a result of the appends) it will
+ * eventually become space-inefficient. When the metaslab's in-core free tree
+ * is zfs_condense_pct/100 times the size of the minimal on-disk
+ * representation, we rewrite it in its minimized form. If a metaslab
+ * needs to condense then we must set the ms_condensing flag to ensure
+ * that allocations are not performed on the metaslab that is being written.
  */
 struct metaslab {
-	kmutex_t	ms_lock;	/* metaslab lock		*/
-	space_map_obj_t	ms_smo;		/* synced space map object	*/
-	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
-	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
-	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
-	space_map_t	ms_defermap[TXG_DEFER_SIZE]; /* deferred frees	*/
-	space_map_t	ms_map;		/* in-core free space map	*/
+	kmutex_t	ms_lock;
+	kcondvar_t	ms_load_cv;
+	space_map_t	*ms_sm;
+	uint64_t	ms_id;
+	uint64_t	ms_start;
+	uint64_t	ms_size;
+	uint64_t	ms_fragmentation;
+
+	range_tree_t	*ms_alloctree[TXG_SIZE];
+	range_tree_t	*ms_freetree[TXG_SIZE];
+	range_tree_t	*ms_defertree[TXG_DEFER_SIZE];
+	range_tree_t	*ms_tree;
+
+	boolean_t	ms_condensing;	/* condensing? */
+	boolean_t	ms_condense_wanted;
+
+	/*
+	 * We must hold both ms_lock and ms_group->mg_lock in order to
+	 * modify ms_loaded.
+	 */
+	boolean_t	ms_loaded;
+	boolean_t	ms_loading;
+
 	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
+	uint64_t	ms_activation_weight;	/* activation weight	*/
+
+	/*
+	 * Track of whenever a metaslab is selected for loading or allocation.
+	 * We use this value to determine how long the metaslab should
+	 * stay cached.
+	 */
+	uint64_t	ms_selected_txg;
+
+	uint64_t	ms_alloc_txg;	/* last successful alloc (debug only) */
+	uint64_t	ms_max_size;	/* maximum allocatable size	*/
+
+	/*
+	 * The metaslab block allocators can optionally use a size-ordered
+	 * range tree and/or an array of LBAs. Not all allocators use
+	 * this functionality. The ms_size_tree should always contain the
+	 * same number of segments as the ms_tree. The only difference
+	 * is that the ms_size_tree is ordered by segment sizes.
+	 */
+	avl_tree_t	ms_size_tree;
+	uint64_t	ms_lbas[MAX_LBAS];
+
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
 	txg_node_t	ms_txg_node;	/* per-txg dirty metaslab links	*/
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/multilist.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/multilist.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/multilist.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/multilist.h	30 Aug 2015 02:17:54 -0000
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_MULTILIST_H
+#define	_SYS_MULTILIST_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef list_node_t multilist_node_t;
+typedef struct multilist multilist_t;
+typedef struct multilist_sublist multilist_sublist_t;
+typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *);
+
+struct multilist_sublist {
+	/*
+	 * The mutex used internally to implement thread safe insertions
+	 * and removals to this individual sublist. It can also be locked
+	 * by a consumer using multilist_sublist_{lock,unlock}, which is
+	 * useful if a consumer needs to traverse the list in a thread
+	 * safe manner.
+	 */
+	kmutex_t	mls_lock;
+	/*
+	 * The actual list object containing all objects in this sublist.
+	 */
+	list_t		mls_list;
+	/*
+	 * Pad to cache line (64 bytes), in an effort to try and prevent
+	 * cache line contention.
+	 */
+	uint8_t		mls_pad[24];
+};
+
+struct multilist {
+	/*
+	 * This is used to get to the multilist_node_t structure given
+	 * the void *object contained on the list.
+	 */
+	size_t				ml_offset;
+	/*
+	 * The number of sublists used internally by this multilist.
+	 */
+	uint64_t			ml_num_sublists;
+	/*
+	 * The array of pointers to the actual sublists.
+	 */
+	multilist_sublist_t		*ml_sublists;
+	/*
+	 * Pointer to function which determines the sublist to use
+	 * when inserting and removing objects from this multilist.
+	 * Please see the comment above multilist_create for details.
+	 */
+	multilist_sublist_index_func_t	*ml_index_func;
+};
+
+void multilist_destroy(multilist_t *);
+void multilist_create(multilist_t *, size_t, size_t, unsigned int,
+    multilist_sublist_index_func_t *);
+
+void multilist_insert(multilist_t *, void *);
+void multilist_remove(multilist_t *, void *);
+int  multilist_is_empty(multilist_t *);
+
+unsigned int multilist_get_num_sublists(multilist_t *);
+unsigned int multilist_get_random_index(multilist_t *);
+
+multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_unlock(multilist_sublist_t *);
+
+void multilist_sublist_insert_head(multilist_sublist_t *, void *);
+void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
+void multilist_sublist_remove(multilist_sublist_t *, void *);
+
+void *multilist_sublist_head(multilist_sublist_t *);
+void *multilist_sublist_tail(multilist_sublist_t *);
+void *multilist_sublist_next(multilist_sublist_t *, void *);
+void *multilist_sublist_prev(multilist_sublist_t *, void *);
+
+void multilist_link_init(multilist_node_t *);
+int  multilist_link_active(multilist_node_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTILIST_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/range_tree.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/range_tree.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/range_tree.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/range_tree.h	24 Apr 2014 03:51:37 -0000
@@ -0,0 +1,97 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_RANGE_TREE_H
+#define	_SYS_RANGE_TREE_H
+
+#include <sys/avl.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	RANGE_TREE_HISTOGRAM_SIZE	64
+
+typedef struct range_tree_ops range_tree_ops_t;
+
+typedef struct range_tree {
+	avl_tree_t	rt_root;	/* offset-ordered segment AVL tree */
+	uint64_t	rt_space;	/* sum of all segments in the map */
+	range_tree_ops_t *rt_ops;
+	void		*rt_arg;
+
+	/*
+	 * The rt_histogram maintains a histogram of ranges. Each bucket,
+	 * rt_histogram[i], contains the number of ranges whose size is:
+	 * 2^i <= size of range in bytes < 2^(i+1)
+	 */
+	uint64_t	rt_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+	kmutex_t	*rt_lock;	/* pointer to lock that protects map */
+} range_tree_t;
+
+typedef struct range_seg {
+	avl_node_t	rs_node;	/* AVL node */
+	avl_node_t	rs_pp_node;	/* AVL picker-private node */
+	uint64_t	rs_start;	/* starting offset of this segment */
+	uint64_t	rs_end;		/* ending offset (non-inclusive) */
+} range_seg_t;
+
+struct range_tree_ops {
+	void    (*rtop_create)(range_tree_t *rt, void *arg);
+	void    (*rtop_destroy)(range_tree_t *rt, void *arg);
+	void	(*rtop_add)(range_tree_t *rt, range_seg_t *rs, void *arg);
+	void    (*rtop_remove)(range_tree_t *rt, range_seg_t *rs, void *arg);
+	void	(*rtop_vacate)(range_tree_t *rt, void *arg);
+};
+
+typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
+
+void range_tree_init(void);
+void range_tree_fini(void);
+range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp);
+void range_tree_destroy(range_tree_t *rt);
+boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+uint64_t range_tree_space(range_tree_t *rt);
+void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
+void range_tree_stat_verify(range_tree_t *rt);
+
+void range_tree_add(void *arg, uint64_t start, uint64_t size);
+void range_tree_remove(void *arg, uint64_t start, uint64_t size);
+void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
+
+void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
+void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_RANGE_TREE_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/refcount.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/refcount.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 refcount.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/refcount.h	7 Aug 2009 18:33:40 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/refcount.h	16 Apr 2017 22:01:06 -0000
@@ -19,16 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_REFCOUNT_H
 #define	_SYS_REFCOUNT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/inttypes.h>
+#include <sys/cdefs.h>
+#include <sys/types.h>
+//#include_next <sys/refcount.h>
 #include <sys/list.h>
 #include <sys/zfs_context.h>
 
@@ -43,7 +43,7 @@ extern "C" {
  */
 #define	FTAG ((char *)__func__)
 
-#if defined(DEBUG) || !defined(_KERNEL)
+#ifdef	ZFS_DEBUG
 typedef struct reference {
 	list_node_t ref_link;
 	void *ref_holder;
@@ -53,15 +53,18 @@ typedef struct reference {
 
 typedef struct refcount {
 	kmutex_t rc_mtx;
+	boolean_t rc_tracked;
 	list_t rc_list;
 	list_t rc_removed;
-	int64_t rc_count;
-	int64_t rc_removed_count;
+	uint64_t rc_count;
+	uint64_t rc_removed_count;
 } refcount_t;
 
-/* Note: refcount_t must be initialized with refcount_create() */
+/* Note: refcount_t must be initialized with refcount_create[_untracked]() */
 
 void refcount_create(refcount_t *rc);
+void refcount_create_untracked(refcount_t *rc);
+void refcount_create_tracked(refcount_t *rc);
 void refcount_destroy(refcount_t *rc);
 void refcount_destroy_many(refcount_t *rc, uint64_t number);
 int refcount_is_zero(refcount_t *rc);
@@ -70,32 +73,46 @@ int64_t refcount_add(refcount_t *rc, voi
 int64_t refcount_remove(refcount_t *rc, void *holder_tag);
 int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
 int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
+void refcount_transfer(refcount_t *dst, refcount_t *src);
+void refcount_transfer_ownership(refcount_t *, void *, void *);
+boolean_t refcount_held(refcount_t *, void *);
+boolean_t refcount_not_held(refcount_t *, void *);
 
-void refcount_init(void);
+void refcount_sysinit(void);
 void refcount_fini(void);
 
-#else /* DEBUG */
+#else	/* ZFS_DEBUG */
 
 typedef struct refcount {
 	uint64_t rc_count;
 } refcount_t;
 
 #define	refcount_create(rc) ((rc)->rc_count = 0)
+#define	refcount_create_untracked(rc) ((rc)->rc_count = 0)
+#define	refcount_create_tracked(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
 #define	refcount_is_zero(rc) ((rc)->rc_count == 0)
 #define	refcount_count(rc) ((rc)->rc_count)
-#define	refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
-#define	refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define	refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
+#define	refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
 #define	refcount_add_many(rc, number, holder) \
 	atomic_add_64_nv(&(rc)->rc_count, number)
 #define	refcount_remove_many(rc, number, holder) \
 	atomic_add_64_nv(&(rc)->rc_count, -number)
+#define	refcount_transfer(dst, src) { \
+	uint64_t __tmp = (src)->rc_count; \
+	atomic_add_64(&(src)->rc_count, -__tmp); \
+	atomic_add_64(&(dst)->rc_count, __tmp); \
+}
+#define	refcount_transfer_ownership(rc, current_holder, new_holder)
+#define	refcount_held(rc, holder)		((rc)->rc_count > 0)
+#define	refcount_not_held(rc, holder)		(B_TRUE)
 
-#define	refcount_init()
+#define	refcount_sysinit()
 #define	refcount_fini()
 
-#endif /* DEBUG */
+#endif	/* ZFS_DEBUG */
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/rrwlock.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/rrwlock.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 rrwlock.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/rrwlock.h	7 Aug 2009 18:33:40 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/rrwlock.h	30 Aug 2015 02:17:54 -0000
@@ -22,17 +22,17 @@
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
 
 #ifndef	_SYS_RR_RW_LOCK_H
 #define	_SYS_RR_RW_LOCK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-#include <sys/inttypes.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 
@@ -57,6 +57,7 @@ typedef struct rrwlock {
 	refcount_t	rr_anon_rcount;
 	refcount_t	rr_linked_rcount;
 	boolean_t	rr_writer_wanted;
+	boolean_t	rr_track_all;
 } rrwlock_t;
 
 /*
@@ -64,14 +65,45 @@ typedef struct rrwlock {
  * 'tag' must be the same in a rrw_enter() as in its
  * corresponding rrw_exit().
  */
-void rrw_init(rrwlock_t *rrl);
+void rrw_init(rrwlock_t *rrl, boolean_t track_all);
 void rrw_destroy(rrwlock_t *rrl);
 void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
+void rrw_enter_read(rrwlock_t *rrl, void *tag);
+void rrw_enter_read_prio(rrwlock_t *rrl, void *tag);
+void rrw_enter_write(rrwlock_t *rrl);
 void rrw_exit(rrwlock_t *rrl, void *tag);
 boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
+void rrw_tsd_destroy(void *arg);
 
 #define	RRW_READ_HELD(x)	rrw_held(x, RW_READER)
 #define	RRW_WRITE_HELD(x)	rrw_held(x, RW_WRITER)
+#define	RRW_LOCK_HELD(x) \
+	(rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, pessimizing write acquisitions.
+ *
+ * This should be a prime number.  See comment in rrwlock.c near
+ * RRM_TD_LOCK() for details.
+ */
+#define	RRM_NUM_LOCKS		17
+typedef struct rrmlock {
+	rrwlock_t	locks[RRM_NUM_LOCKS];
+} rrmlock_t;
+
+void rrm_init(rrmlock_t *rrl, boolean_t track_all);
+void rrm_destroy(rrmlock_t *rrl);
+void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
+void rrm_enter_read(rrmlock_t *rrl, void *tag);
+void rrm_enter_write(rrmlock_t *rrl);
+void rrm_exit(rrmlock_t *rrl, void *tag);
+boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
+
+#define	RRM_READ_HELD(x)	rrm_held(x, RW_READER)
+#define	RRM_WRITE_HELD(x)	rrm_held(x, RW_WRITER)
+#define	RRM_LOCK_HELD(x) \
+	(rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa.h	30 Aug 2015 02:17:54 -0000
@@ -0,0 +1,170 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_SA_H
+#define	_SYS_SA_H
+
+#include <sys/dmu.h>
+#include <sys/uio.h>
+
+/*
+ * Currently available byteswap functions.
+ * If it all possible new attributes should used
+ * one of the already defined byteswap functions.
+ * If a new byteswap function is added then the
+ * ZPL/Pool version will need to be bumped.
+ */
+
+typedef enum sa_bswap_type {
+	SA_UINT64_ARRAY,
+	SA_UINT32_ARRAY,
+	SA_UINT16_ARRAY,
+	SA_UINT8_ARRAY,
+	SA_ACL,
+} sa_bswap_type_t;
+
+typedef uint16_t	sa_attr_type_t;
+
+/*
+ * Attribute to register support for.
+ */
+typedef struct sa_attr_reg {
+	char 			*sa_name;	/* attribute name */
+	uint16_t 		sa_length;
+	sa_bswap_type_t		sa_byteswap;	/* bswap functon enum */
+	sa_attr_type_t 		sa_attr; /* filled in during registration */
+} sa_attr_reg_t;
+
+
+typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t,
+    boolean_t, void *userptr);
+
+/*
+ * array of attributes to store.
+ *
+ * This array should be treated as opaque/private data.
+ * The SA_BULK_ADD_ATTR() macro should be used for manipulating
+ * the array.
+ *
+ * When sa_replace_all_by_template() is used the attributes
+ * will be stored in the order defined in the array, except that
+ * the attributes may be split between the bonus and the spill buffer
+ *
+ */
+typedef struct sa_bulk_attr {
+	void			*sa_data;
+	sa_data_locator_t	*sa_data_func;
+	uint16_t		sa_length;
+	sa_attr_type_t		sa_attr;
+	/* the following are private to the sa framework */
+	void 			*sa_addr;
+	uint16_t		sa_buftype;
+	uint16_t		sa_size;
+} sa_bulk_attr_t;
+
+
+/*
+ * special macro for adding entries for bulk attr support
+ * bulk - sa_bulk_attr_t
+ * count - integer that will be incremented during each add
+ * attr - attribute to manipulate
+ * func - function for accessing data.
+ * data - pointer to data.
+ * len - length of data
+ */
+
+#define	SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \
+{ \
+	b[idx].sa_attr = attr;\
+	b[idx].sa_data_func = func; \
+	b[idx].sa_data = data; \
+	b[idx++].sa_length = len; \
+}
+
+typedef struct sa_os sa_os_t;
+
+typedef enum sa_handle_type {
+	SA_HDL_SHARED,
+	SA_HDL_PRIVATE
+} sa_handle_type_t;
+
+struct sa_handle;
+typedef void *sa_lookup_tab_t;
+typedef struct sa_handle sa_handle_t;
+
+typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx);
+
+int sa_handle_get(objset_t *, uint64_t, void *userp,
+    sa_handle_type_t, sa_handle_t **);
+int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp,
+    sa_handle_type_t, sa_handle_t **);
+void sa_handle_destroy(sa_handle_t *);
+int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **);
+void sa_buf_rele(dmu_buf_t *, void *);
+int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen);
+int sa_update(sa_handle_t *, sa_attr_type_t, void *buf,
+    uint32_t buflen, dmu_tx_t *);
+int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *);
+int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count);
+int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *);
+int sa_size(sa_handle_t *, sa_attr_type_t, int *);
+int sa_update_from_cb(sa_handle_t *, sa_attr_type_t,
+    uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
+void sa_object_info(sa_handle_t *, dmu_object_info_t *);
+void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
+void *sa_get_userdata(sa_handle_t *);
+void sa_set_userp(sa_handle_t *, void *);
+dmu_buf_t *sa_get_db(sa_handle_t *);
+uint64_t sa_handle_object(sa_handle_t *);
+boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size);
+void sa_register_update_callback(objset_t *, sa_update_cb_t *);
+int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **);
+void sa_tear_down(objset_t *);
+int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *,
+    int, dmu_tx_t *);
+int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *,
+    int, dmu_tx_t *);
+boolean_t sa_enabled(objset_t *);
+void sa_cache_init();
+void sa_cache_fini();
+int sa_set_sa_object(objset_t *, uint64_t);
+int sa_hdrsize(void *);
+void sa_handle_lock(sa_handle_t *);
+void sa_handle_unlock(sa_handle_t *);
+
+#ifdef _KERNEL
+int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *);
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SA_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa_impl.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa_impl.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa_impl.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/sa_impl.h	30 Aug 2015 02:17:54 -0000
@@ -0,0 +1,291 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#ifndef	_SYS_SA_IMPL_H
+#define	_SYS_SA_IMPL_H
+
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/list.h>
+
+/*
+ * Array of known attributes and their
+ * various characteristics.
+ */
+typedef struct sa_attr_table {
+	sa_attr_type_t	sa_attr;
+	uint8_t sa_registered;
+	uint16_t sa_length;
+	sa_bswap_type_t sa_byteswap;
+	char *sa_name;
+} sa_attr_table_t;
+
+/*
+ * Zap attribute format for attribute registration
+ *
+ * 64      56      48      40      32      24      16      8       0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * |        unused         |      len      | bswap |   attr num    |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Zap attribute format for layout information.
+ *
+ * layout information is stored as an array of attribute numbers
+ * The name of the attribute is the layout number (0, 1, 2, ...)
+ *
+ * 16       0
+ * +---- ---+
+ * | attr # |
+ * +--------+
+ * | attr # |
+ * +--- ----+
+ *  ......
+ *
+ */
+
+#define	ATTR_BSWAP(x)	BF32_GET(x, 16, 8)
+#define	ATTR_LENGTH(x)	BF32_GET(x, 24, 16)
+#define	ATTR_NUM(x)	BF32_GET(x, 0, 16)
+#define	ATTR_ENCODE(x, attr, length, bswap) \
+{ \
+	BF64_SET(x, 24, 16, length); \
+	BF64_SET(x, 16, 8, bswap); \
+	BF64_SET(x, 0, 16, attr); \
+}
+
+#define	TOC_OFF(x)		BF32_GET(x, 0, 23)
+#define	TOC_ATTR_PRESENT(x)	BF32_GET(x, 31, 1)
+#define	TOC_LEN_IDX(x)		BF32_GET(x, 24, 4)
+#define	TOC_ATTR_ENCODE(x, len_idx, offset) \
+{ \
+	BF32_SET(x, 31, 1, 1); \
+	BF32_SET(x, 24, 7, len_idx); \
+	BF32_SET(x, 0, 24, offset); \
+}
+
+#define	SA_LAYOUTS	"LAYOUTS"
+#define	SA_REGISTRY	"REGISTRY"
+
+/*
+ * Each unique layout will have their own table
+ * sa_lot (layout_table)
+ */
+typedef struct sa_lot {
+	avl_node_t lot_num_node;
+	avl_node_t lot_hash_node;
+	uint64_t lot_num;
+	uint64_t lot_hash;
+	sa_attr_type_t *lot_attrs;	/* array of attr #'s */
+	uint32_t lot_var_sizes;	/* how many aren't fixed size */
+	uint32_t lot_attr_count;	/* total attr count */
+	list_t 	lot_idx_tab;	/* should be only a couple of entries */
+	int	lot_instance;	/* used with lot_hash to identify entry */
+} sa_lot_t;
+
+/* index table of offsets */
+typedef struct sa_idx_tab {
+	list_node_t	sa_next;
+	sa_lot_t	*sa_layout;
+	uint16_t	*sa_variable_lengths;
+	refcount_t	sa_refcount;
+	uint32_t	*sa_idx_tab;	/* array of offsets */
+} sa_idx_tab_t;
+
+/*
+ * Since the offset/index information into the actual data
+ * will usually be identical we can share that information with
+ * all handles that have the exact same offsets.
+ *
+ * You would typically only have a large number of different table of
+ * contents if you had a several variable sized attributes.
+ *
+ * Two AVL trees are used to track the attribute layout numbers.
+ * one is keyed by number and will be consulted when a DMU_OT_SA
+ * object is first read.  The second tree is keyed by the hash signature
+ * of the attributes and will be consulted when an attribute is added
+ * to determine if we already have an instance of that layout.  Both
+ * of these tree's are interconnected.  The only difference is that
+ * when an entry is found in the "hash" tree the list of attributes will
+ * need to be compared against the list of attributes you have in hand.
+ * The assumption is that typically attributes will just be updated and
+ * adding a completely new attribute is a very rare operation.
+ */
+struct sa_os {
+	kmutex_t 	sa_lock;
+	boolean_t	sa_need_attr_registration;
+	boolean_t	sa_force_spill;
+	uint64_t	sa_master_obj;
+	uint64_t	sa_reg_attr_obj;
+	uint64_t	sa_layout_attr_obj;
+	int		sa_num_attrs;
+	sa_attr_table_t *sa_attr_table;	 /* private attr table */
+	sa_update_cb_t	*sa_update_cb;
+	avl_tree_t	sa_layout_num_tree;  /* keyed by layout number */
+	avl_tree_t	sa_layout_hash_tree; /* keyed by layout hash value */
+	int		sa_user_table_sz;
+	sa_attr_type_t	*sa_user_table; /* user name->attr mapping table */
+};
+
+/*
+ * header for all bonus and spill buffers.
+ *
+ * The header has a fixed portion with a variable number
+ * of "lengths" depending on the number of variable sized
+ * attributes which are determined by the "layout number"
+ */
+
+#define	SA_MAGIC	0x2F505A  /* ZFS SA */
+typedef struct sa_hdr_phys {
+	uint32_t sa_magic;
+	/* BEGIN CSTYLED */
+	/*
+	 * Encoded with hdrsize and layout number as follows:
+	 * 16      10       0
+	 * +--------+-------+
+	 * | hdrsz  |layout |
+	 * +--------+-------+
+	 *
+	 * Bits 0-10 are the layout number
+	 * Bits 11-16 are the size of the header.
+	 * The hdrsize is the number * 8
+	 *
+	 * For example.
+	 * hdrsz of 1 ==> 8 byte header
+	 *          2 ==> 16 byte header
+	 *
+	 */
+	/* END CSTYLED */
+	uint16_t sa_layout_info;
+	uint16_t sa_lengths[1];	/* optional sizes for variable length attrs */
+	/* ... Data follows the lengths.  */
+} sa_hdr_phys_t;
+
+#define	SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
+#define	SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0)
+#define	SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
+{ \
+	BF32_SET_SB(x, 10, 6, 3, 0, size); \
+	BF32_SET(x, 0, 10, num); \
+}
+
+typedef enum sa_buf_type {
+	SA_BONUS = 1,
+	SA_SPILL = 2
+} sa_buf_type_t;
+
+typedef enum sa_data_op {
+	SA_LOOKUP,
+	SA_UPDATE,
+	SA_ADD,
+	SA_REPLACE,
+	SA_REMOVE
+} sa_data_op_t;
+
+/*
+ * Opaque handle used for most sa functions
+ *
+ * This needs to be kept as small as possible.
+ */
+
+struct sa_handle {
+	dmu_buf_user_t	sa_dbu;
+	kmutex_t	sa_lock;
+	dmu_buf_t	*sa_bonus;
+	dmu_buf_t	*sa_spill;
+	objset_t	*sa_os;
+	void		*sa_userp;
+	sa_idx_tab_t	*sa_bonus_tab;	 /* idx of bonus */
+	sa_idx_tab_t	*sa_spill_tab; /* only present if spill activated */
+};
+
+#define	SA_GET_DB(hdl, type)	\
+	(dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill)
+
+#define	SA_GET_HDR(hdl, type) \
+	((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \
+	type))->db.db_data))
+
+#define	SA_IDX_TAB_GET(hdl, type) \
+	(type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab)
+
+#define	IS_SA_BONUSTYPE(a)	\
+	((a == DMU_OT_SA) ? B_TRUE : B_FALSE)
+
+#define	SA_BONUSTYPE_FROM_DB(db) \
+	(dmu_get_bonustype((dmu_buf_t *)db))
+
+#define	SA_BLKPTR_SPACE	(DN_MAX_BONUSLEN - sizeof (blkptr_t))
+
+#define	SA_LAYOUT_NUM(x, type) \
+	((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \
+	((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x))))
+
+
+#define	SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length
+
+#define	SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\
+	hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \
+	SA_REGISTERED_LEN(sa, attr))
+
+#define	SA_SET_HDR(hdr, num, size) \
+	{ \
+		hdr->sa_magic = SA_MAGIC; \
+		SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \
+	}
+
+#define	SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \
+	{ \
+		bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \
+		bulk.sa_buftype = type; \
+		bulk.sa_addr = \
+		    (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \
+		    (uintptr_t)hdr); \
+}
+
+#define	SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \
+	(SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \
+	(tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \
+	sizeof (uint16_t), 8) : 0)))
+
+int sa_add_impl(sa_handle_t *, sa_attr_type_t,
+    uint32_t, sa_data_locator_t, void *, dmu_tx_t *);
+
+void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *);
+int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *);
+
+void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t,
+    uint16_t *, sa_hdr_phys_t *);
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SA_IMPL_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h,v
retrieving revision 1.4
diff -u -p -r1.4 spa.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h	10 Jan 2017 19:20:35 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h	10 Oct 2016 11:09:53 -0000
@@ -19,8 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef _SYS_SPA_H
@@ -51,6 +55,7 @@ typedef struct spa_aux_vdev spa_aux_vdev
 typedef struct ddt ddt_t;
 typedef struct ddt_entry ddt_entry_t;
 struct dsl_pool;
+struct dsl_dataset;
 
 /*
  * General-purpose 32-bit and 64-bit bitfield encodings.
@@ -63,37 +68,75 @@ struct dsl_pool;
 #define	BF32_GET(x, low, len)		BF32_DECODE(x, low, len)
 #define	BF64_GET(x, low, len)		BF64_DECODE(x, low, len)
 
-#define	BF32_SET(x, low, len, val)	\
-	((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
-#define	BF64_SET(x, low, len, val)	\
-	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+#define	BF32_SET(x, low, len, val) do { \
+	ASSERT3U(val, <, 1U << (len)); \
+	ASSERT3U(low + len, <=, 32); \
+	(x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	BF64_SET(x, low, len, val) do { \
+	ASSERT3U(val, <, 1ULL << (len)); \
+	ASSERT3U(low + len, <=, 64); \
+	((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
+_NOTE(CONSTCOND) } while (0)
 
 #define	BF32_GET_SB(x, low, len, shift, bias)	\
 	((BF32_GET(x, low, len) + (bias)) << (shift))
 #define	BF64_GET_SB(x, low, len, shift, bias)	\
 	((BF64_GET(x, low, len) + (bias)) << (shift))
 
-#define	BF32_SET_SB(x, low, len, shift, bias, val)	\
-	BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
-#define	BF64_SET_SB(x, low, len, shift, bias, val)	\
-	BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define	BF32_SET_SB(x, low, len, shift, bias, val) do { \
+	ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
+	ASSERT3S((val) >> (shift), >=, bias); \
+	BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+#define	BF64_SET_SB(x, low, len, shift, bias, val) do { \
+	ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
+	ASSERT3S((val) >> (shift), >=, bias); \
+	BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
 
 /*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
  */
 #define	SPA_MINBLOCKSHIFT	9
-#define	SPA_MAXBLOCKSHIFT	17
+#define	SPA_OLD_MAXBLOCKSHIFT	17
+#define	SPA_MAXBLOCKSHIFT	24
 #define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
+#define	SPA_OLD_MAXBLOCKSIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)
 #define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
 
-#define	SPA_BLOCKSIZES		(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+/*
+ * Default maximum supported logical ashift.
+ *
+ * The current 8k allocation block size limit is due to the 8k
+ * aligned/sized operations performed by vdev_probe() on
+ * vdev_label->vl_pad2.  Using another "safe region" for these tests
+ * would allow the limit to be raised to 16k, at the expense of
+ * only having 8 available uberblocks in the label area.
+ */
+#define	SPA_MAXASHIFT		13
+
+/*
+ * Default minimum supported logical ashift.
+ */
+#define SPA_MINASHIFT		SPA_MINBLOCKSHIFT
 
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
-#define	SPA_CONFIG_BLOCKSIZE	(1 << 14)
+#define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
 
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
@@ -106,6 +149,8 @@ struct dsl_pool;
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
+#define	SPA_COMPRESSBITS	7
+
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
  * The members of the dva_t should be considered opaque outside the SPA.
@@ -122,6 +167,14 @@ typedef struct zio_cksum {
 } zio_cksum_t;
 
 /*
+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
+ * secret and is suitable for use in MAC algorithms as the key.
+ */
+typedef struct zio_cksum_salt {
+	uint8_t		zcs_bytes[32];
+} zio_cksum_salt_t;
+
+/*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
@@ -139,7 +192,7 @@ typedef struct zio_cksum {
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|BDX|lvl| type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -173,7 +226,8 @@ typedef struct zio_cksum {
  * G		gang block indicator
  * B		byteorder (endianness)
  * D		dedup
- * X		unused
+ * X		encryption (on version 30, which is not supported)
+ * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
  * phys birth	txg of block allocation; zero if same as logical birth txg
@@ -181,9 +235,112 @@ typedef struct zio_cksum {
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
+
+/*
+ * "Embedded" blkptr_t's don't actually point to a block, instead they
+ * have a data payload embedded in the blkptr_t itself.  See the comment
+ * in blkptr.c for more details.
+ *
+ * The blkptr_t is laid out as follows:
+ *
+ *	64	56	48	40	32	24	16	8	0
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0	|      payload                                                  |
+ * 1	|      payload                                                  |
+ * 2	|      payload                                                  |
+ * 3	|      payload                                                  |
+ * 4	|      payload                                                  |
+ * 5	|      payload                                                  |
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7	|      payload                                                  |
+ * 8	|      payload                                                  |
+ * 9	|      payload                                                  |
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * a	|			logical birth txg			|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ * b	|      payload                                                  |
+ * c	|      payload                                                  |
+ * d	|      payload                                                  |
+ * e	|      payload                                                  |
+ * f	|      payload                                                  |
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * payload		contains the embedded data
+ * B (byteorder)	byteorder (endianness)
+ * D (dedup)		padding (set to zero)
+ * X			encryption (set to zero; see above)
+ * E (embedded)		set to one
+ * lvl			indirection level
+ * type			DMU object type
+ * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
+ * comp			compression function of payload
+ * PSIZE		size of payload after compression, in bytes
+ * LSIZE		logical size of payload, in bytes
+ *			note that 25 bits is enough to store the largest
+ *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
+ * log. birth		transaction group in which the block was logically born
+ *
+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
+ * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
+ * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
+ * other macros, as they assert that they are only used on BP's of the correct
+ * "embedded-ness".
+ */
+
+#define	BPE_GET_ETYPE(bp)	\
+	(ASSERT(BP_IS_EMBEDDED(bp)), \
+	BF64_GET((bp)->blk_prop, 40, 8))
+#define	BPE_SET_ETYPE(bp, t)	do { \
+	ASSERT(BP_IS_EMBEDDED(bp)); \
+	BF64_SET((bp)->blk_prop, 40, 8, t); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	BPE_GET_LSIZE(bp)	\
+	(ASSERT(BP_IS_EMBEDDED(bp)), \
+	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
+#define	BPE_SET_LSIZE(bp, x)	do { \
+	ASSERT(BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define	BPE_GET_PSIZE(bp)	\
+	(ASSERT(BP_IS_EMBEDDED(bp)), \
+	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
+#define	BPE_SET_PSIZE(bp, x)	do { \
+	ASSERT(BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+typedef enum bp_embedded_type {
+	BP_EMBEDDED_TYPE_DATA,
+	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
+	NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
+} bp_embedded_type_t;
+
+#define	BPE_NUM_WORDS 14
+#define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define	BPE_IS_PAYLOADWORD(bp, wp) \
+	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 
+/*
+ * A block is a hole when it has either 1) never been written to, or
+ * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
+ * without physically allocating disk space. Holes are represented in the
+ * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
+ * done through the BP_IS_HOLE macro. For holes, the logical size, level,
+ * DMU object type, and birth times are all also stored for holes that
+ * were written to at some point (i.e. were punched after having been filled).
+ */
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
@@ -198,9 +355,10 @@ typedef struct blkptr {
  * Macros to get and set fields in a bp or DVA.
  */
 #define	DVA_GET_ASIZE(dva)	\
-	BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_ASIZE(dva, x)	\
-	BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
+	SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
@@ -217,20 +375,39 @@ typedef struct blkptr {
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
-	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-#define	BP_SET_LSIZE(bp, x)	\
-	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+	(BP_IS_EMBEDDED(bp) ?	\
+	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
+	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define	BP_SET_LSIZE(bp, x)	do { \
+	ASSERT(!BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, \
+	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_PSIZE(bp)	\
-	BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
-#define	BP_SET_PSIZE(bp, x)	\
-	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
-
-#define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 8)
-#define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 8, x)
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define	BP_SET_PSIZE(bp, x)	do { \
+	ASSERT(!BP_IS_EMBEDDED(bp)); \
+	BF64_SET_SB((bp)->blk_prop, \
+	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
 
-#define	BP_GET_CHECKSUM(bp)		BF64_GET((bp)->blk_prop, 40, 8)
-#define	BP_SET_CHECKSUM(bp, x)		BF64_SET((bp)->blk_prop, 40, 8, x)
+#define	BP_GET_COMPRESS(bp)		\
+	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
+#define	BP_SET_COMPRESS(bp, x)		\
+	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
+
+#define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
+#define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
+
+#define	BP_GET_CHECKSUM(bp)		\
+	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
+	BF64_GET((bp)->blk_prop, 40, 8))
+#define	BP_SET_CHECKSUM(bp, x)		do { \
+	ASSERT(!BP_IS_EMBEDDED(bp)); \
+	BF64_SET((bp)->blk_prop, 40, 8, x); \
+_NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
 #define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
@@ -238,41 +415,46 @@ typedef struct blkptr {
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
-#define	BP_GET_PROP_BIT_61(bp)		BF64_GET((bp)->blk_prop, 61, 1)
-#define	BP_SET_PROP_BIT_61(bp, x)	BF64_SET((bp)->blk_prop, 61, 1, x)
-
 #define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
 #define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
 
-#define	BP_GET_BYTEORDER(bp)		(0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define	BP_PHYSICAL_BIRTH(bp)		\
-	((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
+	ASSERT(!BP_IS_EMBEDDED(bp));		\
 	(bp)->blk_birth = (logical);		\
 	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
 }
 
+#define	BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+
 #define	BP_GET_ASIZE(bp)	\
-	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
-		DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define	BP_GET_UCSIZE(bp) \
-	((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
-	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+	((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
+	BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
-	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+	(BP_IS_EMBEDDED(bp) ? 0 : \
+	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define	BP_COUNT_GANG(bp)	\
+	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
-	DVA_GET_GANG(&(bp)->blk_dva[2]))
+	DVA_GET_GANG(&(bp)->blk_dva[2])))
 
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
@@ -280,6 +462,7 @@ typedef struct blkptr {
 
 #define	BP_EQUAL(bp1, bp2)	\
 	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
+	(bp1)->blk_birth == (bp2)->blk_birth &&			\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@@ -290,6 +473,19 @@ typedef struct blkptr {
 	((zc1).zc_word[2] - (zc2).zc_word[2]) | \
 	((zc1).zc_word[3] - (zc2).zc_word[3])))
 
+#define	ZIO_CHECKSUM_IS_ZERO(zc) \
+	(0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
+	(zc)->zc_word[2] | (zc)->zc_word[3]))
+
+#define	ZIO_CHECKSUM_BSWAP(zcp)					\
+{								\
+	(zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]);	\
+	(zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]);	\
+	(zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]);	\
+	(zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]);	\
+}
+
+
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)	\
@@ -300,9 +496,13 @@ typedef struct blkptr {
 	(zcp)->zc_word[3] = w3;			\
 }
 
-#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
-#define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
-#define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
+#define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
+#define	BP_IS_GANG(bp)		\
+	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
+#define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
+				(dva)->dva_word[1] == 0ULL)
+#define	BP_IS_HOLE(bp) \
+	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@@ -325,14 +525,10 @@ typedef struct blkptr {
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
-/*
- * Note: the byteorder is either 0 or -1, both of which are palindromes.
- * This simplifies the endianness handling a bit.
- */
-#ifdef _BIG_ENDIAN
+#if BYTE_ORDER == _BIG_ENDIAN
 #define	ZFS_HOST_BYTEORDER	(0ULL)
 #else
-#define	ZFS_HOST_BYTEORDER	(-1ULL)
+#define	ZFS_HOST_BYTEORDER	(1ULL)
 #endif
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
@@ -344,18 +540,34 @@ typedef struct blkptr {
  * 'func' is either snprintf() or mdb_snprintf().
  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
  */
-#define	SNPRINTF_BLKPTR(func, ws, buf, buflen, bp, type, checksum, compress)	\
+#define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
 {									\
 	static const char *copyname[] =					\
 	    { "zero", "single", "double", "triple" };			\
-	int size = buflen;					\
 	int len = 0;							\
 	int copies = 0;							\
 									\
 	if (bp == NULL) {						\
-		len = func(buf + len, size - len, "<NULL>");		\
+		len += func(buf + len, size - len, "<NULL>");		\
 	} else if (BP_IS_HOLE(bp)) {					\
-		len = func(buf + len, size - len, "<hole>");		\
+		len += func(buf + len, size - len,			\
+		    "HOLE [L%llu %s] "					\
+		    "size=%llxL birth=%lluL",				\
+		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+		    type,						\
+		    (u_longlong_t)BP_GET_LSIZE(bp),			\
+		    (u_longlong_t)bp->blk_birth);			\
+	} else if (BP_IS_EMBEDDED(bp)) {				\
+		len = func(buf + len, size - len,			\
+		    "EMBEDDED [L%llu %s] et=%u %s "			\
+		    "size=%llxL/%llxP birth=%lluL",			\
+		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+		    type,						\
+		    (int)BPE_GET_ETYPE(bp),				\
+		    compress,						\
+		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
+		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
+		    (u_longlong_t)bp->blk_birth);			\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
@@ -389,7 +601,7 @@ typedef struct blkptr {
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth,			\
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
-		    (u_longlong_t)bp->blk_fill,				\
+		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
@@ -402,8 +614,8 @@ typedef struct blkptr {
 #include <sys/dmu.h>
 
 #define	BP_GET_BUFC_TYPE(bp)						\
-	(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
-	ARC_BUFC_METADATA : ARC_BUFC_DATA);
+	(((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
+	ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
 	SPA_IMPORT_EXISTING,
@@ -414,13 +626,17 @@ typedef enum spa_import_type {
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
     nvlist_t *policy, nvlist_t **config);
-extern int spa_get_stats(const char *pool, nvlist_t **config,
-    char *altroot, size_t buflen);
+extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
+    size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
-    const char *history_str, nvlist_t *zplprops);
+    nvlist_t *zplprops);
+#ifdef illumos
 extern int spa_import_rootpool(char *devpath, char *devid);
-extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
+#else
+extern int spa_import_rootpool(const char *name);
+#endif
+extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
+    uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
 extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
@@ -432,6 +648,8 @@ extern void spa_async_suspend(spa_t *spa
 extern void spa_async_resume(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
+extern void spa_scan_stat_init(spa_t *spa);
+extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 
 #define	SPA_ASYNC_CONFIG_UPDATE	0x01
 #define	SPA_ASYNC_REMOVE	0x02
@@ -439,6 +657,14 @@ extern void spa_inject_delref(spa_t *spa
 #define	SPA_ASYNC_RESILVER_DONE	0x08
 #define	SPA_ASYNC_RESILVER	0x10
 #define	SPA_ASYNC_AUTOEXPAND	0x20
+#define	SPA_ASYNC_REMOVE_DONE	0x40
+#define	SPA_ASYNC_REMOVE_STOP	0x80
+
+/*
+ * Controls the behavior of spa_vdev_remove().
+ */
+#define	SPA_REMOVE_UNSPARE	0x01
+#define	SPA_REMOVE_DONE		0x02
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
@@ -447,6 +673,7 @@ extern int spa_vdev_attach(spa_t *spa, u
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
+extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
@@ -465,17 +692,14 @@ extern boolean_t spa_l2cache_exists(uint
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
 
-/* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
+/* scanning */
+extern int spa_scan(spa_t *spa, pool_scan_func_t func);
+extern int spa_scan_stop(spa_t *spa);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
-#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
-#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
-#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
-
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
@@ -507,6 +731,7 @@ extern spa_t *spa_next(spa_t *prev);
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
+extern void spa_async_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define	SCL_NONE	0x00
@@ -556,12 +781,14 @@ extern void spa_claim_notify(zio_t *zio)
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
+extern boolean_t spa_is_initializing(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern int spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
+extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
@@ -571,22 +798,31 @@ extern spa_load_state_t spa_load_state(s
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern void spa_evicting_os_register(spa_t *, objset_t *os);
+extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
+extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
+extern int spa_prev_software_version(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
-extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+extern uint64_t spa_deadman_synctime(spa_t *spa);
 
 /* Miscellaneous support routines */
+extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
+    dmu_tx_t *tx);
+extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern int spa_rename(const char *oldname, const char *newname);
+extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
@@ -594,6 +830,7 @@ extern uint64_t spa_get_random(uint64_t 
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
+extern int spa_change_guid(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
@@ -605,40 +842,33 @@ extern uint64_t bp_get_dsize(spa_t *spa,
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
-extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to);
+extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
+extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
 
 extern int spa_mode(spa_t *spa);
-extern uint64_t strtonum(const char *str, char **nptr);
-
-/* history logging */
-typedef enum history_log_type {
-	LOG_CMD_POOL_CREATE,
-	LOG_CMD_NORMAL,
-	LOG_INTERNAL
-} history_log_type_t;
-
-typedef struct history_arg {
-	const char *ha_history_str;
-	history_log_type_t ha_log_type;
-	history_internal_events_t ha_event;
-	char ha_zone[MAXPATHLEN];
-} history_arg_t;
+extern uint64_t zfs_strtonum(const char *str, char **nptr);
+#define	strtonum(str, nptr)	zfs_strtonum((str), (nptr))
 
 extern char *spa_his_ievent_table[];
 
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
-extern int spa_history_log(spa_t *spa, const char *his_buf,
-    history_log_type_t what);
-extern void spa_history_internal_log(history_internal_events_t event,
-    spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
-extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
+extern int spa_history_log(spa_t *spa, const char *his_buf);
+extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
+extern void spa_history_log_version(spa_t *spa, const char *operation);
+extern void spa_history_log_internal(spa_t *spa, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...);
+extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
+    dmu_tx_t *tx, const char *fmt, ...);
+extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...);
 
 /* error handling */
-struct zbookmark;
+struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, zio_t *zio);
-extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd,
     zio_t *zio, uint64_t stateoroffset, uint64_t length);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
@@ -657,7 +887,7 @@ extern void vdev_cache_stat_fini(void);
 /* Initialization and termination */
 extern void spa_init(int flags);
 extern void spa_fini(void);
-extern void spa_boot_init(void);
+extern void spa_boot_init();
 
 /* properties */
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
@@ -670,9 +900,9 @@ extern void spa_event_notify(spa_t *spa,
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
-	if (zfs_flags & ZFS_DEBUG_DPRINTF) { 			\
+	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));				\
+	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
@@ -681,6 +911,13 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
+extern boolean_t spa_debug_enabled(spa_t *spa);
+#define	spa_dbgmsg(spa, ...)			\
+{						\
+	if (spa_debug_enabled(spa))		\
+		zfs_dbgmsg(__VA_ARGS__);	\
+}
+
 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 spa_boot.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h	23 Mar 2013 15:29:18 -0000
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_SPA_BOOT_H
 #define	_SYS_SPA_BOOT_H
 
@@ -35,6 +39,8 @@ extern "C" {
 extern char *spa_get_bootprop(char *prop);
 extern void spa_free_bootprop(char *prop);
 
+extern void spa_arch_init(void);
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 spa_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h	23 Apr 2017 20:41:56 -0000
@@ -19,8 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -36,15 +40,22 @@
 #include <sys/avl.h>
 #include <sys/refcount.h>
 #include <sys/bplist.h>
+#include <sys/bpobj.h>
+#include <sys/zfeature.h>
+#include <zfeature_common.h>
+
+#if defined(__NetBSD__) && defined(_HARDKERNEL)
+#include <sys/workqueue.h>
+#endif
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef struct spa_error_entry {
-	zbookmark_t	se_bookmark;
-	char		*se_name;
-	avl_node_t	se_avl;
+	zbookmark_phys_t	se_bookmark;
+	char			*se_name;
+	avl_node_t		se_avl;
 } spa_error_entry_t;
 
 typedef struct spa_history_phys {
@@ -78,16 +89,16 @@ typedef struct spa_config_dirent {
 	char		*scd_path;
 } spa_config_dirent_t;
 
-enum zio_taskq_type {
+typedef enum zio_taskq_type {
 	ZIO_TASKQ_ISSUE = 0,
 	ZIO_TASKQ_ISSUE_HIGH,
 	ZIO_TASKQ_INTERRUPT,
 	ZIO_TASKQ_INTERRUPT_HIGH,
 	ZIO_TASKQ_TYPES
-};
+} zio_taskq_type_t;
 
 /*
- * State machine for the zpool-pooname process.  The states transitions
+ * State machine for the zpool-poolname process.  The states transitions
  * are done as follows:
  *
  *	From		   To			Routine
@@ -105,24 +116,38 @@ typedef enum spa_proc_state {
 	SPA_PROC_GONE		/* spa_thread() is exiting, spa_proc = &p0 */
 } spa_proc_state_t;
 
+typedef struct spa_taskqs {
+	uint_t stqs_count;
+	taskq_t **stqs_taskq;
+} spa_taskqs_t;
+
+typedef enum spa_all_vdev_zap_action {
+	AVZ_ACTION_NONE = 0,
+	AVZ_ACTION_DESTROY,	/* Destroy all per-vdev ZAPs and the AVZ. */
+	AVZ_ACTION_REBUILD	/* Populate the new AVZ, see spa_avz_rebuild */
+} spa_avz_action_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
 	 */
-	char		spa_name[MAXNAMELEN];	/* pool name */
+	char		spa_name[ZFS_MAX_DATASET_NAME_LEN];	/* pool name */
+	char		*spa_comment;		/* comment */
 	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
 	nvlist_t	*spa_config;		/* last synced config */
 	nvlist_t	*spa_config_syncing;	/* currently syncing config */
 	nvlist_t	*spa_config_splitting;	/* config for splitting */
+	nvlist_t	*spa_load_info;		/* info and errors from load */
 	uint64_t	spa_config_txg;		/* txg of last config change */
 	int		spa_sync_pass;		/* iterate-to-convergence */
 	pool_state_t	spa_state;		/* pool state */
 	int		spa_inject_ref;		/* injection references */
 	uint8_t		spa_sync_on;		/* sync threads are running */
 	spa_load_state_t spa_load_state;	/* current load operation */
-	boolean_t	spa_load_verbatim;	/* load the given config? */
-	taskq_t		*spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
+	uint64_t	spa_import_flags;	/* import specific flags */
+	spa_taskqs_t	spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
+	boolean_t	spa_is_initializing;	/* true while opening pool */
 	metaslab_class_t *spa_normal_class;	/* normal data class */
 	metaslab_class_t *spa_log_class;	/* intent log data class */
 	uint64_t	spa_first_txg;		/* first txg after spa_open() */
@@ -130,35 +155,51 @@ struct spa {
 	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
 	uint64_t	spa_load_max_txg;	/* best initial ub_txg */
 	uint64_t	spa_claim_max_txg;	/* highest claimed birth txg */
+	timespec_t	spa_loaded_ts;		/* 1st successful open time */
 	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
+	kmutex_t	spa_evicting_os_lock;	/* Evicting objset list lock */
+	list_t		spa_evicting_os_list;	/* Objsets being evicted. */
+	kcondvar_t	spa_evicting_os_cv;	/* Objset Eviction Completion */
 	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
-	uint64_t	spa_load_guid;		/* initial guid for spa_load */
+	int		spa_min_ashift;		/* of vdevs in normal class */
+	int		spa_max_ashift;		/* of vdevs in normal class */
+	uint64_t	spa_config_guid;	/* config pool guid */
+	uint64_t	spa_load_guid;		/* spa_load initialized guid */
+	uint64_t	spa_last_synced_guid;	/* last synced guid */
 	list_t		spa_config_dirty_list;	/* vdevs with dirty config */
 	list_t		spa_state_dirty_list;	/* vdevs with dirty state */
+	kmutex_t	spa_alloc_lock;
+	avl_tree_t	spa_alloc_tree;
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
+	nvlist_t	*spa_label_features;	/* Features for reading MOS */
 	uint64_t	spa_config_object;	/* MOS object for pool config */
 	uint64_t	spa_config_generation;	/* config generation number */
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
-	uint64_t	spa_deferred_bplist_obj; /* object for deferred frees */
-	bplist_t	spa_deferred_bplist;	/* deferred-free bplist */
+	bpobj_t		spa_deferred_bpobj;	/* deferred-free bplist */
 	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
+	zio_cksum_salt_t spa_cksum_salt;	/* secret salt for cksum */
+	/* checksum context templates */
+	kmutex_t	spa_cksum_tmpls_lock;
+	void		*spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
+	uint64_t	spa_last_io;		/* lbolt of last non-scan I/O */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
-	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
-	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
 	uint8_t		spa_scrub_finished;	/* indicator to rotate logs */
 	uint8_t		spa_scrub_started;	/* started since last boot */
 	uint8_t		spa_scrub_reopen;	/* scrub doing vdev_reopen */
+	uint64_t	spa_scan_pass_start;	/* start time per pass/reboot */
+	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
+	kthread_t	*spa_async_thread_vd;	/* thread doing vd async task */
 	int		spa_async_suspended;	/* async tasks suspended */
 	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
 	uint16_t	spa_async_tasks;	/* async task mask */
@@ -188,12 +229,14 @@ struct spa {
 	uint64_t	spa_failmode;		/* failure mode for the pool */
 	uint64_t	spa_delegation;		/* delegation on/off */
 	list_t		spa_config_list;	/* previous cache file(s) */
-	zio_t		*spa_async_zio_root;	/* root of all async I/O */
+	/* per-CPU array of root of async I/O: */
+	zio_t		**spa_async_zio_root;
 	zio_t		*spa_suspend_zio_root;	/* root of all suspended I/O */
 	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
 	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
+	boolean_t	spa_debug;		/* debug enabled? */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */
@@ -210,21 +253,72 @@ struct spa {
 	spa_proc_state_t spa_proc_state;	/* see definition */
 	struct proc	*spa_proc;		/* "zpool-poolname" process */
 	uint64_t	spa_did;		/* if procp != p0, did of t1 */
+	kthread_t	*spa_trim_thread;	/* thread sending TRIM I/Os */
+	kmutex_t	spa_trim_lock;		/* protects spa_trim_cv */
+	kcondvar_t	spa_trim_cv;		/* used to notify TRIM thread */
 	boolean_t	spa_autoreplace;	/* autoreplace set in open */
 	int		spa_vdev_locks;		/* locks grabbed */
+	uint64_t	spa_creation_version;	/* version at pool creation */
+	uint64_t	spa_prev_software_version; /* See ub_software_version */
+	uint64_t	spa_feat_for_write_obj;	/* required to write to pool */
+	uint64_t	spa_feat_for_read_obj;	/* required to read from pool */
+	uint64_t	spa_feat_desc_obj;	/* Feature descriptions */
+	uint64_t	spa_feat_enabled_txg_obj; /* Feature enabled txg */
+	/* cache feature refcounts */
+	uint64_t	spa_feat_refcount_cache[SPA_FEATURES];
+#ifdef illumos
+	cyclic_id_t	spa_deadman_cycid;	/* cyclic id */
+#endif	/* illumos */
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+	struct callout	spa_deadman_cycid;	/* callout id */
+	struct task	spa_deadman_task;
+#endif
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+#ifdef _HARDKERNEL
+	struct callout	spa_deadman_cycid;	/* callout id */
+	struct work	spa_deadman_work;
+#endif
+#endif /* __NetBSD__ */
+	uint64_t	spa_deadman_calls;	/* number of deadman calls */
+	hrtime_t	spa_sync_starttime;	/* starting time fo spa_sync */
+	uint64_t	spa_deadman_synctime;	/* deadman expiration timer */
+	uint64_t	spa_all_vdev_zaps;	/* ZAP of per-vd ZAP obj #s */
+	spa_avz_action_t	spa_avz_action;	/* destroy/rebuild AVZ? */
 
+#ifdef illumos
 	/*
-	 * spa_refcnt & spa_config_lock must be the last elements
+	 * spa_iokstat_lock protects spa_iokstat and
+	 * spa_queue_stats[].
+	 */
+	kmutex_t	spa_iokstat_lock;
+	struct kstat	*spa_iokstat;		/* kstat of io to this pool */
+	struct {
+		int spa_active;
+		int spa_queued;
+	} spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
+#endif
+	hrtime_t	spa_ccw_fail_time;	/* Conf cache write fail time */
+
+	/*
+	 * spa_refcount & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
 	 * In order for the MDB module to function correctly, the other
 	 * fields must remain in the same location.
 	 */
 	spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
 	refcount_t	spa_refcount;		/* number of opens */
+#ifndef illumos
+	boolean_t	spa_splitting_newspa;	/* creating new spa in split */
+#endif
 };
 
 extern const char *spa_config_path;
 
+extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 space_map.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h	1 Nov 2014 10:04:53 -0000
@@ -23,63 +23,71 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_SPACE_MAP_H
 #define	_SYS_SPACE_MAP_H
 
 #include <sys/avl.h>
+#include <sys/range_tree.h>
 #include <sys/dmu.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-typedef struct space_map_ops space_map_ops_t;
+/*
+ * The size of the space map object has increased to include a histogram.
+ * The SPACE_MAP_SIZE_V0 designates the original size and is used to
+ * maintain backward compatibility.
+ */
+#define	SPACE_MAP_SIZE_V0	(3 * sizeof (uint64_t))
+#define	SPACE_MAP_HISTOGRAM_SIZE	32
+
+/*
+ * The space_map_phys is the on-disk representation of the space map.
+ * Consumers of space maps should never reference any of the members of this
+ * structure directly. These members may only be updated in syncing context.
+ *
+ * Note the smp_object is no longer used but remains in the structure
+ * for backward compatibility.
+ */
+typedef struct space_map_phys {
+	uint64_t	smp_object;	/* on-disk space map object */
+	uint64_t	smp_objsize;	/* size of the object */
+	uint64_t	smp_alloc;	/* space allocated from the map */
+	uint64_t	smp_pad[5];	/* reserved */
+
+	/*
+	 * The smp_histogram maintains a histogram of free regions. Each
+	 * bucket, smp_histogram[i], contains the number of free regions
+	 * whose size is:
+	 * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
+	 */
+	uint64_t	smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
+} space_map_phys_t;
 
+/*
+ * The space map object defines a region of space, its size, how much is
+ * allocated, and the on-disk object that stores this information.
+ * Consumers of space maps may only access the members of this structure.
+ */
 typedef struct space_map {
-	avl_tree_t	sm_root;	/* AVL tree of map segments */
-	uint64_t	sm_space;	/* sum of all segments in the map */
 	uint64_t	sm_start;	/* start of map */
 	uint64_t	sm_size;	/* size of map */
 	uint8_t		sm_shift;	/* unit shift */
-	uint8_t		sm_pad[3];	/* unused */
-	uint8_t		sm_loaded;	/* map loaded? */
-	uint8_t		sm_loading;	/* map loading? */
-	kcondvar_t	sm_load_cv;	/* map load completion */
-	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
-	avl_tree_t	*sm_pp_root;	/* picker-private AVL tree */
-	void		*sm_ppd;	/* picker-private data */
+	uint64_t	sm_length;	/* synced length */
+	uint64_t	sm_alloc;	/* synced space allocated */
+	objset_t	*sm_os;		/* objset for this map */
+	uint64_t	sm_object;	/* object id for this map */
+	uint32_t	sm_blksz;	/* block size for space map */
+	dmu_buf_t	*sm_dbuf;	/* space_map_phys_t dbuf */
+	space_map_phys_t *sm_phys;	/* on-disk space map */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
 
-typedef struct space_seg {
-	avl_node_t	ss_node;	/* AVL node */
-	avl_node_t	ss_pp_node;	/* AVL picker-private node */
-	uint64_t	ss_start;	/* starting offset of this segment */
-	uint64_t	ss_end;		/* ending offset (non-inclusive) */
-} space_seg_t;
-
-typedef struct space_ref {
-	avl_node_t	sr_node;	/* AVL node */
-	uint64_t	sr_offset;	/* offset (start or end) */
-	int64_t		sr_refcnt;	/* associated reference count */
-} space_ref_t;
-
-typedef struct space_map_obj {
-	uint64_t	smo_object;	/* on-disk space map object */
-	uint64_t	smo_objsize;	/* size of the object */
-	uint64_t	smo_alloc;	/* space allocated from the map */
-} space_map_obj_t;
-
-struct space_map_ops {
-	void	(*smop_load)(space_map_t *sm);
-	void	(*smop_unload)(space_map_t *sm);
-	uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
-	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
-	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
-	uint64_t (*smop_max)(space_map_t *sm);
-	boolean_t (*smop_fragmented)(space_map_t *sm);
-};
-
 /*
  * debug entry
  *
@@ -90,7 +98,6 @@ struct space_map_ops {
  *   63  62    60 59        50 49                               0
  *
  *
- *
  * non-debug entry
  *
  *    1               47                   1           15
@@ -121,56 +128,34 @@ struct space_map_ops {
 
 #define	SM_RUN_MAX			SM_RUN_DECODE(~0ULL)
 
-#define	SM_ALLOC	0x0
-#define	SM_FREE		0x1
-
-/*
- * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
- * when only a few blocks have changed since the last transaction group.
- * This could use a lot more research, but for now, set the freelist
- * block size to 4k (2^12).
- */
-#define	SPACE_MAP_BLOCKSHIFT	12
-
-typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+typedef enum {
+	SM_ALLOC,
+	SM_FREE
+} maptype_t;
+
+int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
+
+void space_map_histogram_clear(space_map_t *sm);
+void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
+    dmu_tx_t *tx);
+
+void space_map_update(space_map_t *sm);
+
+uint64_t space_map_object(space_map_t *sm);
+uint64_t space_map_allocated(space_map_t *sm);
+uint64_t space_map_length(space_map_t *sm);
+
+void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    dmu_tx_t *tx);
+void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
+void space_map_free(space_map_t *sm, dmu_tx_t *tx);
+
+int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+    uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
+void space_map_close(space_map_t *sm);
 
-extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
-    uint8_t shift, kmutex_t *lp);
-extern void space_map_destroy(space_map_t *sm);
-extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern boolean_t space_map_contains(space_map_t *sm,
-    uint64_t start, uint64_t size);
-extern void space_map_vacate(space_map_t *sm,
-    space_map_func_t *func, space_map_t *mdest);
-extern void space_map_walk(space_map_t *sm,
-    space_map_func_t *func, space_map_t *mdest);
-
-extern void space_map_load_wait(space_map_t *sm);
-extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
-    uint8_t maptype, space_map_obj_t *smo, objset_t *os);
-extern void space_map_unload(space_map_t *sm);
-
-extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
-extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
-extern uint64_t space_map_maxsize(space_map_t *sm);
-
-extern void space_map_sync(space_map_t *sm, uint8_t maptype,
-    space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
-extern void space_map_truncate(space_map_obj_t *smo,
-    objset_t *os, dmu_tx_t *tx);
-
-extern void space_map_ref_create(avl_tree_t *t);
-extern void space_map_ref_destroy(avl_tree_t *t);
-extern void space_map_ref_add_seg(avl_tree_t *t,
-    uint64_t start, uint64_t end, int64_t refcnt);
-extern void space_map_ref_add_map(avl_tree_t *t,
-    space_map_t *sm, int64_t refcnt);
-extern void space_map_ref_generate_map(avl_tree_t *t,
-    space_map_t *sm, int64_t minref);
+int64_t space_map_alloc_delta(space_map_t *sm);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_reftree.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_reftree.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_reftree.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_reftree.h	11 Dec 2013 04:45:55 -0000
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_SPACE_REFTREE_H
+#define	_SYS_SPACE_REFTREE_H
+
+#include <sys/range_tree.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct space_ref {
+	avl_node_t	sr_node;	/* AVL node */
+	uint64_t	sr_offset;	/* range offset (start or end) */
+	int64_t		sr_refcnt;	/* associated reference count */
+} space_ref_t;
+
+void space_reftree_create(avl_tree_t *t);
+void space_reftree_destroy(avl_tree_t *t);
+void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+    int64_t refcnt);
+void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt);
+void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt,
+    int64_t minref);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SPACE_REFTREE_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/trim_map.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/trim_map.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/trim_map.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/trim_map.h	23 Mar 2013 15:29:18 -0000
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_TRIM_MAP_H
+#define	_SYS_TRIM_MAP_H
+
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/spa.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+extern void trim_map_create(vdev_t *vd);
+extern void trim_map_destroy(vdev_t *vd);
+extern void trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg);
+extern boolean_t trim_map_write_start(zio_t *zio);
+extern void trim_map_write_done(zio_t *zio);
+
+extern void trim_thread_create(spa_t *spa);
+extern void trim_thread_destroy(spa_t *spa);
+extern void trim_thread_wakeup(spa_t *spa);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_TRIM_MAP_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 txg.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h	31 Jul 2014 04:37:13 -0000
@@ -19,9 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_TXG_H
 #define	_SYS_TXG_H
@@ -42,9 +45,6 @@ extern "C" {
 /* Number of txgs worth of frees we defer adding to in-core spacemaps */
 #define	TXG_DEFER_SIZE		2
 
-#define	TXG_WAIT		1ULL
-#define	TXG_NOWAIT		2ULL
-
 typedef struct tx_cpu tx_cpu_t;
 
 typedef struct txg_handle {
@@ -74,13 +74,9 @@ extern void txg_rele_to_quiesce(txg_hand
 extern void txg_rele_to_sync(txg_handle_t *txghp);
 extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
 
-/*
- * Delay the caller by the specified number of ticks or until
- * the txg closes (whichever comes first).  This is intended
- * to be used to throttle writers when the system nears its
- * capacity.
- */
-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
+    hrtime_t resolution);
+extern void txg_kick(struct dsl_pool *dp);
 
 /*
  * Wait until the given transaction group has finished syncing.
@@ -115,11 +111,13 @@ extern boolean_t txg_sync_waiting(struct
 
 extern void txg_list_create(txg_list_t *tl, size_t offset);
 extern void txg_list_destroy(txg_list_t *tl);
-extern int txg_list_empty(txg_list_t *tl, uint64_t txg);
-extern int txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern boolean_t txg_all_lists_empty(txg_list_t *tl);
+extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
 extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg);
-extern int txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
+extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg);
 extern void *txg_list_head(txg_list_t *tl, uint64_t txg);
 extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg);
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 txg_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h	28 Nov 2013 16:38:42 -0000
@@ -18,11 +18,16 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_TXG_IMPL_H
 #define	_SYS_TXG_IMPL_H
 
@@ -33,22 +38,66 @@
 extern "C" {
 #endif
 
+/*
+ * The tx_cpu structure is a per-cpu structure that is used to track
+ * the number of active transaction holds (tc_count). As transactions
+ * are assigned into a transaction group the appropriate tc_count is
+ * incremented to indicate that there are pending changes that have yet
+ * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
+ * the tc_count. A transaction group is not considered quiesced until all
+ * tx_cpu structures have reached a tc_count of zero.
+ *
+ * This structure is a per-cpu structure by design. Updates to this structure
+ * are frequent and concurrent. Having a single structure would result in
+ * heavy lock contention so a per-cpu design was implemented. With the fanned
+ * out mutex design, consumers only need to lock the mutex associated with
+ * thread's cpu.
+ *
+ * The tx_cpu contains two locks, the tc_lock and tc_open_lock.
+ * The tc_lock is used to protect all members of the tx_cpu structure with
+ * the exception of the tc_open_lock. This lock should only be held for a
+ * short period of time, typically when updating the value of tc_count.
+ *
+ * The tc_open_lock protects the tx_open_txg member of the tx_state structure.
+ * This lock is used to ensure that transactions are only assigned into
+ * the current open transaction group. In order to move the current open
+ * transaction group to the quiesce phase, the txg_quiesce thread must
+ * grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
+ * The tc_open_lock is held until the transaction is assigned into the
+ * transaction group. Typically, this is a short operation but if throttling
+ * is occuring it may be held for longer periods of time.
+ */
 struct tx_cpu {
-	kmutex_t	tc_lock;
+	kmutex_t	tc_open_lock;	/* protects tx_open_txg */
+	kmutex_t	tc_lock;	/* protects the rest of this struct */
 	kcondvar_t	tc_cv[TXG_SIZE];
-	uint64_t	tc_count[TXG_SIZE];
+	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
 	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
-	char		tc_pad[16];
+	char		tc_pad[8];		/* pad to fill 3 cache lines */
 };
 
+/*
+ * The tx_state structure maintains the state information about the different
+ * stages of the pool's transcation groups. A per pool tx_state structure
+ * is used to track this information. The tx_state structure also points to
+ * an array of tx_cpu structures (described above). Although the tx_sync_lock
+ * is used to protect the members of this structure, it is not used to
+ * protect the tx_open_txg. Instead a special lock in the tx_cpu structure
+ * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
+ * Any thread wishing to update tx_open_txg must grab the tc_open_lock on
+ * every cpu (see txg_quiesce()).
+ */
 typedef struct tx_state {
-	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
-	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
+	tx_cpu_t	*tx_cpu;	/* protects access to tx_open_txg */
+	kmutex_t	tx_sync_lock;	/* protects the rest of this struct */
+
 	uint64_t	tx_open_txg;	/* currently open txg id */
 	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
 	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
 	uint64_t	tx_synced_txg;	/* last synced txg id */
 
+	hrtime_t	tx_open_time;	/* start time of tx_open_txg */
+
 	uint64_t	tx_sync_txg_waiting; /* txg we're waiting to sync */
 	uint64_t	tx_quiesce_txg_waiting; /* txg we're waiting to open */
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 uberblock.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h	4 Feb 2015 07:24:13 -0000
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_UBERBLOCK_H
 #define	_SYS_UBERBLOCK_H
@@ -36,8 +39,8 @@ extern "C" {
 
 typedef struct uberblock uberblock_t;
 
-extern int uberblock_verify(uberblock_t *ub);
-extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+extern int uberblock_verify(uberblock_t *);
+extern boolean_t uberblock_update(uberblock_t *, vdev_t *, uint64_t);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 uberblock_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h	12 Jun 2012 05:57:27 -0000
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
@@ -52,6 +51,9 @@ struct uberblock {
 	uint64_t	ub_guid_sum;	/* sum of all vdev guids	*/
 	uint64_t	ub_timestamp;	/* UTC time of last sync	*/
 	blkptr_t	ub_rootbp;	/* MOS objset_phys_t		*/
+
+	/* highest SPA_VERSION supported by software that wrote this txg */
+	uint64_t	ub_software_version;
 };
 
 #ifdef	__cplusplus
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/unique.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/unique.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 unique.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/unique.h	7 Aug 2009 18:33:41 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/unique.h	19 Jun 2013 13:06:07 -0000
@@ -26,8 +26,6 @@
 #ifndef	_SYS_UNIQUE_H
 #define	_SYS_UNIQUE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -42,7 +40,7 @@ void unique_fini(void);
 
 /*
  * Return a new unique value (which will not be uniquified against until
- * it is unique_insert()-ed.
+ * it is unique_insert()-ed).
  */
 uint64_t unique_create(void);
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h	3 Dec 2016 17:03:48 -0000
@@ -18,9 +18,10 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_H
@@ -45,11 +46,12 @@ typedef enum vdev_dtl_type {
 } vdev_dtl_type_t;
 
 extern boolean_t zfs_nocacheflush;
+extern boolean_t zfs_trim_enabled;
 
 extern int vdev_open(vdev_t *);
 extern void vdev_open_children(vdev_t *);
 extern boolean_t vdev_uses_zvols(vdev_t *);
-extern int vdev_validate(vdev_t *);
+extern int vdev_validate(vdev_t *, boolean_t);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
 extern void vdev_reopen(vdev_t *);
@@ -59,6 +61,7 @@ extern zio_t *vdev_probe(vdev_t *vd, zio
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern int vdev_count_leaves(spa_t *spa);
 extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
     uint64_t txg, uint64_t size);
 extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
@@ -69,19 +72,27 @@ extern void vdev_dtl_reassess(vdev_t *vd
 extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
+extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
+    dmu_tx_t *tx);
+extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
+
+extern void vdev_hold(vdev_t *);
+extern void vdev_rele(vdev_t *);
 
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
 extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_ashift_optimize(vdev_t *);
 extern void vdev_expand(vdev_t *vd, uint64_t txg);
 extern void vdev_split(vdev_t *vd);
+extern void vdev_deadman(vdev_t *vd);
 
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
-extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
-    boolean_t complete);
+extern void vdev_scan_stat_init(vdev_t *vd);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
@@ -106,7 +117,7 @@ extern boolean_t vdev_accessible(vdev_t 
 
 extern void vdev_cache_init(vdev_t *vd);
 extern void vdev_cache_fini(vdev_t *vd);
-extern int vdev_cache_read(zio_t *zio);
+extern boolean_t vdev_cache_read(zio_t *zio);
 extern void vdev_cache_write(zio_t *zio);
 extern void vdev_cache_purge(vdev_t *vd);
 
@@ -114,18 +125,27 @@ extern void vdev_queue_init(vdev_t *vd);
 extern void vdev_queue_fini(vdev_t *vd);
 extern zio_t *vdev_queue_io(zio_t *zio);
 extern void vdev_queue_io_done(zio_t *zio);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
+extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
-    boolean_t);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
 
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
+typedef enum vdev_config_flag {
+	VDEV_CONFIG_SPARE = 1 << 0,
+	VDEV_CONFIG_L2CACHE = 1 << 1,
+	VDEV_CONFIG_REMOVING = 1 << 2,
+	VDEV_CONFIG_MOS = 1 << 3
+} vdev_config_flag_t;
+
 extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
-    boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
+    boolean_t getstats, vdev_config_flag_t flags);
 
 /*
  * Label routines
@@ -133,8 +153,8 @@ extern nvlist_t *vdev_config_generate(sp
 struct uberblock;
 extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset);
 extern int vdev_label_number(uint64_t psise, uint64_t offset);
-extern nvlist_t *vdev_label_read_config(vdev_t *vd);
-extern void vdev_uberblock_load(zio_t *zio, vdev_t *vd, struct uberblock *ub);
+extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg);
+extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
 
 typedef enum {
 	VDEV_LABEL_CREATE,	/* create/add a new device */
@@ -147,6 +167,8 @@ typedef enum {
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
 
+extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size);
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_disk.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_disk.h,v
retrieving revision 1.2
diff -u -p -r1.2 vdev_disk.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_disk.h	10 Aug 2009 22:38:02 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_disk.h	10 Jun 2017 05:31:45 -0000
@@ -21,17 +21,16 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef _SYS_VDEV_DISK_H
 #define	_SYS_VDEV_DISK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/vdev.h>
 #ifdef _KERNEL
 #include <sys/buf.h>
-#include <sys/ddi.h>
 #include <sys/sunldi.h>
 #include <sys/sunddi.h>
 #endif
@@ -40,14 +39,32 @@
 extern "C" {
 #endif
 
+#ifdef _KERNEL
 typedef struct vdev_disk {
+#ifdef illumos
+	ddi_devid_t	vd_devid;
+	char		*vd_minor;
+	ldi_handle_t	vd_lh;
+	list_t		vd_ldi_cbs;
+	boolean_t	vd_ldi_offline;
+#endif
+#ifdef __NetBSD__
 	char            *vd_minor;
-	vnode_t         *vd_vn;
+	vnode_t         *vd_vp;
 	struct workqueue *vd_wq;
+#endif
 } vdev_disk_t;
+#endif
+
+extern int vdev_disk_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, int, boolean_t);
 
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
 #ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 #endif
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 vdev_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h	27 Feb 2010 22:31:42 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h	18 Apr 2017 00:03:28 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -53,15 +53,21 @@ typedef struct vdev_queue vdev_queue_t;
 typedef struct vdev_cache vdev_cache_t;
 typedef struct vdev_cache_entry vdev_cache_entry_t;
 
+extern int zfs_vdev_queue_depth_pct;
+extern uint32_t zfs_vdev_async_write_max_active;
+
 /*
  * Virtual device operations
  */
-typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *ashift);
+typedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
+    uint64_t *logical_ashift, uint64_t *physical_ashift);
 typedef void	vdev_close_func_t(vdev_t *vd);
 typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef int	vdev_io_start_func_t(zio_t *zio);
+typedef void	vdev_io_start_func_t(zio_t *zio);
 typedef void	vdev_io_done_func_t(zio_t *zio);
 typedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
+typedef void	vdev_hold_func_t(vdev_t *vd);
+typedef void	vdev_rele_func_t(vdev_t *vd);
 
 typedef struct vdev_ops {
 	vdev_open_func_t		*vdev_op_open;
@@ -70,6 +76,8 @@ typedef struct vdev_ops {
 	vdev_io_start_func_t		*vdev_op_io_start;
 	vdev_io_done_func_t		*vdev_op_io_done;
 	vdev_state_change_func_t	*vdev_op_state_change;
+	vdev_hold_func_t		*vdev_op_hold;
+	vdev_rele_func_t		*vdev_op_rele;
 	char				vdev_op_type[16];
 	boolean_t			vdev_op_leaf;
 } vdev_ops_t;
@@ -94,12 +102,26 @@ struct vdev_cache {
 	kmutex_t	vc_lock;
 };
 
+typedef struct vdev_queue_class {
+	uint32_t	vqc_active;
+
+	/*
+	 * Sorted by offset or timestamp, depending on if the queue is
+	 * LBA-ordered vs FIFO.
+	 */
+	avl_tree_t	vqc_queued_tree;
+} vdev_queue_class_t;
+
 struct vdev_queue {
-	avl_tree_t	vq_deadline_tree;
-	avl_tree_t	vq_read_tree;
-	avl_tree_t	vq_write_tree;
-	avl_tree_t	vq_pending_tree;
+	vdev_t		*vq_vdev;
+	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
+	avl_tree_t	vq_active_tree;
+	avl_tree_t	vq_read_offset_tree;
+	avl_tree_t	vq_write_offset_tree;
+	uint64_t	vq_last_offset;
+	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
 	kmutex_t	vq_lock;
+	uint64_t	vq_lastoffset;
 };
 
 /*
@@ -115,17 +137,37 @@ struct vdev {
 	uint64_t	vdev_orig_guid;	/* orig. guid prior to remove	*/
 	uint64_t	vdev_asize;	/* allocatable device capacity	*/
 	uint64_t	vdev_min_asize;	/* min acceptable asize		*/
+	uint64_t	vdev_max_asize;	/* max acceptable asize		*/
 	uint64_t	vdev_ashift;	/* block alignment shift	*/
+	/*
+	 * Logical block alignment shift
+	 *
+	 * The smallest sized/aligned I/O supported by the device.
+	 */
+	uint64_t        vdev_logical_ashift;
+	/*
+	 * Physical block alignment shift
+	 *
+	 * The device supports logical I/Os with vdev_logical_ashift
+	 * size/alignment, but optimum performance will be achieved by
+	 * aligning/sizing requests to vdev_physical_ashift.  Smaller
+	 * requests may be inflated or incur device level read-modify-write
+	 * operations.
+	 *
+	 * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
+         */
+	uint64_t        vdev_physical_ashift;
 	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
 	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
 	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
 	spa_t		*vdev_spa;	/* spa for this vdev		*/
 	void		*vdev_tsd;	/* type-specific data		*/
+	vnode_t		*vdev_name_vp;	/* vnode for pathname		*/
+	vnode_t		*vdev_devid_vp;	/* vnode for devid		*/
 	vdev_t		*vdev_top;	/* top-level vdev		*/
 	vdev_t		*vdev_parent;	/* parent vdev			*/
 	vdev_t		**vdev_child;	/* array of children		*/
 	uint64_t	vdev_children;	/* number of children		*/
-	space_map_t	vdev_dtl[DTL_TYPES]; /* in-core dirty time logs	*/
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
 	boolean_t	vdev_expanding;	/* expand the vdev?		*/
 	boolean_t	vdev_reopening;	/* reopen in progress?		*/
@@ -146,24 +188,39 @@ struct vdev {
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
 	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
-	boolean_t	vdev_removing;	/* device is being removed?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
 	uint64_t	vdev_islog;	/* is an intent log device	*/
-	uint64_t	vdev_ishole;	/* is a hole in the namespace 	*/
+	uint64_t	vdev_removing;	/* device is being removed?	*/
+	boolean_t	vdev_ishole;	/* is a hole in the namespace	*/
+	kmutex_t	vdev_queue_lock; /* protects vdev_queue_depth	*/
+	uint64_t	vdev_top_zap;
+
+	/*
+	 * The queue depth parameters determine how many async writes are
+	 * still pending (i.e. allocated by net yet issued to disk) per
+	 * top-level (vdev_async_write_queue_depth) and the maximum allowed
+	 * (vdev_max_async_write_queue_depth). These values only apply to
+	 * top-level vdevs.
+	 */
+	uint64_t	vdev_async_write_queue_depth;
+	uint64_t	vdev_max_async_write_queue_depth;
 
 	/*
 	 * Leaf vdev state.
 	 */
-	uint64_t	vdev_psize;	/* physical device capacity	*/
-	space_map_obj_t	vdev_dtl_smo;	/* dirty time log space map obj	*/
+	range_tree_t	*vdev_dtl[DTL_TYPES]; /* dirty time logs	*/
+	space_map_t	*vdev_dtl_sm;	/* dirty time log space map	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
+	uint64_t	vdev_dtl_object; /* DTL object			*/
+	uint64_t	vdev_psize;	/* physical device capacity	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
 	uint64_t	vdev_offline;	/* persistent offline state	*/
 	uint64_t	vdev_faulted;	/* persistent faulted state	*/
 	uint64_t	vdev_degraded;	/* persistent degraded state	*/
 	uint64_t	vdev_removed;	/* persistent removed state	*/
+	uint64_t	vdev_resilver_txg; /* persistent resilvering state */
 	uint64_t	vdev_nparity;	/* number of parity devices for raidz */
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
@@ -171,28 +228,34 @@ struct vdev {
 	char		*vdev_fru;	/* physical FRU location	*/
 	uint64_t	vdev_not_present; /* not present during import	*/
 	uint64_t	vdev_unspare;	/* unspare when resilvering done */
-	hrtime_t	vdev_last_try;	/* last reopen time		*/
 	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
+	boolean_t	vdev_notrim;	/* true if trim failed */
 	boolean_t	vdev_checkremove; /* temporary online test	*/
 	boolean_t	vdev_forcefault; /* force online fault		*/
 	boolean_t	vdev_splitting;	/* split or repair in progress  */
-	uint8_t		vdev_tmpoffline; /* device taken offline temporarily? */
-	uint8_t		vdev_detached;	/* device detached?		*/
-	uint8_t		vdev_cant_read;	/* vdev is failing all reads	*/
-	uint8_t		vdev_cant_write; /* vdev is failing all writes	*/
-	uint64_t	vdev_isspare;	/* was a hot spare		*/
-	uint64_t	vdev_isl2cache;	/* was a l2cache device		*/
+	boolean_t	vdev_delayed_close; /* delayed device close?	*/
+	boolean_t	vdev_tmpoffline; /* device taken offline temporarily? */
+	boolean_t	vdev_detached;	/* device detached?		*/
+	boolean_t	vdev_cant_read;	/* vdev is failing all reads	*/
+	boolean_t	vdev_cant_write; /* vdev is failing all writes	*/
+	boolean_t	vdev_isspare;	/* was a hot spare		*/
+	boolean_t	vdev_isl2cache;	/* was a l2cache device		*/
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	vdev_cache_t	vdev_cache;	/* physical block cache		*/
-	spa_aux_vdev_t	*vdev_aux;	/* for l2cache vdevs		*/
+	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
 	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
+	struct trim_map	*vdev_trimmap;	/* map on outstanding trims	*/ 
+	uint16_t	vdev_rotation_rate; /* rotational rate of the media */
+#define	VDEV_RATE_UNKNOWN	0
+#define	VDEV_RATE_NON_ROTATING	1
+	uint64_t	vdev_leaf_zap;
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
 	 * remain at the end of the structure.  DTrace will use the kernel's
 	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
-	 * larger in userland, the offsets for the rest fields would be
+	 * larger in userland, the offsets for the rest of the fields would be
 	 * incorrect.
 	 */
 	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
@@ -208,8 +271,11 @@ struct vdev {
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
+/* The largest uberblock we support is 8k. */
+#define	MAX_UBERBLOCK_SHIFT (13)
 #define	VDEV_UBERBLOCK_SHIFT(vd)	\
-	MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
+	    MAX_UBERBLOCK_SHIFT)
 #define	VDEV_UBERBLOCK_COUNT(vd)	\
 	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
@@ -234,12 +300,13 @@ typedef struct vdev_label {
 #define	VDD_METASLAB	0x01
 #define	VDD_DTL		0x02
 
+/* Offset of embedded boot loader region on each label */
+#define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
 /*
- * Size and offset of embedded boot loader region on each label.
+ * Size of embedded boot loader region on each label.
  * The total size of the first two labels plus the boot area is 4MB.
  */
-#define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
-#define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M	*/
+#define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M */
 
 /*
  * Size of label regions at the start and end of each leaf device.
@@ -247,6 +314,7 @@ typedef struct vdev_label {
 #define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
 #define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
 #define	VDEV_LABELS		4
+#define	VDEV_BEST_LABEL		VDEV_LABELS
 
 #define	VDEV_ALLOC_LOAD		0
 #define	VDEV_ALLOC_ADD		1
@@ -254,6 +322,7 @@ typedef struct vdev_label {
 #define	VDEV_ALLOC_L2CACHE	3
 #define	VDEV_ALLOC_ROOTPOOL	4
 #define	VDEV_ALLOC_SPLIT	5
+#define	VDEV_ALLOC_ATTACH	6
 
 /*
  * Allocate or free a vdev
@@ -277,10 +346,13 @@ extern void vdev_remove_parent(vdev_t *c
  * vdev sync load and sync
  */
 extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
+extern boolean_t vdev_log_state_valid(vdev_t *vd);
 extern void vdev_load(vdev_t *vd);
+extern int vdev_dtl_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
+extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
 
 /*
  * Available vdev types.
@@ -289,7 +361,11 @@ extern vdev_ops_t vdev_root_ops;
 extern vdev_ops_t vdev_mirror_ops;
 extern vdev_ops_t vdev_replacing_ops;
 extern vdev_ops_t vdev_raidz_ops;
+#if defined(__FreeBSD__) && defined(_KERNEL)
+extern vdev_ops_t vdev_geom_ops;
+#else
 extern vdev_ops_t vdev_disk_ops;
+#endif
 extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
 extern vdev_ops_t vdev_hole_ops;
@@ -303,9 +379,21 @@ extern uint64_t vdev_get_min_asize(vdev_
 extern void vdev_set_min_asize(vdev_t *vd);
 
 /*
- * zdb uses this tunable, so it must be declared here to make lint happy.
+ * Global variables
  */
+/* zdb uses this tunable, so it must be declared here to make lint happy. */
 extern int zfs_vdev_cache_size;
+extern uint_t zfs_geom_probe_vdev_key;
+
+#ifdef illumos
+/*
+ * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
+ */
+typedef struct vdev_buf {
+	buf_t	vb_buf;		/* buffer that describes the io */
+	zio_t	*vb_io;		/* pointer back to the original zio_t */
+} vdev_buf_t;
+#endif
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_raidz.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_raidz.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_raidz.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_raidz.h	5 Oct 2013 12:53:07 -0000
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define	_SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#ifdef illumos
+#include <sys/semaphore.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
+#endif
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_VDEV_RAIDZ_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zap.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h	27 Feb 2010 22:31:43 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h	10 Oct 2016 11:09:53 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_H
@@ -80,24 +80,29 @@
  */
 
 #include <sys/dmu.h>
+#include <sys/refcount.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
- * The matchtype specifies which entry will be accessed.
- * MT_EXACT: only find an exact match (non-normalized)
- * MT_FIRST: find the "first" normalized (case and Unicode
- *     form) match; the designated "first" match will not change as long
- *     as the set of entries with this normalization doesn't change
- * MT_BEST: if there is an exact match, find that, otherwise find the
- *     first normalized match
+ * Specifies matching criteria for ZAP lookups.
  */
 typedef enum matchtype
 {
+	/* Only find an exact match (non-normalized) */
 	MT_EXACT,
+	/*
+	 * If there is an exact match, find that, otherwise find the
+	 * first normalized match.
+	 */
 	MT_BEST,
+	/*
+	 * Find the "first" normalized (case and Unicode form) match;
+	 * the designated "first" match will not change as long as the
+	 * set of entries with this normalization doesn't change.
+	 */
 	MT_FIRST
 } matchtype_t;
 
@@ -133,6 +138,14 @@ uint64_t zap_create_norm(objset_t *ds, i
 uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
     dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot,
+    uint64_t parent_obj, const char *name, dmu_tx_t *tx);
+
+/*
+ * Initialize an already-allocated object.
+ */
+void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
+    zap_flags_t flags, dmu_tx_t *tx);
 
 /*
  * Create a new zapobj with no attributes from the given (unallocated)
@@ -172,16 +185,21 @@ int zap_destroy(objset_t *ds, uint64_t z
  * call will fail and return EINVAL.
  *
  * If 'integer_size' is equal to or larger than the attribute's integer
- * size, the call will succeed and return 0.  * When converting to a
- * larger integer size, the integers will be treated as unsigned (ie. no
- * sign-extension will be performed).
+ * size, the call will succeed and return 0.
+ *
+ * When converting to a larger integer size, the integers will be treated as
+ * unsigned (ie. no sign-extension will be performed).
  *
  * 'num_integers' is the length (in integers) of 'buf'.
  *
  * If the attribute is longer than the buffer, as many integers as will
  * fit will be transferred to 'buf'.  If the entire attribute was not
  * transferred, the call will return EOVERFLOW.
- *
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
  * If rn_len is nonzero, realname will be set to the name of the found
  * entry (which may be different from the requested name if matchtype is
  * not MT_EXACT).
@@ -189,8 +207,6 @@ int zap_destroy(objset_t *ds, uint64_t z
  * If normalization_conflictp is not NULL, it will be set if there is
  * another name with the same case/unicode normalized form.
  */
-int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
-    uint64_t integer_size, uint64_t num_integers, void *buf);
 int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
@@ -198,9 +214,17 @@ int zap_lookup_norm(objset_t *ds, uint64
 int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
 int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints);
+int zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp);
 
-int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
-    int add, uint64_t *towrite, uint64_t *tooverwrite);
+int zap_count_write_by_dnode(dnode_t *dn, const char *name,
+    int add, refcount_t *towrite, refcount_t *tooverwrite);
 
 /*
  * Create an attribute with the given name and value.
@@ -259,7 +283,6 @@ int zap_remove_uint64(objset_t *os, uint
  */
 int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count);
 
-
 /*
  * Returns (in name) the name of the entry whose (value & mask)
  * (za_first_integer) is value, or ENOENT if not found.  The string
@@ -276,6 +299,14 @@ int zap_value_search(objset_t *os, uint6
  */
 int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
 
+/* Same as zap_join, but set the values to 'value'. */
+int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    uint64_t value, dmu_tx_t *tx);
+
+/* Same as zap_join, but add together any duplicated entries. */
+int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    dmu_tx_t *tx);
+
 /*
  * Manipulate entries where the name + value are the "same" (the name is
  * a stringified version of the value).
@@ -286,6 +317,17 @@ int zap_lookup_int(objset_t *os, uint64_
 int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
     dmu_tx_t *tx);
 
+/* Here the key is an int and the value is a different int. */
+int zap_add_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t *valuep);
+
+int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx);
+
 struct zap;
 struct zap_leaf;
 typedef struct zap_cursor {
@@ -308,7 +350,7 @@ typedef struct {
 	boolean_t za_normalization_conflict;
 	uint64_t za_num_integers;
 	uint64_t za_first_integer;	/* no sign extension for <8byte ints */
-	char za_name[MAXNAMELEN];
+	char za_name[ZAP_MAXNAMELEN];
 } zap_attribute_t;
 
 /*
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zap_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h	27 Feb 2010 22:31:43 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h	2 Mar 2017 10:54:19 -0000
@@ -19,8 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
@@ -42,8 +44,7 @@ extern int fzap_default_block_shift;
 
 #define	MZAP_ENT_LEN		64
 #define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
-#define	MZAP_MAX_BLKSHIFT	SPA_MAXBLOCKSHIFT
-#define	MZAP_MAX_BLKSZ		(1 << MZAP_MAX_BLKSHIFT)
+#define	MZAP_MAX_BLKSZ		SPA_OLD_MAXBLOCKSIZE
 
 #define	ZAP_NEED_CD		(-1U)
 
@@ -67,9 +68,12 @@ typedef struct mzap_ent {
 	avl_node_t mze_node;
 	int mze_chunkid;
 	uint64_t mze_hash;
-	mzap_ent_phys_t mze_phys;
+	uint32_t mze_cd; /* copy from mze_phys->mze_cd */
 } mzap_ent_t;
 
+#define	MZE_PHYS(zap, mze) \
+	(&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
+
 /*
  * The (fat) zap is stored in one object. It is an array of
  * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
@@ -102,7 +106,7 @@ struct zap_leaf;
  * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
  */
 #define	ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
-	((uint64_t *)(zap)->zap_f.zap_phys) \
+	((uint64_t *)zap_f_phys(zap)) \
 	[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
 
 /*
@@ -138,6 +142,7 @@ typedef struct zap_phys {
 typedef struct zap_table_phys zap_table_phys_t;
 
 typedef struct zap {
+	dmu_buf_user_t zap_dbu;
 	objset_t *zap_objset;
 	uint64_t zap_object;
 	struct dmu_buf *zap_dbuf;
@@ -147,8 +152,6 @@ typedef struct zap {
 	uint64_t zap_salt;
 	union {
 		struct {
-			zap_phys_t *zap_phys;
-
 			/*
 			 * zap_num_entries_mtx protects
 			 * zap_num_entries
@@ -157,7 +160,6 @@ typedef struct zap {
 			int zap_block_shift;
 		} zap_fat;
 		struct {
-			mzap_phys_t *zap_phys;
 			int16_t zap_num_entries;
 			int16_t zap_num_chunks;
 			int16_t zap_alloc_next;
@@ -166,6 +168,18 @@ typedef struct zap {
 	} zap_u;
 } zap_t;
 
+inline zap_phys_t *
+zap_f_phys(zap_t *zap)
+{
+	return (zap->zap_dbuf->db_data);
+}
+
+inline mzap_phys_t *
+zap_m_phys(zap_t *zap)
+{
+	return (zap->zap_dbuf->db_data);
+}
+
 typedef struct zap_name {
 	zap_t *zn_zap;
 	int zn_key_intlen;
@@ -183,9 +197,9 @@ typedef struct zap_name {
 
 boolean_t zap_match(zap_name_t *zn, const char *matchname);
 int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
-    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
-void zap_unlockdir(zap_t *zap);
-void zap_evict(dmu_buf_t *db, void *vmzap);
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
+void zap_unlockdir(zap_t *zap, void *tag);
+void zap_evict_sync(void *dbu);
 zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
 int zap_hashbits(zap_t *zap);
@@ -199,12 +213,14 @@ int fzap_count(zap_t *zap, uint64_t *cou
 int fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *normalization_conflictp);
-int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
-    uint64_t *tooverwrite);
+void fzap_prefetch(zap_name_t *zn);
+int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
+    refcount_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
-    const void *val, dmu_tx_t *tx);
+    const void *val, void *tag, dmu_tx_t *tx);
 int fzap_update(zap_name_t *zn,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+    int integer_size, uint64_t num_integers, const void *val,
+    void *tag, dmu_tx_t *tx);
 int fzap_length(zap_name_t *zn,
     uint64_t *integer_size, uint64_t *num_integers);
 int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
@@ -214,7 +230,7 @@ void zap_put_leaf(struct zap_leaf *l);
 
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
-    const void *val, uint32_t cd, dmu_tx_t *tx);
+    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx);
 void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
 int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zap_leaf.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h	27 Feb 2010 22:31:43 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h	30 Aug 2015 02:17:54 -0000
@@ -19,13 +19,15 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_ZAP_LEAF_H
 #define	_SYS_ZAP_LEAF_H
 
+#include <sys/zap.h>
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -82,7 +84,7 @@ struct zap_stats;
  */
 #define	ZAP_LEAF_CHUNK(l, idx) \
 	((zap_leaf_chunk_t *) \
-	((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+	(zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
 #define	ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
 
 typedef enum zap_chunk_type {
@@ -100,6 +102,7 @@ typedef enum zap_chunk_type {
  */
 typedef struct zap_leaf_phys {
 	struct zap_leaf_header {
+		/* Public to ZAP */
 		uint64_t lh_block_type;		/* ZBT_LEAF */
 		uint64_t lh_pad1;
 		uint64_t lh_prefix;		/* hash prefix of this leaf */
@@ -108,8 +111,7 @@ typedef struct zap_leaf_phys {
 		uint16_t lh_nentries;		/* number of entries */
 		uint16_t lh_prefix_len;		/* num bits used to id this */
 
-/* above is accessable to zap, below is zap_leaf private */
-
+		/* Private to zap_leaf */
 		uint16_t lh_freelist;		/* chunk head of free list */
 		uint8_t lh_flags;		/* ZLF_* flags */
 		uint8_t lh_pad2[11];
@@ -151,22 +153,27 @@ typedef union zap_leaf_chunk {
 } zap_leaf_chunk_t;
 
 typedef struct zap_leaf {
+	dmu_buf_user_t l_dbu;
 	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
 	int l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
-	zap_leaf_phys_t *l_phys;
 } zap_leaf_t;
 
+inline zap_leaf_phys_t *
+zap_leaf_phys(zap_leaf_t *l)
+{
+	return (l->l_dbuf->db_data);
+}
 
 typedef struct zap_entry_handle {
-	/* below is set by zap_leaf.c and is public to zap.c */
+	/* Set by zap_leaf and public to ZAP */
 	uint64_t zeh_num_integers;
 	uint64_t zeh_hash;
 	uint32_t zeh_cd;
 	uint8_t zeh_integer_size;
 
-	/* below is private to zap_leaf.c */
+	/* Private to zap_leaf */
 	uint16_t zeh_fakechunk;
 	uint16_t *zeh_chunkp;
 	zap_leaf_t *zeh_leaf;
@@ -201,7 +208,7 @@ extern int zap_entry_read_name(struct za
 /*
  * Replace the value of an existing entry.
  *
- * zap_entry_update may fail if it runs out of space (ENOSPC).
+ * May fail if it runs out of space (ENOSPC).
  */
 extern int zap_entry_update(zap_entry_handle_t *zeh,
     uint8_t integer_size, uint64_t num_integers, const void *buf);
@@ -220,10 +227,7 @@ extern int zap_entry_create(zap_leaf_t *
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh);
 
-/*
- * Return true if there are additional entries with the same normalized
- * form.
- */
+/* Determine whether there is another entry with the same normalized form. */
 extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
     struct zap_name *zn, const char *name, struct zap *zap);
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfeature.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfeature.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfeature.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfeature.h	13 Jan 2014 02:59:34 -0000
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_ZFEATURE_H
+#define	_SYS_ZFEATURE_H
+
+#include <sys/nvpair.h>
+#include <sys/txg.h>
+#include "zfeature_common.h"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	VALID_FEATURE_FID(fid)	((fid) >= 0 && (fid) < SPA_FEATURES)
+#define	VALID_FEATURE_OR_NONE(fid)	((fid) == SPA_FEATURE_NONE ||	\
+    VALID_FEATURE_FID(fid))
+
+struct spa;
+struct dmu_tx;
+struct objset;
+
+extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *);
+extern void spa_feature_enable(struct spa *, spa_feature_t,
+    struct dmu_tx *);
+extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid,
+    uint64_t *txg);
+extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t);
+extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *);
+
+/*
+ * These functions are only exported for zhack and zdb; normal callers should
+ * use the above interfaces.
+ */
+extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *);
+extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+    uint64_t *res);
+extern void feature_enable_sync(struct spa *, zfeature_info_t *,
+    struct dmu_tx *);
+extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t,
+    struct dmu_tx *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFEATURE_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_acl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h	27 Feb 2010 22:31:44 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h	14 Apr 2017 20:58:31 -0000
@@ -19,20 +19,19 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_FS_ZFS_ACL_H
 #define	_SYS_FS_ZFS_ACL_H
 
 #ifdef _KERNEL
-#include <sys/isa_defs.h>
-#include <sys/types32.h>
+#include <sys/cred.h>
 #endif
 #include <sys/acl.h>
 #include <sys/dmu.h>
 #include <sys/zfs_fuid.h>
+#include <sys/sa.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -46,7 +45,8 @@ struct znode_phys;
 #define	ZFS_ACL_VERSION		ZFS_ACL_VERSION_FUID
 
 /*
- * ZFS ACLs are store in various forms.
+ * ZFS ACLs (Access Control Lists) are stored in various forms.
+ *
  * Files created with ACL version ZFS_ACL_VERSION_INITIAL
  * will all be created with fixed length ACEs of type
  * zfs_oldace_t.
@@ -106,12 +106,18 @@ typedef struct zfs_acl_phys_v0 {
 
 #define	ZFS_ACE_SPACE	(sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
 
+/*
+ * Size of ACL count is always 2 bytes.
+ * Necessary to for dealing with both V0 ACL and V1 ACL layout
+ */
+#define	ZFS_ACL_COUNT_SIZE	(sizeof (uint16_t))
+
 typedef struct zfs_acl_phys {
 	uint64_t	z_acl_extern_obj;	  /* ext acl pieces */
 	uint32_t	z_acl_size;		  /* Number of bytes in ACL */
 	uint16_t	z_acl_version;		  /* acl version */
 	uint16_t	z_acl_count;		  /* ace count */
-	uint8_t		z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+	uint8_t	z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
 } zfs_acl_phys_t;
 
 typedef struct acl_ops {
@@ -130,8 +136,8 @@ typedef struct acl_ops {
 	size_t		(*ace_size)(void *acep); /* how big is this ace */
 	size_t		(*ace_abstract_size)(void); /* sizeof abstract entry */
 	int		(*ace_mask_off)(void); /* off of access mask in ace */
+	/* ptr to data if any */
 	int		(*ace_data)(void *acep, void **datap);
-			    /* ptr to data if any */
 } acl_ops_t;
 
 /*
@@ -146,21 +152,26 @@ typedef struct zfs_acl_node {
 	void		*z_allocdata;	/* pointer to kmem allocated memory */
 	size_t		z_allocsize;	/* Size of blob in bytes */
 	size_t		z_size;		/* length of ACL data */
-	int		z_ace_count;	/* number of ACEs in this acl node */
+	uint64_t	z_ace_count;	/* number of ACEs in this acl node */
 	int		z_ace_idx;	/* ace iterator positioned on */
 } zfs_acl_node_t;
 
 typedef struct zfs_acl {
-	int		z_acl_count;	/* Number of ACEs */
+	uint64_t	z_acl_count;	/* Number of ACEs */
 	size_t		z_acl_bytes;	/* Number of bytes in ACL */
 	uint_t		z_version;	/* version of ACL */
 	void		*z_next_ace;	/* pointer to next ACE */
-	int		z_hints;	/* ACL hints (ZFS_INHERIT_ACE ...) */
+	uint64_t	z_hints;	/* ACL hints (ZFS_INHERIT_ACE ...) */
 	zfs_acl_node_t	*z_curr_node;	/* current node iterator is handling */
 	list_t		z_acl;		/* chunks of ACE data */
 	acl_ops_t	z_ops;		/* ACL operations */
 } zfs_acl_t;
 
+typedef struct acl_locator_cb {
+	zfs_acl_t *cb_aclp;
+	zfs_acl_node_t *cb_acl_node;
+} zfs_acl_locator_cb_t;
+
 #define	ACL_DATA_ALLOCED	0x1
 #define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
 
@@ -215,6 +226,16 @@ void zfs_acl_free(zfs_acl_t *);
 int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
     struct zfs_fuid_info **, zfs_acl_t **);
 int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
+uint64_t zfs_external_acl(struct znode *);
+int zfs_znode_acl_version(struct znode *);
+int zfs_acl_size(struct znode *, int *);
+zfs_acl_t *zfs_acl_alloc(int);
+zfs_acl_node_t *zfs_acl_node_alloc(size_t);
+void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *);
+void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *);
+uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *,
+    uint64_t *, uint64_t, uint64_t);
+int zfs_acl_chown_setattr(struct znode *);
 
 #endif
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_context.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h	27 Feb 2010 22:31:44 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h	17 Apr 2017 01:50:07 -0000
@@ -22,6 +22,10 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define	_SYS_ZFS_CONTEXT_H
@@ -30,44 +34,114 @@
 extern "C" {
 #endif
 
+#include <sys/param.h>
+#include <sys/stdint.h>
 #include <sys/note.h>
-#include <sys/types.h>
-#include <sys/t_lock.h>
-#include <sys/atomic.h>
+#include <sys/kernel.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
 #include <sys/sysmacros.h>
 #include <sys/bitmap.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
-#include <sys/buf.h>
-#include <sys/param.h>
+#include <sys/taskqueue.h>
 #include <sys/systm.h>
-#include <sys/cpuvar.h>
-#include <sys/kobj.h>
 #include <sys/conf.h>
-#include <sys/disp.h>
-#include <sys/debug.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/kcondvar.h>
 #include <sys/random.h>
 #include <sys/byteorder.h>
 #include <sys/systm.h>
 #include <sys/list.h>
+#include <sys/zfs_debug.h>
+#include <sys/sysevent.h>
 #include <sys/uio.h>
 #include <sys/dirent.h>
 #include <sys/time.h>
-#include <vm/seg_kmem.h>
-#include <sys/zone.h>
 #include <sys/uio.h>
-#include <sys/zfs_debug.h>
-#include <sys/sysevent.h>
-#include <sys/sysevent/eventdefs.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/string.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/cred.h>
+#include <sys/sdt.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/sysctl.h>
+#include <sys/sbuf.h>
+#include <sys/priv.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/stack.h>
+#include <sys/lockf.h>
+#include <sys/pathname.h>
+#include <sys/policy.h>
+#include <sys/refstr.h>
+#include <sys/zone.h>
+#include <sys/eventhandler.h>
+#include <sys/extattr.h>
+#include <sys/misc.h>
+#include <sys/sig.h>
+#include <sys/osd.h>
 #include <sys/sysevent/dev.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/u8_textprep.h>
 #include <sys/fm/util.h>
 #include <sys/sunddi.h>
+#ifdef illumos
+#include <sys/cyclic.h>
+#endif
+#include <sys/callo.h>
+#include <sys/disp.h>
+#include <machine/stdarg.h>
 
-#define	CPU_SEQID	(CPU->cpu_seqid)
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */
+#ifdef min_offset
+#undef min_offset
+#endif
+#ifdef max_offset
+#undef max_offset
+#endif
+#include <vm/vm_extern.h>
+#include <vm/vnode_pager.h>
+
+#define	CPU_SEQID	(curcpu)
+
+#define	tsd_create(keyp, destructor)	do {				\
+	*(keyp) = osd_thread_register((destructor));			\
+	KASSERT(*(keyp) > 0, ("cannot register OSD"));			\
+} while (0)
+#define	tsd_destroy(keyp)		osd_thread_deregister(*(keyp))
+#define	tsd_get(key)			osd_thread_get(curthread, (key))
+#define	tsd_set(key, value)		osd_thread_set(curthread, (key), (value))
 
 #ifdef	__cplusplus
 }
 #endif
 
+extern int zfs_debug_level;
+extern struct mtx zfs_debug_mtx;
+#define	ZFS_LOG(lvl, ...)	do {					\
+	if (((lvl) & 0xff) <= zfs_debug_level) {			\
+		mtx_lock(&zfs_debug_mtx);				\
+		printf("%s:%u[%d]: ", __func__, __LINE__, (lvl));	\
+		printf(__VA_ARGS__);					\
+		printf("\n");						\
+		if ((lvl) & 0x100)					\
+			kdb_backtrace();				\
+		mtx_unlock(&zfs_debug_mtx);				\
+	}								\
+} while (0)
+
+#define	sys_shutdown	rebooting
+
 #endif	/* _SYS_ZFS_CONTEXT_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_ctldir.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h	27 Feb 2010 22:31:44 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h	2 Mar 2017 10:54:19 -0000
@@ -19,14 +19,12 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_ZFS_CTLDIR_H
 #define	_ZFS_CTLDIR_H
 
-#include <sys/pathname.h>
 #include <sys/vnode.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
@@ -46,25 +44,19 @@ extern "C" {
 
 void zfsctl_create(zfsvfs_t *);
 void zfsctl_destroy(zfsvfs_t *);
-vnode_t *zfsctl_root(znode_t *);
+int zfsctl_root(zfsvfs_t *, int, vnode_t **);
 void zfsctl_init(void);
 void zfsctl_fini(void);
+boolean_t zfsctl_is_node(vnode_t *);
 
 int zfsctl_rename_snapshot(const char *from, const char *to);
 int zfsctl_destroy_snapshot(const char *snapname, int force);
 int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
 
-int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
-    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
-    int *direntflags, pathname_t *realpnp);
-
-int zfsctl_make_fid(zfsvfs_t *zfsvfsp, uint64_t object, uint32_t gen,
-    fid_t *fidp);
 int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
 
 #define	ZFSCTL_INO_ROOT		0x1
 #define	ZFSCTL_INO_SNAPDIR	0x2
-#define	ZFSCTL_INO_SHARES	0x3
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_debug.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_debug.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 zfs_debug.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_debug.h	7 Aug 2009 18:33:43 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_debug.h	27 Mar 2017 06:19:45 -0000
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_DEBUG_H
 #define	_SYS_ZFS_DEBUG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -45,16 +43,24 @@ extern "C" {
  */
 
 #if defined(DEBUG) || !defined(_KERNEL)
+#if !defined(ZFS_DEBUG)
 #define	ZFS_DEBUG
 #endif
+#endif
 
 extern int zfs_flags;
+extern boolean_t zfs_recover;
+extern boolean_t zfs_free_leak_on_eio;
 
-#define	ZFS_DEBUG_DPRINTF	0x0001
-#define	ZFS_DEBUG_DBUF_VERIFY	0x0002
-#define	ZFS_DEBUG_DNODE_VERIFY	0x0004
-#define	ZFS_DEBUG_SNAPNAMES	0x0008
-#define	ZFS_DEBUG_MODIFY	0x0010
+#define	ZFS_DEBUG_DPRINTF		(1 << 0)
+#define	ZFS_DEBUG_DBUF_VERIFY		(1 << 1)
+#define	ZFS_DEBUG_DNODE_VERIFY		(1 << 2)
+#define	ZFS_DEBUG_SNAPNAMES		(1 << 3)
+#define	ZFS_DEBUG_MODIFY		(1 << 4)
+#define	ZFS_DEBUG_SPA			(1 << 5)
+#define	ZFS_DEBUG_ZIO_FREE		(1 << 6)
+#define	ZFS_DEBUG_HISTOGRAM_VERIFY	(1 << 7)
+#define	ZFS_DEBUG_METASLAB_VERIFY	(1 << 8)
 
 #ifdef ZFS_DEBUG
 extern void __dprintf(const char *file, const char *func,
@@ -68,6 +74,23 @@ extern void __dprintf(const char *file, 
 
 extern void zfs_panic_recover(const char *fmt, ...);
 
+typedef struct zfs_dbgmsg {
+	list_node_t zdm_node;
+	time_t zdm_timestamp;
+	char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+extern void zfs_dbgmsg_init(void);
+extern void zfs_dbgmsg_fini(void);
+extern void zfs_dbgmsg(const char *fmt, ...);
+extern void zfs_dbgmsg_print(const char *tag);
+
+#ifdef illumos
+#ifndef _KERNEL
+extern int dprintf_find_string(const char *string);
+#endif
+#endif /* illumos */
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h,v
retrieving revision 1.3
diff -u -p -r1.3 zfs_dir.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h	10 Oct 2016 11:09:53 -0000
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,18 +48,18 @@ extern "C" {
 #define	IS_ROOT_NODE	0x01		/* create a root node */
 #define	IS_XATTR	0x02		/* create an extended attribute node */
 
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
-    int, int *, pathname_t *);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
+extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
     boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
-    pathname_t *);
+#if 0
+extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
+#else
+extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
+#endif
 extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
-    uint_t, znode_t **, int, zfs_acl_ids_t *);
+    uint_t, znode_t **, zfs_acl_ids_t *);
 extern void zfs_rmnode(znode_t *);
-extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
 extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
 extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_fuid.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h	27 Feb 2010 22:31:44 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h	12 Jun 2012 05:57:27 -0000
@@ -26,13 +26,14 @@
 #ifndef	_SYS_FS_ZFS_FUID_H
 #define	_SYS_FS_ZFS_FUID_H
 
+#include <sys/types.h>
 #ifdef _KERNEL
 #include <sys/kidmap.h>
-#include <sys/sid.h>
 #include <sys/dmu.h>
 #include <sys/zfs_vfsops.h>
 #endif
 #include <sys/avl.h>
+#include <sys/list.h>
 
 #ifdef	__cplusplus
 extern "C" {
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_ioctl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	27 Feb 2010 22:31:44 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h	19 Apr 2017 16:43:51 -0000
@@ -19,8 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
@@ -31,6 +34,7 @@
 #include <sys/zio.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
+#include <sys/zfs_stat.h>
 
 #ifdef _KERNEL
 #include <sys/nvpair.h>
@@ -41,6 +45,15 @@ extern "C" {
 #endif
 
 /*
+ * The structures in this file are passed between userland and the
+ * kernel.  Userland may be running a 32-bit process, while the kernel
+ * is 64-bit.  Therefore, these structures need to compile the same in
+ * 32-bit and 64-bit.  This means not using type "long", and adding
+ * explicit padding so that the 32-bit structure will not be packed more
+ * tightly than the 64-bit structure (which requires 64-bit alignment).
+ */
+
+/*
  * Property values for snapdir
  */
 #define	ZFS_SNAPDIR_HIDDEN		0
@@ -69,18 +82,32 @@ typedef enum drr_headertype {
  * Feature flags for zfs send streams (flags in drr_versioninfo)
  */
 
-#define	DMU_BACKUP_FEATURE_DEDUP	(0x1)
-#define	DMU_BACKUP_FEATURE_DEDUPPROPS	(0x2)
+#define	DMU_BACKUP_FEATURE_DEDUP		(1 << 0)
+#define	DMU_BACKUP_FEATURE_DEDUPPROPS		(1 << 1)
+#define	DMU_BACKUP_FEATURE_SA_SPILL		(1 << 2)
+/* flags #3 - #15 are reserved for incompatible closed-source implementations */
+#define	DMU_BACKUP_FEATURE_EMBED_DATA		(1 << 16)
+#define	DMU_BACKUP_FEATURE_EMBED_DATA_LZ4	(1 << 17)
+/* flag #18 is reserved for a Delphix feature */
+#define	DMU_BACKUP_FEATURE_LARGE_BLOCKS		(1 << 19)
+#define	DMU_BACKUP_FEATURE_RESUMING		(1 << 20)
 
 /*
  * Mask of all supported backup features
  */
 #define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP | \
-		DMU_BACKUP_FEATURE_DEDUPPROPS)
+    DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
+    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+    DMU_BACKUP_FEATURE_RESUMING | \
+    DMU_BACKUP_FEATURE_LARGE_BLOCKS)
 
 /* Are all features in the given flag word currently supported? */
 #define	DMU_STREAM_SUPPORTED(x)	(!((x) & ~DMU_BACKUP_FEATURE_MASK))
 
+typedef enum dmu_send_resume_token_version {
+	ZFS_SEND_RESUME_TOKEN_VERSION = 1
+} dmu_send_resume_token_version_t;
+
 /*
  * The drr_versioninfo field of the dmu_replay_record has the
  * following layout:
@@ -100,8 +127,22 @@ typedef enum drr_headertype {
 
 #define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
+/*
+ * Send stream flags.  Bits 24-31 are reserved for vendor-specific
+ * implementations and should not be used.
+ */
 #define	DRR_FLAG_CLONE		(1<<0)
 #define	DRR_FLAG_CI_DATA	(1<<1)
+/*
+ * This send stream, if it is a full send, includes the FREE and FREEOBJECT
+ * records that are created by the sending process.  This means that the send
+ * stream can be received as a clone, even though it is not an incremental.
+ * This is not implemented as a feature flag, because the receiving side does
+ * not need to have implemented it to receive this stream; it is fully backwards
+ * compatible.  We need a flag, though, because full send streams without it
+ * cannot necessarily be received as a clone correctly.
+ */
+#define	DRR_FLAG_FREERECORDS	(1<<2)
 
 /*
  * flags in the drr_checksumflags field in the DRR_WRITE and
@@ -114,83 +155,151 @@ typedef enum drr_headertype {
 /*
  * zfs ioctl command structure
  */
+struct drr_begin {
+	uint64_t drr_magic;
+	uint64_t drr_versioninfo; /* was drr_version */
+	uint64_t drr_creation_time;
+	dmu_objset_type_t drr_type;
+	uint32_t drr_flags;
+	uint64_t drr_toguid;
+	uint64_t drr_fromguid;
+	char drr_toname[MAXNAMELEN];
+};
+
+struct drr_end {
+	zio_cksum_t drr_checksum;
+	uint64_t drr_toguid;
+};
+
+struct drr_object {
+	uint64_t drr_object;
+	dmu_object_type_t drr_type;
+	dmu_object_type_t drr_bonustype;
+	uint32_t drr_blksz;
+	uint32_t drr_bonuslen;
+	uint8_t drr_checksumtype;
+	uint8_t drr_compress;
+	uint8_t drr_pad[6];
+	uint64_t drr_toguid;
+	/* bonus content follows */
+};
+
+struct drr_freeobjects {
+	uint64_t drr_firstobj;
+	uint64_t drr_numobjs;
+	uint64_t drr_toguid;
+};
+
+struct drr_write {
+	uint64_t drr_object;
+	dmu_object_type_t drr_type;
+	uint32_t drr_pad;
+	uint64_t drr_offset;
+	uint64_t drr_length;
+	uint64_t drr_toguid;
+	uint8_t drr_checksumtype;
+	uint8_t drr_checksumflags;
+	uint8_t drr_pad2[6];
+	ddt_key_t drr_key; /* deduplication key */
+	/* content follows */
+};
+
+struct drr_free {
+	uint64_t drr_object;
+	uint64_t drr_offset;
+	uint64_t drr_length;
+	uint64_t drr_toguid;
+};
+
+struct drr_write_byref {
+	/* where to put the data */
+	uint64_t drr_object;
+	uint64_t drr_offset;
+	uint64_t drr_length;
+	uint64_t drr_toguid;
+	/* where to find the prior copy of the data */
+	uint64_t drr_refguid;
+	uint64_t drr_refobject;
+	uint64_t drr_refoffset;
+	/* properties of the data */
+	uint8_t drr_checksumtype;
+	uint8_t drr_checksumflags;
+	uint8_t drr_pad2[6];
+	ddt_key_t drr_key; /* deduplication key */
+};
+
+struct drr_spill {
+	uint64_t drr_object;
+	uint64_t drr_length;
+	uint64_t drr_toguid;
+	uint64_t drr_pad[4]; /* needed for crypto */
+	/* spill data follows */
+};
+
 typedef struct dmu_replay_record {
 	enum {
 		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
 		DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
-		DRR_NUMTYPES
+		DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
 	} drr_type;
 	uint32_t drr_payloadlen;
 	union {
-		struct drr_begin {
-			uint64_t drr_magic;
-			uint64_t drr_versioninfo; /* was drr_version */
-			uint64_t drr_creation_time;
-			dmu_objset_type_t drr_type;
-			uint32_t drr_flags;
-			uint64_t drr_toguid;
-			uint64_t drr_fromguid;
-			char drr_toname[MAXNAMELEN];
-		} drr_begin;
-		struct drr_end {
-			zio_cksum_t drr_checksum;
-			uint64_t drr_toguid;
-		} drr_end;
-		struct drr_object {
-			uint64_t drr_object;
-			dmu_object_type_t drr_type;
-			dmu_object_type_t drr_bonustype;
-			uint32_t drr_blksz;
-			uint32_t drr_bonuslen;
-			uint8_t drr_checksumtype;
-			uint8_t drr_compress;
-			uint8_t drr_pad[6];
-			uint64_t drr_toguid;
-			/* bonus content follows */
-		} drr_object;
-		struct drr_freeobjects {
-			uint64_t drr_firstobj;
-			uint64_t drr_numobjs;
-			uint64_t drr_toguid;
-		} drr_freeobjects;
-		struct drr_write {
-			uint64_t drr_object;
-			dmu_object_type_t drr_type;
-			uint32_t drr_pad;
-			uint64_t drr_offset;
-			uint64_t drr_length;
-			uint64_t drr_toguid;
-			uint8_t drr_checksumtype;
-			uint8_t drr_checksumflags;
-			uint8_t drr_pad2[6];
-			ddt_key_t drr_key; /* deduplication key */
-			/* content follows */
-		} drr_write;
-		struct drr_free {
-			uint64_t drr_object;
-			uint64_t drr_offset;
-			uint64_t drr_length;
-			uint64_t drr_toguid;
-		} drr_free;
-		struct drr_write_byref {
-			/* where to put the data */
+		struct drr_begin drr_begin;
+		struct drr_end drr_end;
+		struct drr_object drr_object;
+		struct drr_freeobjects drr_freeobjects;
+		struct drr_write drr_write;
+		struct drr_free drr_free;
+		struct drr_write_byref drr_write_byref;
+		struct drr_spill drr_spill;
+		struct drr_write_embedded {
 			uint64_t drr_object;
 			uint64_t drr_offset;
+			/* logical length, should equal blocksize */
 			uint64_t drr_length;
 			uint64_t drr_toguid;
-			/* where to find the prior copy of the data */
-			uint64_t drr_refguid;
-			uint64_t drr_refobject;
-			uint64_t drr_refoffset;
-			/* properties of the data */
-			uint8_t drr_checksumtype;
-			uint8_t drr_checksumflags;
-			uint8_t drr_pad2[6];
-			ddt_key_t drr_key; /* deduplication key */
-		} drr_write_byref;
+			uint8_t drr_compression;
+			uint8_t drr_etype;
+			uint8_t drr_pad[6];
+			uint32_t drr_lsize; /* uncompressed size of payload */
+			uint32_t drr_psize; /* compr. (real) size of payload */
+			/* (possibly compressed) content follows */
+		} drr_write_embedded;
+
+		/*
+		 * Nore: drr_checksum is overlaid with all record types
+		 * except DRR_BEGIN.  Therefore its (non-pad) members
+		 * must not overlap with members from the other structs.
+		 * We accomplish this by putting its members at the very
+		 * end of the struct.
+		 */
+		struct drr_checksum {
+			uint64_t drr_pad[34];
+			/*
+			 * fletcher-4 checksum of everything preceding the
+			 * checksum.
+			 */
+			zio_cksum_t drr_checksum;
+		} drr_checksum;
 	} drr_u;
 } dmu_replay_record_t;
 
+/* diff record range types */
+typedef enum diff_type {
+	DDR_NONE = 0x1,
+	DDR_INUSE = 0x2,
+	DDR_FREE = 0x4
+} diff_type_t;
+
+/*
+ * The diff reports back ranges of free or in-use objects.
+ */
+typedef struct dmu_diff_record {
+	uint64_t ddr_type;
+	uint64_t ddr_first;
+	uint64_t ddr_last;
+} dmu_diff_record_t;
+
 typedef struct zinject_record {
 	uint64_t	zi_objset;
 	uint64_t	zi_object;
@@ -206,12 +315,25 @@ typedef struct zinject_record {
 	uint32_t	zi_iotype;
 	int32_t		zi_duration;
 	uint64_t	zi_timer;
+	uint64_t	zi_nlanes;
+	uint32_t	zi_cmd;
+	uint32_t	zi_pad;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
 #define	ZINJECT_FLUSH_ARC	0x2
 #define	ZINJECT_UNLOAD_SPA	0x4
 
+typedef enum zinject_type {
+	ZINJECT_UNINITIALIZED,
+	ZINJECT_DATA_FAULT,
+	ZINJECT_DEVICE_FAULT,
+	ZINJECT_LABEL_FAULT,
+	ZINJECT_IGNORED_WRITES,
+	ZINJECT_PANIC,
+	ZINJECT_DELAY_IO,
+} zinject_type_t;
+
 typedef struct zfs_share {
 	uint64_t	z_exportdata;
 	uint64_t	z_sharedata;
@@ -231,32 +353,55 @@ typedef enum zfs_case {
 	ZFS_CASE_MIXED
 } zfs_case_t;
 
+/*
+ * Note: this struct must have the same layout in 32-bit and 64-bit, so
+ * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
+ * kernel.  Therefore, we add padding to it so that no "hidden" padding
+ * is automatically added on 64-bit (but not on 32-bit).
+ */
 typedef struct zfs_cmd {
-	char		zc_name[MAXPATHLEN];
+	char		zc_name[MAXPATHLEN];	/* name of pool or dataset */
+	uint64_t	zc_nvlist_src;		/* really (char *) */
+	uint64_t	zc_nvlist_src_size;
+	uint64_t	zc_nvlist_dst;		/* really (char *) */
+	uint64_t	zc_nvlist_dst_size;
+	boolean_t	zc_nvlist_dst_filled;	/* put an nvlist in dst? */
+	int		zc_pad2;
+
+	/*
+	 * The following members are for legacy ioctls which haven't been
+	 * converted to the new method.
+	 */
+	uint64_t	zc_history;		/* really (char *) */
 	char		zc_value[MAXPATHLEN * 2];
 	char		zc_string[MAXNAMELEN];
-	char		zc_top_ds[MAXPATHLEN];
 	uint64_t	zc_guid;
 	uint64_t	zc_nvlist_conf;		/* really (char *) */
 	uint64_t	zc_nvlist_conf_size;
-	uint64_t	zc_nvlist_src;		/* really (char *) */
-	uint64_t	zc_nvlist_src_size;
-	uint64_t	zc_nvlist_dst;		/* really (char *) */
-	uint64_t	zc_nvlist_dst_size;
 	uint64_t	zc_cookie;
 	uint64_t	zc_objset_type;
 	uint64_t	zc_perm_action;
-	uint64_t 	zc_history;		/* really (char *) */
-	uint64_t 	zc_history_len;
+	uint64_t	zc_history_len;
 	uint64_t	zc_history_offset;
 	uint64_t	zc_obj;
 	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
 	zfs_share_t	zc_share;
+	uint64_t	zc_jailid;
 	dmu_objset_stats_t zc_objset_stats;
-	struct drr_begin zc_begin_record;
+	dmu_replay_record_t zc_begin_record;
 	zinject_record_t zc_inject_record;
-	boolean_t	zc_defer_destroy;
-	boolean_t	zc_temphold;
+	uint32_t	zc_defer_destroy;
+	uint32_t	zc_flags;
+	uint64_t	zc_action_handle;
+	int		zc_cleanup_fd;
+	uint8_t		zc_simple;
+	uint8_t		zc_pad3[3];
+	boolean_t	zc_resumable;
+	uint32_t	zc_pad4;
+	uint64_t	zc_sendobj;
+	uint64_t	zc_fromobj;
+	uint64_t	zc_createtxg;
+	zfs_stat_t	zc_stat;
 } zfs_cmd_t;
 
 typedef struct zfs_useracct {
@@ -266,8 +411,8 @@ typedef struct zfs_useracct {
 	uint64_t zu_space;
 } zfs_useracct_t;
 
-#define	ZVOL_MAX_MINOR	(1 << 16)
-#define	ZFS_MIN_MINOR	(ZVOL_MAX_MINOR + 1)
+#define	ZFSDEV_MAX_MINOR	(1 << 16)
+#define	ZFS_MIN_MINOR	(ZFSDEV_MAX_MINOR + 1)
 
 #define	ZPOOL_EXPORT_AFTER_SPLIT 0x1
 
@@ -285,7 +430,29 @@ extern int zfs_secpolicy_rename_perms(co
     const char *to, cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int zfs_busy(void);
-extern int zfs_unmount_snap(const char *, void *);
+extern int zfs_unmount_snap(const char *);
+extern void zfs_destroy_unmount_origin(const char *);
+
+/*
+ * ZFS minor numbers can refer to either a control device instance or
+ * a zvol. Depending on the value of zss_type, zss_data points to either
+ * a zvol_state_t or a zfs_onexit_t.
+ */
+enum zfs_soft_state_type {
+	ZSST_ZVOL,
+	ZSST_CTLDEV
+};
+
+typedef struct zfs_soft_state {
+	enum zfs_soft_state_type zss_type;
+	void *zss_data;
+} zfs_soft_state_t;
+
+extern void *zfsdev_get_soft_state(minor_t minor,
+    enum zfs_soft_state_type which);
+extern minor_t zfsdev_minor_alloc(void);
+
+extern void *zfsdev_state;
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_onexit.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_onexit.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_onexit.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_onexit.h	12 Jun 2012 05:57:27 -0000
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_ZFS_ONEXIT_H
+#define	_SYS_ZFS_ONEXIT_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+typedef struct zfs_onexit {
+	kmutex_t	zo_lock;
+	list_t		zo_actions;
+} zfs_onexit_t;
+
+typedef struct zfs_onexit_action_node {
+	list_node_t	za_link;
+	void		(*za_func)(void *);
+	void		*za_data;
+} zfs_onexit_action_node_t;
+
+extern void zfs_onexit_init(zfs_onexit_t **zo);
+extern void zfs_onexit_destroy(zfs_onexit_t *zo);
+
+#endif
+
+extern int zfs_onexit_fd_hold(int fd, minor_t *minorp);
+extern void zfs_onexit_fd_rele(int fd);
+extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+    uint64_t *action_handle);
+extern int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle,
+    boolean_t fire);
+extern int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle,
+    void **data);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_ONEXIT_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_rlock.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_rlock.h,v
retrieving revision 1.2
diff -u -p -r1.2 zfs_rlock.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_rlock.h	15 Oct 2012 14:15:59 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_rlock.h	4 May 2017 22:26:33 -0000
@@ -26,8 +26,6 @@
 #ifndef	_SYS_FS_ZFS_RLOCK_H
 #define	_SYS_FS_ZFS_RLOCK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -54,20 +52,17 @@ typedef struct rl {
 	uint8_t r_proxy;	/* acting for original range */
 	uint8_t r_write_wanted;	/* writer wants to lock this range */
 	uint8_t r_read_wanted;	/* reader wants to lock this range */
-	unsigned long r_refcnt; /* reference count for cv waits */
 } rl_t;
 
 /*
- * Lock a range (offset, length) as either shared (READER)
- * or exclusive (WRITER or APPEND). APPEND is a special type that
- * is converted to WRITER that specified to lock from the start of the
- * end of file.  zfs_range_lock() returns the range lock structure.
+ * Lock a range (offset, length) as either shared (RL_READER)
+ * or exclusive (RL_WRITER or RL_APPEND).  RL_APPEND is a special type that
+ * is converted to RL_WRITER that specified to lock from the start of the
+ * end of file.  Returns the range lock structure.
  */
 rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
 
-/*
- * Unlock range and destroy range lock structure.
- */
+/* Unlock range and destroy range lock structure. */
 void zfs_range_unlock(rl_t *rl);
 
 /*
@@ -77,7 +72,8 @@ void zfs_range_unlock(rl_t *rl);
 void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
 
 /*
- * AVL comparison function used to compare range locks
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
  */
 int zfs_range_compare(const void *arg1, const void *arg2);
 
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_sa.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_sa.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_sa.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_sa.h	12 Jun 2012 05:57:27 -0000
@@ -0,0 +1,142 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_ZFS_SA_H
+#define	_SYS_ZFS_SA_H
+
+#ifdef _KERNEL
+#include <sys/list.h>
+#include <sys/dmu.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zil.h>
+
+
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * This is the list of known attributes
+ * to the ZPL.  The values of the actual
+ * attributes are not defined by the order
+ * the enums.  It is controlled by the attribute
+ * registration mechanism.  Two different file system
+ * could have different numeric values for the same
+ * attributes.  this list is only used for dereferencing
+ * into the table that will hold the actual numeric value.
+ */
+typedef enum zpl_attr {
+	ZPL_ATIME,
+	ZPL_MTIME,
+	ZPL_CTIME,
+	ZPL_CRTIME,
+	ZPL_GEN,
+	ZPL_MODE,
+	ZPL_SIZE,
+	ZPL_PARENT,
+	ZPL_LINKS,
+	ZPL_XATTR,
+	ZPL_RDEV,
+	ZPL_FLAGS,
+	ZPL_UID,
+	ZPL_GID,
+	ZPL_PAD,
+	ZPL_ZNODE_ACL,
+	ZPL_DACL_COUNT,
+	ZPL_SYMLINK,
+	ZPL_SCANSTAMP,
+	ZPL_DACL_ACES,
+	ZPL_END
+} zpl_attr_t;
+
+#define	ZFS_OLD_ZNODE_PHYS_SIZE	0x108
+#define	ZFS_SA_BASE_ATTR_SIZE	(ZFS_OLD_ZNODE_PHYS_SIZE - \
+    sizeof (zfs_acl_phys_t))
+
+#define	SA_MODE_OFFSET		0
+#define	SA_SIZE_OFFSET		8
+#define	SA_GEN_OFFSET		16
+#define	SA_UID_OFFSET		24
+#define	SA_GID_OFFSET		32
+#define	SA_PARENT_OFFSET	40
+
+extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1];
+extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1];
+
+/*
+ * This is a deprecated data structure that only exists for
+ * dealing with file systems create prior to ZPL version 5.
+ */
+typedef struct znode_phys {
+	uint64_t zp_atime[2];		/*  0 - last file access time */
+	uint64_t zp_mtime[2];		/* 16 - last file modification time */
+	uint64_t zp_ctime[2];		/* 32 - last file change time */
+	uint64_t zp_crtime[2];		/* 48 - creation time */
+	uint64_t zp_gen;		/* 64 - generation (txg of creation) */
+	uint64_t zp_mode;		/* 72 - file mode bits */
+	uint64_t zp_size;		/* 80 - size of file */
+	uint64_t zp_parent;		/* 88 - directory parent (`..') */
+	uint64_t zp_links;		/* 96 - number of links to file */
+	uint64_t zp_xattr;		/* 104 - DMU object for xattrs */
+	uint64_t zp_rdev;		/* 112 - dev_t for VBLK & VCHR files */
+	uint64_t zp_flags;		/* 120 - persistent flags */
+	uint64_t zp_uid;		/* 128 - file owner */
+	uint64_t zp_gid;		/* 136 - owning group */
+	uint64_t zp_zap;		/* 144 - extra attributes */
+	uint64_t zp_pad[3];		/* 152 - future */
+	zfs_acl_phys_t zp_acl;		/* 176 - 263 ACL */
+	/*
+	 * Data may pad out any remaining bytes in the znode buffer, eg:
+	 *
+	 * |<---------------------- dnode_phys (512) ------------------------>|
+	 * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+	 *			|<---- znode (264) ---->|<---- data (56) ---->|
+	 *
+	 * At present, we use this space for the following:
+	 *  - symbolic links
+	 *  - 32-byte anti-virus scanstamp (regular files only)
+	 */
+} znode_phys_t;
+
+#ifdef _KERNEL
+int zfs_sa_readlink(struct znode *, uio_t *);
+void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *);
+void zfs_sa_upgrade(struct sa_handle  *, dmu_tx_t *);
+void zfs_sa_get_scanstamp(struct znode *, xvattr_t *);
+void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *);
+void zfs_sa_uprade_pre(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_post(struct sa_handle *, void *, dmu_tx_t *);
+void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_ZFS_SA_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_stat.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_stat.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_stat.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_stat.h	12 Jun 2012 05:57:27 -0000
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_FS_ZFS_STAT_H
+#define	_SYS_FS_ZFS_STAT_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/dmu.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * A limited number of zpl level stats are retrievable
+ * with an ioctl.  zfs diff is the current consumer.
+ */
+typedef struct zfs_stat {
+	uint64_t	zs_gen;
+	uint64_t	zs_mode;
+	uint64_t	zs_links;
+	uint64_t	zs_ctime[2];
+} zfs_stat_t;
+
+extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_FS_ZFS_STAT_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs_vfsops.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h	27 Feb 2010 22:31:45 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h	2 Mar 2017 10:54:19 -0000
@@ -19,18 +19,18 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
  */
 
 #ifndef	_SYS_FS_ZFS_VFSOPS_H
 #define	_SYS_FS_ZFS_VFSOPS_H
 
-#include <sys/isa_defs.h>
-#include <sys/types32.h>
 #include <sys/list.h>
 #include <sys/vfs.h>
 #include <sys/zil.h>
+#include <sys/sa.h>
 #include <sys/rrwlock.h>
 #include <sys/zfs_ioctl.h>
 
@@ -39,6 +39,7 @@ extern "C" {
 #endif
 
 typedef struct zfsvfs zfsvfs_t;
+struct znode;
 
 struct zfsvfs {
 	vfs_t		*z_vfs;		/* generic fs struct */
@@ -63,21 +64,25 @@ struct zfsvfs {
 	int		z_norm;		/* normalization flags */
 	boolean_t	z_atime;	/* enable atimes mount option */
 	boolean_t	z_unmounted;	/* unmounted */
-	rrwlock_t	z_teardown_lock;
+	rrmlock_t	z_teardown_lock;
 	krwlock_t	z_teardown_inactive_lock;
 	list_t		z_all_znodes;	/* all vnodes in the fs */
 	kmutex_t	z_znodes_lock;	/* lock for z_all_znodes */
-	vnode_t		*z_ctldir;	/* .zfs directory pointer */
+	struct zfsctl_root	*z_ctldir;	/* .zfs directory pointer */
 	boolean_t	z_show_ctldir;	/* expose .zfs in the root dir */
 	boolean_t	z_issnap;	/* true if this is a snapshot */
 	boolean_t	z_vscan;	/* virus scan on/off */
 	boolean_t	z_use_fuids;	/* version allows fuids */
 	boolean_t	z_replay;	/* set during ZIL replay */
+	boolean_t	z_use_sa;	/* version allow system attributes */
+	boolean_t	z_use_namecache;/* make use of FreeBSD name cache */
 	uint64_t	z_version;	/* ZPL version */
 	uint64_t	z_shares_dir;	/* hidden shares dir */
 	kmutex_t	z_lock;
 	uint64_t	z_userquota_obj;
 	uint64_t	z_groupquota_obj;
+	uint64_t	z_replay_eof;	/* New end of file - replay only */
+	sa_attr_type_t	*z_attr_table;	/* SA attr mapping->id */
 #define	ZFS_OBJ_MTX_SZ	64
 	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
 };
@@ -106,7 +111,7 @@ typedef struct zfid_short {
 } zfid_short_t;
 
 /*
- * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes
+ * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes
  * (including the length field).  This makes files under .zfs/snapshot
  * accessible by NFSv3 and NFSv4, but not NFSv2.
  *
@@ -116,37 +121,47 @@ typedef struct zfid_short {
  *	6 bytes		object number (48 bits)
  *	4 bytes		generation number (32 bits)
  *	6 bytes		objset id (48 bits)
- *	4 bytes		currently just zero (32 bits)
+ *	4 bytes[**]	currently just zero (32 bits)
  *
  * We reserve only 48 bits for the object number and objset id, as these are
  * the limits currently defined and imposed by the DMU.
+ *
+ * [*] 20 bytes on FreeBSD to fit into the size of struct fid.
+ * [**] 2 bytes on FreeBSD for the above reason.
  */
 typedef struct zfid_long {
 	zfid_short_t	z_fid;
 	uint8_t		zf_setid[6];		/* obj[i] = obj >> (8 * i) */
-	uint8_t		zf_setgen[4];		/* gen[i] = gen >> (8 * i) */
+	uint8_t		zf_setgen[2];		/* gen[i] = gen >> (8 * i) */
 } zfid_long_t;
 
 #define	SHORT_FID_LEN	(sizeof (zfid_short_t) - sizeof (uint16_t))
 #define	LONG_FID_LEN	(sizeof (zfid_long_t) - sizeof (uint16_t))
 
 extern uint_t zfs_fsyncer_key;
+extern int zfs_super_owner;
 
 extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds);
 extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valuep);
 extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
 extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t quota);
-extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
-    boolean_t isgroup, uint64_t fuid);
+extern boolean_t zfs_owner_overquota(zfsvfs_t *zfsvfs, struct znode *,
+    boolean_t isgroup);
+extern boolean_t zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup,
+    uint64_t fuid);
 extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
 extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
 extern void zfsvfs_free(zfsvfs_t *zfsvfs);
 extern int zfs_check_global_label(const char *dsname, const char *hexsl);
 
+#ifdef _KERNEL
+extern void zfsvfs_update_fromname(const char *oldname, const char *newname);
+#endif
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h,v
retrieving revision 1.7
diff -u -p -r1.7 zfs_znode.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h	9 Apr 2015 19:47:05 -0000	1.7
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h	9 Jul 2017 07:35:23 -0000
@@ -19,31 +19,30 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
 #ifndef	_SYS_FS_ZFS_ZNODE_H
 #define	_SYS_FS_ZFS_ZNODE_H
 
 #ifdef _KERNEL
-#include <sys/isa_defs.h>
-#include <sys/types32.h>
-#include <sys/attr.h>
 #include <sys/list.h>
 #include <sys/dmu.h>
+#include <sys/sa.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/rrwlock.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+#endif
+#include <sys/zfs_acl.h>
+#include <sys/zil.h>
 
 #ifdef __NetBSD__
-#include <miscfs/genfs/genfs.h>
 #include <miscfs/genfs/genfs_node.h>
 #endif	
 
-#endif
-#include <sys/zfs_acl.h>
-#include <sys/zil.h>
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -64,13 +63,17 @@ extern "C" {
 #define	ZFS_AV_QUARANTINED 	0x0000020000000000
 #define	ZFS_AV_MODIFIED 	0x0000040000000000
 #define	ZFS_REPARSE		0x0000080000000000
+#define	ZFS_OFFLINE		0x0000100000000000
+#define	ZFS_SPARSE		0x0000200000000000
 
-#define	ZFS_ATTR_SET(zp, attr, value)	\
+#define	ZFS_ATTR_SET(zp, attr, value, pflags, tx) \
 { \
 	if (value) \
-		zp->z_phys->zp_flags |= attr; \
+		pflags |= attr; \
 	else \
-		zp->z_phys->zp_flags &= ~attr; \
+		pflags &= ~attr; \
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zp->z_zfsvfs), \
+	    &pflags, sizeof (pflags), tx)); \
 }
 
 /*
@@ -86,6 +89,27 @@ extern "C" {
 #define	ZFS_BONUS_SCANSTAMP	0x80		/* Scanstamp in bonus area */
 #define	ZFS_NO_EXECS_DENIED	0x100		/* exec was given to everyone */
 
+#define	SA_ZPL_ATIME(z)		z->z_attr_table[ZPL_ATIME]
+#define	SA_ZPL_MTIME(z)		z->z_attr_table[ZPL_MTIME]
+#define	SA_ZPL_CTIME(z)		z->z_attr_table[ZPL_CTIME]
+#define	SA_ZPL_CRTIME(z)	z->z_attr_table[ZPL_CRTIME]
+#define	SA_ZPL_GEN(z)		z->z_attr_table[ZPL_GEN]
+#define	SA_ZPL_DACL_ACES(z)	z->z_attr_table[ZPL_DACL_ACES]
+#define	SA_ZPL_XATTR(z)		z->z_attr_table[ZPL_XATTR]
+#define	SA_ZPL_SYMLINK(z)	z->z_attr_table[ZPL_SYMLINK]
+#define	SA_ZPL_RDEV(z)		z->z_attr_table[ZPL_RDEV]
+#define	SA_ZPL_SCANSTAMP(z)	z->z_attr_table[ZPL_SCANSTAMP]
+#define	SA_ZPL_UID(z)		z->z_attr_table[ZPL_UID]
+#define	SA_ZPL_GID(z)		z->z_attr_table[ZPL_GID]
+#define	SA_ZPL_PARENT(z)	z->z_attr_table[ZPL_PARENT]
+#define	SA_ZPL_LINKS(z)		z->z_attr_table[ZPL_LINKS]
+#define	SA_ZPL_MODE(z)		z->z_attr_table[ZPL_MODE]
+#define	SA_ZPL_DACL_COUNT(z)	z->z_attr_table[ZPL_DACL_COUNT]
+#define	SA_ZPL_FLAGS(z)		z->z_attr_table[ZPL_FLAGS]
+#define	SA_ZPL_SIZE(z)		z->z_attr_table[ZPL_SIZE]
+#define	SA_ZPL_ZNODE_ACL(z)	z->z_attr_table[ZPL_ZNODE_ACL]
+#define	SA_ZPL_PAD(z)		z->z_attr_table[ZPL_PAD]
+
 /*
  * Is ID ephemeral?
  */
@@ -94,8 +118,10 @@ extern "C" {
 /*
  * Should we use FUIDs?
  */
-#define	USE_FUIDS(version, os)	(version >= ZPL_VERSION_FUID &&\
+#define	USE_FUIDS(version, os)	(version >= ZPL_VERSION_FUID && \
     spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+#define	USE_SA(version, os) (version >= ZPL_VERSION_SA && \
+    spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA)
 
 #define	MASTER_NODE_OBJ	1
 
@@ -110,8 +136,7 @@ extern "C" {
 #define	ZPL_VERSION_STR		"VERSION"
 #define	ZFS_FUID_TABLES		"FUID"
 #define	ZFS_SHARES_DIR		"SHARES"
-
-#define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
+#define	ZFS_SA_ATTRS		"SA_ATTRS"
 
 /* Path component length */
 /*
@@ -127,7 +152,7 @@ extern "C" {
  * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
  * the directory entries.
  */
-#ifndef __NetBSD__
+#ifndef IFTODT
 #define	IFTODT(mode) (((mode) & S_IFMT) >> 12)
 #endif
 
@@ -140,42 +165,6 @@ extern "C" {
 #define	ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
 
 /*
- * This is the persistent portion of the znode.  It is stored
- * in the "bonus buffer" of the file.  Short symbolic links
- * are also stored in the bonus buffer.
- */
-typedef struct znode_phys {
-	uint64_t zp_atime[2];		/*  0 - last file access time */
-	uint64_t zp_mtime[2];		/* 16 - last file modification time */
-	uint64_t zp_ctime[2];		/* 32 - last file change time */
-	uint64_t zp_crtime[2];		/* 48 - creation time */
-	uint64_t zp_gen;		/* 64 - generation (txg of creation) */
-	uint64_t zp_mode;		/* 72 - file mode bits */
-	uint64_t zp_size;		/* 80 - size of file */
-	uint64_t zp_parent;		/* 88 - directory parent (`..') */
-	uint64_t zp_links;		/* 96 - number of links to file */
-	uint64_t zp_xattr;		/* 104 - DMU object for xattrs */
-	uint64_t zp_rdev;		/* 112 - dev_t for VBLK & VCHR files */
-	uint64_t zp_flags;		/* 120 - persistent flags */
-	uint64_t zp_uid;		/* 128 - file owner */
-	uint64_t zp_gid;		/* 136 - owning group */
-	uint64_t zp_zap;		/* 144 - extra attributes */
-	uint64_t zp_pad[3];		/* 152 - future */
-	zfs_acl_phys_t zp_acl;		/* 176 - 263 ACL */
-	/*
-	 * Data may pad out any remaining bytes in the znode buffer, eg:
-	 *
-	 * |<---------------------- dnode_phys (512) ------------------------>|
-	 * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
-	 *			|<---- znode (264) ---->|<---- data (56) ---->|
-	 *
-	 * At present, we use this space for the following:
-	 *  - symbolic links
-	 *  - 32-byte anti-virus scanstamp (regular files only)
-	 */
-} znode_phys_t;
-
-/*
  * Directory entry locks control access to directory entries.
  * They are used to protect creates, deletes, and renames.
  * Each directory znode has a mutex and a list of locked names.
@@ -186,7 +175,6 @@ typedef struct zfs_dirlock {
 	uint32_t	dl_sharecnt;	/* 0 if exclusive, > 0 if shared */
 	uint8_t		dl_namelock;	/* 1 if z_name_lock is NOT held */
 	uint16_t	dl_namesize;	/* set if dl_name was allocated */
-	unsigned long	dl_refcnt;	/* reference count */
 	kcondvar_t	dl_cv;		/* wait for entry to be unlocked */
 	struct znode	*dl_dzp;	/* directory znode */
 	struct zfs_dirlock *dl_next;	/* next in z_dirlocks list */
@@ -194,38 +182,43 @@ typedef struct zfs_dirlock {
 
 typedef struct znode {
 #ifdef __NetBSD__
-	struct genfs_node  z_genfs_node;
-#endif	
+	struct genfs_node z_gnode;
+#endif
 	struct zfsvfs	*z_zfsvfs;
 	vnode_t		*z_vnode;
 	uint64_t	z_id;		/* object ID for this znode */
+#ifdef illumos
 	kmutex_t	z_lock;		/* znode modification lock */
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
+#endif
 	kmutex_t	z_range_lock;	/* protects changes to z_range_avl */
 	avl_tree_t	z_range_avl;	/* avl tree of file range locks */
 	uint8_t		z_unlinked;	/* file has been unlinked */
 	uint8_t		z_atime_dirty;	/* atime needs to be synced */
 	uint8_t		z_zn_prefetch;	/* Prefetch znodes? */
+	uint8_t		z_moved;	/* Has this znode been moved? */
 	uint_t		z_blksz;	/* block size in bytes */
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
-	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
-	uint64_t	z_gen;		/* generation (same as zp_gen) */
+	uint64_t	z_gen;		/* generation (cached) */
+	uint64_t	z_size;		/* file size (cached) */
+	uint64_t	z_atime[2];	/* atime (cached) */
+	uint64_t	z_links;	/* file links (cached) */
+	uint64_t	z_pflags;	/* pflags (cached) */
+	uint64_t	z_uid;		/* uid fuid (cached) */
+	uint64_t	z_gid;		/* gid fuid (cached) */
+	mode_t		z_mode;		/* mode (cached) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	zfs_acl_t	*z_acl_cached;	/* cached acl */
 	list_node_t	z_link_node;	/* all znodes in fs link */
-	/*
-	 * These are dmu managed fields.
-	 */
-	znode_phys_t	*z_phys;	/* pointer to persistent znode */
-	dmu_buf_t	*z_dbuf;	/* buffer containing the z_phys */
-	/*
-	 * NetBSD-specific fields.
-	 */
-	struct lockf	*z_lockf;	/* Head of byte-level lock list. */
+	sa_handle_t	*z_sa_hdl;	/* handle to sa data */
+	boolean_t	z_is_sa;	/* are we native sa? */
+#ifdef __NetBSD__
+	struct lockf	*z_lockf;	/* head of byte-level lock list */
+#endif
 } znode_t;
 
 
@@ -248,27 +241,44 @@ typedef struct znode {
 /*
  * Convert between znode pointers and vnode pointers
  */
+#ifdef DEBUG
+static __inline vnode_t *
+ZTOV(znode_t *zp)
+{
+	vnode_t *vp = zp->z_vnode;
+
+	ASSERT(vp == NULL || vp->v_data == NULL || vp->v_data == zp);
+	return (vp);
+}
+static __inline znode_t *
+VTOZ(vnode_t *vp)
+{
+	znode_t *zp = (znode_t *)vp->v_data;
+
+	ASSERT(zp == NULL || zp->z_vnode == NULL || zp->z_vnode == vp);
+	return (zp);
+}
+#else
 #define	ZTOV(ZP)	((ZP)->z_vnode)
 #define	VTOZ(VP)	((znode_t *)(VP)->v_data)
+#endif
 
-/*
- * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
- * ZFS_EXIT() must be called before exitting the vop.
- * ZFS_VERIFY_ZP() verifies the znode is valid.
- */
+/* Called on entry to each ZFS vnode and vfs operation  */
 #define	ZFS_ENTER(zfsvfs) \
 	{ \
-		rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG); \
+		rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
 		if ((zfsvfs)->z_unmounted) { \
 			ZFS_EXIT(zfsvfs); \
 			return (EIO); \
 		} \
 	}
 
-#define	ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+/* Must be called before exiting the vop */
+#define	ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
 
+/* Verifies the znode is valid */
 #define	ZFS_VERIFY_ZP(zp) \
-	if ((zp)->z_dbuf == NULL) { \
+	if ((zp)->z_sa_hdl == NULL) { \
 		ZFS_EXIT((zp)->z_zfsvfs); \
 		return (EIO); \
 	} \
@@ -286,15 +296,14 @@ typedef struct znode {
 #define	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
 	mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
 
-/*
- * Macros to encode/decode ZFS stored time values from/to struct timespec
- */
+/* Encode ZFS stored time values from a struct timespec */
 #define	ZFS_TIME_ENCODE(tp, stmp)		\
 {						\
 	(stmp)[0] = (uint64_t)(tp)->tv_sec;	\
 	(stmp)[1] = (uint64_t)(tp)->tv_nsec;	\
 }
 
+/* Decode ZFS stored time values to a struct timespec */
 #define	ZFS_TIME_DECODE(tp, stmp)		\
 {						\
 	(tp)->tv_sec = (time_t)(stmp)[0];		\
@@ -310,20 +319,23 @@ typedef struct znode {
 
 #define	ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
 	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
-		zfs_time_stamper(zp, ACCESSED, NULL)
+		zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);
 
 extern int	zfs_init_fs(zfsvfs_t *, znode_t **);
 extern void	zfs_set_dataprop(objset_t *);
 extern void	zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
     dmu_tx_t *tx);
-extern void	zfs_time_stamper(znode_t *, uint_t, dmu_tx_t *);
-extern void	zfs_time_stamper_locked(znode_t *, uint_t, dmu_tx_t *);
+extern void	zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
+    uint64_t [2], boolean_t);
 extern void	zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
 extern int	zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
 extern void	zfs_znode_init(void);
 extern void	zfs_znode_fini(void);
+#ifdef __NetBSD__
 extern int	zfs_loadvnode(struct mount *, struct vnode *,
     const void *, size_t, const void **);
+extern int	zfs_zget_cleaner(zfsvfs_t *, uint64_t, znode_t **);
+#endif
 extern int	zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
 extern int	zfs_rezget(znode_t *);
 extern void	zfs_zinactive(znode_t *);
@@ -331,7 +343,6 @@ extern void	zfs_znode_delete(znode_t *, 
 extern void	zfs_znode_free(znode_t *);
 extern void	zfs_remove_op_tables();
 extern int	zfs_create_op_tables();
-extern int	zfs_sync(vfs_t *vfsp, int flag, cred_t *cr);
 extern dev_t	zfs_cmpldev(uint64_t);
 extern int	zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
 extern int	zfs_get_stats(objset_t *os, nvlist_t *nv);
@@ -343,7 +354,8 @@ extern void zfs_log_create(zilog_t *zilo
 extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp,
     vattr_t *vap);
 extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
-    znode_t *dzp, char *name);
+    znode_t *dzp, char *name, uint64_t foid);
+#define	ZFS_NO_OBJECT	0	/* no object id */
 extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
     znode_t *dzp, znode_t *zp, char *name);
 extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
@@ -356,9 +368,11 @@ extern void zfs_log_truncate(zilog_t *zi
     znode_t *zp, uint64_t off, uint64_t len);
 extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
+#ifndef ZFS_NO_ACL
 extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
-extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
+#endif
+extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
 extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 
@@ -369,6 +383,8 @@ extern zil_get_data_t zfs_get_data;
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
 extern int zfsfstype;
 
+extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf);
+
 #endif /* _KERNEL */
 
 extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zil.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h	27 Feb 2010 22:31:45 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h	3 Dec 2016 17:03:48 -0000
@@ -19,10 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_ZIL_H
 #define	_SYS_ZIL_H
 
@@ -35,6 +38,9 @@
 extern "C" {
 #endif
 
+struct dsl_pool;
+struct dsl_dataset;
+
 /*
  * Intent log format:
  *
@@ -88,7 +94,6 @@ typedef struct zil_chain {
 } zil_chain_t;
 
 #define	ZIL_MIN_BLKSZ	4096ULL
-#define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
 
 /*
  * The words of a log block checksum.
@@ -168,18 +173,14 @@ typedef enum zil_create {
 	(txtype) == TX_ACL ||		\
 	(txtype) == TX_WRITE2)
 
-
 /*
  * Format of log records.
  * The fields are carefully defined to allow them to be aligned
  * and sized the same on sparc & intel architectures.
  * Each log record has a common structure at the beginning.
  *
- * Note, lrc_seq holds two different sequence numbers. Whilst in memory
- * it contains the transaction sequence number.  The log record on
- * disk holds the sequence number of all log records which is used to
- * ensure we don't replay the same record.  The two sequence numbers are
- * different because the transactions can now be pushed out of order.
+ * The log record on disk (lrc_seq) holds the sequence number of all log
+ * records which is used to ensure we don't replay the same record.
  */
 typedef struct {			/* common log record header */
 	uint64_t	lrc_txtype;	/* intent log transaction type */
@@ -244,6 +245,12 @@ typedef struct {
  * information needed for replaying the create.  If the
  * file doesn't have any actual ACEs then the lr_aclcnt
  * would be zero.
+ *
+ * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
+ * If create is also setting xvattr's, then acl data follows xvattr.
+ * If ACE FUIDs are needed then they will follow the xvattr_t.  Following
+ * the FUIDs will be the domain table information.  The FUIDs for the owner
+ * and group will be in lr_create.  Name follows ACL data.
  */
 typedef struct {
 	lr_create_t	lr_create;	/* common create portion */
@@ -252,13 +259,6 @@ typedef struct {
 	uint64_t	lr_fuidcnt;	/* number of real fuids */
 	uint64_t	lr_acl_bytes;	/* number of bytes in ACL */
 	uint64_t	lr_acl_flags;	/* ACL flags */
-	/* lr_acl_bytes number of variable sized ace's follows */
-	/* if create is also setting xvattr's, then acl data follows xvattr */
-	/* if ACE FUIDs are needed then they will follow the xvattr_t */
-	/* Following the FUIDs will be the domain table information. */
-	/* The FUIDs for the owner and group will be in the lr_create */
-	/* portion of the record. */
-	/* name follows ACL data */
 } lr_acl_create_t;
 
 typedef struct {
@@ -369,7 +369,7 @@ typedef struct itx {
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
 	uint8_t		itx_sync;	/* synchronous transaction */
-	uint64_t	itx_sod;	/* record size on disk */
+	uint64_t	itx_oid;	/* object id */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
@@ -397,29 +397,35 @@ extern void	zil_replay(objset_t *os, voi
     zil_replay_func_t *replay_func[TX_MAX_TYPE]);
 extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
+extern void	zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
 extern void	zil_itx_destroy(itx_t *itx);
-extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+extern void	zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
-extern void	zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
+extern void	zil_async_to_sync(zilog_t *zilog, uint64_t oid);
+extern void	zil_commit(zilog_t *zilog, uint64_t oid);
 
 extern int	zil_vdev_offline(const char *osname, void *txarg);
-extern int	zil_claim(const char *osname, void *txarg);
-extern int	zil_check_log_chain(const char *osname, void *txarg);
+extern int	zil_claim(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *txarg);
+extern int 	zil_check_log_chain(struct dsl_pool *dp,
+    struct dsl_dataset *ds, void *tx);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
-extern void	zil_clean(zilog_t *zilog);
+extern void	zil_clean(zilog_t *zilog, uint64_t synced_txg);
 
-extern int	zil_suspend(zilog_t *zilog);
-extern void	zil_resume(zilog_t *zilog);
+extern int	zil_suspend(const char *osname, void **cookiep);
+extern void	zil_resume(void *cookie);
 
 extern void	zil_add_block(zilog_t *zilog, const blkptr_t *bp);
 extern int	zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
 
+extern void	zil_set_sync(zilog_t *zilog, uint64_t syncval);
+
 extern void	zil_set_logbias(zilog_t *zilog, uint64_t slogval);
 
-extern int zil_disable;
+extern int zil_replay_disable;
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zil_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h	27 Feb 2010 22:31:45 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h	3 Dec 2016 17:03:48 -0000
@@ -19,10 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_ZIL_IMPL_H
 #define	_SYS_ZIL_IMPL_H
 
@@ -39,6 +42,7 @@ extern "C" {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
@@ -49,6 +53,27 @@ typedef struct lwb {
 } lwb_t;
 
 /*
+ * Intent log transaction lists
+ */
+typedef struct itxs {
+	list_t		i_sync_list;	/* list of synchronous itxs */
+	avl_tree_t	i_async_tree;	/* tree of foids for async itxs */
+} itxs_t;
+
+typedef struct itxg {
+	kmutex_t	itxg_lock;	/* lock for this structure */
+	uint64_t	itxg_txg;	/* txg for this chain */
+	itxs_t		*itxg_itxs;	/* sync and async itxs */
+} itxg_t;
+
+/* for async nodes we build up an AVL tree of lists of async itxs per file */
+typedef struct itx_async_node {
+	uint64_t	ia_foid;	/* file object id */
+	list_t		ia_list;	/* list of async itxs for this foid */
+	avl_node_t	ia_node;	/* AVL tree linkage */
+} itx_async_node_t;
+
+/*
  * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs
  * we've touched so we know which ones need a write cache flush at the end.
  */
@@ -70,9 +95,7 @@ struct zilog {
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
 	zio_t		*zl_root_zio;	/* log writer root zio */
-	uint64_t	zl_itx_seq;	/* next in-core itx sequence number */
 	uint64_t	zl_lr_seq;	/* on-disk log record sequence number */
-	uint64_t	zl_commit_seq;	/* committed upto this number */
 	uint64_t	zl_commit_lr_seq; /* last committed on-disk lr seq */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
 	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
@@ -86,15 +109,18 @@ struct zilog {
 	uint8_t		zl_stop_sync;	/* for debugging */
 	uint8_t		zl_writer;	/* boolean: write setup in progress */
 	uint8_t		zl_logbias;	/* latency or throughput */
+	uint8_t		zl_sync;	/* synchronous or asynchronous */
 	int		zl_parse_error;	/* last zil_parse() error */
 	uint64_t	zl_parse_blk_seq; /* highest blk seq on last parse */
 	uint64_t	zl_parse_lr_seq; /* highest lr seq on last parse */
 	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
 	uint64_t	zl_parse_lr_count; /* number of log records parsed */
-	list_t		zl_itx_list;	/* in-memory itx list */
-	uint64_t	zl_itx_list_sz;	/* total size of records on list */
+	uint64_t	zl_next_batch;	/* next batch number */
+	uint64_t	zl_com_batch;	/* committed batch number */
+	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
+	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
+	list_t		zl_itx_commit_list; /* itx list to be committed */
 	uint64_t	zl_cur_used;	/* current commit log size used */
-	uint64_t	zl_prev_used;	/* previous commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
 	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
@@ -105,6 +131,7 @@ struct zilog {
 	zil_header_t	zl_old_header;	/* debugging aid */
 	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
 	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
+	txg_node_t	zl_dirty_link;	/* protected by dp_dirty_zilogs list */
 };
 
 typedef struct zil_bp_node {
@@ -112,8 +139,10 @@ typedef struct zil_bp_node {
 	avl_node_t	zn_node;
 } zil_bp_node_t;
 
-#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define	ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
+#define	ZIL_MAX_COPIED_DATA \
+    ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zio.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h	27 Feb 2010 22:31:45 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h	28 Jun 2017 00:02:45 -0000
@@ -20,17 +20,22 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
+#include <sys/zio_priority.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
+#include <sys/kstat.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
@@ -77,9 +82,23 @@ enum zio_checksum {
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
+	ZIO_CHECKSUM_NOPARITY,
+#ifndef __NetBSD__
+	ZIO_CHECKSUM_SHA512,
+	ZIO_CHECKSUM_SKEIN,
+#endif
+#ifdef illumos
+	ZIO_CHECKSUM_EDONR,
+#endif
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
+
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
@@ -105,38 +124,35 @@ enum zio_compress {
 	ZIO_COMPRESS_GZIP_8,
 	ZIO_COMPRESS_GZIP_9,
 	ZIO_COMPRESS_ZLE,
+	ZIO_COMPRESS_LZ4,
 	ZIO_COMPRESS_FUNCTIONS
 };
 
-#define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
-#define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
+
+/*
+ * The meaning of "compress = on" selected by the compression features enabled
+ * on a given pool.
+ */
+#define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
+#define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
+
+#define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_OFF
 
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
-	((compress) == ZIO_COMPRESS_ON &&		\
-	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) ||	\
+	(compress) == ZIO_COMPRESS_LZ4 ||		\
+	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
-#define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
-#define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
-#define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
-#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[3])
-#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[4])
-#define	ZIO_PRIORITY_AGG		(zio_priority_table[5])
-#define	ZIO_PRIORITY_FREE		(zio_priority_table[6])
-#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[7])
-#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[8])
-#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[9])
-#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[10])
-#define	ZIO_PRIORITY_TABLE_SIZE		11
-
-#define	ZIO_PIPELINE_CONTINUE		0x100
-#define	ZIO_PIPELINE_STOP		0x101
-
 enum zio_flag {
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
@@ -147,20 +163,22 @@ enum zio_flag {
 	ZIO_FLAG_SELF_HEAL	= 1 << 2,
 	ZIO_FLAG_RESILVER	= 1 << 3,
 	ZIO_FLAG_SCRUB		= 1 << 4,
-	ZIO_FLAG_SCRUB_THREAD	= 1 << 5,
+	ZIO_FLAG_SCAN_THREAD	= 1 << 5,
+	ZIO_FLAG_PHYSICAL	= 1 << 6,
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
-	ZIO_FLAG_CANFAIL	= 1 << 6,	/* must be first for INHERIT */
-	ZIO_FLAG_SPECULATIVE	= 1 << 7,
-	ZIO_FLAG_CONFIG_WRITER	= 1 << 8,
-	ZIO_FLAG_DONT_RETRY	= 1 << 9,
-	ZIO_FLAG_DONT_CACHE	= 1 << 10,
-	ZIO_FLAG_NODATA		= 1 << 11,
-	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 12,
+	ZIO_FLAG_CANFAIL	= 1 << 7,	/* must be first for INHERIT */
+	ZIO_FLAG_SPECULATIVE	= 1 << 8,
+	ZIO_FLAG_CONFIG_WRITER	= 1 << 9,
+	ZIO_FLAG_DONT_RETRY	= 1 << 10,
+	ZIO_FLAG_DONT_CACHE	= 1 << 11,
+	ZIO_FLAG_NODATA		= 1 << 12,
+	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 13,
+	ZIO_FLAG_IO_ALLOCATING	= 1 << 14,
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
@@ -168,24 +186,27 @@ enum zio_flag {
 	/*
 	 * Flags inherited by vdev children.
 	 */
-	ZIO_FLAG_IO_RETRY	= 1 << 13,	/* must be first for INHERIT */
-	ZIO_FLAG_PROBE		= 1 << 14,
-	ZIO_FLAG_TRYHARD	= 1 << 15,
-	ZIO_FLAG_OPTIONAL	= 1 << 16,
+	ZIO_FLAG_IO_RETRY	= 1 << 15,	/* must be first for INHERIT */
+	ZIO_FLAG_PROBE		= 1 << 16,
+	ZIO_FLAG_TRYHARD	= 1 << 17,
+	ZIO_FLAG_OPTIONAL	= 1 << 18,
 
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
-	ZIO_FLAG_DONT_QUEUE	= 1 << 17,	/* must be first for INHERIT */
-	ZIO_FLAG_DONT_PROPAGATE	= 1 << 18,
-	ZIO_FLAG_IO_BYPASS	= 1 << 19,
-	ZIO_FLAG_IO_REWRITE	= 1 << 20,
-	ZIO_FLAG_RAW		= 1 << 21,
-	ZIO_FLAG_GANG_CHILD	= 1 << 22,
-	ZIO_FLAG_DDT_CHILD	= 1 << 23,
-	ZIO_FLAG_GODFATHER	= 1 << 24
+	ZIO_FLAG_DONT_QUEUE	= 1 << 19,	/* must be first for INHERIT */
+	ZIO_FLAG_DONT_PROPAGATE	= 1 << 20,
+	ZIO_FLAG_IO_BYPASS	= 1 << 21,
+	ZIO_FLAG_IO_REWRITE	= 1 << 22,
+	ZIO_FLAG_RAW		= 1 << 23,
+	ZIO_FLAG_GANG_CHILD	= 1 << 24,
+	ZIO_FLAG_DDT_CHILD	= 1 << 25,
+	ZIO_FLAG_GODFATHER	= 1 << 26,
+	ZIO_FLAG_NOPWRITE	= 1 << 27,
+	ZIO_FLAG_REEXECUTED	= 1 << 28,
+	ZIO_FLAG_DELEGATED	= 1 << 29,
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
@@ -217,16 +238,17 @@ enum zio_wait_type {
 };
 
 /*
- * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent
- * graveyard) to indicate checksum errors and fragmentation.
+ * We'll take the number 122 and 123 to indicate checksum errors and
+ * fragmentation. Those doesn't collide with any errno values as they
+ * are greater than ELAST.
  */
-#define	ECKSUM	EBADE
-#define	EFRAGS	EBADR
+#define	ECKSUM	122
+#define	EFRAGS	123
 
 typedef void zio_done_func_t(zio_t *zio);
 
-extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
-extern char *zio_type_name[ZIO_TYPES];
+extern boolean_t zio_dva_throttle_enabled;
+extern const char *zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
@@ -237,20 +259,21 @@ extern char *zio_type_name[ZIO_TYPES];
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
- * Note: this structure is passed between userland and the kernel.
- * Therefore it must not change size or alignment between 32/64 bit
- * compilation options.
+ * Note: this structure is passed between userland and the kernel, and is
+ * stored on disk (by virtue of being incorporated into other on-disk
+ * structures, e.g. dsl_scan_phys_t).
  */
-typedef struct zbookmark {
+typedef struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
-} zbookmark_t;
+} zbookmark_phys_t;
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
@@ -269,14 +292,26 @@ typedef struct zbookmark {
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
+#define	ZB_DNODE_LEVEL		(-3LL)
+#define	ZB_DNODE_BLKID		(0ULL)
+
+#define	ZB_IS_ZERO(zb)						\
+	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
+	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
+#define	ZB_IS_ROOT(zb)				\
+	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
+	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
+	(zb)->zb_blkid == ZB_ROOT_BLKID)
+
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
-	uint8_t			zp_dedup;
-	uint8_t			zp_dedup_verify;
+	boolean_t		zp_dedup;
+	boolean_t		zp_dedup_verify;
+	boolean_t		zp_nopwrite;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
@@ -286,6 +321,7 @@ typedef void zio_cksum_finish_f(zio_cksu
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
+struct dnode_phys;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
@@ -342,6 +378,11 @@ typedef int zio_pipe_stage_t(zio_t *zio)
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
+typedef struct zio_alloc_list {
+	list_t  zal_list;
+	uint64_t zal_size;
+} zio_alloc_list_t;
+
 typedef struct zio_link {
 	zio_t		*zl_parent;
 	zio_t		*zl_child;
@@ -349,14 +390,47 @@ typedef struct zio_link {
 	list_node_t	zl_child_node;
 } zio_link_t;
 
+/*
+ * Used for TRIM kstat.
+ */
+typedef struct zio_trim_stats {
+	/*
+	 * Number of bytes successfully TRIMmed.
+	 */
+	kstat_named_t bytes;
+
+	/*
+	 * Number of successful TRIM requests.
+	 */
+	kstat_named_t success;
+
+	/*
+	 * Number of TRIM requests that failed because TRIM is not
+	 * supported.
+	 */
+	kstat_named_t unsupported;
+
+	/*
+	 * Number of TRIM requests that failed for other reasons.
+	 */
+	kstat_named_t failed;
+} zio_trim_stats_t;
+
+extern zio_trim_stats_t zio_trim_stats;
+
+#define ZIO_TRIM_STAT_INCR(stat, val) \
+	atomic_add_64(&zio_trim_stats.stat.value.ui64, (val));
+#define ZIO_TRIM_STAT_BUMP(stat) \
+	ZIO_TRIM_STAT_INCR(stat, 1);
+
 struct zio {
 	/* Core information about this I/O */
-	zbookmark_t	io_bookmark;
+	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	int		io_cmd;
-	uint8_t		io_priority;
+	zio_priority_t	io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
@@ -366,12 +440,13 @@ struct zio {
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
-	zio_link_t	*io_walk_link;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
+	zio_done_func_t	*io_children_ready;
+	zio_done_func_t	*io_physdone;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
@@ -389,10 +464,13 @@ struct zio {
 	const zio_vsd_ops_t *io_vsd_ops;
 
 	uint64_t	io_offset;
-	uint64_t	io_deadline;
+	hrtime_t	io_timestamp;
+	hrtime_t	io_queued_timestamp;
+	hrtime_t	io_target_timestamp;
+	avl_node_t	io_queue_node;
 	avl_node_t	io_offset_node;
-	avl_node_t	io_deadline_node;
-	avl_tree_t	*io_vdev_tree;
+	avl_node_t	io_alloc_node;
+	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
 	enum zio_flag	io_flags;
@@ -401,10 +479,12 @@ struct zio {
 	enum zio_flag	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
+	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	io_child_count;
+	uint64_t	io_phys_children;
 	uint64_t	io_parent_count;
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
@@ -417,81 +497,103 @@ struct zio {
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
+
+	/* Taskq dispatching state */
+	taskq_ent_t	io_tqent;
+
+	avl_node_t	io_trim_node;
+	list_node_t	io_trim_link;
 };
 
+extern int zio_timestamp_compare(const void *, const void *);
+
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
-    zio_done_func_t *done, void *private, enum zio_flag flags);
+    zio_done_func_t *done, void *priv, enum zio_flag flags);
 
 extern zio_t *zio_root(spa_t *spa,
-    zio_done_func_t *done, void *private, enum zio_flag flags);
+    zio_done_func_t *done, void *priv, enum zio_flag flags);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
-    uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb);
+    uint64_t size, zio_done_func_t *done, void *priv,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, const zio_prop_t *zp,
-    zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, const zbookmark_t *zb);
+    zio_done_func_t *ready, zio_done_func_t *children_ready,
+    zio_done_func_t *physdone, zio_done_func_t *done,
+    void *priv, zio_priority_t priority, enum zio_flag flags,
+    const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, enum zio_flag flags, zbookmark_t *zb);
+    void *data, uint64_t size, zio_done_func_t *done, void *priv,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
 
-extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
+    boolean_t nopwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
-    zio_done_func_t *done, void *private, enum zio_flag flags);
+    zio_done_func_t *done, void *priv, enum zio_flag flags);
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
+    uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
+    zio_priority_t priority, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
-    boolean_t labels);
+    zio_done_func_t *done, void *priv, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
-    boolean_t labels);
+    zio_done_func_t *done, void *priv, zio_priority_t priority,
+    enum zio_flag flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
-    const blkptr_t *bp, enum zio_flag flags);
+    const blkptr_t *bp, uint64_t size, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
+    uint64_t size);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
+extern void zio_delay_init(zio_t *zio);
+extern void zio_delay_interrupt(zio_t *zio);
 
-extern zio_t *zio_walk_parents(zio_t *cio);
-extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
+extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
+extern void *zio_buf_alloc_nowait(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
+extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
+    uint64_t bufsize, zio_transform_func_t *transform);
+extern void zio_pop_transforms(zio_t *zio);
+
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
-    uint64_t offset, void *data, uint64_t size, int type, int priority,
-    enum zio_flag flags, zio_done_func_t *done, void *private);
+    uint64_t offset, void *data, uint64_t size, int type,
+    zio_priority_t priority, enum zio_flag flags,
+    zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
-    void *data, uint64_t size, int type, int priority,
-    enum zio_flag flags, zio_done_func_t *done, void *private);
+    void *data, uint64_t size, int type, zio_priority_t priority,
+    enum zio_flag flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
@@ -504,8 +606,8 @@ extern enum zio_checksum zio_checksum_se
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
-extern enum zio_compress zio_compress_select(enum zio_compress child,
-    enum zio_compress parent);
+extern enum zio_compress zio_compress_select(spa_t *spa,
+    enum zio_compress child, enum zio_compress parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio);
 extern int zio_resume(spa_t *spa);
@@ -532,6 +634,7 @@ extern int zio_handle_fault_injection(zi
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
+extern hrtime_t zio_handle_io_delay(zio_t *zio);
 
 /*
  * Checksum ereport functions
@@ -552,6 +655,12 @@ extern void zfs_ereport_post_checksum(sp
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
+/* zbookmark_phys functions */
+boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
+int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
+    uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zio_checksum.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h	27 Feb 2010 22:31:46 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h	28 Jun 2017 00:01:51 -0000
@@ -19,15 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright Saso Kiselkov 2013, All rights reserved.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
 #define	_SYS_ZIO_CHECKSUM_H
 
 #include <sys/zio.h>
-#include <zfs_fletcher.h>
+#include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -36,17 +37,34 @@ extern "C" {
 /*
  * Signature for checksum functions.
  */
-typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+typedef void zio_checksum_t(const void *data, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp);
+typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
+typedef void zio_checksum_tmpl_free_t(void *ctx_template);
+
+typedef enum zio_checksum_flags {
+	/* Strong enough for metadata? */
+	ZCHECKSUM_FLAG_METADATA = (1 << 1),
+	/* ZIO embedded checksum */
+	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
+	/* Strong enough for dedup (without verification)? */
+	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
+	/* Uses salt value */
+	ZCHECKSUM_FLAG_SALTED = (1 << 4),
+	/* Strong enough for nopwrite? */
+	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
+} zio_checksum_flags_t;
 
 /*
  * Information about each checksum function.
  */
 typedef struct zio_checksum_info {
-	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
-	int		ci_correctable;	/* number of correctable bits	*/
-	int		ci_eck;		/* uses zio embedded checksum? */
-	int		ci_dedup;	/* strong enough for dedup? */
-	char		*ci_name;	/* descriptive name */
+	/* checksum function for each byteorder */
+	zio_checksum_t			*ci_func[2];
+	zio_checksum_tmpl_init_t	*ci_tmpl_init;
+	zio_checksum_tmpl_free_t	*ci_tmpl_free;
+	zio_checksum_flags_t		ci_flags;
+	char				*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
 typedef struct zio_bad_cksum {
@@ -64,10 +82,35 @@ extern zio_checksum_info_t zio_checksum_
  * Checksum routines.
  */
 extern zio_checksum_t zio_checksum_SHA256;
+#ifndef __NetBSD__
+extern zio_checksum_t zio_checksum_SHA512_native;
+extern zio_checksum_t zio_checksum_SHA512_byteswap;
+
+/* Skein */
+extern zio_checksum_t zio_checksum_skein_native;
+extern zio_checksum_t zio_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+#endif
+
+#ifdef illumos
+/* Edon-R */
+extern zio_checksum_t zio_checksum_edonr_native;
+extern zio_checksum_t zio_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
+#endif
 
+extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
+    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     void *data, uint64_t size);
+extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
+    void *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+extern void zio_checksum_templates_free(spa_t *spa);
+extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zio_compress.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h	27 Feb 2010 22:31:46 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h	25 Apr 2017 16:39:46 -0000
@@ -23,6 +23,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ */
 
 #ifndef _SYS_ZIO_COMPRESS_H
 #define	_SYS_ZIO_COMPRESS_H
@@ -33,11 +36,10 @@
 extern "C" {
 #endif
 
-/*
- * Common signature for all zio compress/decompress functions.
- */
+/* Common signature for all zio compress functions. */
 typedef size_t zio_compress_func_t(void *src, void *dst,
     size_t s_len, size_t d_len, int);
+/* Common signature for all zio decompress functions. */
 typedef int zio_decompress_func_t(void *src, void *dst,
     size_t s_len, size_t d_len, int);
 
@@ -68,6 +70,17 @@ extern size_t zle_compress(void *src, vo
     int level);
 extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
+#ifdef __FreeBSD__
+extern void lz4_init(void);
+extern void lz4_fini(void);
+#else
+#define lz4_init() /* nothing */
+#define lz4_fini() /* nothing */
+#endif
+extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
+extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
 
 /*
  * Compress and decompress data if necessary.
@@ -77,6 +90,12 @@ extern size_t zio_compress_data(enum zio
 extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
     size_t s_len, size_t d_len);
 
+/*
+ * Module lifetime management.
+ */
+extern void zio_compress_init(void);
+extern void zio_compress_fini(void);
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zio_impl.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h	27 Feb 2010 22:31:46 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h	10 Oct 2016 11:09:53 -0000
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
 #ifndef _ZIO_IMPL_H
 #define	_ZIO_IMPL_H
 
@@ -34,39 +38,107 @@ extern "C" {
 #endif
 
 /*
+ * XXX -- Describe ZFS I/O pipeline here. Fill in as needed.
+ *
+ * The ZFS I/O pipeline is comprised of various stages which are defined
+ * in the zio_stage enum below. The individual stages are used to construct
+ * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
+ *
+ * I/O operations: (XXX - provide detail for each of the operations)
+ *
+ * Read:
+ * Write:
+ * Free:
+ * Claim:
+ * Ioctl:
+ *
+ * Although the most common pipeline are used by the basic I/O operations
+ * above, there are some helper pipelines (one could consider them
+ * sub-pipelines) which are used internally by the ZIO module and are
+ * explained below:
+ *
+ * Interlock Pipeline:
+ * The interlock pipeline is the most basic pipeline and is used by all
+ * of the I/O operations. The interlock pipeline does not perform any I/O
+ * and is used to coordinate the dependencies between I/Os that are being
+ * issued (i.e. the parent/child relationship).
+ *
+ * Vdev child Pipeline:
+ * The vdev child pipeline is responsible for performing the physical I/O.
+ * It is in this pipeline where the I/O are queued and possibly cached.
+ *
+ * In addition to performing I/O, the pipeline is also responsible for
+ * data transformations. The transformations performed are based on the
+ * specific properties that user may have selected and modify the
+ * behavior of the pipeline. Examples of supported transformations are
+ * compression, dedup, and nop writes. Transformations will either modify
+ * the data or the pipeline. This list below further describes each of
+ * the supported transformations:
+ *
+ * Compression:
+ * ZFS supports three different flavors of compression -- gzip, lzjb, and
+ * zle. Compression occurs as part of the write pipeline and is performed
+ * in the ZIO_STAGE_WRITE_BP_INIT stage.
+ *
+ * Dedup:
+ * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
+ * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
+ * read pipeline if the dedup bit is set on the block pointer.
+ * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
+ * and added to a write pipeline if a user has enabled dedup on that
+ * particular dataset.
+ *
+ * NOP Write:
+ * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
+ * and is added to an existing write pipeline if a crypographically
+ * secure checksum (i.e. SHA256) is enabled and compression is turned on.
+ * The NOP write stage will compare the checksums of the current data
+ * on-disk (level-0 blocks only) and the data that is currently being written.
+ * If the checksum values are identical then the pipeline is converted to
+ * an interlock pipeline skipping block allocation and bypassing the
+ * physical I/O.  The nop write feature can handle writes in either
+ * syncing or open context (i.e. zil writes) and as a result is mutually
+ * exclusive with dedup.
+ */
+
+/*
  * zio pipeline stage definitions
  */
 enum zio_stage {
 	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
 
 	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
-	ZIO_STAGE_FREE_BP_INIT		= 1 << 2,	/* --F-- */
-	ZIO_STAGE_ISSUE_ASYNC		= 1 << 3,	/* RWF-- */
-	ZIO_STAGE_WRITE_BP_INIT		= 1 << 4,	/* -W--- */
+	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W--- */
+	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F-- */
+	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* RWF-- */
+	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W--- */
 
-	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 5,	/* -W--- */
+	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 6,	/* -W--- */
 
-	ZIO_STAGE_DDT_READ_START	= 1 << 6,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 7,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 8,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 9,	/* --F-- */
+	ZIO_STAGE_NOP_WRITE		= 1 << 7,	/* -W--- */
 
-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 10,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 11,	/* RWFC- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 8,	/* R---- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 9,	/* R---- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 10,	/* -W--- */
+	ZIO_STAGE_DDT_FREE		= 1 << 11,	/* --F-- */
 
-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 12,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 13,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 14,	/* ---C- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 12,	/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 13,	/* RWFC- */
 
-	ZIO_STAGE_READY			= 1 << 15,	/* RWFCI */
+	ZIO_STAGE_DVA_THROTTLE		= 1 << 14,	/* -W--- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 15,	/* -W--- */
+	ZIO_STAGE_DVA_FREE		= 1 << 16,	/* --F-- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 17,	/* ---C- */
 
-	ZIO_STAGE_VDEV_IO_START		= 1 << 16,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 17,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 18,	/* RW--I */
+	ZIO_STAGE_READY			= 1 << 18,	/* RWFCI */
 
-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 19,	/* R---- */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 19,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 20,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 21,	/* RWF-I */
 
-	ZIO_STAGE_DONE			= 1 << 20	/* RWFCI */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 22,	/* R---- */
+
+	ZIO_STAGE_DONE			= 1 << 23	/* RWFCI */
 };
 
 #define	ZIO_INTERLOCK_STAGES			\
@@ -117,22 +189,27 @@ enum zio_stage {
 
 #define	ZIO_REWRITE_PIPELINE			\
 	(ZIO_WRITE_COMMON_STAGES |		\
+	ZIO_STAGE_WRITE_COMPRESS |		\
 	ZIO_STAGE_WRITE_BP_INIT)
 
 #define	ZIO_WRITE_PIPELINE			\
 	(ZIO_WRITE_COMMON_STAGES |		\
 	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_WRITE_COMPRESS |		\
+	ZIO_STAGE_DVA_THROTTLE |		\
 	ZIO_STAGE_DVA_ALLOCATE)
 
 #define	ZIO_DDT_CHILD_WRITE_PIPELINE		\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_DVA_THROTTLE |		\
 	ZIO_STAGE_DVA_ALLOCATE)
 
 #define	ZIO_DDT_WRITE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
-	ZIO_STAGE_ISSUE_ASYNC |			\
 	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_WRITE_COMPRESS |		\
 	ZIO_STAGE_CHECKSUM_GENERATE |		\
 	ZIO_STAGE_DDT_WRITE)
 
@@ -145,6 +222,10 @@ enum zio_stage {
 	ZIO_STAGE_FREE_BP_INIT |		\
 	ZIO_STAGE_DVA_FREE)
 
+#define	ZIO_FREE_PHYS_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES)
+
 #define	ZIO_DDT_FREE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_STAGE_FREE_BP_INIT |		\
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_priority.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_priority.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_priority.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_priority.h	30 Aug 2015 02:17:54 -0000
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+#ifndef	_ZIO_PRIORITY_H
+#define	_ZIO_PRIORITY_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef enum zio_priority {
+	ZIO_PRIORITY_SYNC_READ,
+	ZIO_PRIORITY_SYNC_WRITE,	/* ZIL */
+	ZIO_PRIORITY_ASYNC_READ,	/* prefetch */
+	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
+	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
+	ZIO_PRIORITY_TRIM,		/* free requests used for TRIM */
+	ZIO_PRIORITY_NUM_QUEUEABLE,
+
+	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */
+} zio_priority_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZIO_PRIORITY_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zrlock.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zrlock.h
diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zrlock.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zrlock.h	22 Nov 2015 17:22:31 -0000
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#ifndef	_SYS_ZRLOCK_H
+#define	_SYS_ZRLOCK_H
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct zrlock {
+	kmutex_t zr_mtx;
+	volatile int32_t zr_refcount;
+	kcondvar_t zr_cv;
+	uint16_t zr_pad;
+#ifdef	ZFS_DEBUG
+	kthread_t *zr_owner;
+	const char *zr_caller;
+#endif
+} zrlock_t;
+
+extern void zrl_init(zrlock_t *);
+extern void zrl_destroy(zrlock_t *);
+#define	zrl_add(_z)	zrl_add_impl((_z), __func__)
+extern void zrl_add_impl(zrlock_t *, const char *);
+extern void zrl_remove(zrlock_t *);
+extern int zrl_tryenter(zrlock_t *);
+extern void zrl_exit(zrlock_t *);
+extern int zrl_is_zero(zrlock_t *);
+extern int zrl_is_locked(zrlock_t *);
+#ifdef	ZFS_DEBUG
+extern kthread_t *zrl_owner(zrlock_t *);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_ZRLOCK_H */
Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h,v
retrieving revision 1.3
diff -u -p -r1.3 zvol.h
--- src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h	19 Apr 2017 15:49:30 -0000
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ZVOL_H
@@ -44,23 +43,45 @@ extern void zvol_create_cb(objset_t *os,
 extern int zvol_create_minor(const char *);
 extern int zvol_remove_minor(const char *);
 extern void zvol_remove_minors(const char *);
-extern int zvol_set_volsize(const char *, major_t, uint64_t);
+extern int zvol_set_volsize(const char *, uint64_t);
 
+#ifndef __FreeBSD__
 extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
 extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
 extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
+#ifdef __NetBSD__
 extern void zvol_strategy(buf_t *bp);
+#else
+extern int zvol_strategy(buf_t *bp);
+#endif
 extern int zvol_read(dev_t dev, uio_t *uiop, cred_t *cr);
 extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
 extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
 extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
+#endif	/* !__FreeBSD__ */
 extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
     int *rvalp);
 extern int zvol_busy(void);
 extern void zvol_init(void);
 extern void zvol_fini(void);
+
+#ifdef illumos
+extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+    void **rl_hdl, void **bonus_hdl);
+extern uint64_t zvol_get_volume_size(void *minor_hdl);
+extern int zvol_get_volume_wce(void *minor_hdl);
+extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
+    ssize_t resid, boolean_t sync);
+#endif	/* illumos */
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+extern int zvol_create_minors(const char *name);
+extern void zvol_rename_minors(const char *oldname, const char *newname);
 #endif
 
+#endif	/* _KERNEL */
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/os/nvpair_alloc_system.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/os/nvpair_alloc_system.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 nvpair_alloc_system.c
--- src/external/cddl/osnet/dist/uts/common/os/nvpair_alloc_system.c	7 Aug 2009 18:33:45 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/os/nvpair_alloc_system.c	12 Jun 2012 05:57:26 -0000
@@ -26,7 +26,6 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
-#include <rpc/types.h>
 #include <sys/nvpair.h>
 
 static void *
Index: src/external/cddl/osnet/dist/uts/common/sys/acl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/acl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 acl.h
--- src/external/cddl/osnet/dist/uts/common/sys/acl.h	27 Feb 2010 22:31:48 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/acl.h	25 Apr 2017 13:46:24 -0000
@@ -19,6 +19,8 @@
  * CDDL HEADER END
  */
 /*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ *
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -34,7 +36,7 @@ extern "C" {
 #endif
 
 #define	MAX_ACL_ENTRIES		(1024)	/* max entries of each type */
-typedef struct acl {
+typedef struct {
 	int		a_type;		/* the type of ACL entry */
 	uid_t		a_id;		/* the entry in -uid or gid */
 	o_mode_t	a_perm;		/* the permission field */
@@ -47,7 +49,9 @@ typedef struct ace {
 	uint16_t	a_type;		/* allow or deny */
 } ace_t;
 
+#ifndef _KERNEL
 typedef struct acl_info acl_t;
+#endif
 
 /*
  * The following are Defined types for an aclent_t.
@@ -175,11 +179,12 @@ typedef struct ace_object {
     ACE_DIRECTORY_INHERIT_ACE | \
     ACE_NO_PROPAGATE_INHERIT_ACE | \
     ACE_INHERIT_ONLY_ACE | \
+    ACE_INHERITED_ACE | \
     ACE_IDENTIFIER_GROUP)
 
 #define	ACE_TYPE_FLAGS		(ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \
     ACE_IDENTIFIER_GROUP)
-#define	ACE_INHERIT_FLAGS	(ACE_FILE_INHERIT_ACE| \
+#define	ACE_INHERIT_FLAGS	(ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \
     ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
 
 /* cmd args to acl(2) for aclent_t  */
@@ -287,13 +292,8 @@ extern int cmp2acls(void *, void *);
 
 #endif	/* !defined(_KERNEL) */
 
-#if defined(__STDC__)
 extern int acl(const char *path, int cmd, int cnt, void *buf);
 extern int facl(int fd, int cmd, int cnt, void *buf);
-#else	/* !__STDC__ */
-extern int acl();
-extern int facl();
-#endif	/* defined(__STDC__) */
 
 #ifdef	__cplusplus
 }
Index: src/external/cddl/osnet/dist/uts/common/sys/acl_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/acl_impl.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 acl_impl.h
--- src/external/cddl/osnet/dist/uts/common/sys/acl_impl.h	7 Aug 2009 18:33:46 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/sys/acl_impl.h	12 Jun 2012 05:57:29 -0000
@@ -44,10 +44,10 @@ extern "C" {
 typedef enum acl_type {
 	ACLENT_T = 0,
 	ACE_T = 1
-} acl_type_t;
+} zfs_acl_type_t;
 
 struct acl_info {
-	acl_type_t acl_type;		/* style of acl */
+	zfs_acl_type_t acl_type;	/* style of acl */
 	int acl_cnt;			/* number of acl entries */
 	int acl_entry_size;		/* sizeof acl entry */
 	int acl_flags;			/* special flags about acl */
Index: src/external/cddl/osnet/dist/uts/common/sys/avl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/avl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 avl.h
--- src/external/cddl/osnet/dist/uts/common/sys/avl.h	27 Feb 2010 22:31:48 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/avl.h	31 Jul 2014 04:37:18 -0000
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 #ifndef	_AVL_H
 #define	_AVL_H
 
@@ -39,7 +43,7 @@ extern "C" {
 #include <sys/avl_impl.h>
 
 /*
- * This is a generic implemenatation of AVL trees for use in the Solaris kernel.
+ * This is a generic implementation of AVL trees for use in the Solaris kernel.
  * The interfaces provide an efficient way of implementing an ordered set of
  * data structures.
  *
@@ -175,7 +179,7 @@ extern void avl_insert(avl_tree_t *tree,
  * Insert "new_data" in "tree" in the given "direction" either after
  * or before the data "here".
  *
- * This might be usefull for avl clients caching recently accessed
+ * This might be useful for avl clients caching recently accessed
  * data to avoid doing avl_find() again for insertion.
  *
  * new_data	- new data to insert
@@ -260,6 +264,11 @@ extern boolean_t avl_update_lt(avl_tree_
 extern boolean_t avl_update_gt(avl_tree_t *, void *);
 
 /*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
+/*
  * Return the number of nodes in the tree
  */
 extern ulong_t avl_numnodes(avl_tree_t *tree);
Index: src/external/cddl/osnet/dist/uts/common/sys/bitmap.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/bitmap.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 bitmap.h
--- src/external/cddl/osnet/dist/uts/common/sys/bitmap.h	7 Aug 2009 18:33:47 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/sys/bitmap.h	1 Nov 2014 10:04:54 -0000
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
@@ -31,8 +35,6 @@
 #ifndef _SYS_BITMAP_H
 #define	_SYS_BITMAP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -152,6 +154,7 @@ extern int	bt_range(ulong_t *bitmap, siz
  * Low order bit is 0, high order bit is 31.
  */
 extern int	highbit(ulong_t);
+extern int	highbit64(uint64_t);
 extern int	lowbit(ulong_t);
 extern int	bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
 extern void	bt_copy(ulong_t *, ulong_t *, ulong_t);
@@ -168,9 +171,9 @@ extern int	odd_parity(ulong_t);
  * to 0 otherwise.
  */
 #define	BT_ATOMIC_SET(bitmap, bitindex) \
-	{ atomic_or_long(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
+	{ atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
 #define	BT_ATOMIC_CLEAR(bitmap, bitindex) \
-	{ atomic_and_long(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
+	{ atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
 
 #define	BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
 	{ result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)),	\
Index: src/external/cddl/osnet/dist/uts/common/sys/callb.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/callb.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 callb.h
--- src/external/cddl/osnet/dist/uts/common/sys/callb.h	27 Feb 2010 22:31:49 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/callb.h	12 Jun 2012 05:57:29 -0000
@@ -26,8 +26,7 @@
 #ifndef	_SYS_CALLB_H
 #define	_SYS_CALLB_H
 
-#include <sys/t_lock.h>
-#include <sys/thread.h>
+#include <sys/kcondvar.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -132,10 +131,14 @@ typedef struct callb_cpr {
  * later on.  No lock held is needed for this initialization.
  */
 #define	CALLB_CPR_INIT(cp, lockp, func, name)	{			\
+		strlcpy(curthread->td_name, (name),			\
+		    sizeof(curthread->td_name));			\
 		bzero((caddr_t)(cp), sizeof (callb_cpr_t));		\
 		(cp)->cc_lockp = lockp;					\
 		(cp)->cc_id = callb_add(func, (void *)(cp),		\
 			CB_CL_CPR_DAEMON, name);			\
+		cv_init(&(cp)->cc_callb_cv, NULL, CV_DEFAULT, NULL);	\
+		cv_init(&(cp)->cc_stop_cv, NULL, CV_DEFAULT, NULL);	\
 	}
 
 #ifndef __lock_lint
@@ -192,7 +195,6 @@ typedef struct callb_cpr {
 	}
 
 extern callb_cpr_t callb_cprinfo_safe;
-extern void	callb_init(void);
 extern callb_id_t callb_add(boolean_t  (*)(void *, int), void *, int, char *);
 extern callb_id_t callb_add_thread(boolean_t (*)(void *, int),
     void *, int, char *, kthread_id_t);
Index: src/external/cddl/osnet/dist/uts/common/sys/ccompile.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/ccompile.h,v
retrieving revision 1.2
diff -u -p -r1.2 ccompile.h
--- src/external/cddl/osnet/dist/uts/common/sys/ccompile.h	13 Mar 2010 22:31:15 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/ccompile.h	20 Apr 2017 14:16:09 -0000
@@ -27,8 +27,6 @@
 #ifndef	_SYS_CCOMPILE_H
 #define	_SYS_CCOMPILE_H
 
-/* #pragma ident	"%Z%%M%	%I%	%E% SMI" */
-
 /*
  * This file contains definitions designed to enable different compilers
  * to be used harmoniously on Solaris systems.
Index: src/external/cddl/osnet/dist/uts/common/sys/cmn_err.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/cmn_err.h,v
retrieving revision 1.2
diff -u -p -r1.2 cmn_err.h
--- src/external/cddl/osnet/dist/uts/common/sys/cmn_err.h	27 Mar 2014 15:50:48 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/cmn_err.h	12 Jun 2012 05:57:29 -0000
@@ -103,6 +103,11 @@ extern size_t snprintf(char *, size_t, c
     __KPRINTFLIKE(3);
 extern size_t vsnprintf(char *, size_t, const char *, __va_list)
     __KVPRINTFLIKE(3);
+/*PRINTFLIKE2*/
+extern char *sprintf(char *, const char *, ...)
+    __KPRINTFLIKE(2);
+extern char *vsprintf(char *, const char *, __va_list)
+    __KVPRINTFLIKE(2);
 
 /*PRINTFLIKE1*/
 extern void panic(const char *, ...)
Index: src/external/cddl/osnet/dist/uts/common/sys/cpupart.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/cpupart.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 cpupart.h
--- src/external/cddl/osnet/dist/uts/common/sys/cpupart.h	27 Feb 2010 22:31:52 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/cpupart.h	12 Jun 2012 05:57:29 -0000
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_CPUPART_H
@@ -125,6 +124,15 @@ extern cpupart_t	*cp_list_head;
 extern uint_t		cp_numparts;
 extern uint_t		cp_numparts_nonempty;
 
+/*
+ * Each partition contains a bitset that indicates which CPUs are halted and
+ * which ones are running. Given the growing number of CPUs in current and
+ * future platforms, it's important to fanout each CPU within its partition's
+ * haltset to prevent contention due to false sharing. The fanout factor
+ * is platform specific, and declared accordingly.
+ */
+extern uint_t cp_haltset_fanout;
+
 extern void	cpupart_initialize_default();
 extern cpupart_t *cpupart_find(psetid_t);
 extern int	cpupart_create(psetid_t *);
Index: src/external/cddl/osnet/dist/uts/common/sys/cpuvar.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/cpuvar.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 cpuvar.h
--- src/external/cddl/osnet/dist/uts/common/sys/cpuvar.h	27 Feb 2010 22:31:53 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/cpuvar.h	11 Dec 2014 11:55:11 -0000
@@ -20,8 +20,8 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
  */
 
 #ifndef _SYS_CPUVAR_H
@@ -32,6 +32,7 @@
 #include <sys/disp.h>
 #include <sys/processor.h>
 
+#include <sys/loadavg.h>
 #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
 #include <sys/machcpuvar.h>
 #endif
@@ -53,15 +54,6 @@ extern "C" {
 struct squeue_set_s;
 
 #define	CPU_CACHE_COHERENCE_SIZE	64
-#define	S_LOADAVG_SZ	11
-#define	S_MOVAVG_SZ	10
-
-struct loadavg_s {
-	int lg_cur;		/* current loadavg entry */
-	unsigned int lg_len;	/* number entries recorded */
-	hrtime_t lg_total;	/* used to temporarily hold load totals */
-	hrtime_t lg_loads[S_LOADAVG_SZ];	/* table of recorded entries */
-};
 
 /*
  * For fast event tracing.
@@ -525,8 +517,8 @@ typedef	ulong_t	cpuset_t;	/* a set of CP
 	largest = (uint_t)(highbit(set) - 1);		\
 }
 
-#define	CPUSET_ATOMIC_DEL(set, cpu)	atomic_and_long(&(set), ~CPUSET(cpu))
-#define	CPUSET_ATOMIC_ADD(set, cpu)	atomic_or_long(&(set), CPUSET(cpu))
+#define	CPUSET_ATOMIC_DEL(set, cpu)	atomic_and_ulong(&(set), ~CPUSET(cpu))
+#define	CPUSET_ATOMIC_ADD(set, cpu)	atomic_or_ulong(&(set), CPUSET(cpu))
 
 #define	CPUSET_ATOMIC_XADD(set, cpu, result) \
 	{ result = atomic_set_long_excl(&(set), (cpu)); }
@@ -564,6 +556,7 @@ extern int		boot_ncpus;	/* # cpus presen
 extern processorid_t	max_cpuid;	/* maximum CPU number */
 extern struct cpu	*cpu_inmotion;	/* offline or partition move target */
 extern cpu_t		*clock_cpu_list;
+extern processorid_t	max_cpu_seqid_ever;	/* maximum seqid ever given */
 
 #if defined(__i386) || defined(__amd64)
 extern struct cpu *curcpup(void);
@@ -655,7 +648,7 @@ void	poke_cpu(int cpun);	 /* interrupt a
 
 void	mach_cpu_pause(volatile char *);
 
-void	pause_cpus(cpu_t *off_cp);
+void	pause_cpus(cpu_t *off_cp, void *(*func)(void *));
 void	start_cpus(void);
 int	cpus_paused(void);
 
Index: src/external/cddl/osnet/dist/uts/common/sys/ctf.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/ctf.h,v
retrieving revision 1.2
diff -u -p -r1.2 ctf.h
--- src/external/cddl/osnet/dist/uts/common/sys/ctf.h	21 Feb 2010 00:49:56 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/ctf.h	4 Feb 2015 07:24:17 -0000
@@ -27,7 +27,7 @@
 #ifndef	_CTF_H
 #define	_CTF_H
 
-#if defined(sun)
+#ifdef illumos
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 #endif
 
Index: src/external/cddl/osnet/dist/uts/common/sys/debug.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/debug.h,v
retrieving revision 1.3
diff -u -p -r1.3 debug.h
--- src/external/cddl/osnet/dist/uts/common/sys/debug.h	23 Jan 2016 15:45:42 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/sys/debug.h	20 Apr 2017 17:00:43 -0000
@@ -19,10 +19,17 @@
  * CDDL HEADER END
  */
 /*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ *
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 
@@ -43,25 +50,13 @@ extern "C" {
  * ASSERT and is evaluated on both debug and non-debug kernels.
  */
 
-#if defined(__STDC__)
 extern int assfail(const char *, const char *, int);
 #define	VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
-#ifndef ASSERT
-#if DEBUG
+#ifdef DEBUG
 #define	ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
-#endif
-#else	/* defined(__STDC__) */
-extern int assfail();
-#define	VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
-#if DEBUG
-#define	ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
-#else
-#define	ASSERT(x)  ((void)0)
-#endif
-#endif	/* defined(__STDC__) */
 
 /*
  * Assertion variants sensitive to the compilation data model
@@ -81,7 +76,7 @@ extern int assfail();
  * and
  *	if (a) then (b) *AND* if (b) then (a)
  */
-#if DEBUG
+#ifdef DEBUG
 #define	IMPLY(A, B) \
 	((void)(((!(A)) || (B)) || \
 	    assfail("(" #A ") implies (" #B ")", __FILE__, __LINE__)))
@@ -115,14 +110,32 @@ _NOTE(CONSTCOND) } while (0)
 #define	VERIFY3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
 #define	VERIFY3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
 #define	VERIFY3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
-#if DEBUG
+#define	VERIFY0(x)		VERIFY3_IMPL(x, ==, 0, uintmax_t)
+
+#ifdef DEBUG
 #define	ASSERT3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
 #define	ASSERT3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
 #define	ASSERT3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
+#define	ASSERT0(x)		VERIFY3_IMPL(x, ==, 0, uintmax_t)
 #else
 #define	ASSERT3S(x, y, z)	((void)0)
 #define	ASSERT3U(x, y, z)	((void)0)
 #define	ASSERT3P(x, y, z)	((void)0)
+#define	ASSERT0(x)		((void)0)
+#endif
+
+/*
+ * Compile-time assertion. The condition 'x' must be constant.
+ */
+#ifndef CTASSERT
+#define	CTASSERT(x)		_CTASSERT(x, __LINE__)
+#endif
+#ifndef _CTASSERT
+#define	_CTASSERT(x, y)		__CTASSERT(x, y)
+#endif
+#ifndef __CTASSERT
+#define	__CTASSERT(x, y) \
+	typedef char __compile_time_assertion__ ## y [(x) ? 1 : -1]
 #endif
 
 #ifdef	_KERNEL
Index: src/external/cddl/osnet/dist/uts/common/sys/dtrace.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/dtrace.h,v
retrieving revision 1.15
diff -u -p -r1.15 dtrace.h
--- src/external/cddl/osnet/dist/uts/common/sys/dtrace.h	27 Feb 2017 06:47:00 -0000	1.15
+++ src/external/cddl/osnet/dist/uts/common/sys/dtrace.h	9 May 2017 15:20:41 -0000
@@ -32,8 +32,6 @@
 #ifndef _SYS_DTRACE_H
 #define	_SYS_DTRACE_H
 
-/* #pragma ident	"%Z%%M%	%I%	%E% SMI" */
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -57,12 +55,11 @@ extern "C" {
 #ifdef illumos
 #include <sys/systm.h>
 #else
-#include <sys/proc.h>
+#include <sys/cpuvar.h>
 #include <sys/param.h>
 #include <sys/linker.h>
 #include <sys/ioccom.h>
 #include <sys/ucred.h>
-#include <sys/pset.h>
 typedef int model_t;
 #endif
 #include <sys/ctf_api.h>
@@ -88,7 +85,7 @@ typedef int model_t;
 
 #define	DTRACE_PROVNAMELEN	64
 #define	DTRACE_MODNAMELEN	64
-#define	DTRACE_FUNCNAMELEN	128
+#define	DTRACE_FUNCNAMELEN	192
 #define	DTRACE_NAMELEN		64
 #define	DTRACE_FULLNAMELEN	(DTRACE_PROVNAMELEN + DTRACE_MODNAMELEN + \
 				DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4)
@@ -309,15 +306,14 @@ typedef enum dtrace_probespec {
 #define	DIF_SUBR_TOUPPER		44
 #define	DIF_SUBR_TOLOWER		45
 #define	DIF_SUBR_MEMREF			46
-#define	DIF_SUBR_TYPEREF		47
-#define	DIF_SUBR_SX_SHARED_HELD		48
-#define	DIF_SUBR_SX_EXCLUSIVE_HELD	49
-#define	DIF_SUBR_SX_ISEXCLUSIVE		50
-#define	DIF_SUBR_MEMSTR			51
-#define	DIF_SUBR_GETF			52
-#define	DIF_SUBR_JSON			53
-#define	DIF_SUBR_STRTOLL		54
-#define	DIF_SUBR_MAX			54	/* max subroutine value */
+#define	DIF_SUBR_SX_SHARED_HELD		47
+#define	DIF_SUBR_SX_EXCLUSIVE_HELD	48
+#define	DIF_SUBR_SX_ISEXCLUSIVE		49
+#define	DIF_SUBR_MEMSTR			50
+#define	DIF_SUBR_GETF			51
+#define	DIF_SUBR_JSON			52
+#define	DIF_SUBR_STRTOLL		53
+#define	DIF_SUBR_MAX			53	/* max subroutine value */
 
 typedef uint32_t dif_instr_t;
 
@@ -430,7 +426,6 @@ typedef struct dtrace_difv {
 #define	DTRACEACT_TRACEMEM		6	/* tracemem() action */
 #define	DTRACEACT_TRACEMEM_DYNSIZE	7	/* dynamic tracemem() size */
 #define	DTRACEACT_PRINTM		8	/* printm() action (BSD) */
-#define	DTRACEACT_PRINTT		9	/* printt() action (BSD) */
 
 #define	DTRACEACT_PROC			0x0100
 #define	DTRACEACT_USTACK		(DTRACEACT_PROC + 1)
@@ -740,8 +735,8 @@ typedef struct dof_sec {
 	((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) ||		\
 	((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) ||		\
 	((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) ||	\
-	((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) ||	\
-	((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS))
+	((x) == DOF_SECT_XLEXPORT) ||  ((x) == DOF_SECT_PREXPORT) || 	\
+	((x) == DOF_SECT_PRENOFFS))
 
 typedef struct dof_ecbdesc {
 	dof_secidx_t dofe_probes;	/* link to DOF_SECT_PROBEDESC */
@@ -789,6 +784,7 @@ typedef struct dof_relodesc {
 
 #define	DOF_RELO_NONE	0		/* empty relocation entry */
 #define	DOF_RELO_SETX	1		/* relocate setx value */
+#define	DOF_RELO_DOFREL	2		/* relocate DOF-relative value */
 
 typedef struct dof_optdesc {
 	uint32_t dofo_option;		/* option identifier */
@@ -1415,7 +1411,6 @@ typedef struct {
 #define	DTRACEHIOC_REMOVE	(DTRACEHIOC | 2)	/* remove helper */
 #define	DTRACEHIOC_ADDDOF	(DTRACEHIOC | 3)	/* add helper DOF */
 #else
-#define	DTRACEHIOC_ADD		_IOWR('z', 1, dof_hdr_t)/* add helper */
 #define	DTRACEHIOC_REMOVE	_IOW('z', 2, int)	/* remove helper */
 #define	DTRACEHIOC_ADDDOF	_IOWR('z', 3, dof_helper_t)/* add helper DOF */
 #endif
@@ -1506,7 +1501,7 @@ typedef struct dof_helper {
  *   DTrace routines, including dtrace_probe_create(), dtrace_probe_lookup(),
  *   and dtrace_probe_arg().
  *
- * 1.3  void dtps_provide_module(void *arg, dtrace_modctl_t *mp)
+ * 1.3  void dtps_provide_module(void *arg, modctl_t *mp)
  *
  * 1.3.1  Overview
  *
@@ -2125,12 +2120,9 @@ typedef struct dof_helper {
  *   instrument the kernel arbitrarily should be sure to not instrument these
  *   routines.
  */
-
-typedef dtrace_modctl_t *mymodctl_p;
-
 typedef struct dtrace_pops {
-	void (*dtps_provide)(void *arg, const dtrace_probedesc_t *spec);
-	void (*dtps_provide_module)(void *arg, dtrace_modctl_t *mp);
+	void (*dtps_provide)(void *arg, dtrace_probedesc_t *spec);
+	void (*dtps_provide_module)(void *arg, modctl_t *mp);
 	int (*dtps_enable)(void *arg, dtrace_id_t id, void *parg);
 	void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg);
 	void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg);
@@ -2156,8 +2148,8 @@ extern int dtrace_register(const char *,
 extern int dtrace_unregister(dtrace_provider_id_t);
 extern int dtrace_condense(dtrace_provider_id_t);
 extern void dtrace_invalidate(dtrace_provider_id_t);
-extern dtrace_id_t dtrace_probe_lookup(dtrace_provider_id_t, const char *,
-    const char *, const char *);
+extern dtrace_id_t dtrace_probe_lookup(dtrace_provider_id_t, char *,
+    char *, char *);
 extern dtrace_id_t dtrace_probe_create(dtrace_provider_id_t, const char *,
     const char *, const char *, int, void *);
 extern void *dtrace_probe_arg(dtrace_provider_id_t, dtrace_id_t);
@@ -2209,12 +2201,18 @@ extern void dtrace_probe(dtrace_id_t, ui
  *
  * 1.2.4  Caller's context
  *
- *   dtms_create_probe() is called from either ioctl() or module load context.
- *   The DTrace framework is locked in such a way that meta providers may not
- *   register or unregister. This means that the meta provider cannot call
- *   dtrace_meta_register() or dtrace_meta_unregister(). However, the context is
- *   such that the provider may (and is expected to) call provider-related
- *   DTrace provider APIs including dtrace_probe_create().
+ *   dtms_create_probe() is called from either ioctl() or module load context
+ *   in the context of a newly-created provider (that is, a provider that
+ *   is a result of a call to dtms_provide_pid()). The DTrace framework is
+ *   locked in such a way that meta providers may not register or unregister,
+ *   such that no other thread can call into a meta provider operation and that
+ *   atomicity is assured with respect to meta provider operations across
+ *   dtms_provide_pid() and subsequent calls to dtms_create_probe().
+ *   The context is thus effectively single-threaded with respect to the meta
+ *   provider, and that the meta provider cannot call dtrace_meta_register()
+ *   or dtrace_meta_unregister(). However, the context is such that the
+ *   provider may (and is expected to) call provider-related DTrace provider
+ *   APIs including dtrace_probe_create().
  *
  * 1.3  void *dtms_provide_pid(void *arg, dtrace_meta_provider_t *mprov,
  *	      pid_t pid)
@@ -2362,8 +2360,8 @@ extern void dtrace_membar_consumer(void)
 
 extern void (*dtrace_cpu_init)(processorid_t);
 #ifdef illumos
-extern void (*dtrace_modload)(dtrace_modctl_t *);
-extern void (*dtrace_modunload)(dtrace_modctl_t *);
+extern void (*dtrace_modload)(modctl_t *);
+extern void (*dtrace_modunload)(modctl_t *);
 #endif
 extern void (*dtrace_helpers_cleanup)(void);
 extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child);
@@ -2452,17 +2450,14 @@ extern void dtrace_helpers_destroy(proc_
 #define DTRACE_INVOP_POPM	2
 #define DTRACE_INVOP_B		3
 
-#define	DTRACE_INVOP_MOV_IP_SP		1
-#define	DTRACE_INVOP_BX_LR		2
-#define	DTRACE_INVOP_MOV_PC_LR		3
-#define	DTRACE_INVOP_LDM		4
-#define	DTRACE_INVOP_LDMIB		5
-#define	DTRACE_INVOP_LDR_IMM		6
-#define	DTRACE_INVOP_MOVW		7
-#define	DTRACE_INVOP_MOV_IMM		8
-#define	DTRACE_INVOP_CMP_IMM		9
-#define	DTRACE_INVOP_B_LABEL		10
-#define	DTRACE_INVOP_PUSH		11
+#define	DTRACE_INVOP_MOV_IP_SP	4
+#define	DTRACE_INVOP_BX_LR	5
+#define	DTRACE_INVOP_MOV_PC_LR	6
+#define	DTRACE_INVOP_LDM	7
+#define	DTRACE_INVOP_LDR_IMM	8
+#define	DTRACE_INVOP_MOVW	9
+#define	DTRACE_INVOP_MOV_IMM	10
+#define	DTRACE_INVOP_CMP_IMM	11
 
 #elif defined(__aarch64__)
 
@@ -2493,6 +2488,28 @@ extern void dtrace_helpers_destroy(proc_
 #define	DTRACE_INVOP_RET	2
 #define	DTRACE_INVOP_B		3
 
+#elif defined(__mips__)
+
+#define	INSN_SIZE		4
+
+/* Load/Store double RA to/from SP */
+#define	LDSD_RA_SP_MASK		0xffff0000
+#define	LDSD_DATA_MASK		0x0000ffff
+#define	SD_RA_SP		0xffbf0000
+#define	LD_RA_SP		0xdfbf0000
+
+#define	DTRACE_INVOP_SD		1
+#define	DTRACE_INVOP_LD		2
+
+#elif defined(__riscv__)
+
+#define	SD_RA_SP_MASK		0x01fff07f
+#define	SD_RA_SP		0x00113023
+
+#define	DTRACE_INVOP_SD		1
+#define	DTRACE_INVOP_RET	2
+#define	DTRACE_INVOP_NOP	3
+
 #endif
 
 #ifdef	__cplusplus
Index: src/external/cddl/osnet/dist/uts/common/sys/dtrace_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/dtrace_impl.h,v
retrieving revision 1.6
diff -u -p -r1.6 dtrace_impl.h
--- src/external/cddl/osnet/dist/uts/common/sys/dtrace_impl.h	24 Sep 2015 14:21:35 -0000	1.6
+++ src/external/cddl/osnet/dist/uts/common/sys/dtrace_impl.h	16 May 2017 21:08:58 -0000
@@ -18,17 +18,17 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 277300 2015-01-17 14:44:59Z smh $
+ * $FreeBSD: head/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 313176 2017-02-03 22:26:19Z gnn $
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_DTRACE_IMPL_H
@@ -50,6 +50,7 @@ extern "C" {
  */
 
 #include <sys/dtrace.h>
+
 #ifndef illumos
 #ifdef __sparcv9
 typedef uint32_t		pc_t;
@@ -65,6 +66,13 @@ typedef	u_long			greg_t;
 #define	DTRACE_MAXPROPLEN		128
 #define	DTRACE_DYNVAR_CHUNKSIZE		256
 
+#ifdef __FreeBSD__
+#define	NCPU		MAXCPU
+#endif /* __FreeBSD__ */
+#ifdef __NetBSD__
+#define	NCPU		MAXCPUS
+#endif /* __NetBSD__ */
+
 struct dtrace_probe;
 struct dtrace_ecb;
 struct dtrace_predicate;
@@ -932,6 +940,7 @@ typedef struct dtrace_mstate {
 	int dtms_ipl;				/* cached interrupt pri lev */
 	int dtms_fltoffs;			/* faulting DIFO offset */
 	uintptr_t dtms_strtok;			/* saved strtok() pointer */
+	uintptr_t dtms_strtok_limit;		/* upper bound of strtok ptr */
 	uint32_t dtms_access;			/* memory access rights */
 	dtrace_difo_t *dtms_difo;		/* current dif object */
 	file_t *dtms_getf;			/* cached rval of getf() */
@@ -1122,10 +1131,10 @@ typedef struct dtrace_cred {
  * dtrace_state structure.
  */
 struct dtrace_state {
-#if defined(illumos) || defined(__NetBSD__)
-	dev_t dts_dev;				/* device */
-#else
+#ifdef __FreeBSD__
 	struct cdev *dts_dev;			/* device */
+#else
+	dev_t dts_dev;				/* device */
 #endif
 	int dts_necbs;				/* total number of ECBs */
 	dtrace_ecb_t **dts_ecbs;		/* array of ECBs */
@@ -1140,10 +1149,10 @@ struct dtrace_state {
 	int dts_nspeculations;			/* number of speculations */
 	int dts_naggregations;			/* number of aggregations */
 	dtrace_aggregation_t **dts_aggregations; /* aggregation array */
-#if defined(illumos) || defined(__NetBSD__)
-	vmem_t *dts_aggid_arena;		/* arena for aggregation IDs */
-#else
+#ifdef __FreeBSD__
 	struct unrhdr *dts_aggid_arena;		/* arena for aggregation IDs */
+#else
+	vmem_t *dts_aggid_arena;		/* arena for aggregation IDs */
 #endif
 	uint64_t dts_errors;			/* total number of errors */
 	uint32_t dts_speculations_busy;		/* number of spec. busy */
@@ -1152,16 +1161,18 @@ struct dtrace_state {
 	uint32_t dts_dblerrors;			/* errors in ERROR probes */
 	uint32_t dts_reserve;			/* space reserved for END */
 	hrtime_t dts_laststatus;		/* time of last status */
-#if defined(illumos)
+#ifdef illumos
 	cyclic_id_t dts_cleaner;		/* cleaning cyclic */
 	cyclic_id_t dts_deadman;		/* deadman cyclic */
-#elif defined(__NetBSD__)
-	struct dtrace_state_worker *dts_cleaner;/* cleaning cyclic */
-	struct dtrace_state_worker *dts_deadman;/* deadman cyclic */
-#else
+#endif
+#ifdef __FreeBSD__
 	struct callout dts_cleaner;		/* Cleaning callout. */
 	struct callout dts_deadman;		/* Deadman callout. */
 #endif
+#ifdef __NetBSD__
+	struct dtrace_state_worker *dts_cleaner;/* cleaning cyclic */
+	struct dtrace_state_worker *dts_deadman;/* deadman cyclic */
+#endif
 	hrtime_t dts_alive;			/* time last alive */
 	char dts_speculates;			/* boolean: has speculations */
 	char dts_destructive;			/* boolean: has dest. actions */
@@ -1171,6 +1182,7 @@ struct dtrace_state {
 	dtrace_cred_t dts_cred;			/* credentials */
 	size_t dts_nretained;			/* number of retained enabs */
 	int dts_getf;				/* number of getf() calls */
+	uint64_t dts_rstate[NCPU][2];		/* per-CPU random state */
 };
 
 struct dtrace_provider {
@@ -1320,16 +1332,19 @@ extern void dtrace_copystr(uintptr_t, ui
 /*
  * DTrace Assertions
  *
- * DTrace calls ASSERT from probe context.  To assure that a failed ASSERT
- * does not induce a markedly more catastrophic failure (e.g., one from which
- * a dump cannot be gleaned), DTrace must define its own ASSERT to be one that
- * may safely be called from probe context.  This header file must thus be
- * included by any DTrace component that calls ASSERT from probe context, and
- * _only_ by those components.  (The only exception to this is kernel
- * debugging infrastructure at user-level that doesn't depend on calling
- * ASSERT.)
+ * DTrace calls ASSERT and VERIFY from probe context.  To assure that a failed
+ * ASSERT or VERIFY does not induce a markedly more catastrophic failure (e.g.,
+ * one from which a dump cannot be gleaned), DTrace must define its own ASSERT
+ * and VERIFY macros to be ones that may safely be called from probe context.
+ * This header file must thus be included by any DTrace component that calls
+ * ASSERT and/or VERIFY from probe context, and _only_ by those components.
+ * (The only exception to this is kernel debugging infrastructure at user-level
+ * that doesn't depend on calling ASSERT.)
  */
 #undef ASSERT
+#undef VERIFY
+#define	VERIFY(EX)	((void)((EX) || \
+			dtrace_assfail(#EX, __FILE__, __LINE__)))
 #ifdef DEBUG
 #define	ASSERT(EX)	((void)((EX) || \
 			dtrace_assfail(#EX, __FILE__, __LINE__)))
Index: src/external/cddl/osnet/dist/uts/common/sys/errorq.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/errorq.h,v
retrieving revision 1.2
diff -u -p -r1.2 errorq.h
--- src/external/cddl/osnet/dist/uts/common/sys/errorq.h	7 Aug 2009 20:16:46 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/errorq.h	12 Jun 2012 05:57:29 -0000
@@ -54,8 +54,8 @@ typedef void (*errorq_func_t)(void *, co
 
 #ifdef	_KERNEL
 
-/*extern errorq_t *errorq_create(const char *, errorq_func_t, void *,
-    ulong_t, size_t, uint_t, uint_t);*/
+extern errorq_t *errorq_create(const char *, errorq_func_t, void *,
+    ulong_t, size_t, uint_t, uint_t);
 
 extern errorq_t *errorq_nvcreate(const char *, errorq_func_t, void *,
     ulong_t, size_t, uint_t, uint_t);
@@ -68,7 +68,7 @@ extern void errorq_panic(void);
 extern errorq_elem_t *errorq_reserve(errorq_t *);
 extern void errorq_commit(errorq_t *, errorq_elem_t *, uint_t);
 extern void errorq_cancel(errorq_t *, errorq_elem_t *);
-/*extern nvlist_t *errorq_elem_nvl(errorq_t *, const errorq_elem_t *);*/
+extern nvlist_t *errorq_elem_nvl(errorq_t *, const errorq_elem_t *);
 extern nv_alloc_t *errorq_elem_nva(errorq_t *, const errorq_elem_t *);
 extern void *errorq_elem_dup(errorq_t *, const errorq_elem_t *,
     errorq_elem_t **);
Index: src/external/cddl/osnet/dist/uts/common/sys/fasttrap.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/fasttrap.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 fasttrap.h
--- src/external/cddl/osnet/dist/uts/common/sys/fasttrap.h	20 Feb 2010 04:34:31 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/sys/fasttrap.h	4 Feb 2015 07:24:17 -0000
@@ -37,9 +37,14 @@
 extern "C" {
 #endif
 
+#ifdef illumos
 #define	FASTTRAPIOC		(('m' << 24) | ('r' << 16) | ('f' << 8))
 #define	FASTTRAPIOC_MAKEPROBE	(FASTTRAPIOC | 1)
 #define	FASTTRAPIOC_GETINSTR	(FASTTRAPIOC | 2)
+#else
+#define	FASTTRAPIOC_GETINSTR	_IOWR('f', 2, uint8_t)
+#define	FASTTRAPIOC_MAKEPROBE	_IO('f', 3)
+#endif
 
 typedef enum fasttrap_probe_type {
 	DTFTP_NONE = 0,
Index: src/external/cddl/osnet/dist/uts/common/sys/feature_tests.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/feature_tests.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 feature_tests.h
--- src/external/cddl/osnet/dist/uts/common/sys/feature_tests.h	7 Aug 2009 18:33:48 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/sys/feature_tests.h	1 Nov 2014 10:04:54 -0000
@@ -20,6 +20,8 @@
  */
 
 /*
+ * Copyright 2013 Garrett D'Amore <garrett@damore.org>
+ *
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -27,10 +29,7 @@
 #ifndef _SYS_FEATURE_TESTS_H
 #define	_SYS_FEATURE_TESTS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/ccompile.h>
-#include <sys/isa_defs.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -45,15 +44,16 @@ extern "C" {
  *		199309L	    POSIX.1b-1993 compilation (Real Time)
  *		199506L	    POSIX.1c-1995 compilation (POSIX Threads)
  *		200112L	    POSIX.1-2001 compilation (Austin Group Revision)
+ *		200809L     POSIX.1-2008 compilation
  */
 #if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE)
 #define	_POSIX_C_SOURCE 1
 #endif
 
 /*
- * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, and _STDC_C99
- * are Sun implementation specific macros created in order to compress
- * common standards specified feature test macros for easier reading.
+ * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, _STRICT_SYMBOLS,
+ * and _STDC_C99 are Sun implementation specific macros created in order to
+ * compress common standards specified feature test macros for easier reading.
  * These macros should not be used by the application developer as
  * unexpected results may occur. Instead, the user should reference
  * standards(5) for correct usage of the standards feature test macros.
@@ -79,6 +79,10 @@ extern "C" {
  *                      the C standard. A value of 199901L indicates a
  *                      compiler that complies with ISO/IEC 9899:1999, other-
  *                      wise known as the C99 standard.
+ *
+ * _STRICT_SYMBOLS	Used in cases where symbol visibility is restricted
+ *                      by the standards, and the user has not explicitly
+ *                      relaxed the strictness via __EXTENSIONS__.
  */
 
 #if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE)
@@ -142,8 +146,18 @@ extern "C" {
  */
 
 #if __STDC_VERSION__ - 0 >= 199901L
+#ifndef _STDC_C99
 #define	_STDC_C99
 #endif
+#endif
+
+/*
+ * Use strict symbol visibility.
+ */
+#if (defined(_STRICT_STDC) || defined(__XOPEN_OR_POSIX)) && \
+	!defined(__EXTENSIONS__)
+#define	_STRICT_SYMBOLS
+#endif
 
 /*
  * Large file interfaces:
@@ -224,6 +238,8 @@ extern "C" {
  * X/Open CAE Specification, Issue 5 (XPG5)
  * Open Group Technical Standard, Issue 6 (XPG6), also referred to as
  *    IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002.
+ * Open Group Technical Standard, Issue 7 (XPG7), also referred to as
+ *    IEEE Std. 1003.1-2008 and ISO/IEC 9945:2009.
  *
  * XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1).
  * XPG5 is also referred to as UNIX 98 or the Single Unix Specification,
@@ -231,6 +247,7 @@ extern "C" {
  * XPG6 is the result of a merge of the X/Open and POSIX specifications
  *     and as such is also referred to as IEEE Std. 1003.1-2001 in
  *     addition to UNIX 03 and SUSv3.
+ * XPG7 is also referred to as UNIX 08 and SUSv4.
  *
  * When writing a conforming X/Open application, as per the specification
  * requirements, the appropriate feature test macros must be defined at
@@ -243,6 +260,7 @@ extern "C" {
  * _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1           XPG4v2
  * _XOPEN_SOURCE = 500                                   XPG5
  * _XOPEN_SOURCE = 600  (or POSIX_C_SOURCE=200112L)      XPG6
+ * _XOPEN_SOURCE = 700  (or POSIX_C_SOURCE=200809L)      XPG7
  *
  * In order to simplify the guards within the headers, the following
  * implementation private test macros have been created. Applications
@@ -262,6 +280,7 @@ extern "C" {
  * _XPG4_2  X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS)
  * _XPG5    X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2)
  * _XPG6    Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3)
+ * _XPG7    Open Group Technical Standard, Issue 7 (XPG7/UNIX 08/SUSv4)
  */
 
 /* X/Open Portability Guide, Issue 3 */
@@ -296,6 +315,19 @@ extern "C" {
 #define	_POSIX_C_SOURCE			200112L
 #undef	_XOPEN_SOURCE
 #define	_XOPEN_SOURCE			600
+
+/* Open Group Technical Standard, Issue 7 */
+#elif	(_XOPEN_SOURCE - 0 == 700) || (_POSIX_C_SOURCE - 0 == 200809L)
+#define	_XPG7
+#define	_XPG6
+#define	_XPG5
+#define	_XPG4_2
+#define	_XPG4
+#define	_XPG3
+#undef	_POSIX_C_SOURCE
+#define	_POSIX_C_SOURCE			200809L
+#undef	_XOPEN_SOURCE
+#define	_XOPEN_SOURCE			700
 #endif
 
 /*
@@ -306,12 +338,15 @@ extern "C" {
  * with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application.
  * _XOPEN_VERSION  defined with a value of 500 indicates an XPG5 (UNIX 98)
  * application and with a value of 600 indicates an XPG6 (UNIX 03)
- * application.  The appropriate version is determined by the use of the
+ * application and with a value of 700 indicates an XPG7 (UNIX 08).
+ * The appropriate version is determined by the use of the
  * feature test macros described earlier.  The value of _XOPEN_VERSION
  * defaults to 3 otherwise indicating support for XPG3 applications.
  */
 #ifndef _XOPEN_VERSION
-#ifdef	_XPG6
+#if	defined(_XPG7)
+#define	_XOPEN_VERSION 700
+#elif	defined(_XPG6)
 #define	_XOPEN_VERSION 600
 #elif defined(_XPG5)
 #define	_XOPEN_VERSION 500
@@ -365,7 +400,7 @@ extern "C" {
  * compiler is used. This allows for the use of single prototype
  * declarations regardless of compiler version.
  */
-#if (defined(__STDC__) && defined(_STDC_C99))
+#if (defined(__STDC__) && defined(_STDC_C99)) && !defined(__cplusplus)
 #define	_RESTRICT_KYWD	restrict
 #else
 #define	_RESTRICT_KYWD
Index: src/external/cddl/osnet/dist/uts/common/sys/gfs.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/sys/gfs.h
diff -N src/external/cddl/osnet/dist/uts/common/sys/gfs.h
--- src/external/cddl/osnet/dist/uts/common/sys/gfs.h	7 Aug 2009 20:16:46 -0000	1.2
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,173 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_GFS_H
-#define	_SYS_GFS_H
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/types.h>
-#include <sys/vnode.h>
-#include <sys/mutex.h>
-#include <sys/dirent.h>
-#include <sys/extdirent.h>
-#include <sys/uio.h>
-#include <sys/list.h>
-#include <sys/pathname.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifdef PORT_SOLARIS	
-typedef struct gfs_opsvec {
-	const char		*gfsv_name;	/* vnode description */
-	const fs_operation_def_t *gfsv_template; /* ops template */
-	vnodeops_t		**gfsv_ops;	/* ptr to result */
-} gfs_opsvec_t;
-
-int gfs_make_opsvec(gfs_opsvec_t *);
-#endif
-	
-#define	GFS_CACHE_VNODE		0x1
-
-typedef struct gfs_dirent {
-	char			*gfse_name;	/* entry name */
-	vnode_t *(*gfse_ctor)(vnode_t *);	/* constructor */
-	int			gfse_flags;	/* flags */
-	list_node_t		gfse_link;	/* dynamic list */
-	vnode_t			*gfse_vnode;	/* cached vnode */
-} gfs_dirent_t;
-
-typedef enum gfs_type {
-	GFS_DIR,
-	GFS_FILE
-} gfs_type_t;
-
-typedef struct gfs_file {
-	vnode_t		*gfs_vnode;	/* current vnode */
-	vnode_t		*gfs_parent;	/* parent vnode */
-	size_t		gfs_size;	/* size of private data structure */
-	gfs_type_t	gfs_type;	/* type of vnode */
-	int		gfs_index;	/* index in parent dir */
-	ino64_t		gfs_ino;	/* inode for this vnode */
-} gfs_file_t;
-
-typedef int (*gfs_readdir_cb)(vnode_t *, void *, int *, offset_t *,
-    offset_t *, void *, int);
-typedef int (*gfs_lookup_cb)(vnode_t *, const char *, vnode_t **, ino64_t *,
-    cred_t *, int, int *, pathname_t *);
-typedef ino64_t (*gfs_inode_cb)(vnode_t *, int);
-
-typedef struct gfs_dir {
-	gfs_file_t	gfsd_file;	/* generic file attributes */
-	gfs_dirent_t	*gfsd_static;	/* statically defined entries */
-	int		gfsd_nstatic;	/* # static entries */
-	kmutex_t	gfsd_lock;	/* protects entries */
-	int		gfsd_maxlen;	/* maximum name length */
-	gfs_readdir_cb	gfsd_readdir;	/* readdir() callback */
-	gfs_lookup_cb	gfsd_lookup;	/* lookup() callback */
-	gfs_inode_cb	gfsd_inode;	/* get an inode number */
-} gfs_dir_t;
-
-struct vfs;
-
-extern vnode_t *gfs_file_create(size_t, vnode_t *, vnodeops_t *);
-extern vnode_t *gfs_dir_create(size_t, vnode_t *, vnodeops_t *,
-    gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb);
-extern vnode_t *gfs_root_create(size_t, struct vfs *, vnodeops_t *, ino64_t,
-    gfs_dirent_t *, gfs_inode_cb, int, gfs_readdir_cb, gfs_lookup_cb);
-extern vnode_t *gfs_root_create_file(size_t, struct vfs *, vnodeops_t *,
-    ino64_t);
-
-extern void *gfs_file_inactive(vnode_t *);
-extern void *gfs_dir_inactive(vnode_t *);
-
-extern int gfs_dir_case_lookup(vnode_t *, const char *, vnode_t **, cred_t *,
-    int, int *, pathname_t *);
-extern int gfs_dir_lookup(vnode_t *, const char *, vnode_t **, cred_t *,
-    int, int *, pathname_t *);
-extern int gfs_dir_readdir(vnode_t *, uio_t *, int *, void *, cred_t *,
-    caller_context_t *, int flags);
-
-#define	gfs_dir_lock(gd)	mutex_enter(&(gd)->gfsd_lock)
-#define	gfs_dir_unlock(gd)	mutex_exit(&(gd)->gfsd_lock)
-#define	GFS_DIR_LOCKED(gd)	MUTEX_HELD(&(gd)->gfsd_lock)
-
-#define	gfs_file_parent(vp)	(((gfs_file_t *)(vp)->v_data)->gfs_parent)
-
-#define	gfs_file_index(vp)	(((gfs_file_t *)(vp)->v_data)->gfs_index)
-#define	gfs_file_set_index(vp, idx)	\
-	(((gfs_file_t *)(vp)->v_data)->gfs_index = (idx))
-
-#define	gfs_file_inode(vp)	(((gfs_file_t *)(vp)->v_data)->gfs_ino)
-#define	gfs_file_set_inode(vp, ino)	\
-	(((gfs_file_t *)(vp)->v_data)->gfs_ino = (ino))
-
-typedef struct gfs_readdir_state {
-	void		*grd_dirent;	/* directory entry buffer */
-	size_t		grd_namlen;	/* max file name length */
-	size_t		grd_ureclen;	/* exported record size */
-	ssize_t		grd_oresid;	/* original uio_resid */
-	ino64_t		grd_parent;	/* inode of parent */
-	ino64_t		grd_self;	/* inode of self */
-	int		grd_flags;	/* flags from VOP_READDIR */
-} gfs_readdir_state_t;
-
-extern int gfs_readdir_init(gfs_readdir_state_t *, int, int, uio_t *, ino64_t,
-    ino64_t, int);
-extern int gfs_readdir_emit(gfs_readdir_state_t *, uio_t *, offset_t, ino64_t,
-    const char *, int);
-extern int gfs_readdir_emitn(gfs_readdir_state_t *, uio_t *, offset_t, ino64_t,
-    unsigned long);
-extern int gfs_readdir_pred(gfs_readdir_state_t *, uio_t *, offset_t *);
-extern int gfs_readdir_fini(gfs_readdir_state_t *, int, int *, int);
-extern int gfs_get_parent_ino(vnode_t *, cred_t *, caller_context_t *,
-    ino64_t *, ino64_t *);
-
-/*
- * Objects with real extended attributes will get their . and ..
- * readdir entries from the real xattr directory. GFS_STATIC_ENTRY_OFFSET
- * lets us skip right to the static entries in the GFS directory.
- */
-#define	GFS_STATIC_ENTRY_OFFSET	((offset_t)2)
-
-extern int gfs_lookup_dot(vnode_t **, vnode_t *, vnode_t *, const char *);
-
-extern int gfs_vop_lookup(vnode_t *, char *, vnode_t **, pathname_t *,
-    int, vnode_t *, cred_t *, caller_context_t *, int *, pathname_t *);
-extern int gfs_vop_readdir(vnode_t *, uio_t *, cred_t *, int *,
-    caller_context_t *, int);
-//extern int gfs_vop_map(vnode_t *, offset_t, struct as *, caddr_t *,
-//	size_t, uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
-extern void gfs_vop_inactive(vnode_t *, cred_t *, caller_context_t *);
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_GFS_H */
Index: src/external/cddl/osnet/dist/uts/common/sys/nvpair.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/nvpair.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 nvpair.h
--- src/external/cddl/osnet/dist/uts/common/sys/nvpair.h	27 Feb 2010 22:32:37 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/nvpair.h	23 Mar 2013 15:29:24 -0000
@@ -19,16 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_NVPAIR_H
 #define	_SYS_NVPAIR_H
 
 #include <sys/types.h>
+#include <sys/time.h>
 #include <sys/errno.h>
-#include <sys/va_list.h>
 
 #if defined(_KERNEL) && !defined(_BOOT)
 #include <sys/kmem.h>
@@ -158,6 +158,8 @@ int nvlist_unpack(char *, size_t, nvlist
 int nvlist_dup(nvlist_t *, nvlist_t **, int);
 int nvlist_merge(nvlist_t *, nvlist_t *, int);
 
+uint_t nvlist_nvflag(nvlist_t *);
+
 int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *);
 int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *);
 int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *);
@@ -273,6 +275,74 @@ int nvpair_value_hrtime(nvpair_t *, hrti
 int nvpair_value_double(nvpair_t *, double *);
 #endif
 
+nvlist_t *fnvlist_alloc(void);
+void fnvlist_free(nvlist_t *);
+size_t fnvlist_size(nvlist_t *);
+char *fnvlist_pack(nvlist_t *, size_t *);
+void fnvlist_pack_free(char *, size_t);
+nvlist_t *fnvlist_unpack(char *, size_t);
+nvlist_t *fnvlist_dup(nvlist_t *);
+void fnvlist_merge(nvlist_t *, nvlist_t *);
+size_t fnvlist_num_pairs(nvlist_t *);
+
+void fnvlist_add_boolean(nvlist_t *, const char *);
+void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t);
+void fnvlist_add_byte(nvlist_t *, const char *, uchar_t);
+void fnvlist_add_int8(nvlist_t *, const char *, int8_t);
+void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t);
+void fnvlist_add_int16(nvlist_t *, const char *, int16_t);
+void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t);
+void fnvlist_add_int32(nvlist_t *, const char *, int32_t);
+void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t);
+void fnvlist_add_int64(nvlist_t *, const char *, int64_t);
+void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t);
+void fnvlist_add_string(nvlist_t *, const char *, const char *);
+void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *);
+void fnvlist_add_nvpair(nvlist_t *, nvpair_t *);
+void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t);
+void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t);
+void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t);
+void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t);
+void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t);
+void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t);
+void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t);
+void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t);
+void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t);
+void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t);
+void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t);
+void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t);
+
+void fnvlist_remove(nvlist_t *, const char *);
+void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *);
+
+nvpair_t *fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name);
+boolean_t fnvlist_lookup_boolean(nvlist_t *nvl, const char *name);
+boolean_t fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name);
+uchar_t fnvlist_lookup_byte(nvlist_t *nvl, const char *name);
+int8_t fnvlist_lookup_int8(nvlist_t *nvl, const char *name);
+int16_t fnvlist_lookup_int16(nvlist_t *nvl, const char *name);
+int32_t fnvlist_lookup_int32(nvlist_t *nvl, const char *name);
+int64_t fnvlist_lookup_int64(nvlist_t *nvl, const char *name);
+uint8_t fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name);
+uint16_t fnvlist_lookup_uint16(nvlist_t *nvl, const char *name);
+uint32_t fnvlist_lookup_uint32(nvlist_t *nvl, const char *name);
+uint64_t fnvlist_lookup_uint64(nvlist_t *nvl, const char *name);
+char *fnvlist_lookup_string(nvlist_t *nvl, const char *name);
+nvlist_t *fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name);
+
+boolean_t fnvpair_value_boolean_value(nvpair_t *nvp);
+uchar_t fnvpair_value_byte(nvpair_t *nvp);
+int8_t fnvpair_value_int8(nvpair_t *nvp);
+int16_t fnvpair_value_int16(nvpair_t *nvp);
+int32_t fnvpair_value_int32(nvpair_t *nvp);
+int64_t fnvpair_value_int64(nvpair_t *nvp);
+uint8_t fnvpair_value_uint8_t(nvpair_t *nvp);
+uint16_t fnvpair_value_uint16(nvpair_t *nvp);
+uint32_t fnvpair_value_uint32(nvpair_t *nvp);
+uint64_t fnvpair_value_uint64(nvpair_t *nvp);
+char *fnvpair_value_string(nvpair_t *nvp);
+nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp);
+
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/sys/processor.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/processor.h,v
retrieving revision 1.5
diff -u -p -r1.5 processor.h
--- src/external/cddl/osnet/dist/uts/common/sys/processor.h	13 Mar 2010 22:31:15 -0000	1.5
+++ src/external/cddl/osnet/dist/uts/common/sys/processor.h	1 Nov 2014 10:04:54 -0000
@@ -1,4 +1,4 @@
-	/*
+/*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
@@ -25,6 +25,8 @@
  */
 
 /*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ *
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -32,11 +34,8 @@
 #ifndef _SYS_PROCESSOR_H
 #define	_SYS_PROCESSOR_H
 
-/* #pragma ident	"%Z%%M%	%I%	%E% SMI" */
-
 #include <sys/types.h>
 #include <sys/procset.h>
-#include <sys/pset.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -54,9 +53,7 @@ typedef uint16_t lgrpid_t;
 /*
  * Type for processor name (CPU number).
  */
-#if defined(sun)
 typedef	int	processorid_t;
-#endif
 typedef int	chipid_t;
 
 /*
@@ -117,7 +114,6 @@ typedef struct {
  * User-level system call interface prototypes
  */
 #ifndef _KERNEL
-#ifdef __STDC__
 
 extern int	p_online(processorid_t processorid, int flag);
 extern int	processor_info(processorid_t processorid,
@@ -127,22 +123,13 @@ extern int	processor_bind(idtype_t idtyp
 extern processorid_t getcpuid(void);
 extern lgrpid_t gethomelgroup(void);
 
-#else
-
-extern int	p_online();
-extern int	processor_info();
-extern int	processor_bind();
-extern processorid_t getcpuid();
-extern lgrpid_t gethomelgroup();
-
-#endif /* __STDC__ */
-
 #else   /* _KERNEL */
 
 /*
  * Internal interface prototypes
  */
 extern int	p_online_internal(processorid_t, int, int *);
+extern int	p_online_internal_locked(processorid_t, int, int *);
 
 #endif /* !_KERNEL */
 
Index: src/external/cddl/osnet/dist/uts/common/sys/sysevent.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/sysevent.h,v
retrieving revision 1.3
diff -u -p -r1.3 sysevent.h
--- src/external/cddl/osnet/dist/uts/common/sys/sysevent.h	27 Feb 2010 23:43:53 -0000	1.3
+++ src/external/cddl/osnet/dist/uts/common/sys/sysevent.h	31 May 2016 11:20:08 -0000
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_SYSEVENT_H
@@ -77,8 +76,8 @@ extern "C" {
 #define	SUNW_VENDOR	"SUNW"
 #define	SE_USR_PUB	"usr:"
 #define	SE_KERN_PUB	"kern:"
-#define	SUNW_KERN_PUB	SUNW_VENDOR":"SE_KERN_PUB
-#define	SUNW_USR_PUB	SUNW_VENDOR":"SE_USR_PUB
+#define	SUNW_KERN_PUB	SUNW_VENDOR ":" SE_KERN_PUB
+#define	SUNW_USR_PUB	SUNW_VENDOR ":" SE_USR_PUB
 
 /*
  * Event header and attribute value limits
@@ -215,20 +214,25 @@ typedef struct sysevent_value {
 #define	EVCH_SET_CHAN_LEN	 3	/* Set event queue length */
 #define	EVCH_CMD_LAST		 EVCH_SET_CHAN_LEN	/* Last command */
 
+#ifdef illumos
 /*
  * Shared user/kernel event channel interface definitions
  */
-/*int sysevent_evc_bind(const char *, evchan_t **, uint32_t);*/
-/*void sysevent_evc_unbind(evchan_t *);*/
-int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
+extern int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
+extern int sysevent_evc_unbind(evchan_t *);
+extern int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
     int (*)(sysevent_t *, void *), void *, uint32_t);
 extern int sysevent_evc_unsubscribe(evchan_t *, const char *);
 extern int sysevent_evc_publish(evchan_t *, const char *, const char *,
     const char *, const char *, nvlist_t *, uint32_t);
-/*int sysevent_evc_control(evchan_t *, int, ...);*/
+extern int sysevent_evc_control(evchan_t *, int, ...);
+extern int sysevent_evc_setpropnvl(evchan_t *, nvlist_t *);
+extern int sysevent_evc_getpropnvl(evchan_t *, nvlist_t **);
+#endif	/* illumos */
 
 #ifndef	_KERNEL
 
+#ifdef illumos
 /*
  * Userland-only event channel interfaces
  */
@@ -250,6 +254,7 @@ extern void sysevent_subattr_thrsetup(sy
 
 extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
     int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
+#endif	/* illumos */
 
 #else
 
@@ -265,6 +270,7 @@ extern int sysevent_add_attr(sysevent_at
 extern void sysevent_free_attr(sysevent_attr_list_t *);
 extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
 extern void sysevent_detach_attributes(sysevent_t *);
+#ifdef illumos
 extern char *sysevent_get_class_name(sysevent_t *);
 extern char *sysevent_get_subclass_name(sysevent_t *);
 extern uint64_t sysevent_get_seq(sysevent_t *);
@@ -272,6 +278,7 @@ extern void sysevent_get_time(sysevent_t
 extern size_t sysevent_get_size(sysevent_t *);
 extern char *sysevent_get_pub(sysevent_t *);
 extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+#endif	/* illumos */
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/dist/uts/common/sys/sysmacros.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/sys/sysmacros.h
diff -N src/external/cddl/osnet/dist/uts/common/sys/sysmacros.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/sys/sysmacros.h	6 Jul 2015 15:44:48 -0000
@@ -0,0 +1,459 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SYSMACROS_H
+#define	_SYS_SYSMACROS_H
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/libkern.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Some macros for units conversion
+ */
+/*
+ * Disk blocks (sectors) and bytes.
+ */
+#define	dtob(DD)	((DD) << DEV_BSHIFT)
+#define	btod(BB)	(((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
+#define	btodt(BB)	((BB) >> DEV_BSHIFT)
+#define	lbtod(BB)	(((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT)
+
+/* common macros */
+#ifndef MIN
+#define	MIN(a, b)	((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define	MAX(a, b)	((a) < (b) ? (b) : (a))
+#endif
+#ifndef ABS
+#define	ABS(a)		((a) < 0 ? -(a) : (a))
+#endif
+#ifndef	SIGNOF
+#define	SIGNOF(a)	((a) < 0 ? -1 : (a) > 0)
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * Convert a single byte to/from binary-coded decimal (BCD).
+ */
+extern unsigned char byte_to_bcd[256];
+extern unsigned char bcd_to_byte[256];
+
+#define	BYTE_TO_BCD(x)	byte_to_bcd[(x) & 0xff]
+#define	BCD_TO_BYTE(x)	bcd_to_byte[(x) & 0xff]
+
+#endif	/* _KERNEL */
+
+/*
+ * WARNING: The device number macros defined here should not be used by device
+ * drivers or user software. Device drivers should use the device functions
+ * defined in the DDI/DKI interface (see also ddi.h). Application software
+ * should make use of the library routines available in makedev(3). A set of
+ * new device macros are provided to operate on the expanded device number
+ * format supported in SVR4. Macro versions of the DDI device functions are
+ * provided for use by kernel proper routines only. Macro routines bmajor(),
+ * major(), minor(), emajor(), eminor(), and makedev() will be removed or
+ * their definitions changed at the next major release following SVR4.
+ */
+
+#define	O_BITSMAJOR	7	/* # of SVR3 major device bits */
+#define	O_BITSMINOR	8	/* # of SVR3 minor device bits */
+#define	O_MAXMAJ	0x7f	/* SVR3 max major value */
+#define	O_MAXMIN	0xff	/* SVR3 max minor value */
+
+
+#define	L_BITSMAJOR32	14	/* # of SVR4 major device bits */
+#define	L_BITSMINOR32	18	/* # of SVR4 minor device bits */
+#define	L_MAXMAJ32	0x3fff	/* SVR4 max major value */
+#define	L_MAXMIN32	0x3ffff	/* MAX minor for 3b2 software drivers. */
+				/* For 3b2 hardware devices the minor is */
+				/* restricted to 256 (0-255) */
+
+#ifdef _LP64
+#define	L_BITSMAJOR	32	/* # of major device bits in 64-bit Solaris */
+#define	L_BITSMINOR	32	/* # of minor device bits in 64-bit Solaris */
+#define	L_MAXMAJ	0xfffffffful	/* max major value */
+#define	L_MAXMIN	0xfffffffful	/* max minor value */
+#else
+#define	L_BITSMAJOR	L_BITSMAJOR32
+#define	L_BITSMINOR	L_BITSMINOR32
+#define	L_MAXMAJ	L_MAXMAJ32
+#define	L_MAXMIN	L_MAXMIN32
+#endif
+
+#ifdef illumos
+#ifdef _KERNEL
+
+/* major part of a device internal to the kernel */
+
+#define	major(x)	(major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+#define	bmajor(x)	(major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+
+/* get internal major part of expanded device number */
+
+#define	getmajor(x)	(major_t)((((dev_t)(x)) >> L_BITSMINOR) & L_MAXMAJ)
+
+/* minor part of a device internal to the kernel */
+
+#define	minor(x)	(minor_t)((x) & O_MAXMIN)
+
+/* get internal minor part of expanded device number */
+
+#define	getminor(x)	(minor_t)((x) & L_MAXMIN)
+
+#else
+
+/* major part of a device external from the kernel (same as emajor below) */
+
+#define	major(x)	(major_t)((((unsigned)(x)) >> O_BITSMINOR) & O_MAXMAJ)
+
+/* minor part of a device external from the kernel  (same as eminor below) */
+
+#define	minor(x)	(minor_t)((x) & O_MAXMIN)
+
+#endif	/* _KERNEL */
+
+/* create old device number */
+
+#define	makedev(x, y) (unsigned short)(((x) << O_BITSMINOR) | ((y) & O_MAXMIN))
+
+/* make an new device number */
+
+#define	makedevice(x, y) (dev_t)(((dev_t)(x) << L_BITSMINOR) | ((y) & L_MAXMIN))
+
+
+/*
+ * emajor() allows kernel/driver code to print external major numbers
+ * eminor() allows kernel/driver code to print external minor numbers
+ */
+
+#define	emajor(x) \
+	(major_t)(((unsigned int)(x) >> O_BITSMINOR) > O_MAXMAJ) ? \
+	    NODEV : (((unsigned int)(x) >> O_BITSMINOR) & O_MAXMAJ)
+
+#define	eminor(x) \
+	(minor_t)((x) & O_MAXMIN)
+
+/*
+ * get external major and minor device
+ * components from expanded device number
+ */
+#define	getemajor(x)	(major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
+			    NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
+#define	geteminor(x)	(minor_t)((x) & L_MAXMIN)
+#endif /* illumos */
+
+/*
+ * These are versions of the kernel routines for compressing and
+ * expanding long device numbers that don't return errors.
+ */
+#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR)
+
+#define	DEVCMPL(x)	(x)
+#define	DEVEXPL(x)	(x)
+
+#else
+
+#define	DEVCMPL(x)	\
+	(dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \
+	    ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \
+	    ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32)))
+
+#define	DEVEXPL(x)	\
+	(((x) == NODEV32) ? NODEV : \
+	makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32))
+
+#endif /* L_BITSMAJOR32 ... */
+
+/* convert to old (SVR3.2) dev format */
+
+#define	cmpdev(x) \
+	(o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \
+	    ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \
+	    ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN)))
+
+/* convert to new (SVR4) dev format */
+
+#define	expdev(x) \
+	(dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \
+	    ((x) & O_MAXMIN))
+
+/*
+ * Macro for checking power of 2 address alignment.
+ */
+#define	IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
+
+/*
+ * Macros for counting and rounding.
+ */
+#define	howmany(x, y)	(((x)+((y)-1))/(y))
+#define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))
+
+/*
+ * Macro to determine if value is a power of 2
+ */
+#define	ISP2(x)		(((x) & ((x) - 1)) == 0)
+
+/*
+ * Macros for various sorts of alignment and rounding.  The "align" must
+ * be a power of 2.  Often times it is a block, sector, or page.
+ */
+
+/*
+ * return x rounded down to an align boundary
+ * eg, P2ALIGN(1200, 1024) == 1024 (1*align)
+ * eg, P2ALIGN(1024, 1024) == 1024 (1*align)
+ * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
+ * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+#define	P2ALIGN(x, align)		((x) & -(align))
+
+/*
+ * return x % (mod) align
+ * eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align)
+ * eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align)
+ */
+#define	P2PHASE(x, align)		((x) & ((align) - 1))
+
+/*
+ * return how much space is left in this block (but if it's perfectly
+ * aligned, return 0).
+ * eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x)
+ * eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x)
+ */
+#define	P2NPHASE(x, align)		(-(x) & ((align) - 1))
+
+/*
+ * return x rounded up to an align boundary
+ * eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align)
+ * eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align)
+ */
+#define	P2ROUNDUP(x, align)		(-(-(x) & -(align)))
+
+/*
+ * return the ending address of the block that x is in
+ * eg, P2END(0x1234, 0x100) == 0x12ff (0x13*align - 1)
+ * eg, P2END(0x5600, 0x100) == 0x56ff (0x57*align - 1)
+ */
+#define	P2END(x, align)			(-(~(x) & -(align)))
+
+/*
+ * return x rounded up to the next phase (offset) within align.
+ * phase should be < align.
+ * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase)
+ * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase)
+ */
+#define	P2PHASEUP(x, align, phase)	((phase) - (((phase) - (x)) & -(align)))
+
+/*
+ * return TRUE if adding len to off would cause it to cross an align
+ * boundary.
+ * eg, P2BOUNDARY(0x1234, 0xe0, 0x100) == TRUE (0x1234 + 0xe0 == 0x1314)
+ * eg, P2BOUNDARY(0x1234, 0x50, 0x100) == FALSE (0x1234 + 0x50 == 0x1284)
+ */
+#define	P2BOUNDARY(off, len, align) \
+	(((off) ^ ((off) + (len) - 1)) > (align) - 1)
+
+/*
+ * Return TRUE if they have the same highest bit set.
+ * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000)
+ * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000)
+ */
+#define	P2SAMEHIGHBIT(x, y)		(((x) ^ (y)) < ((x) & (y)))
+
+/*
+ * Typed version of the P2* macros.  These macros should be used to ensure
+ * that the result is correctly calculated based on the data type of (x),
+ * which is passed in as the last argument, regardless of the data
+ * type of the alignment.  For example, if (x) is of type uint64_t,
+ * and we want to round it up to a page boundary using "PAGESIZE" as
+ * the alignment, we can do either
+ *	P2ROUNDUP(x, (uint64_t)PAGESIZE)
+ * or
+ *	P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
+ */
+#define	P2ALIGN_TYPED(x, align, type)	\
+	((type)(x) & -(type)(align))
+#define	P2PHASE_TYPED(x, align, type)	\
+	((type)(x) & ((type)(align) - 1))
+#define	P2NPHASE_TYPED(x, align, type)	\
+	(-(type)(x) & ((type)(align) - 1))
+#define	P2ROUNDUP_TYPED(x, align, type)	\
+	(-(-(type)(x) & -(type)(align)))
+#define	P2END_TYPED(x, align, type)	\
+	(-(~(type)(x) & -(type)(align)))
+#define	P2PHASEUP_TYPED(x, align, phase, type)	\
+	((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
+#define	P2CROSS_TYPED(x, y, align, type)	\
+	(((type)(x) ^ (type)(y)) > (type)(align) - 1)
+#define	P2SAMEHIGHBIT_TYPED(x, y, type) \
+	(((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
+
+/*
+ * Macros to atomically increment/decrement a variable.  mutex and var
+ * must be pointers.
+ */
+#define	INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex)
+#define	DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex)
+
+/*
+ * Macros to declare bitfields - the order in the parameter list is
+ * Low to High - that is, declare bit 0 first.  We only support 8-bit bitfields
+ * because if a field crosses a byte boundary it's not likely to be meaningful
+ * without reassembly in its nonnative endianness.
+ */
+#if defined(_BIT_FIELDS_LTOH)
+#define	DECL_BITFIELD2(_a, _b)				\
+	uint8_t _a, _b
+#define	DECL_BITFIELD3(_a, _b, _c)			\
+	uint8_t _a, _b, _c
+#define	DECL_BITFIELD4(_a, _b, _c, _d)			\
+	uint8_t _a, _b, _c, _d
+#define	DECL_BITFIELD5(_a, _b, _c, _d, _e)		\
+	uint8_t _a, _b, _c, _d, _e
+#define	DECL_BITFIELD6(_a, _b, _c, _d, _e, _f)		\
+	uint8_t _a, _b, _c, _d, _e, _f
+#define	DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g)	\
+	uint8_t _a, _b, _c, _d, _e, _f, _g
+#define	DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h)	\
+	uint8_t _a, _b, _c, _d, _e, _f, _g, _h
+#elif defined(_BIT_FIELDS_HTOL)
+#define	DECL_BITFIELD2(_a, _b)				\
+	uint8_t _b, _a
+#define	DECL_BITFIELD3(_a, _b, _c)			\
+	uint8_t _c, _b, _a
+#define	DECL_BITFIELD4(_a, _b, _c, _d)			\
+	uint8_t _d, _c, _b, _a
+#define	DECL_BITFIELD5(_a, _b, _c, _d, _e)		\
+	uint8_t _e, _d, _c, _b, _a
+#define	DECL_BITFIELD6(_a, _b, _c, _d, _e, _f)		\
+	uint8_t _f, _e, _d, _c, _b, _a
+#define	DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g)	\
+	uint8_t _g, _f, _e, _d, _c, _b, _a
+#define	DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h)	\
+	uint8_t _h, _g, _f, _e, _d, _c, _b, _a
+#else
+#error	One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined
+#endif  /* _BIT_FIELDS_LTOH */
+
+#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof)
+
+/* avoid any possibility of clashing with <stddef.h> version */
+
+#define	offsetof(s, m)	((size_t)(&(((s *)0)->m)))
+#endif
+
+/*
+ * Find highest one bit set.
+ *      Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ * High order bit is 31 (or 63 in _LP64 kernel).
+ */
+static __inline int
+highbit(ulong_t i)
+{
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSL)
+	return (flsl(i));
+#else
+	register int h = 1;
+
+	if (i == 0)
+		return (0);
+#ifdef _LP64
+	if (i & 0xffffffff00000000ul) {
+		h += 32; i >>= 32;
+	}
+#endif
+	if (i & 0xffff0000) {
+		h += 16; i >>= 16;
+	}
+	if (i & 0xff00) {
+		h += 8; i >>= 8;
+	}
+	if (i & 0xf0) {
+		h += 4; i >>= 4;
+	}
+	if (i & 0xc) {
+		h += 2; i >>= 2;
+	}
+	if (i & 0x2) {
+		h += 1;
+	}
+	return (h);
+#endif
+}
+
+/*
+ * Find highest one bit set.
+ *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+static __inline int
+highbit64(uint64_t i)
+{
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSLL)
+	return (flsll(i));
+#else
+	int h = 1;
+
+	if (i == 0)
+		return (0);
+	if (i & 0xffffffff00000000ULL) {
+		h += 32; i >>= 32;
+	}
+	if (i & 0xffff0000) {
+		h += 16; i >>= 16;
+	}
+	if (i & 0xff00) {
+		h += 8; i >>= 8;
+	}
+	if (i & 0xf0) {
+		h += 4; i >>= 4;
+	}
+	if (i & 0xc) {
+		h += 2; i >>= 2;
+	}
+	if (i & 0x2) {
+		h += 1;
+	}
+	return (h);
+#endif
+}
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SYSMACROS_H */
Index: src/external/cddl/osnet/dist/uts/common/sys/taskq.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/taskq.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 taskq.h
--- src/external/cddl/osnet/dist/uts/common/sys/taskq.h	27 Feb 2010 22:32:39 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/taskq.h	17 Apr 2017 23:07:59 -0000
@@ -27,7 +27,10 @@
 #define	_SYS_TASKQ_H
 
 #include <sys/types.h>
-#include <sys/thread.h>
+#include <sys/proc.h>
+#ifdef __FreeBSD__
+#include <sys/taskqueue.h>
+#endif
 
 #ifdef	__cplusplus
 extern "C" {
@@ -35,6 +38,18 @@ extern "C" {
 
 #define	TASKQ_NAMELEN	31
 
+#ifdef __FreeBSD__
+struct taskqueue;
+struct taskq {
+	struct taskqueue	*tq_queue;
+};
+typedef struct taskq_ent {
+	struct task	 tqent_task;
+	task_func_t	*tqent_func;
+	void		*tqent_arg;
+} taskq_ent_t;
+#endif
+
 typedef struct taskq taskq_t;
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
@@ -64,24 +79,27 @@ struct proc;
 
 extern taskq_t *system_taskq;
 
-extern void	taskq_init(void);
-extern void	taskq_mp_init(void);
+void	taskq_init(void);
+void	taskq_mp_init(void);
 
-extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
-extern taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int,
-    int, uint_t);
-extern taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
+taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
+taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
     struct proc *, uint_t);
-extern taskq_t	*taskq_create_sysdc(const char *, int, int, int,
+taskq_t	*taskq_create_sysdc(const char *, int, int, int,
     struct proc *, uint_t, uint_t);
-extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
-extern void	nulltask(void *);
-extern void	taskq_destroy(taskq_t *);
-extern void	taskq_wait(taskq_t *);
-extern void	taskq_suspend(taskq_t *);
-extern int	taskq_suspended(taskq_t *);
-extern void	taskq_resume(taskq_t *);
-extern int	taskq_member(taskq_t *, kthread_t *);
+taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+#ifdef __FreeBSD__
+void	taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+    taskq_ent_t *);
+#endif
+void	nulltask(void *);
+void	taskq_destroy(taskq_t *);
+void	taskq_wait(taskq_t *);
+void	taskq_suspend(taskq_t *);
+int	taskq_suspended(taskq_t *);
+void	taskq_resume(taskq_t *);
+int	taskq_member(taskq_t *, kthread_t *);
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 taskq_impl.h
--- src/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h	27 Feb 2010 22:32:39 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h	26 Apr 2017 15:48:45 -0000
@@ -28,7 +28,9 @@
 
 #include <sys/taskq.h>
 #include <sys/inttypes.h>
+#ifdef _KERNEL
 #include <sys/vmem.h>
+#endif
 #include <sys/list.h>
 #include <sys/kstat.h>
 
@@ -46,8 +48,11 @@ typedef struct taskq_ent {
 	taskq_bucket_t		*tqent_bucket;
 	kthread_t		*tqent_thread;
 	kcondvar_t		tqent_cv;
+	uintptr_t		tqent_flags;
 } taskq_ent_t;
 
+#define	TQENT_FLAG_PREALLOC	0x1
+
 /*
  * Taskq Statistics fields are not protected by any locks.
  */
@@ -139,6 +144,10 @@ struct taskq {
 	int		tq_tdeaths;
 };
 
+/* Special form of taskq dispatch that uses preallocated entries. */
+void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, taskq_ent_t *);
+
+
 #define	tq_thread tq_thr._tq_thread
 #define	tq_threadlist tq_thr._tq_threadlist
 
Index: src/external/cddl/osnet/dist/uts/common/sys/u8_textprep.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/u8_textprep.h,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 u8_textprep.h
--- src/external/cddl/osnet/dist/uts/common/sys/u8_textprep.h	7 Aug 2009 18:34:36 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/sys/u8_textprep.h	18 Apr 2017 15:04:51 -0000
@@ -36,6 +36,7 @@
 extern "C" {
 #endif
 
+#ifdef illumos
 /*
  * Unicode encoding conversion functions and their macros.
  */
@@ -57,6 +58,7 @@ extern int uconv_u32tou16(const uint32_t
 extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
 extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
 extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
+#endif	/* illumos */
 
 /*
  * UTF-8 text preparation functions and their macros.
Index: src/external/cddl/osnet/dist/uts/common/sys/vnode.h
===================================================================
RCS file: src/external/cddl/osnet/dist/uts/common/sys/vnode.h
diff -N src/external/cddl/osnet/dist/uts/common/sys/vnode.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/dist/uts/common/sys/vnode.h	18 Apr 2017 12:04:56 -0000
@@ -0,0 +1,426 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+#ifndef _UTS_SYS_VNODE_H
+#define	_UTS_SYS_VNODE_H
+
+#include_next <sys/vnode.h>
+
+#define	IS_DEVVP(vp)	\
+	((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
+
+#define	V_XATTRDIR	0x0000	/* attribute unnamed directory */
+
+#define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
+
+/*
+ * Structure of all optional attributes.
+ */
+typedef struct xoptattr {
+	timestruc_t	xoa_createtime;	/* Create time of file */
+	uint8_t		xoa_archive;
+	uint8_t		xoa_system;
+	uint8_t		xoa_readonly;
+	uint8_t		xoa_hidden;
+	uint8_t		xoa_nounlink;
+	uint8_t		xoa_immutable;
+	uint8_t		xoa_appendonly;
+	uint8_t		xoa_nodump;
+	uint8_t		xoa_opaque;
+	uint8_t		xoa_av_quarantined;
+	uint8_t		xoa_av_modified;
+	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
+	uint8_t		xoa_reparse;
+	uint64_t	xoa_generation;
+	uint8_t		xoa_offline;
+	uint8_t		xoa_sparse;
+} xoptattr_t;
+
+/*
+ * The xvattr structure is really a variable length structure that
+ * is made up of:
+ * - The classic vattr_t (xva_vattr)
+ * - a 32 bit quantity (xva_mapsize) that specifies the size of the
+ *   attribute bitmaps in 32 bit words.
+ * - A pointer to the returned attribute bitmap (needed because the
+ *   previous element, the requested attribute bitmap) is variable lenth.
+ * - The requested attribute bitmap, which is an array of 32 bit words.
+ *   Callers use the XVA_SET_REQ() macro to set the bits corresponding to
+ *   the attributes that are being requested.
+ * - The returned attribute bitmap, which is an array of 32 bit words.
+ *   File systems that support optional attributes use the XVA_SET_RTN()
+ *   macro to set the bits corresponding to the attributes that are being
+ *   returned.
+ * - The xoptattr_t structure which contains the attribute values
+ *
+ * xva_mapsize determines how many words in the attribute bitmaps.
+ * Immediately following the attribute bitmaps is the xoptattr_t.
+ * xva_getxoptattr() is used to get the pointer to the xoptattr_t
+ * section.
+ */
+
+#define	XVA_MAPSIZE	3		/* Size of attr bitmaps */
+#define	XVA_MAGIC	0x78766174	/* Magic # for verification */
+
+/*
+ * The xvattr structure is an extensible structure which permits optional
+ * attributes to be requested/returned.  File systems may or may not support
+ * optional attributes.  They do so at their own discretion but if they do
+ * support optional attributes, they must register the VFSFT_XVATTR feature
+ * so that the optional attributes can be set/retrived.
+ *
+ * The fields of the xvattr structure are:
+ *
+ * xva_vattr - The first element of an xvattr is a legacy vattr structure
+ * which includes the common attributes.  If AT_XVATTR is set in the va_mask
+ * then the entire structure is treated as an xvattr.  If AT_XVATTR is not
+ * set, then only the xva_vattr structure can be used.
+ *
+ * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification.
+ *
+ * xva_mapsize - Size of requested and returned attribute bitmaps.
+ *
+ * xva_rtnattrmapp - Pointer to xva_rtnattrmap[].  We need this since the
+ * size of the array before it, xva_reqattrmap[], could change which means
+ * the location of xva_rtnattrmap[] could change.  This will allow unbundled
+ * file systems to find the location of xva_rtnattrmap[] when the sizes change.
+ *
+ * xva_reqattrmap[] - Array of requested attributes.  Attributes are
+ * represented by a specific bit in a specific element of the attribute
+ * map array.  Callers set the bits corresponding to the attributes
+ * that the caller wants to get/set.
+ *
+ * xva_rtnattrmap[] - Array of attributes that the file system was able to
+ * process.  Not all file systems support all optional attributes.  This map
+ * informs the caller which attributes the underlying file system was able
+ * to set/get.  (Same structure as the requested attributes array in terms
+ * of each attribute  corresponding to specific bits and array elements.)
+ *
+ * xva_xoptattrs - Structure containing values of optional attributes.
+ * These values are only valid if the corresponding bits in xva_reqattrmap
+ * are set and the underlying file system supports those attributes.
+ */
+typedef struct xvattr {
+	vattr_t		xva_vattr;	/* Embedded vattr structure */
+	uint32_t	xva_magic;	/* Magic Number */
+	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
+	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
+	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
+	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
+	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
+} xvattr_t;
+
+/*
+ * Attributes of interest to the caller of setattr or getattr.
+ */
+#define	AT_TYPE		0x00001
+#define	AT_MODE		0x00002
+#define	AT_UID		0x00004
+#define	AT_GID		0x00008
+#define	AT_FSID		0x00010
+#define	AT_NODEID	0x00020
+#define	AT_NLINK	0x00040
+#define	AT_SIZE		0x00080
+#define	AT_ATIME	0x00100
+#define	AT_MTIME	0x00200
+#define	AT_CTIME	0x00400
+#define	AT_RDEV		0x00800
+#define	AT_BLKSIZE	0x01000
+#define	AT_NBLOCKS	0x02000
+/*			0x04000 */	/* unused */
+#define	AT_SEQ		0x08000
+/*
+ * If AT_XVATTR is set then there are additional bits to process in
+ * the xvattr_t's attribute bitmap.  If this is not set then the bitmap
+ * MUST be ignored.  Note that this bit must be set/cleared explicitly.
+ * That is, setting AT_ALL will NOT set AT_XVATTR.
+ */
+#define	AT_XVATTR	0x10000
+
+#define	AT_ALL		(AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\
+			AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\
+			AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
+
+#define	AT_STAT		(AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\
+			AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE)
+
+#define	AT_TIMES	(AT_ATIME|AT_MTIME|AT_CTIME)
+
+#define	AT_NOSET	(AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
+			AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
+
+/*
+ * Attribute bits used in the extensible attribute's (xva's) attribute
+ * bitmaps.  Note that the bitmaps are made up of a variable length number
+ * of 32-bit words.  The convention is to use XAT{n}_{attrname} where "n"
+ * is the element in the bitmap (starting at 1).  This convention is for
+ * the convenience of the maintainer to keep track of which element each
+ * attribute belongs to.
+ *
+ * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY.  CONSUMERS
+ * MUST USE THE XAT_* DEFINES.
+ */
+#define	XAT0_INDEX	0LL		/* Index into bitmap for XAT0 attrs */
+#define	XAT0_CREATETIME	0x00000001	/* Create time of file */
+#define	XAT0_ARCHIVE	0x00000002	/* Archive */
+#define	XAT0_SYSTEM	0x00000004	/* System */
+#define	XAT0_READONLY	0x00000008	/* Readonly */
+#define	XAT0_HIDDEN	0x00000010	/* Hidden */
+#define	XAT0_NOUNLINK	0x00000020	/* Nounlink */
+#define	XAT0_IMMUTABLE	0x00000040	/* immutable */
+#define	XAT0_APPENDONLY	0x00000080	/* appendonly */
+#define	XAT0_NODUMP	0x00000100	/* nodump */
+#define	XAT0_OPAQUE	0x00000200	/* opaque */
+#define	XAT0_AV_QUARANTINED	0x00000400	/* anti-virus quarantine */
+#define	XAT0_AV_MODIFIED	0x00000800	/* anti-virus modified */
+#define	XAT0_AV_SCANSTAMP	0x00001000	/* anti-virus scanstamp */
+#define	XAT0_REPARSE	0x00002000	/* FS reparse point */
+#define	XAT0_GEN	0x00004000	/* object generation number */
+#define	XAT0_OFFLINE	0x00008000	/* offline */
+#define	XAT0_SPARSE	0x00010000	/* sparse */
+
+#define	XAT0_ALL_ATTRS	(XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \
+    XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \
+    XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED|  XAT0_AV_MODIFIED| \
+    XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE)
+
+/* Support for XAT_* optional attributes */
+#define	XVA_MASK		0xffffffff	/* Used to mask off 32 bits */
+#define	XVA_SHFT		32		/* Used to shift index */
+
+/*
+ * Used to pry out the index and attribute bits from the XAT_* attributes
+ * defined below.  Note that we're masking things down to 32 bits then
+ * casting to uint32_t.
+ */
+#define	XVA_INDEX(attr)		((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK))
+#define	XVA_ATTRBIT(attr)	((uint32_t)((attr) & XVA_MASK))
+
+/*
+ * The following defines present a "flat namespace" so that consumers don't
+ * need to keep track of which element belongs to which bitmap entry.
+ *
+ * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER
+ */
+#define	XAT_CREATETIME		((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME)
+#define	XAT_ARCHIVE		((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE)
+#define	XAT_SYSTEM		((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM)
+#define	XAT_READONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY)
+#define	XAT_HIDDEN		((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN)
+#define	XAT_NOUNLINK		((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK)
+#define	XAT_IMMUTABLE		((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE)
+#define	XAT_APPENDONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY)
+#define	XAT_NODUMP		((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP)
+#define	XAT_OPAQUE		((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE)
+#define	XAT_AV_QUARANTINED	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED)
+#define	XAT_AV_MODIFIED		((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED)
+#define	XAT_AV_SCANSTAMP	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP)
+#define	XAT_REPARSE		((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE)
+#define	XAT_GEN			((XAT0_INDEX << XVA_SHFT) | XAT0_GEN)
+#define	XAT_OFFLINE		((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE)
+#define	XAT_SPARSE		((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE)
+
+/*
+ * The returned attribute map array (xva_rtnattrmap[]) is located past the
+ * requested attribute map array (xva_reqattrmap[]).  Its location changes
+ * when the array sizes change.  We use a separate pointer in a known location
+ * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[].  This is
+ * set in xva_init()
+ */
+#define	XVA_RTNATTRMAP(xvap)	((xvap)->xva_rtnattrmapp)
+
+/*
+ * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap
+ * of requested attributes (xva_reqattrmap[]).
+ */
+#define	XVA_SET_REQ(xvap, attr)	{				\
+	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
+	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
+	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \
+}
+/*
+ * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap
+ * of requested attributes (xva_reqattrmap[]).
+ */
+#define	XVA_CLR_REQ(xvap, attr)	{				\
+	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
+	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
+	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr); \
+}
+
+/*
+ * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap
+ * of returned attributes (xva_rtnattrmap[]).
+ */
+#define	XVA_SET_RTN(xvap, attr)	{				\
+	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
+	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
+	(XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr); \
+}
+
+/*
+ * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[])
+ * to see of the corresponding attribute bit is set.  If so, returns non-zero.
+ */
+#define	XVA_ISSET_REQ(xvap, attr)					\
+	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
+		((xvap)->xva_magic == XVA_MAGIC) &&			\
+		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
+	((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) :	0)
+
+/*
+ * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[])
+ * to see of the corresponding attribute bit is set.  If so, returns non-zero.
+ */
+#define	XVA_ISSET_RTN(xvap, attr)					\
+	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
+		((xvap)->xva_magic == XVA_MAGIC) &&			\
+		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
+	((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
+
+#define	MODEMASK	07777		/* mode bits plus permission bits */
+#define	PERMMASK	00777		/* permission bits */
+
+/*
+ * VOP_ACCESS flags
+ */
+#define	V_ACE_MASK	0x1	/* mask represents  NFSv4 ACE permissions */
+
+/*
+ * Flags for vnode operations.
+ */
+enum rm		{ RMFILE, RMDIRECTORY };	/* rm or rmdir (remove) */
+enum create	{ CRCREAT, CRMKNOD, CRMKDIR };	/* reason for create */
+
+/*
+ * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations
+ */
+
+typedef struct vsecattr {
+	uint_t		vsa_mask;	/* See below */
+	int		vsa_aclcnt;	/* ACL entry count */
+	void		*vsa_aclentp;	/* pointer to ACL entries */
+	int		vsa_dfaclcnt;	/* default ACL entry count */
+	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
+	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
+	uint_t		vsa_aclflags;	/* ACE ACL flags */
+} vsecattr_t;
+
+/* vsa_mask values */
+#define	VSA_ACL			0x0001
+#define	VSA_ACLCNT		0x0002
+#define	VSA_DFACL		0x0004
+#define	VSA_DFACLCNT		0x0008
+#define	VSA_ACE			0x0010
+#define	VSA_ACECNT		0x0020
+#define	VSA_ACE_ALLTYPES	0x0040
+#define	VSA_ACE_ACLFLAGS	0x0080	/* get/set ACE ACL flags */
+
+/*
+ * Structure used by various vnode operations to determine
+ * the context (pid, host, identity) of a caller.
+ *
+ * The cc_caller_id is used to identify one or more callers who invoke
+ * operations, possibly on behalf of others.  For example, the NFS
+ * server could have it's own cc_caller_id which can be detected by
+ * vnode/vfs operations or (FEM) monitors on those operations.  New
+ * caller IDs are generated by fs_new_caller_id().
+ */
+typedef struct caller_context {
+	pid_t		cc_pid;		/* Process ID of the caller */
+	int		cc_sysid;	/* System ID, used for remote calls */
+	u_longlong_t	cc_caller_id;	/* Identifier for (set of) caller(s) */
+	ulong_t		cc_flags;
+} caller_context_t;
+
+struct taskq;
+
+/*
+ * Flags for VOP_LOOKUP
+ *
+ * Defined in file.h, but also possible, FIGNORECASE and FSEARCH
+ *
+ */
+#define	LOOKUP_DIR		0x01	/* want parent dir vp */
+#define	LOOKUP_XATTR		0x02	/* lookup up extended attr dir */
+#define	CREATE_XATTR_DIR	0x04	/* Create extended attr dir */
+#define	LOOKUP_HAVE_SYSATTR_DIR	0x08	/* Already created virtual GFS dir */
+
+/*
+ * Flags for VOP_READDIR
+ */
+#define	V_RDDIR_ENTFLAGS	0x01	/* request dirent flags */
+#define	V_RDDIR_ACCFILTER	0x02	/* filter out inaccessible dirents */
+
+/*
+ * Public vnode manipulation functions.
+ */
+#ifdef	_KERNEL
+
+void	vn_rele_async(struct vnode *vp, struct taskq *taskq);
+
+/*
+ * Extensible vnode attribute (xva) routines:
+ * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR)
+ * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t
+ */
+void		xva_init(xvattr_t *);
+xoptattr_t	*xva_getxoptattr(xvattr_t *);	/* Get ptr to xoptattr_t */
+
+#define	VN_RELE_ASYNC(vp, taskq)	{ \
+	vn_rele_async(vp, taskq); \
+}
+
+#endif	/* _KERNEL */
+
+/*
+ * Flags to VOP_SETATTR/VOP_GETATTR.
+ */
+#define	ATTR_UTIME	0x01	/* non-default utime(2) request */
+#define	ATTR_EXEC	0x02	/* invocation from exec(2) */
+#define	ATTR_COMM	0x04	/* yield common vp attributes */
+#define	ATTR_HINT	0x08	/* information returned will be `hint' */
+#define	ATTR_REAL	0x10	/* yield attributes of the real vp */
+#define	ATTR_NOACLCHECK	0x20	/* Don't check ACL when checking permissions */
+#define	ATTR_TRIGGER	0x40	/* Mount first if vnode is a trigger mount */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _UTS_SYS_VNODE_H */
Index: src/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 protocol.h
--- src/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h	27 Feb 2010 22:32:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h	12 Jun 2012 05:57:28 -0000
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_FM_PROTOCOL_H
@@ -38,18 +37,18 @@ extern "C" {
 #include <libnvpair.h>
 #include <stdarg.h>
 #endif
-#include <sys/processor.h>
 
 /* FM common member names */
 #define	FM_CLASS			"class"
 #define	FM_VERSION			"version"
 
-/* FM event class values */
+/* FM protocol category 1 class names */
 #define	FM_EREPORT_CLASS		"ereport"
 #define	FM_FAULT_CLASS			"fault"
 #define	FM_DEFECT_CLASS			"defect"
 #define	FM_RSRC_CLASS			"resource"
 #define	FM_LIST_EVENT			"list"
+#define	FM_IREPORT_CLASS		"ireport"
 
 /* FM list.* event class values */
 #define	FM_LIST_SUSPECT_CLASS		FM_LIST_EVENT ".suspect"
@@ -73,6 +72,12 @@ extern "C" {
 /* list.* event payload member names */
 #define	FM_LIST_EVENT_SIZE		"list-sz"
 
+/* ireport.* event payload member names */
+#define	FM_IREPORT_DETECTOR		"detector"
+#define	FM_IREPORT_UUID			"uuid"
+#define	FM_IREPORT_PRIORITY		"pri"
+#define	FM_IREPORT_ATTRIBUTES		"attr"
+
 /*
  * list.suspect, isolated, updated, repaired and resolved
  * versions/payload member names.
@@ -193,6 +198,7 @@ extern "C" {
 #define	FM_FMRI_SCHEME_PKG		"pkg"
 #define	FM_FMRI_SCHEME_LEGACY		"legacy-hc"
 #define	FM_FMRI_SCHEME_ZFS		"zfs"
+#define	FM_FMRI_SCHEME_SW		"sw"
 
 /* Scheme versions */
 #define	FMD_SCHEME_VERSION0		0
@@ -216,6 +222,8 @@ extern "C" {
 #define	FM_SVC_SCHEME_VERSION		SVC_SCHEME_VERSION0
 #define	ZFS_SCHEME_VERSION0		0
 #define	FM_ZFS_SCHEME_VERSION		ZFS_SCHEME_VERSION0
+#define	SW_SCHEME_VERSION0		0
+#define	FM_SW_SCHEME_VERSION		SW_SCHEME_VERSION0
 
 /* hc scheme member names */
 #define	FM_FMRI_HC_SERIAL_ID		"serial"
@@ -247,6 +255,7 @@ extern "C" {
 
 /* dev scheme member names */
 #define	FM_FMRI_DEV_ID			"devid"
+#define	FM_FMRI_DEV_TGTPTLUN0		"target-port-l0id"
 #define	FM_FMRI_DEV_PATH		"device-path"
 
 /* pkg scheme member names */
@@ -299,6 +308,25 @@ extern "C" {
 #define	FM_FMRI_ZFS_POOL		"pool"
 #define	FM_FMRI_ZFS_VDEV		"vdev"
 
+/* sw scheme member names - extra indentation for members of an nvlist */
+#define	FM_FMRI_SW_OBJ			"object"
+#define	FM_FMRI_SW_OBJ_PATH			"path"
+#define	FM_FMRI_SW_OBJ_ROOT			"root"
+#define	FM_FMRI_SW_OBJ_PKG			"pkg"
+#define	FM_FMRI_SW_SITE			"site"
+#define	FM_FMRI_SW_SITE_TOKEN			"token"
+#define	FM_FMRI_SW_SITE_MODULE			"module"
+#define	FM_FMRI_SW_SITE_FILE			"file"
+#define	FM_FMRI_SW_SITE_LINE			"line"
+#define	FM_FMRI_SW_SITE_FUNC			"func"
+#define	FM_FMRI_SW_CTXT			"context"
+#define	FM_FMRI_SW_CTXT_ORIGIN			"origin"
+#define	FM_FMRI_SW_CTXT_EXECNAME		"execname"
+#define	FM_FMRI_SW_CTXT_PID			"pid"
+#define	FM_FMRI_SW_CTXT_ZONE			"zone"
+#define	FM_FMRI_SW_CTXT_CTID			"ctid"
+#define	FM_FMRI_SW_CTXT_STACK			"stack"
+
 extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
 extern void fm_nva_xdestroy(nv_alloc_t *);
 
@@ -315,7 +343,7 @@ extern int i_fm_payload_set(nvlist_t *, 
 extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *,
     int, ...);
 extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *,
-    const char *);
+    const char *, const char *);
 extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *);
 extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t,
     uint8_t *, const char *);
@@ -329,7 +357,6 @@ extern void fm_fmri_hc_create(nvlist_t *
 
 extern uint64_t fm_ena_increment(uint64_t);
 extern uint64_t fm_ena_generate(uint64_t, uchar_t);
-extern uint64_t fm_ena_generate_cpu(uint64_t, processorid_t, uchar_t);
 extern uint64_t fm_ena_generation_get(uint64_t);
 extern uchar_t fm_ena_format_get(uint64_t);
 extern uint64_t fm_ena_id_get(uint64_t);
Index: src/external/cddl/osnet/dist/uts/common/sys/fm/util.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/fm/util.h,v
retrieving revision 1.2
diff -u -p -r1.2 util.h
--- src/external/cddl/osnet/dist/uts/common/sys/fm/util.h	7 Aug 2009 20:16:46 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/fm/util.h	25 Apr 2017 16:09:40 -0000
@@ -20,15 +20,12 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_FM_UTIL_H
 #define	_SYS_FM_UTIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -89,13 +86,12 @@ extern size_t ereport_dumplen;
 extern void fm_init(void);
 extern void fm_nvprint(nvlist_t *);
 extern void fm_panic(const char *, ...);
-/*extern void fm_banner(void);*/
+extern void fm_banner(void);
 
-/*extern void fm_ereport_dump(void);*/
+extern void fm_ereport_dump(void);
 extern void fm_ereport_post(nvlist_t *, int);
 
-/*extern void fm_payload_stack_add(nvlist_t *, const pc_t *, int);*/
-
+extern int is_fm_panic();
 #endif  /* _KERNEL */
 
 #ifdef	__cplusplus
Index: src/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 zfs.h
--- src/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h	27 Feb 2010 22:32:40 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h	19 Jun 2013 13:06:01 -0000
@@ -46,6 +46,7 @@ extern "C" {
 #define	FM_EREPORT_ZFS_IO_FAILURE		"io_failure"
 #define	FM_EREPORT_ZFS_PROBE_FAILURE		"probe_failure"
 #define	FM_EREPORT_ZFS_LOG_REPLAY		"log_replay"
+#define	FM_EREPORT_ZFS_CONFIG_CACHE_WRITE	"config_cache_write"
 
 #define	FM_EREPORT_PAYLOAD_ZFS_POOL		"pool"
 #define	FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE	"pool_failmode"
Index: src/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h,v
retrieving revision 1.4
diff -u -p -r1.4 zfs.h
--- src/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h	3 Jan 2013 16:41:52 -0000	1.4
+++ src/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h	30 Apr 2017 03:34:54 -0000
@@ -20,15 +20,22 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
  */
 
+/* Portions Copyright 2010 Robert Milkowski */
+
 #ifndef	_SYS_FS_ZFS_H
 #define	_SYS_FS_ZFS_H
 
+#include <sys/types.h>
+#include <sys/ioccom.h>
 #include <sys/time.h>
-#include <sys/ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -43,18 +50,37 @@ extern "C" {
  * combined into masks that can be passed to various functions.
  */
 typedef enum {
-	ZFS_TYPE_FILESYSTEM	= 0x1,
-	ZFS_TYPE_SNAPSHOT	= 0x2,
-	ZFS_TYPE_VOLUME		= 0x4,
-	ZFS_TYPE_POOL		= 0x8
+	ZFS_TYPE_FILESYSTEM	= (1 << 0),
+	ZFS_TYPE_SNAPSHOT	= (1 << 1),
+	ZFS_TYPE_VOLUME		= (1 << 2),
+	ZFS_TYPE_POOL		= (1 << 3),
+	ZFS_TYPE_BOOKMARK	= (1 << 4)
 } zfs_type_t;
 
+/*
+ * NB: lzc_dataset_type should be updated whenever a new objset type is added,
+ * if it represents a real type of a dataset that can be created from userland.
+ */
+typedef enum dmu_objset_type {
+	DMU_OST_NONE,
+	DMU_OST_META,
+	DMU_OST_ZFS,
+	DMU_OST_ZVOL,
+	DMU_OST_OTHER,			/* For testing only! */
+	DMU_OST_ANY,			/* Be careful! */
+	DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
 #define	ZFS_TYPE_DATASET	\
 	(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
 
+/*
+ * All of these include the terminating NUL byte.
+ */
 #define	ZAP_MAXNAMELEN 256
 #define	ZAP_MAXVALUELEN (1024 * 8)
 #define	ZAP_OLDMAXVALUELEN 1024
+#define	ZFS_MAX_DATASET_NAME_LEN 256
 
 /*
  * Dataset properties are identified by these constants and must be added to
@@ -92,7 +118,6 @@ typedef enum {
 	ZFS_PROP_CREATETXG,		/* not exposed to the user */
 	ZFS_PROP_NAME,			/* not exposed to the user */
 	ZFS_PROP_CANMOUNT,
-	ZFS_PROP_SHAREISCSI,
 	ZFS_PROP_ISCSIOPTIONS,		/* not exposed to the user */
 	ZFS_PROP_XATTR,
 	ZFS_PROP_NUMCLONES,		/* not exposed to the user */
@@ -122,6 +147,21 @@ typedef enum {
 	ZFS_PROP_OBJSETID,		/* not exposed to the user */
 	ZFS_PROP_DEDUP,
 	ZFS_PROP_MLSLABEL,
+	ZFS_PROP_SYNC,
+	ZFS_PROP_REFRATIO,
+	ZFS_PROP_WRITTEN,
+	ZFS_PROP_CLONES,
+	ZFS_PROP_LOGICALUSED,
+	ZFS_PROP_LOGICALREFERENCED,
+	ZFS_PROP_INCONSISTENT,		/* not exposed to the user */
+	ZFS_PROP_VOLMODE,
+	ZFS_PROP_FILESYSTEM_LIMIT,
+	ZFS_PROP_SNAPSHOT_LIMIT,
+	ZFS_PROP_FILESYSTEM_COUNT,
+	ZFS_PROP_SNAPSHOT_COUNT,
+	ZFS_PROP_REDUNDANT_METADATA,
+	ZFS_PROP_PREV_SNAP,
+	ZFS_PROP_RECEIVE_RESUME_TOKEN,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
@@ -160,9 +200,19 @@ typedef enum {
 	ZPOOL_PROP_DEDUPRATIO,
 	ZPOOL_PROP_FREE,
 	ZPOOL_PROP_ALLOCATED,
+	ZPOOL_PROP_READONLY,
+	ZPOOL_PROP_COMMENT,
+	ZPOOL_PROP_EXPANDSZ,
+	ZPOOL_PROP_FREEING,
+	ZPOOL_PROP_FRAGMENTATION,
+	ZPOOL_PROP_LEAKED,
+	ZPOOL_PROP_MAXBLOCKSIZE,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
+/* Small enough to not hog a whole line of printout in zpool(1M). */
+#define	ZPROP_MAX_COMMENT	32
+
 #define	ZPROP_CONT		-2
 #define	ZPROP_INVAL		-1
 
@@ -230,6 +280,8 @@ const char *zpool_prop_to_name(zpool_pro
 const char *zpool_prop_default_string(zpool_prop_t);
 uint64_t zpool_prop_default_numeric(zpool_prop_t);
 boolean_t zpool_prop_readonly(zpool_prop_t);
+boolean_t zpool_prop_feature(const char *);
+boolean_t zpool_prop_unsupported(const char *name);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
 uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
@@ -299,6 +351,23 @@ typedef enum zfs_cache_type {
 	ZFS_CACHE_ALL = 2
 } zfs_cache_type_t;
 
+typedef enum {
+	ZFS_SYNC_STANDARD = 0,
+	ZFS_SYNC_ALWAYS = 1,
+	ZFS_SYNC_DISABLED = 2
+} zfs_sync_type_t;
+
+typedef enum {
+	ZFS_VOLMODE_DEFAULT = 0,
+	ZFS_VOLMODE_GEOM = 1,
+	ZFS_VOLMODE_DEV = 2,
+	ZFS_VOLMODE_NONE = 3
+} zfs_volmode_t;
+
+typedef enum {
+	ZFS_REDUNDANT_METADATA_ALL,
+	ZFS_REDUNDANT_METADATA_MOST
+} zfs_redundant_metadata_type_t;
 
 /*
  * On-disk version number.
@@ -326,14 +395,21 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_21			21ULL
 #define	SPA_VERSION_22			22ULL
 #define	SPA_VERSION_23			23ULL
+#define	SPA_VERSION_24			24ULL
+#define	SPA_VERSION_25			25ULL
+#define	SPA_VERSION_26			26ULL
+#define	SPA_VERSION_27			27ULL
+#define	SPA_VERSION_28			28ULL
+#define	SPA_VERSION_5000		5000ULL
+
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_23
-#define	SPA_VERSION_STRING		"23"
+#define	SPA_VERSION			SPA_VERSION_5000
+#define	SPA_VERSION_STRING		"5000"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -350,7 +426,7 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_DITTO_BLOCKS	SPA_VERSION_2
 #define	SPA_VERSION_SPARES		SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ2		SPA_VERSION_3
-#define	SPA_VERSION_BPLIST_ACCOUNT	SPA_VERSION_3
+#define	SPA_VERSION_BPOBJ_ACCOUNT	SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ_DEFLATE	SPA_VERSION_3
 #define	SPA_VERSION_DNODE_BYTES		SPA_VERSION_3
 #define	SPA_VERSION_ZPOOL_HISTORY	SPA_VERSION_4
@@ -378,6 +454,18 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_DEDUP		SPA_VERSION_21
 #define	SPA_VERSION_RECVD_PROPS		SPA_VERSION_22
 #define	SPA_VERSION_SLIM_ZIL		SPA_VERSION_23
+#define	SPA_VERSION_SA			SPA_VERSION_24
+#define	SPA_VERSION_SCAN		SPA_VERSION_25
+#define	SPA_VERSION_DIR_CLONES		SPA_VERSION_26
+#define	SPA_VERSION_DEADLISTS		SPA_VERSION_26
+#define	SPA_VERSION_FAST_SNAP		SPA_VERSION_27
+#define	SPA_VERSION_MULTI_REPLACE	SPA_VERSION_28
+#define	SPA_VERSION_BEFORE_FEATURES	SPA_VERSION_28
+#define	SPA_VERSION_FEATURES		SPA_VERSION_5000
+
+#define	SPA_VERSION_IS_SUPPORTED(v) \
+	(((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \
+	((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION))
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -391,8 +479,9 @@ typedef enum zfs_cache_type {
 #define	ZPL_VERSION_2			2ULL
 #define	ZPL_VERSION_3			3ULL
 #define	ZPL_VERSION_4			4ULL
-#define	ZPL_VERSION			ZPL_VERSION_4
-#define	ZPL_VERSION_STRING		"4"
+#define	ZPL_VERSION_5			5ULL
+#define	ZPL_VERSION			ZPL_VERSION_5
+#define	ZPL_VERSION_STRING		"5"
 
 #define	ZPL_VERSION_INITIAL		ZPL_VERSION_1
 #define	ZPL_VERSION_DIRENT_TYPE		ZPL_VERSION_2
@@ -400,6 +489,7 @@ typedef enum zfs_cache_type {
 #define	ZPL_VERSION_NORMALIZATION	ZPL_VERSION_3
 #define	ZPL_VERSION_SYSATTR		ZPL_VERSION_3
 #define	ZPL_VERSION_USERSPACE		ZPL_VERSION_4
+#define	ZPL_VERSION_SA			ZPL_VERSION_5
 
 /* Rewind request information */
 #define	ZPOOL_NO_REWIND		1  /* No policy - default behavior */
@@ -440,7 +530,8 @@ typedef struct zpool_rewind_policy {
 #define	ZPOOL_CONFIG_ASHIFT		"ashift"
 #define	ZPOOL_CONFIG_ASIZE		"asize"
 #define	ZPOOL_CONFIG_DTL		"DTL"
-#define	ZPOOL_CONFIG_STATS		"stats"
+#define	ZPOOL_CONFIG_SCAN_STATS		"scan_stats"	/* not stored on disk */
+#define	ZPOOL_CONFIG_VDEV_STATS		"vdev_stats"	/* not stored on disk */
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
 #define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
 #define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
@@ -449,6 +540,7 @@ typedef struct zpool_rewind_policy {
 #define	ZPOOL_CONFIG_NPARITY		"nparity"
 #define	ZPOOL_CONFIG_HOSTID		"hostid"
 #define	ZPOOL_CONFIG_HOSTNAME		"hostname"
+#define	ZPOOL_CONFIG_LOADED_TIME	"initial_load_time"
 #define	ZPOOL_CONFIG_UNSPARE		"unspare"
 #define	ZPOOL_CONFIG_PHYS_PATH		"phys_path"
 #define	ZPOOL_CONFIG_IS_LOG		"is_log"
@@ -463,9 +555,23 @@ typedef struct zpool_rewind_policy {
 #define	ZPOOL_CONFIG_ORIG_GUID		"orig_guid"
 #define	ZPOOL_CONFIG_SPLIT_GUID		"split_guid"
 #define	ZPOOL_CONFIG_SPLIT_LIST		"guid_list"
+#define	ZPOOL_CONFIG_REMOVING		"removing"
+#define	ZPOOL_CONFIG_RESILVER_TXG	"resilver_txg"
+#define	ZPOOL_CONFIG_COMMENT		"comment"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
 #define	ZPOOL_CONFIG_BOOTFS		"bootfs"	/* not stored on disk */
+#define	ZPOOL_CONFIG_MISSING_DEVICES	"missing_vdevs"	/* not stored on disk */
+#define	ZPOOL_CONFIG_LOAD_INFO		"load_info"	/* not stored on disk */
+#define	ZPOOL_CONFIG_REWIND_INFO	"rewind_info"	/* not stored on disk */
+#define	ZPOOL_CONFIG_UNSUP_FEAT		"unsup_feat"	/* not stored on disk */
+#define	ZPOOL_CONFIG_ENABLED_FEAT	"enabled_feat"	/* not stored on disk */
+#define	ZPOOL_CONFIG_CAN_RDONLY		"can_rdonly"	/* not stored on disk */
+#define	ZPOOL_CONFIG_FEATURES_FOR_READ	"features_for_read"
+#define	ZPOOL_CONFIG_FEATURE_STATS	"feature_stats"	/* not stored on disk */
+#define	ZPOOL_CONFIG_VDEV_TOP_ZAP	"com.delphix:vdev_zap_top"
+#define	ZPOOL_CONFIG_VDEV_LEAF_ZAP	"com.delphix:vdev_zap_leaf"
+#define	ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS	"com.delphix:has_per_vdev_zaps"
 /*
  * The persistent vdev state is stored as separate values rather than a single
  * 'vdev_state' entry.  This is because a device can be in multiple states, such
@@ -504,14 +610,28 @@ typedef struct zpool_rewind_policy {
 
 /*
  * This is needed in userland to report the minimum necessary device size.
+ *
+ * Note that the zfs test suite uses 64MB vdevs.
  */
 #define	SPA_MINDEVSIZE		(64ULL << 20)
 
 /*
+ * Set if the fragmentation has not yet been calculated. This can happen
+ * because the space maps have not been upgraded or the histogram feature
+ * is not enabled.
+ */
+#define	ZFS_FRAG_INVALID	UINT64_MAX
+
+/*
  * The location of the pool configuration repository, shared between kernel and
  * userland.
  */
-#define	ZPOOL_CACHE		"/etc/zfs/zpool.cache"
+#ifdef __FreeBSD__
+#define	ZPOOL_CACHE		"/boot/zfs/zpool.cache"
+#endif
+#ifdef __NetBSD__
+#define	ZPOOL_CACHE		"/var/db/zfs/zpool.cache"
+#endif
 
 /*
  * vdev states are ordered from least to most healthy.
@@ -544,12 +664,14 @@ typedef enum vdev_aux {
 	VDEV_AUX_BAD_LABEL,	/* the label is OK but invalid		*/
 	VDEV_AUX_VERSION_NEWER,	/* on-disk version is too new		*/
 	VDEV_AUX_VERSION_OLDER,	/* on-disk version is too old		*/
+	VDEV_AUX_UNSUP_FEAT,	/* unsupported features			*/
 	VDEV_AUX_SPARED,	/* hot spare used in another pool	*/
 	VDEV_AUX_ERR_EXCEEDED,	/* too many errors			*/
 	VDEV_AUX_IO_FAILURE,	/* experienced I/O failure		*/
 	VDEV_AUX_BAD_LOG,	/* cannot read log chain(s)		*/
 	VDEV_AUX_EXTERNAL,	/* external diagnosis			*/
-	VDEV_AUX_SPLIT_POOL	/* vdev was split off into another pool	*/
+	VDEV_AUX_SPLIT_POOL,	/* vdev was split off into another pool	*/
+	VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large   */
 } vdev_aux_t;
 
 /*
@@ -570,14 +692,14 @@ typedef enum pool_state {
 } pool_state_t;
 
 /*
- * Scrub types.
+ * Scan Functions.
  */
-typedef enum pool_scrub_type {
-	POOL_SCRUB_NONE,
-	POOL_SCRUB_RESILVER,
-	POOL_SCRUB_EVERYTHING,
-	POOL_SCRUB_TYPES
-} pool_scrub_type_t;
+typedef enum pool_scan_func {
+	POOL_SCAN_NONE,
+	POOL_SCAN_SCRUB,
+	POOL_SCAN_RESILVER,
+	POOL_SCAN_FUNCS
+} pool_scan_func_t;
 
 /*
  * ZIO types.  Needed to interpret vdev statistics below.
@@ -593,6 +715,36 @@ typedef enum zio_type {
 } zio_type_t;
 
 /*
+ * Pool statistics.  Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct pool_scan_stat {
+	/* values stored on disk */
+	uint64_t	pss_func;	/* pool_scan_func_t */
+	uint64_t	pss_state;	/* dsl_scan_state_t */
+	uint64_t	pss_start_time;	/* scan start time */
+	uint64_t	pss_end_time;	/* scan end time */
+	uint64_t	pss_to_examine;	/* total bytes to scan */
+	uint64_t	pss_examined;	/* total examined bytes	*/
+	uint64_t	pss_to_process; /* total bytes to process */
+	uint64_t	pss_processed;	/* total processed bytes */
+	uint64_t	pss_errors;	/* scan errors	*/
+
+	/* values not stored on disk */
+	uint64_t	pss_pass_exam;	/* examined bytes per scan pass */
+	uint64_t	pss_pass_start;	/* start time of a scan pass */
+} pool_scan_stat_t;
+
+typedef enum dsl_scan_state {
+	DSS_NONE,
+	DSS_SCANNING,
+	DSS_FINISHED,
+	DSS_CANCELED,
+	DSS_NUM_STATES
+} dsl_scan_state_t;
+
+
+/*
  * Vdev statistics.  Note: all fields should be 64-bit because this
  * is passed between kernel and userland as an nvlist uint64 array.
  */
@@ -604,20 +756,23 @@ typedef struct vdev_stat {
 	uint64_t	vs_space;		/* total capacity	*/
 	uint64_t	vs_dspace;		/* deflated capacity	*/
 	uint64_t	vs_rsize;		/* replaceable dev size */
+	uint64_t	vs_esize;		/* expandable dev size */
 	uint64_t	vs_ops[ZIO_TYPES];	/* operation count	*/
 	uint64_t	vs_bytes[ZIO_TYPES];	/* bytes read/written	*/
 	uint64_t	vs_read_errors;		/* read errors		*/
 	uint64_t	vs_write_errors;	/* write errors		*/
 	uint64_t	vs_checksum_errors;	/* checksum errors	*/
 	uint64_t	vs_self_healed;		/* self-healed bytes	*/
-	uint64_t	vs_scrub_type;		/* pool_scrub_type_t	*/
-	uint64_t	vs_scrub_complete;	/* completed?		*/
-	uint64_t	vs_scrub_examined;	/* bytes examined; top	*/
-	uint64_t	vs_scrub_repaired;	/* bytes repaired; leaf	*/
-	uint64_t	vs_scrub_errors;	/* errors during scrub	*/
-	uint64_t	vs_scrub_start;		/* UTC scrub start time	*/
-	uint64_t	vs_scrub_end;		/* UTC scrub end time	*/
+	uint64_t	vs_scan_removing;	/* removing?	*/
+	uint64_t	vs_scan_processed;	/* scan processed bytes	*/
+ 	uint64_t	vs_configured_ashift;	/* TLV vdev_ashift      */
+ 	uint64_t	vs_logical_ashift;	/* vdev_logical_ashift  */
+ 	uint64_t	vs_physical_ashift;	/* vdev_physical_ashift */
+	uint64_t	vs_fragmentation;	/* device fragmentation */
 } vdev_stat_t;
+#define VDEV_STAT_VALID(field, uint64_t_field_count) \
+    ((uint64_t_field_count * sizeof(uint64_t)) >= \
+     (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))
 
 /*
  * DDT statistics.  Note: all fields should be 64-bit because this
@@ -646,7 +801,12 @@ typedef struct ddt_histogram {
 
 #define	ZVOL_DRIVER	"zvol"
 #define	ZFS_DRIVER	"zfs"
-#define	ZFS_DEV		"/dev/zfs"
+#define	ZFS_DEV_NAME	"zfs"
+#define	ZFS_DEV		"/dev/" ZFS_DEV_NAME
+#define	ZFS_DISK_ROOT	"/dev/dsk"
+#define	ZFS_DISK_ROOTD	ZFS_DISK_ROOT "/"
+#define	ZFS_RDISK_ROOT	"/dev/rdsk"
+#define	ZFS_RDISK_ROOTD	ZFS_RDISK_ROOT "/"
 
 /* general zvol path */
 #define	ZVOL_DIR		"/dev/zvol"
@@ -662,17 +822,16 @@ typedef struct ddt_histogram {
 /*
  * /dev/zfs ioctl numbers.
  */
-#define	ZFS_IOC		('Z' << 8)
-
 typedef enum zfs_ioc {
-	ZFS_IOC_POOL_CREATE = ZFS_IOC,
+	ZFS_IOC_FIRST =	0,
+	ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST,
 	ZFS_IOC_POOL_DESTROY,
 	ZFS_IOC_POOL_IMPORT,
 	ZFS_IOC_POOL_EXPORT,
 	ZFS_IOC_POOL_CONFIGS,
 	ZFS_IOC_POOL_STATS,
 	ZFS_IOC_POOL_TRYIMPORT,
-	ZFS_IOC_POOL_SCRUB,
+	ZFS_IOC_POOL_SCAN,
 	ZFS_IOC_POOL_FREEZE,
 	ZFS_IOC_POOL_UPGRADE,
 	ZFS_IOC_POOL_GET_HISTORY,
@@ -708,7 +867,6 @@ typedef enum zfs_ioc {
 	ZFS_IOC_POOL_GET_PROPS,
 	ZFS_IOC_SET_FSACL,
 	ZFS_IOC_GET_FSACL,
-	ZFS_IOC_ISCSI_PERM_CHECK,
 	ZFS_IOC_SHARE,
 	ZFS_IOC_INHERIT_PROP,
 	ZFS_IOC_SMB_ACL,
@@ -719,7 +877,27 @@ typedef enum zfs_ioc {
 	ZFS_IOC_RELEASE,
 	ZFS_IOC_GET_HOLDS,
 	ZFS_IOC_OBJSET_RECVD_PROPS,
-	ZFS_IOC_VDEV_SPLIT
+	ZFS_IOC_VDEV_SPLIT,
+	ZFS_IOC_NEXT_OBJ,
+	ZFS_IOC_DIFF,
+	ZFS_IOC_TMP_SNAPSHOT,
+	ZFS_IOC_OBJ_TO_STATS,
+	ZFS_IOC_JAIL,
+	ZFS_IOC_UNJAIL,
+	ZFS_IOC_POOL_REGUID,
+	ZFS_IOC_SPACE_WRITTEN,
+	ZFS_IOC_SPACE_SNAPS,
+	ZFS_IOC_SEND_PROGRESS,
+	ZFS_IOC_POOL_REOPEN,
+	ZFS_IOC_LOG_HISTORY,
+	ZFS_IOC_SEND_NEW,
+	ZFS_IOC_SEND_SPACE,
+	ZFS_IOC_CLONE,
+	ZFS_IOC_BOOKMARK,
+	ZFS_IOC_GET_BOOKMARKS,
+	ZFS_IOC_DESTROY_BOOKMARKS,
+	ZFS_IOC_NEXTBOOT,
+	ZFS_IOC_LAST
 } zfs_ioc_t;
 
 /*
@@ -731,7 +909,8 @@ typedef enum {
 	SPA_LOAD_IMPORT,	/* import in progress	*/
 	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
 	SPA_LOAD_RECOVER,	/* recovery requested	*/
-	SPA_LOAD_ERROR		/* load failed		*/
+	SPA_LOAD_ERROR,		/* load failed		*/
+	SPA_LOAD_CREATE		/* creation in progress */
 } spa_load_state_t;
 
 /*
@@ -756,6 +935,12 @@ typedef enum {
 #define	ZPOOL_HIST_TXG		"history txg"
 #define	ZPOOL_HIST_INT_EVENT	"history internal event"
 #define	ZPOOL_HIST_INT_STR	"history internal str"
+#define	ZPOOL_HIST_INT_NAME	"internal_name"
+#define	ZPOOL_HIST_IOCTL	"ioctl"
+#define	ZPOOL_HIST_INPUT_NVL	"in_nvl"
+#define	ZPOOL_HIST_OUTPUT_NVL	"out_nvl"
+#define	ZPOOL_HIST_DSNAME	"dsname"
+#define	ZPOOL_HIST_DSID		"dsid"
 
 /*
  * Flags for ZFS_IOC_VDEV_SET_STATE
@@ -767,12 +952,22 @@ typedef enum {
 #define	ZFS_OFFLINE_TEMPORARY	0x1
 
 /*
+ * Flags for ZFS_IOC_POOL_IMPORT
+ */
+#define	ZFS_IMPORT_NORMAL	0x0
+#define	ZFS_IMPORT_VERBATIM	0x1
+#define	ZFS_IMPORT_ANY_HOST	0x2
+#define	ZFS_IMPORT_MISSING_LOG	0x4
+#define	ZFS_IMPORT_ONLY		0x8
+
+/*
  * Sysevent payload members.  ZFS will generate the following sysevents with the
  * given payloads:
  *
  *	ESC_ZFS_RESILVER_START
  *	ESC_ZFS_RESILVER_END
  *	ESC_ZFS_POOL_DESTROY
+ *	ESC_ZFS_POOL_REGUID
  *
  *		ZFS_EV_POOL_NAME	DATA_TYPE_STRING
  *		ZFS_EV_POOL_GUID	DATA_TYPE_UINT64
@@ -791,56 +986,6 @@ typedef enum {
 #define	ZFS_EV_VDEV_PATH	"vdev_path"
 #define	ZFS_EV_VDEV_GUID	"vdev_guid"
 
-/*
- * Note: This is encoded on-disk, so new events must be added to the
- * end, and unused events can not be removed.  Be sure to edit
- * libzfs_pool.c: hist_event_table[].
- */
-typedef enum history_internal_events {
-	LOG_NO_EVENT = 0,
-	LOG_POOL_CREATE,
-	LOG_POOL_VDEV_ADD,
-	LOG_POOL_REMOVE,
-	LOG_POOL_DESTROY,
-	LOG_POOL_EXPORT,
-	LOG_POOL_IMPORT,
-	LOG_POOL_VDEV_ATTACH,
-	LOG_POOL_VDEV_REPLACE,
-	LOG_POOL_VDEV_DETACH,
-	LOG_POOL_VDEV_ONLINE,
-	LOG_POOL_VDEV_OFFLINE,
-	LOG_POOL_UPGRADE,
-	LOG_POOL_CLEAR,
-	LOG_POOL_SCRUB,
-	LOG_POOL_PROPSET,
-	LOG_DS_CREATE,
-	LOG_DS_CLONE,
-	LOG_DS_DESTROY,
-	LOG_DS_DESTROY_BEGIN,
-	LOG_DS_INHERIT,
-	LOG_DS_PROPSET,
-	LOG_DS_QUOTA,
-	LOG_DS_PERM_UPDATE,
-	LOG_DS_PERM_REMOVE,
-	LOG_DS_PERM_WHO_REMOVE,
-	LOG_DS_PROMOTE,
-	LOG_DS_RECEIVE,
-	LOG_DS_RENAME,
-	LOG_DS_RESERVATION,
-	LOG_DS_REPLAY_INC_SYNC,
-	LOG_DS_REPLAY_FULL_SYNC,
-	LOG_DS_ROLLBACK,
-	LOG_DS_SNAPSHOT,
-	LOG_DS_UPGRADE,
-	LOG_DS_REFQUOTA,
-	LOG_DS_REFRESERV,
-	LOG_POOL_SCRUB_DONE,
-	LOG_DS_USER_HOLD,
-	LOG_DS_USER_RELEASE,
-	LOG_POOL_SPLIT,
-	LOG_END
-} history_internal_events_t;
-
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 eventdefs.h
--- src/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h	27 Feb 2010 22:32:41 -0000	1.1.1.2
+++ src/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h	10 Oct 2016 11:10:00 -0000
@@ -19,8 +19,8 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  */
 
 #ifndef	_SYS_SYSEVENT_EVENTDEFS_H
@@ -246,16 +246,26 @@ extern "C" {
  * ZFS subclass definitions.  supporting attributes (name/value paris) are found
  * in sys/fs/zfs.h
  */
-#define	ESC_ZFS_RESILVER_START	"ESC_ZFS_resilver_start"
-#define	ESC_ZFS_RESILVER_FINISH	"ESC_ZFS_resilver_finish"
-#define	ESC_ZFS_VDEV_REMOVE	"ESC_ZFS_vdev_remove"
-#define	ESC_ZFS_POOL_DESTROY	"ESC_ZFS_pool_destroy"
-#define	ESC_ZFS_VDEV_CLEAR	"ESC_ZFS_vdev_clear"
-#define	ESC_ZFS_VDEV_CHECK	"ESC_ZFS_vdev_check"
-#define	ESC_ZFS_CONFIG_SYNC	"ESC_ZFS_config_sync"
-#define	ESC_ZFS_SCRUB_START	"ESC_ZFS_scrub_start"
-#define	ESC_ZFS_SCRUB_FINISH	"ESC_ZFS_scrub_finish"
-#define	ESC_ZFS_VDEV_SPARE	"ESC_ZFS_vdev_spare"
+#define	ESC_ZFS_RESILVER_START		"ESC_ZFS_resilver_start"
+#define	ESC_ZFS_RESILVER_FINISH		"ESC_ZFS_resilver_finish"
+#define	ESC_ZFS_VDEV_REMOVE		"ESC_ZFS_vdev_remove"
+#define	ESC_ZFS_VDEV_REMOVE_AUX		"ESC_ZFS_vdev_remove_aux"
+#define	ESC_ZFS_VDEV_REMOVE_DEV		"ESC_ZFS_vdev_remove_dev"
+#define	ESC_ZFS_POOL_CREATE		"ESC_ZFS_pool_create"
+#define	ESC_ZFS_POOL_DESTROY		"ESC_ZFS_pool_destroy"
+#define	ESC_ZFS_POOL_IMPORT		"ESC_ZFS_pool_import"
+#define	ESC_ZFS_VDEV_ADD		"ESC_ZFS_vdev_add"
+#define	ESC_ZFS_VDEV_ATTACH		"ESC_ZFS_vdev_attach"
+#define	ESC_ZFS_VDEV_CLEAR		"ESC_ZFS_vdev_clear"
+#define	ESC_ZFS_VDEV_CHECK		"ESC_ZFS_vdev_check"
+#define	ESC_ZFS_VDEV_ONLINE		"ESC_ZFS_vdev_online"
+#define	ESC_ZFS_CONFIG_SYNC		"ESC_ZFS_config_sync"
+#define	ESC_ZFS_SCRUB_START		"ESC_ZFS_scrub_start"
+#define	ESC_ZFS_SCRUB_FINISH		"ESC_ZFS_scrub_finish"
+#define	ESC_ZFS_VDEV_SPARE		"ESC_ZFS_vdev_spare"
+#define	ESC_ZFS_BOOTFS_VDEV_ATTACH	"ESC_ZFS_bootfs_vdev_attach"
+#define	ESC_ZFS_POOL_REGUID		"ESC_ZFS_pool_reguid"
+#define	ESC_ZFS_VDEV_AUTOEXPAND		"ESC_ZFS_vdev_autoexpand"
 
 /*
  * datalink subclass definitions.
Index: src/external/cddl/osnet/dist/uts/common/zmod/deflate.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/zmod/deflate.c,v
retrieving revision 1.2
diff -u -p -r1.2 deflate.c
--- src/external/cddl/osnet/dist/uts/common/zmod/deflate.c	18 Jun 2016 21:41:08 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/zmod/deflate.c	12 Jun 2012 05:57:28 -0000
@@ -303,7 +303,7 @@ int ZEXPORT deflateInit2_(strm, level, m
     if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
         s->pending_buf == Z_NULL) {
         s->status = FINISH_STATE;
-        strm->msg = zError(Z_MEM_ERROR);
+        strm->msg = (char*)ERR_MSG(Z_MEM_ERROR);
         (void) deflateEnd (strm);
         return Z_MEM_ERROR;
     }
Index: src/external/cddl/osnet/dist/uts/common/zmod/zlib.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/zmod/zlib.h,v
retrieving revision 1.2
diff -u -p -r1.2 zlib.h
--- src/external/cddl/osnet/dist/uts/common/zmod/zlib.h	18 Jun 2016 21:41:08 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/zmod/zlib.h	12 Jun 2012 05:57:28 -0000
@@ -90,7 +90,7 @@ typedef struct z_stream_s {
     uInt     avail_out; /* remaining free space at next_out */
     uLong    total_out; /* total nb of bytes output so far */
 
-    const char *msg;      /* last error message, NULL if no error */
+    char     *msg;      /* last error message, NULL if no error */
     struct internal_state FAR *state; /* not visible by applications */
 
     alloc_func zalloc;  /* used to allocate the internal state */
Index: src/external/cddl/osnet/dist/uts/common/zmod/zmod.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/zmod/zmod.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 zmod.c
--- src/external/cddl/osnet/dist/uts/common/zmod/zmod.c	7 Aug 2009 18:34:46 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/zmod/zmod.c	12 Jun 2012 05:57:28 -0000
@@ -24,7 +24,7 @@
  * Use is subject to license terms.
  */
 
-#include <sys/modctl.h>
+#include <sys/types.h>
 #include <sys/zmod.h>
 
 #include "zlib.h"
Index: src/external/cddl/osnet/dist/uts/common/zmod/zmod_subr.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/zmod/zmod_subr.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 zmod_subr.c
--- src/external/cddl/osnet/dist/uts/common/zmod/zmod_subr.c	7 Aug 2009 18:34:46 -0000	1.1.1.1
+++ src/external/cddl/osnet/dist/uts/common/zmod/zmod_subr.c	12 Jun 2012 05:57:28 -0000
@@ -29,7 +29,6 @@
 #include <sys/systm.h>
 #include <sys/cmn_err.h>
 #include <sys/kobj.h>
-#include <sys/kobj_impl.h>
 
 struct zchdr {
 	uint_t zch_magic;
Index: src/external/cddl/osnet/dist/uts/common/zmod/zutil.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/zmod/zutil.c,v
retrieving revision 1.2
diff -u -p -r1.2 zutil.c
--- src/external/cddl/osnet/dist/uts/common/zmod/zutil.c	18 Jun 2016 21:41:08 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/zmod/zutil.c	12 Jun 2012 05:57:28 -0000
@@ -16,7 +16,7 @@
 struct internal_state      {int dummy;}; /* for buggy compilers */
 #endif
 
-static const char * const z_errmsg[10] = {
+const char * const z_errmsg[10] = {
 "need dictionary",     /* Z_NEED_DICT       2  */
 "stream end",          /* Z_STREAM_END      1  */
 "",                    /* Z_OK              0  */
Index: src/external/cddl/osnet/dist/uts/common/zmod/zutil.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/zmod/zutil.h,v
retrieving revision 1.2
diff -u -p -r1.2 zutil.h
--- src/external/cddl/osnet/dist/uts/common/zmod/zutil.h	18 Jun 2016 21:41:08 -0000	1.2
+++ src/external/cddl/osnet/dist/uts/common/zmod/zutil.h	25 Apr 2017 16:04:21 -0000
@@ -55,15 +55,15 @@ typedef unsigned short ush;
 typedef ush FAR ushf;
 typedef unsigned long  ulg;
 
-#if 0
+#define z_errmsg opensolaris_z_errmsg
+
 extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 /* (size given to avoid silly warnings with Visual C++) */
-#endif
 
 #define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
 
 #define ERR_RETURN(strm,err) \
-  return (strm->msg = zError(err), (err))
+  return (strm->msg = (char*)ERR_MSG(err), (err))
 /* To be used only when the state is known to be valid */
 
         /* common constants */
Index: src/external/cddl/osnet/include/alloca.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/alloca.h,v
retrieving revision 1.3
diff -u -p -r1.3 alloca.h
--- src/external/cddl/osnet/include/alloca.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/alloca.h	9 Jun 2017 19:08:57 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/include/alloca.h,v 1.1 2007/11/28 21:40:07 jb Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/alloca.h 177698 2008-03-28 22:16:18Z jb $
  *
  */
 
Index: src/external/cddl/osnet/include/dtrace.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/dtrace.h,v
retrieving revision 1.2
diff -u -p -r1.2 dtrace.h
--- src/external/cddl/osnet/include/dtrace.h	21 Feb 2010 01:46:34 -0000	1.2
+++ src/external/cddl/osnet/include/dtrace.h	9 Jun 2017 19:09:20 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/cddl/compat/opensolaris/include/dtrace.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/dtrace.h 179163 2008-05-21 00:26:38Z jb $
  *
  */
 
Index: src/external/cddl/osnet/include/fcntl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/fcntl.h,v
retrieving revision 1.3
diff -u -p -r1.3 fcntl.h
--- src/external/cddl/osnet/include/fcntl.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/fcntl.h	9 Jun 2017 21:29:17 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/include/fcntl.h,v 1.1 2007/11/28 21:40:07 jb Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/fcntl.h 219089 2011-02-27 19:41:40Z pjd $
  *
  */
 
@@ -34,6 +34,7 @@
 
 #include_next <fcntl.h>
 
-#define open64	open
+#define open64(...)	open(__VA_ARGS__)
+#define openat64(...)	openat(__VA_ARGS__)
 
 #endif
Index: src/external/cddl/osnet/include/fsshare.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/fsshare.h,v
retrieving revision 1.3
diff -u -p -r1.3 fsshare.h
--- src/external/cddl/osnet/include/fsshare.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/fsshare.h	9 Jun 2017 21:32:13 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/include/fsshare.h,v 1.1 2007/04/06 01:08:58 pjd Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/fsshare.h 177698 2008-03-28 22:16:18Z jb $
  */
 
 #ifndef _OPENSOLARIS_FSSHARE_H_
Index: src/external/cddl/osnet/include/libproc.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/libproc.h,v
retrieving revision 1.3
diff -u -p -r1.3 libproc.h
--- src/external/cddl/osnet/include/libproc.h	24 Sep 2015 14:17:20 -0000	1.3
+++ src/external/cddl/osnet/include/libproc.h	27 Jun 2017 15:43:35 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/cddl/compat/opensolaris/include/libproc.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/libproc.h 309597 2016-12-06 04:22:38Z markj $
  *
  */
 
@@ -40,20 +40,6 @@
 #define PR_RLC		0x0001
 #define PR_KLC		0x0002
 
-#define	PGRAB_RDONLY	O_RDONLY
-#define	PGRAB_FORCE	0
-
-struct proc_handle;
-typedef void (*proc_child_func)(void *);
-
-/* Values returned by proc_state(). */
-#define PS_IDLE         1
-#define PS_STOP         2
-#define PS_RUN          3
-#define PS_UNDEAD       4
-#define PS_DEAD         5
-#define PS_LOST         6
-
 #include_next <libproc.h>
 
 #endif
Index: src/external/cddl/osnet/include/mnttab.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/mnttab.h,v
retrieving revision 1.3
diff -u -p -r1.3 mnttab.h
--- src/external/cddl/osnet/include/mnttab.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/mnttab.h	22 Apr 2017 16:01:47 -0000
@@ -32,19 +32,36 @@
 #ifndef	_OPENSOLARIS_MNTTAB_H_
 #define	_OPENSOLARIS_MNTTAB_H_
 
+#include <sys/param.h>
+#include <sys/mount.h>
+
 #include <stdio.h>
 #include <paths.h>
 
 #define	MNTTAB		_PATH_DEVNULL
 #define	MNT_LINE_MAX	1024
 
+#if 0
+#define	MS_OVERLAY	0x0
+#define	MS_NOMNTTAB	0x0
+#define	MS_RDONLY	0x1
+
+#define	umount2(p, f)	unmount(p, f)
+#endif
+
 struct mnttab {
 	char	*mnt_special;
 	char	*mnt_mountp;
 	char	*mnt_fstype;
 	char	*mnt_mntopts;
 };
+#define	extmnttab	mnttab
 
 int getmntany(FILE *fd, struct mnttab *mgetp, struct mnttab *mrefp);
+int getmntent(FILE *fp, struct mnttab *mp);
+char *hasmntopt(struct mnttab *mnt, char *opt);
+
+struct statvfs;
+void statvfs2mnttab(struct statvfs *sfs, struct mnttab *mp);
 
 #endif	/* !_OPENSOLARIS_MNTTAB_H_ */
Index: src/external/cddl/osnet/include/solaris.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/solaris.h,v
retrieving revision 1.3
diff -u -p -r1.3 solaris.h
--- src/external/cddl/osnet/include/solaris.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/solaris.h	10 Jun 2017 01:02:14 -0000
@@ -29,11 +29,17 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/* $FreeBSD: head/cddl/compat/opensolaris/include/solaris.h 219089 2011-02-27 19:41:40Z pjd $ */
+
 #ifndef	_SOLARIS_H_
 #define	_SOLARIS_H_
 
 #include <sys/ccompile.h>
 
-#define	dirent64	dirent
+#include <fcntl.h>
+
+#define	NOTE(s)
+
+int mkdirp(const char *, mode_t);
 
 #endif	/* !_SOLARIS_H_ */
Index: src/external/cddl/osnet/include/stdio.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/stdio.h,v
retrieving revision 1.3
diff -u -p -r1.3 stdio.h
--- src/external/cddl/osnet/include/stdio.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/stdio.h	9 Jun 2017 21:45:31 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/include/stdio.h,v 1.1 2007/11/28 21:40:07 jb Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/stdio.h 177698 2008-03-28 22:16:18Z jb $
  *
  */
 
Index: src/external/cddl/osnet/include/stdlib.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/stdlib.h,v
retrieving revision 1.3
diff -u -p -r1.3 stdlib.h
--- src/external/cddl/osnet/include/stdlib.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/stdlib.h	9 Jun 2017 21:45:57 -0000
@@ -25,15 +25,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/include/stdlib.h,v 1.1 2007/11/28 21:40:07 jb Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/stdlib.h 234856 2012-04-30 23:12:16Z rmh $
  *
  */
 
+#include_next <stdlib.h>
+
 #ifndef _COMPAT_OPENSOLARIS_STDLIB_H_
 #define _COMPAT_OPENSOLARIS_STDLIB_H_
 
-#include_next <stdlib.h>
-
 #define getexecname	getprogname
 
 #endif
Index: src/external/cddl/osnet/include/strings.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/strings.h,v
retrieving revision 1.3
diff -u -p -r1.3 strings.h
--- src/external/cddl/osnet/include/strings.h	21 Feb 2010 01:46:34 -0000	1.3
+++ src/external/cddl/osnet/include/strings.h	9 Jun 2017 22:04:28 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/include/strings.h,v 1.1 2007/11/28 21:40:07 jb Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/strings.h 177698 2008-03-28 22:16:18Z jb $
  *
  */
 
Index: src/external/cddl/osnet/include/thread.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/thread.h,v
retrieving revision 1.3
diff -u -p -r1.3 thread.h
--- src/external/cddl/osnet/include/thread.h	10 May 2010 06:26:11 -0000	1.3
+++ src/external/cddl/osnet/include/thread.h	22 Apr 2017 15:53:29 -0000
@@ -52,7 +52,9 @@ typedef pthread_rwlock_t rwlock_t;
 #define	thr_main()		(1)
 #define	_mutex_init(l,f,a)	pthread_mutex_init(l,NULL)
 #define	_mutex_destroy(l)	pthread_mutex_destroy(l)
+#if 0
 #define _mutex_held(l)		pthread_mutex_held_np(l)
+#endif
 #define	mutex_lock(l)		pthread_mutex_lock(l)
 #define	mutex_trylock(l)	pthread_mutex_trylock(l)
 #define	mutex_unlock(l)		pthread_mutex_unlock(l)
Index: src/external/cddl/osnet/include/thread_pool.h
===================================================================
RCS file: src/external/cddl/osnet/include/thread_pool.h
diff -N src/external/cddl/osnet/include/thread_pool.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/include/thread_pool.h	17 Jul 2014 16:19:58 -0000
@@ -0,0 +1,78 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * $FreeBSD: head/cddl/compat/opensolaris/include/thread_pool.h 265689 2014-05-08 16:59:36Z mav $
+ */
+
+#ifndef	_THREAD_POOL_H_
+#define	_THREAD_POOL_H_
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <thread.h>
+#include <pthread.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef	struct tpool tpool_t;	/* opaque thread pool descriptor */
+
+#if defined(__STDC__)
+
+extern	tpool_t	*tpool_create(uint_t min_threads, uint_t max_threads,
+			uint_t linger, pthread_attr_t *attr);
+extern	int	tpool_dispatch(tpool_t *tpool,
+			void (*func)(void *), void *arg);
+extern	void	tpool_destroy(tpool_t *tpool);
+extern	void	tpool_abandon(tpool_t *tpool);
+extern	void	tpool_wait(tpool_t *tpool);
+extern	void	tpool_suspend(tpool_t *tpool);
+extern	int	tpool_suspended(tpool_t *tpool);
+extern	void	tpool_resume(tpool_t *tpool);
+extern	int	tpool_member(tpool_t *tpool);
+
+#else	/* Non ANSI */
+
+extern	tpool_t	*tpool_create();
+extern	int	tpool_dispatch();
+extern	void	tpool_destroy();
+extern	void	tpool_abandon();
+extern	void	tpool_wait();
+extern	void	tpool_suspend();
+extern	int	tpool_suspended();
+extern	void	tpool_resume();
+extern	int	tpool_member();
+
+#endif	/* __STDC__ */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _THREAD_POOL_H_ */
Index: src/external/cddl/osnet/include/unistd.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/include/unistd.h,v
retrieving revision 1.5
diff -u -p -r1.5 unistd.h
--- src/external/cddl/osnet/include/unistd.h	4 Nov 2012 19:12:41 -0000	1.5
+++ src/external/cddl/osnet/include/unistd.h	9 Jun 2017 21:46:28 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/include/unistd.h,v 1.1 2007/11/28 21:40:07 jb Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/include/unistd.h 177698 2008-03-28 22:16:18Z jb $
  *
  */
 
Index: src/external/cddl/osnet/lib/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/Makefile,v
retrieving revision 1.8
diff -u -p -r1.8 Makefile
--- src/external/cddl/osnet/lib/Makefile	29 Feb 2016 17:46:54 -0000	1.8
+++ src/external/cddl/osnet/lib/Makefile	22 Apr 2017 21:09:12 -0000
@@ -18,6 +18,8 @@ SUBDIR+=	libnvpair
 SUBDIR+=	libumem
 SUBDIR+=	libuutil
 SUBDIR+=	.WAIT
+SUBDIR+=	libzfs_core
+SUBDIR+=	.WAIT
 SUBDIR+=	libzfs
 SUBDIR+=	libzpool
 .endif
Index: src/external/cddl/osnet/lib/libctf/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libctf/Makefile,v
retrieving revision 1.9
diff -u -p -r1.9 Makefile
--- src/external/cddl/osnet/lib/libctf/Makefile	4 Aug 2016 17:07:24 -0000	1.9
+++ src/external/cddl/osnet/lib/libctf/Makefile	9 Jun 2017 21:47:57 -0000
@@ -1,12 +1,10 @@
 #	$NetBSD: Makefile,v 1.9 2016/08/04 17:07:24 christos Exp $
 
-# $FreeBSD: src/cddl/lib/libctf/Makefile,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
+# $FreeBSD: head/cddl/lib/libctf/Makefile 314654 2017-03-04 11:30:04Z ngie $
 
 .include <bsd.init.mk>
 
 LIB=		ctf
-MAN=		ctf.5
-
 SRCS=		ctf_create.c \
 		ctf_decl.c \
 		ctf_error.c \
@@ -18,6 +16,7 @@ SRCS=		ctf_create.c \
 		ctf_subr.c \
 		ctf_types.c \
 		ctf_util.c
+MAN=		ctf.5
 
 .PATH:		${.CURDIR}/../../dist/common/ctf
 .PATH:		${.CURDIR}/../../dist/lib/libctf/common
Index: src/external/cddl/osnet/lib/libdtrace/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/Makefile,v
retrieving revision 1.16
diff -u -p -r1.16 Makefile
--- src/external/cddl/osnet/lib/libdtrace/Makefile	19 Dec 2016 01:26:31 -0000	1.16
+++ src/external/cddl/osnet/lib/libdtrace/Makefile	9 Jun 2017 21:51:00 -0000
@@ -1,14 +1,12 @@
 #	$NetBSD: Makefile,v 1.16 2016/12/19 01:26:31 christos Exp $
 
-# $FreeBSD: src/cddl/lib/libdtrace/Makefile,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
+# $FreeBSD: head/cddl/lib/libdtrace/Makefile 314654 2017-03-04 11:30:04Z ngie $
 
 WARNS=1
 .include <bsd.init.mk>
 
 LIB=		dtrace
 
-#WARNS=		1
-
 SRCS=		dt_aggregate.c \
 		dt_as.c \
 		dt_buf.c \
@@ -24,6 +22,7 @@ SRCS=		dt_aggregate.c \
 		dt_handle.c \
 		dt_ident.c \
 		dt_inttab.c \
+		dt_isadep.c \
 		dt_lex.l \
 		dt_link.c \
 		dt_list.c \
@@ -46,6 +45,7 @@ SRCS=		dt_aggregate.c \
 		dt_string.c \
 		dt_strtab.c \
 		dt_subr.c \
+		dt_sugar.c \
 		dt_work.c \
 		dt_xlator.c \
 		gmatch.c
@@ -55,6 +55,9 @@ DSRCS=		errno.d			\
 		signal.d		\
 		unistd.d
 
+FILES=		${DSRCS}
+FILESDIR=	/usr/lib/dtrace
+
 .PATH:		${.CURDIR}/../../dist/lib/libgen/common
 .PATH:		${.CURDIR}/../../dist/lib/libdtrace/common
 
@@ -67,10 +70,6 @@ CPPFLAGS+=	-I${.OBJDIR} \
 		-I${OPENSOLARIS_USR_DISTDIR}/lib/libdtrace/common \
 		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
 
-# XXX need new libelf (not GPL)
-
-#CFLAGS+=	-DYYDEBUG
-
 COPTS.dt_aggregate.c	+= -Wno-stack-protector
 COPTS.dt_consume.c	+= -Wno-stack-protector
 COPTS.dt_decl.c		+= -Wno-stack-protector
@@ -87,17 +86,22 @@ COPTS.dt_program.c	+= -Wno-stack-protect
 COPTS.dt_provider.c	+= -Wno-stack-protector
 COPTS.dt_subr.c		+= -Wno-stack-protector
 
+COPTS.dt_consume.c	+= -Wno-parentheses -Wno-maybe-uninitialized
+COPTS.dt_options.c	+= -Wno-parentheses
+COPTS.dt_pid.c		+= -Wno-unused-but-set-variable
+COPTS.dt_isadep.c	+= -Wno-unused-variable
+
 .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "x86_64"
 CPPFLAGS+=	-I${OPENSOLARIS_SYS_DISTDIR}/uts/intel
-CPPFLAGS+=	-I${.CURDIR}/../../dev/dtrace/i386
-.PATH:		${.CURDIR}/../../dist/lib/libdtrace/i386 ${.CURDIR}/../../dev/dtrace/i386
-SRCS+=	dt_isadep.c # dis_tables.c
+CPPFLAGS+=	-I${.CURDIR}/../../dev/dtrace/x86
+.PATH:		${.CURDIR}/../../dist/lib/libdtrace/i386 ${.CURDIR}/../../dev/dtrace/x86
+SRCS+=		dis_tables.c
+COPTS.dis_tables.c	+= -Wno-parentheses -Wno-maybe-uninitialized
 .elif ${MACHINE_ARCH} == "sparc64"
 CPPFLAGS+=	-I${OPENSOLARIS_SYS_DISTDIR}/uts/sparc
 .elif ${MACHINE_CPU} == "arm"
 CPPFLAGS+=	-I${OPENSOLARIS_SYS_DISTDIR}/uts/arm
 .PATH:		${.CURDIR}/../../dist/lib/libdtrace/arm
-SRCS+=	dt_isadep.c
 .endif
 
 LFLAGS+=-l
@@ -116,18 +120,12 @@ dt_names.c: ${MKNAMES} ${OPENSOLARIS_SYS
 
 beforedepend:	dt_errtags.c dt_names.c
 
-foo:
-	echo ${OPENSOLARIS_USR_DISTDIR}
-
 .SUFFIXES: .in
 .in:
 	${CPP} -D_KERNEL ${CPPFLAGS} $< | tr -d ' ' | tr '"' '@' | \
 	    ${TOOL_SED} -e 's/\&/\\\&/g' | grep '^s/' > ${.TARGET}
 
 
-FILES=		${DSRCS}
-FILESDIR=	/usr/lib/dtrace
-
 LIBDPLIBS=	proc	${NETBSDSRCDIR}/external/bsd/libproc/lib
 
 .include <bsd.lib.mk>
Index: src/external/cddl/osnet/lib/libdtrace/errno.d
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/errno.d,v
retrieving revision 1.2
diff -u -p -r1.2 errno.d
--- src/external/cddl/osnet/lib/libdtrace/errno.d	21 Feb 2010 01:46:35 -0000	1.2
+++ src/external/cddl/osnet/lib/libdtrace/errno.d	9 Jun 2017 21:51:21 -0000
@@ -23,7 +23,7 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/cddl/lib/libdtrace/errno.d,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/lib/libdtrace/errno.d 179189 2008-05-22 04:26:42Z jb $
  */
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
Index: src/external/cddl/osnet/lib/libdtrace/libproc_compat.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/libproc_compat.h,v
retrieving revision 1.1
diff -u -p -r1.1 libproc_compat.h
--- src/external/cddl/osnet/lib/libdtrace/libproc_compat.h	24 Sep 2015 14:36:54 -0000	1.1
+++ src/external/cddl/osnet/lib/libdtrace/libproc_compat.h	9 Jun 2017 21:35:35 -0000
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  * SUCH DAMAGE. 
  *
- * $FreeBSD: head/cddl/lib/libdtrace/libproc_compat.h 281257 2015-04-08 02:36:37Z markj $
+ * $FreeBSD: head/cddl/lib/libdtrace/libproc_compat.h 309597 2016-12-06 04:22:38Z markj $
  */
 
 /*
@@ -34,6 +34,9 @@
  * Functions sorted alphabetically.
  */
 #define	PR_LMID_EVERY 0
+#define	PGRAB_RDONLY	PATTACH_RDONLY
+#define	PGRAB_FORCE	PATTACH_FORCE
+
 #define	Psetrun(p, a1, a2) proc_continue((p))
 #define	Pxlookup_by_addr(p, a, n, s, sym, i) \
     proc_addr2sym(p, a, n, s, sym)
Index: src/external/cddl/osnet/lib/libdtrace/net.d
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/net.d,v
retrieving revision 1.2
diff -u -p -r1.2 net.d
--- src/external/cddl/osnet/lib/libdtrace/net.d	21 Feb 2010 01:46:35 -0000	1.2
+++ src/external/cddl/osnet/lib/libdtrace/net.d	9 Jun 2017 21:56:22 -0000
@@ -22,7 +22,7 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/cddl/lib/libdtrace/net.d,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/lib/libdtrace/net.d 179189 2008-05-22 04:26:42Z jb $
  */
 
 /*
Index: src/external/cddl/osnet/lib/libdtrace/nfs.d
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/nfs.d,v
retrieving revision 1.2
diff -u -p -r1.2 nfs.d
--- src/external/cddl/osnet/lib/libdtrace/nfs.d	21 Feb 2010 01:46:35 -0000	1.2
+++ src/external/cddl/osnet/lib/libdtrace/nfs.d	9 Jun 2017 21:55:58 -0000
@@ -22,7 +22,7 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/cddl/lib/libdtrace/nfs.d,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/lib/libdtrace/nfs.d 286420 2015-08-07 19:56:22Z markj $
  */
 
 /*
@@ -30,8 +30,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #pragma	D depends_on library ip.d
 #pragma	D depends_on library net.d
 #pragma	D depends_on module genunix
@@ -96,3 +94,19 @@ translator nfsv4cbinfo_t < rfs4_deleg_st
 	nci_curpath = (P->finfo->vp == NULL) ? "<unknown>" :
 	    P->finfo->vp->v_path;
 };
+
+typedef struct nfsv3opinfo {
+	uint64_t noi_xid;	/* unique transation ID */
+	cred_t *noi_cred;	/* credentials for operation */
+	string noi_curpath;	/* current file handle path (if any) */
+} nfsv3opinfo_t;
+
+typedef struct nfsv3oparg nfsv3oparg_t;
+
+#pragma D binding "1.5" translator
+translator nfsv3opinfo_t < nfsv3oparg_t *P > {
+	noi_xid = ((struct svc_req *)arg0)->rq_xprt->xp_xid;
+	noi_cred = (cred_t *)arg1;
+	noi_curpath = (arg2 == 0 || ((vnode_t *)arg2)->v_path == NULL) ?
+	    "<unknown>" : ((vnode_t *)arg2)->v_path;
+};
Index: src/external/cddl/osnet/lib/libdtrace/psinfo.d
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/psinfo.d,v
retrieving revision 1.5
diff -u -p -r1.5 psinfo.d
--- src/external/cddl/osnet/lib/libdtrace/psinfo.d	7 Oct 2015 00:35:23 -0000	1.5
+++ src/external/cddl/osnet/lib/libdtrace/psinfo.d	9 Jun 2017 21:51:46 -0000
@@ -23,7 +23,7 @@
  *
  * Portions Copyright 2006 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/cddl/lib/libdtrace/psinfo.d,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/lib/libdtrace/psinfo.d 304825 2016-08-25 23:24:57Z gnn $
  */
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
Index: src/external/cddl/osnet/lib/libdtrace/sched.d
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/sched.d,v
retrieving revision 1.2
diff -u -p -r1.2 sched.d
--- src/external/cddl/osnet/lib/libdtrace/sched.d	21 Feb 2010 01:46:35 -0000	1.2
+++ src/external/cddl/osnet/lib/libdtrace/sched.d	9 Jun 2017 21:55:49 -0000
@@ -22,15 +22,13 @@
  *
  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/cddl/lib/libdtrace/sched.d,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/lib/libdtrace/sched.d 286420 2015-08-07 19:56:22Z markj $
  */
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #pragma D depends_on module unix
 #pragma D depends_on provider sched
 
Index: src/external/cddl/osnet/lib/libdtrace/signal.d
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/signal.d,v
retrieving revision 1.2
diff -u -p -r1.2 signal.d
--- src/external/cddl/osnet/lib/libdtrace/signal.d	21 Feb 2010 01:46:35 -0000	1.2
+++ src/external/cddl/osnet/lib/libdtrace/signal.d	9 Jun 2017 21:57:22 -0000
@@ -23,7 +23,7 @@
  *
  * Portions Copyright 2008 John Birrell jb@freebsd.org
  *
- * $FreeBSD: src/cddl/lib/libdtrace/signal.d,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/lib/libdtrace/signal.d 179189 2008-05-22 04:26:42Z jb $
  */
 
 inline int SIGHUP = 1;
Index: src/external/cddl/osnet/lib/libdtrace/unistd.d
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libdtrace/unistd.d,v
retrieving revision 1.2
diff -u -p -r1.2 unistd.d
--- src/external/cddl/osnet/lib/libdtrace/unistd.d	21 Feb 2010 01:46:35 -0000	1.2
+++ src/external/cddl/osnet/lib/libdtrace/unistd.d	9 Jun 2017 21:57:47 -0000
@@ -20,7 +20,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/cddl/lib/libdtrace/unistd.d,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/cddl/lib/libdtrace/unistd.d 179189 2008-05-22 04:26:42Z jb $
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
Index: src/external/cddl/osnet/lib/libnvpair/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libnvpair/Makefile,v
retrieving revision 1.8
diff -u -p -r1.8 Makefile
--- src/external/cddl/osnet/lib/libnvpair/Makefile	4 Aug 2016 17:07:24 -0000	1.8
+++ src/external/cddl/osnet/lib/libnvpair/Makefile	22 Apr 2017 08:41:52 -0000
@@ -6,11 +6,21 @@
 .PATH:  ${.CURDIR}/../../dist/common/nvpair
 .PATH:  ${.CURDIR}/../../dist/uts/common/rpc
 
-CPPFLAGS+=-Wall -Wno-unknown-pragmas
+CPPFLAGS+=	-I${OSNETDIR}/sys \
+		-I${OPENSOLARIS_USR_DISTDIR}/head \
+		-I${OPENSOLARIS_SYS_DISTDIR}/common \
+		-I${OPENSOLARIS_SYS_DISTDIR}/uts/common
+
+CPPFLAGS+=	-Wall \
+		-Wno-unknown-pragmas \
+		-Wno-shadow \
+		-Wno-format-security \
+		-Wno-format-nonliteral \
+		-Wno-cast-qual
 
 LIB=	nvpair
 
-SRCS=	libnvpair.c nvpair_alloc_system.c nvpair_alloc_fixed.c nvpair.c
+SRCS=	libnvpair.c nvpair_alloc_system.c nvpair_alloc_fixed.c nvpair.c fnvpair.c nvpair_json.c
 SRCS+=	xdr.c
 SRCS+=	xdr_array.c
 SRCS+=	xdr_mem.c
Index: src/external/cddl/osnet/lib/libzfs/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libzfs/Makefile,v
retrieving revision 1.8
diff -u -p -r1.8 Makefile
--- src/external/cddl/osnet/lib/libzfs/Makefile	4 Aug 2016 17:07:24 -0000	1.8
+++ src/external/cddl/osnet/lib/libzfs/Makefile	11 Jun 2017 12:55:39 -0000
@@ -15,20 +15,45 @@ LIBDPLIBS+= \
 	pthread	${NETBSDSRCDIR}/lib/libpthread \
 	umem	${.CURDIR}/../libumem \
 	util	${NETBSDSRCDIR}/lib/libutil \
-	uutil	${.CURDIR}/../libuutil
+	uutil	${.CURDIR}/../libuutil \
+	z	${NETBSDSRCDIR}/lib/libz \
+	zfs_core ${.CURDIR}/../libzfs_core
 
 # Local
-SRCS=	deviceid.c mnttab.c mkdirp.c zmount.c fsshare.c zone.c efi.c ucred.c stubs.c
+SRCS+=	deviceid.c
+SRCS+=	efi.c
+SRCS+=	fsshare.c
+SRCS+=	mkdirp.c
+SRCS+=	mnttab.c
+SRCS+=	stubs.c
+SRCS+=	thread_pool.c
+SRCS+=	ucred.c
+SRCS+=	zmount.c
+SRCS+=	zone.c
 
 # Sun
-SRCS+=	zfs_namecheck.c zprop_common.c zfs_prop.c zpool_prop.c
-SRCS+=	zfs_deleg.c zfs_fletcher.c zfs_comutil.c
-SRCS+=	libzfs_dataset.c libzfs_util.c libzfs_graph.c libzfs_mount.c
-SRCS+=	libzfs_pool.c libzfs_changelist.c libzfs_config.c libzfs_import.c
-SRCS+=	libzfs_status.c libzfs_sendrecv.c
+SRCS+=	libzfs_changelist.c
+SRCS+=	libzfs_config.c
+SRCS+=	libzfs_dataset.c
+SRCS+=	libzfs_diff.c
+#SRCS+=	libzfs_graph.c
+SRCS+=	libzfs_import.c
+SRCS+=	libzfs_iter.c
+SRCS+=	libzfs_mount.c
+SRCS+=	libzfs_pool.c
+SRCS+=	libzfs_sendrecv.c
+SRCS+=	libzfs_status.c
+SRCS+=	libzfs_util.c
+SRCS+=	zfeature_common.c
+SRCS+=	zfs_comutil.c
+SRCS+=	zfs_deleg.c
+SRCS+=	zfs_fletcher.c
+SRCS+=	zfs_namecheck.c
+SRCS+=	zfs_prop.c
+SRCS+=	zpool_prop.c
+SRCS+=	zprop_common.c
 
 CPPFLAGS+=	-I${NETBSDSRCDIR}/sbin/mount
-# CPPFLAGS+=	-Wall -Wno-unknown-pragmas
 
 # Avoid conflict with libprop
 CPPFLAGS+=	-D_PROPLIB_ZFS_CONFLICT
@@ -36,5 +61,7 @@ CPPFLAGS+=	-D_PROPLIB_ZFS_CONFLICT
 # Avoid internal header conflicts
 CPPFLAGS+=	-D_ZFS_CONTEXT_NO_VERIFY -D_ZFS_CONTEXT_NO_DEBUG
 
+CWARNFLAGS+=	-Wno-parentheses
+
 .include "../../Makefile.zfs"
 .include <bsd.lib.mk>
Index: src/external/cddl/osnet/lib/libzfs/deviceid.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libzfs/deviceid.c,v
retrieving revision 1.3
diff -u -p -r1.3 deviceid.c
--- src/external/cddl/osnet/lib/libzfs/deviceid.c	10 Jan 2017 19:20:35 -0000	1.3
+++ src/external/cddl/osnet/lib/libzfs/deviceid.c	9 Jun 2017 22:01:14 -0000
@@ -56,12 +56,11 @@
  */
 
 #include <sys/cdefs.h>
-/* __FBSDID("$FreeBSD: src/compat/opensolaris/misc/deviceid.c,v 1.1 2007/05/06 01:39:38 pjd Exp $"); */
+/* __FBSDID("$FreeBSD: head/cddl/compat/opensolaris/misc/deviceid.c 238112 2012-07-04 17:36:26Z pjd $"); */
 __RCSID("$NetBSD: deviceid.c,v 1.3 2017/01/10 19:20:35 christos Exp $");
 
 #include <sys/param.h>
 #include <sys/ioctl.h>
-
 #include <stdio.h>
 #include <unistd.h>
 #include <string.h>
@@ -114,18 +113,7 @@ devid_free_nmlist(devid_nmlist_t *list)
 int
 devid_get(int fd, ddi_devid_t *retdevid)
 {
-#ifdef XXXNETBSD
-	errx(1, "XXXNETBSD devid_get");
-
-	if (ioctl(fd, DIOCGIDENT, retdevid->devid) == -1)
-		return (errno);
-	if (retdevid->devid[0] == '\0')
-		return (ENOENT);
-#else
-	memset(retdevid->devid, 0, sizeof(retdevid->devid));
-	strlcpy(retdevid->devid, "device", sizeof(retdevid->devid));
-#endif
-	return (0);
+	return (ENOENT);
 }
 
 int
Index: src/external/cddl/osnet/lib/libzfs/fsshare.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libzfs/fsshare.c,v
retrieving revision 1.2
diff -u -p -r1.2 fsshare.c
--- src/external/cddl/osnet/lib/libzfs/fsshare.c	10 Aug 2009 22:44:41 -0000	1.2
+++ src/external/cddl/osnet/lib/libzfs/fsshare.c	9 Jun 2017 22:02:37 -0000
@@ -27,20 +27,22 @@
  */
 
 #include <sys/cdefs.h>
-/* __FBSDID("$FreeBSD: src/compat/opensolaris/misc/fsshare.c,v 1.2 2007/04/21 13:17:23 pjd Exp $"); */
+/* __FBSDID("$FreeBSD: head/cddl/compat/opensolaris/misc/fsshare.c 299342 2016-05-10 07:50:57Z bapt $"); */
+
 __RCSID("$NetBSD: fsshare.c,v 1.2 2009/08/10 22:44:41 haad Exp $");
 
 #include <sys/param.h>
-#include <stdio.h>
-#include <unistd.h>
+
+#include <assert.h>
+#include <errno.h>
 #include <fcntl.h>
+#include <fsshare.h>
+#include <pathnames.h>	/* _PATH_MOUNTDPID */
 #include <signal.h>
+#include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 #include <stdlib.h>
-#include <errno.h>
-#include <assert.h>
-#include <pathnames.h>	/* _PATH_MOUNTDPID */
-#include <fsshare.h>
 
 #define	FILE_HEADER	"# !!! DO NOT EDIT THIS FILE MANUALLY !!!\n\n"
 #define	OPTSSIZE	1024
Index: src/external/cddl/osnet/lib/libzfs/mnttab.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libzfs/mnttab.c,v
retrieving revision 1.2
diff -u -p -r1.2 mnttab.c
--- src/external/cddl/osnet/lib/libzfs/mnttab.c	28 Feb 2010 17:36:51 -0000	1.2
+++ src/external/cddl/osnet/lib/libzfs/mnttab.c	29 Apr 2017 21:29:33 -0000
@@ -32,20 +32,20 @@
  */
 
 #include <sys/cdefs.h>
-/* __FBSDID("$FreeBSD: src/compat/opensolaris/misc/mnttab.c,v 1.1 2007/04/06 01:08:59 pjd Exp $"); */
+__FBSDID("$FreeBSD: head/cddl/compat/opensolaris/misc/mnttab.c 209962 2010-07-12 23:49:04Z mm $");
 __RCSID("$NetBSD: mnttab.c,v 1.2 2010/02/28 17:36:51 haad Exp $");
 
 #include <sys/param.h>
 #include <sys/statvfs.h>
 #include <sys/mntent.h>
+#include <sys/mnttab.h>
 
 #include <stdio.h>
-#include <string.h>
+#include <errno.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <ctype.h>
-#include <err.h>
-
-#include <sys/mnttab.h>
 
 char *
 mntopt(char **p)
@@ -95,110 +95,124 @@ optadd(char *mntopts, size_t size, const
 	strlcat(mntopts, opt, size);
 }
 
-int
-getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp)
+void
+statvfs2mnttab(struct statvfs *sfs, struct mnttab *mp)
 {
-	static struct statvfs *sfs;
 	static char mntopts[MNTMAXSTR];
-	struct opt *o;
-	static int i, n;
-	int flags;
-
-	if (sfs == NULL) {
-		n = getmntinfo(&sfs, ST_WAIT);
-		if (n == -1)
-			return -1;
-	}
-	for (i = 0; i < n; i++) {
+	long flags;
+
+	mntopts[0] = '\0';
+
+	flags = sfs->f_flag;
+#define	OPTADD(opt)	optadd(mntopts, sizeof(mntopts), (opt))
+	if (flags & MNT_RDONLY)
+		OPTADD(MNTOPT_RO);
+	else
+		OPTADD(MNTOPT_RW);
+	if (flags & MNT_NOSUID)
+		OPTADD(MNTOPT_NOSUID);
+	else
+		OPTADD(MNTOPT_SETUID);
+	if (flags & MNT_UPDATE)
+		OPTADD(MNTOPT_REMOUNT);
+	if (flags & MNT_NOATIME)
+		OPTADD(MNTOPT_NOATIME);
+	else
+		OPTADD(MNTOPT_ATIME);
+	OPTADD(MNTOPT_NOXATTR);
+	if (flags & MNT_NOEXEC)
+		OPTADD(MNTOPT_NOEXEC);
+	else
+		OPTADD(MNTOPT_EXEC);
+#undef	OPTADD
+	mp->mnt_special = sfs->f_mntfromname;
+	mp->mnt_mountp = sfs->f_mntonname;
+	mp->mnt_fstype = sfs->f_fstypename;
+	mp->mnt_mntopts = mntopts;
+}
+
+static struct statvfs *gsfs;
+static int allfs;
+
+static int
+statvfs_init(void)
+{
+	struct statvfs *sfs;
+	int error;
+
+	if (gsfs != NULL) {
+		free(gsfs);
+		gsfs = NULL;
+	}
+	allfs = getvfsstat(NULL, 0, ST_WAIT);
+	if (allfs == -1)
+		goto fail;
+	gsfs = malloc(sizeof(gsfs[0]) * allfs * 2);
+	if (gsfs == NULL)
+		goto fail;
+	allfs = getvfsstat(gsfs, sizeof(gsfs[0]) * allfs * 2, ST_WAIT);
+	if (allfs == -1)
+		goto fail;
+	sfs = realloc(gsfs, allfs * sizeof(gsfs[0]));
+	if (sfs != NULL)
+		gsfs = sfs;
+	return (0);
+fail:
+	error = errno;
+	if (gsfs != NULL)
+		free(gsfs);
+	gsfs = NULL;
+	allfs = 0;
+	return (error);
+}
+
+int
+getmntany(FILE *fd, struct mnttab *mgetp, struct mnttab *mrefp)
+{
+	int i, error;
+
+	error = statvfs_init();
+	if (error != 0)
+		return (error);
+
+	for (i = 0; i < allfs; i++) {
 		if (mrefp->mnt_special != NULL &&
-		    strcmp(mrefp->mnt_special, sfs[i].f_mntfromname) != 0) {
+		    strcmp(mrefp->mnt_special, gsfs[i].f_mntfromname) != 0) {
 			continue;
 		}
 		if (mrefp->mnt_mountp != NULL &&
-		    strcmp(mrefp->mnt_mountp, sfs[i].f_mntonname) != 0) {
+		    strcmp(mrefp->mnt_mountp, gsfs[i].f_mntonname) != 0) {
 			continue;
 		}
 		if (mrefp->mnt_fstype != NULL &&
-		    strcmp(mrefp->mnt_fstype, sfs[i].f_fstypename) != 0) {
+		    strcmp(mrefp->mnt_fstype, gsfs[i].f_fstypename) != 0) {
 			continue;
 		}
-		flags = sfs[i].f_flag;
-#define	OPTADD(opt)	optadd(mntopts, sizeof(mntopts), (opt))
-		if (flags & MNT_RDONLY)
-			OPTADD(MNTOPT_RO);
-		else
-			OPTADD(MNTOPT_RW);
-		if (flags & MNT_NOSUID)
-			OPTADD(MNTOPT_NOSUID);
-		else
-			OPTADD(MNTOPT_SETUID);
-		if (flags & MNT_UPDATE)
-			OPTADD(MNTOPT_REMOUNT);
-		if (flags & MNT_NOATIME)
-			OPTADD(MNTOPT_NOATIME);
-		else
-			OPTADD(MNTOPT_ATIME);
-		OPTADD(MNTOPT_NOXATTR);
-		if (flags & MNT_NOEXEC)
-			OPTADD(MNTOPT_NOEXEC);
-		else
-			OPTADD(MNTOPT_EXEC);
-#undef	OPTADD
-		mgetp->mnt_special = sfs[i].f_mntfromname;
-		mgetp->mnt_mountp = sfs[i].f_mntonname;
-		mgetp->mnt_fstype = sfs[i].f_fstypename;
-		mgetp->mnt_mntopts = mntopts;
+		statvfs2mnttab(&gsfs[i], mgetp);
 		return (0);
 	}
-	sfs = NULL;
 	return (-1);
 }
 
 int
-getstatfs(struct statvfs *stat, const char *path)
+getmntent(FILE *fp, struct mnttab *mp)
 {
-	int fs_num, i;
-	struct statvfs *statvfs;
+	struct statvfs *sfs;
+	int error, nfs;
 
-	fs_num = 0;
-	
-	if (path == NULL)
-		return (-1);
-
-	fs_num = getvfsstat(NULL, 0, ST_WAIT);
-
-	if ((statvfs = malloc(fs_num * sizeof(struct statvfs))) == NULL)
-	    return (-1);
-	    
-	memset(statvfs, 0, fs_num * sizeof(struct statvfs));
-	    
-	if (getvfsstat(statvfs, fs_num * sizeof(struct statvfs), ST_WAIT) != 0) {
-		free(statvfs);
-		return (-1);
+	nfs = (int)lseek(fileno(fp), 0, SEEK_CUR);
+	if (nfs == -1)
+		return (errno);
+	/* If nfs is 0, we want to refresh our cache. */
+	if (nfs == 0 || gsfs == NULL) {
+		error = statvfs_init();
+		if (error != 0)
+			return (error);
 	}
-	
-	for( i = 0; i < fs_num; i++) {
-		
-		if (statvfs[i].f_fstypename != NULL &&
-		    strcmp(statvfs[i].f_fstypename, MNTTYPE_ZFS) != 0) {
-			continue;
-		}
-		
-		if (statvfs[i].f_mntonname != NULL &&
-		    strcmp(statvfs[i].f_mntonname, path) != 0) {
-			continue;
-		}
-		
-		memcpy(stat, &statvfs[i], sizeof(struct statvfs));
-	}
-
-	free(statvfs);
+	if (nfs >= allfs)
+		return (-1);
+	statvfs2mnttab(&gsfs[nfs], mp);
+	if (lseek(fileno(fp), 1, SEEK_CUR) == -1)
+		return (errno);
 	return (0);
 }
-
-int
-getmntent(FILE *file, struct mnttab *mnttab) {
-
-
-	return 1;
-}
Index: src/external/cddl/osnet/lib/libzfs/thread_pool.c
===================================================================
RCS file: src/external/cddl/osnet/lib/libzfs/thread_pool.c
diff -N src/external/cddl/osnet/lib/libzfs/thread_pool.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/lib/libzfs/thread_pool.c	20 Apr 2017 22:48:27 -0000
@@ -0,0 +1,426 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/cdefs.h>
+/* __FBSDID("$FreeBSD: head/cddl/compat/opensolaris/misc/thread_pool.c 275595 2014-12-08 06:10:47Z delphij $"); */
+
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include "thread_pool_impl.h"
+
+typedef void (*_Voidfp)(void*); /* pointer to extern "C" function */
+
+static void
+delete_pool(tpool_t *tpool)
+{
+	tpool_job_t *job;
+
+	/*
+	 * There should be no pending jobs, but just in case...
+	 */
+	for (job = tpool->tp_head; job != NULL; job = tpool->tp_head) {
+		tpool->tp_head = job->tpj_next;
+		free(job);
+	}
+	(void) pthread_attr_destroy(&tpool->tp_attr);
+	free(tpool);
+}
+
+/*
+ * Worker thread is terminating.
+ */
+static void
+worker_cleanup(void *arg)
+{
+	tpool_t *tpool = arg;
+
+	if (--tpool->tp_current == 0 &&
+	    (tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) {
+		if (tpool->tp_flags & TP_ABANDON) {
+			pthread_mutex_unlock(&tpool->tp_mutex);
+			delete_pool(tpool);
+			return;
+		}
+		if (tpool->tp_flags & TP_DESTROY)
+			(void) pthread_cond_broadcast(&tpool->tp_busycv);
+	}
+	pthread_mutex_unlock(&tpool->tp_mutex);
+}
+
+static void
+notify_waiters(tpool_t *tpool)
+{
+	if (tpool->tp_head == NULL && tpool->tp_active == NULL) {
+		tpool->tp_flags &= ~TP_WAIT;
+		(void) pthread_cond_broadcast(&tpool->tp_waitcv);
+	}
+}
+
+/*
+ * Called by a worker thread on return from a tpool_dispatch()d job.
+ */
+static void
+job_cleanup(void *arg)
+{
+	tpool_t *tpool = arg;
+	pthread_t my_tid = pthread_self();
+	tpool_active_t *activep;
+	tpool_active_t **activepp;
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	/* CSTYLED */
+	for (activepp = &tpool->tp_active;; activepp = &activep->tpa_next) {
+		activep = *activepp;
+		if (activep->tpa_tid == my_tid) {
+			*activepp = activep->tpa_next;
+			break;
+		}
+	}
+	if (tpool->tp_flags & TP_WAIT)
+		notify_waiters(tpool);
+}
+
+static void *
+tpool_worker(void *arg)
+{
+	tpool_t *tpool = (tpool_t *)arg;
+	int elapsed;
+	tpool_job_t *job;
+	void (*func)(void *);
+	tpool_active_t active;
+	sigset_t maskset;
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	pthread_cleanup_push(worker_cleanup, tpool);
+
+	/*
+	 * This is the worker's main loop.
+	 * It will only be left if a timeout or an error has occured.
+	 */
+	active.tpa_tid = pthread_self();
+	for (;;) {
+		elapsed = 0;
+		tpool->tp_idle++;
+		if (tpool->tp_flags & TP_WAIT)
+			notify_waiters(tpool);
+		while ((tpool->tp_head == NULL ||
+		    (tpool->tp_flags & TP_SUSPEND)) &&
+		    !(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) {
+			if (tpool->tp_current <= tpool->tp_minimum ||
+			    tpool->tp_linger == 0) {
+				(void) pthread_cond_wait(&tpool->tp_workcv,
+				    &tpool->tp_mutex);
+			} else {
+				struct timespec timeout;
+
+				clock_gettime(CLOCK_MONOTONIC, &timeout);
+				timeout.tv_sec += tpool->tp_linger;
+				if (pthread_cond_timedwait(&tpool->tp_workcv,
+				    &tpool->tp_mutex, &timeout) != 0) {
+					elapsed = 1;
+					break;
+				}
+			}
+		}
+		tpool->tp_idle--;
+		if (tpool->tp_flags & TP_DESTROY)
+			break;
+		if (tpool->tp_flags & TP_ABANDON) {
+			/* can't abandon a suspended pool */
+			if (tpool->tp_flags & TP_SUSPEND) {
+				tpool->tp_flags &= ~TP_SUSPEND;
+				(void) pthread_cond_broadcast(&tpool->tp_workcv);
+			}
+			if (tpool->tp_head == NULL)
+				break;
+		}
+		if ((job = tpool->tp_head) != NULL &&
+		    !(tpool->tp_flags & TP_SUSPEND)) {
+			elapsed = 0;
+			func = job->tpj_func;
+			arg = job->tpj_arg;
+			tpool->tp_head = job->tpj_next;
+			if (job == tpool->tp_tail)
+				tpool->tp_tail = NULL;
+			tpool->tp_njobs--;
+			active.tpa_next = tpool->tp_active;
+			tpool->tp_active = &active;
+			pthread_mutex_unlock(&tpool->tp_mutex);
+			pthread_cleanup_push(job_cleanup, tpool);
+			free(job);
+			/*
+			 * Call the specified function.
+			 */
+			func(arg);
+			/*
+			 * We don't know what this thread has been doing,
+			 * so we reset its signal mask and cancellation
+			 * state back to the initial values.
+			 */
+			sigfillset(&maskset);
+			(void) pthread_sigmask(SIG_SETMASK, &maskset, NULL);
+			(void) pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED,
+			    NULL);
+			(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE,
+			    NULL);
+			pthread_cleanup_pop(1);
+		}
+		if (elapsed && tpool->tp_current > tpool->tp_minimum) {
+			/*
+			 * We timed out and there is no work to be done
+			 * and the number of workers exceeds the minimum.
+			 * Exit now to reduce the size of the pool.
+			 */
+			break;
+		}
+	}
+	pthread_cleanup_pop(1);
+	return (arg);
+}
+
+/*
+ * Create a worker thread, with all signals blocked.
+ */
+static int
+create_worker(tpool_t *tpool)
+{
+	sigset_t maskset, oset;
+	pthread_t thread;
+	int error;
+
+	sigfillset(&maskset);
+	(void) pthread_sigmask(SIG_SETMASK, &maskset, &oset);
+	error = pthread_create(&thread, &tpool->tp_attr, tpool_worker, tpool);
+	(void) pthread_sigmask(SIG_SETMASK, &oset, NULL);
+	return (error);
+}
+
+tpool_t	*
+tpool_create(uint_t min_threads, uint_t max_threads, uint_t linger,
+	pthread_attr_t *attr)
+{
+	tpool_t	*tpool;
+	int error;
+
+	if (min_threads > max_threads || max_threads < 1) {
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	tpool = calloc(1, sizeof (*tpool));
+	if (tpool == NULL) {
+		errno = ENOMEM;
+		return (NULL);
+	}
+	(void) pthread_mutex_init(&tpool->tp_mutex, NULL);
+	(void) pthread_cond_init(&tpool->tp_busycv, NULL);
+	(void) pthread_cond_init(&tpool->tp_workcv, NULL);
+	(void) pthread_cond_init(&tpool->tp_waitcv, NULL);
+	tpool->tp_minimum = min_threads;
+	tpool->tp_maximum = max_threads;
+	tpool->tp_linger = linger;
+
+	/* make all pool threads be detached daemon threads */
+	(void) pthread_attr_init(&tpool->tp_attr);
+	(void) pthread_attr_setdetachstate(&tpool->tp_attr,
+	    PTHREAD_CREATE_DETACHED);
+
+	return (tpool);
+}
+
+/*
+ * Dispatch a work request to the thread pool.
+ * If there are idle workers, awaken one.
+ * Else, if the maximum number of workers has
+ * not been reached, spawn a new worker thread.
+ * Else just return with the job added to the queue.
+ */
+int
+tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg)
+{
+	tpool_job_t *job;
+
+	if ((job = calloc(1, sizeof (*job))) == NULL)
+		return (-1);
+	job->tpj_next = NULL;
+	job->tpj_func = func;
+	job->tpj_arg = arg;
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+
+	if (tpool->tp_head == NULL)
+		tpool->tp_head = job;
+	else
+		tpool->tp_tail->tpj_next = job;
+	tpool->tp_tail = job;
+	tpool->tp_njobs++;
+
+	if (!(tpool->tp_flags & TP_SUSPEND)) {
+		if (tpool->tp_idle > 0)
+			(void) pthread_cond_signal(&tpool->tp_workcv);
+		else if (tpool->tp_current < tpool->tp_maximum &&
+		    create_worker(tpool) == 0)
+			tpool->tp_current++;
+	}
+
+	pthread_mutex_unlock(&tpool->tp_mutex);
+	return (0);
+}
+
+/*
+ * Assumes: by the time tpool_destroy() is called no one will use this
+ * thread pool in any way and no one will try to dispatch entries to it.
+ * Calling tpool_destroy() from a job in the pool will cause deadlock.
+ */
+void
+tpool_destroy(tpool_t *tpool)
+{
+	tpool_active_t *activep;
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	pthread_cleanup_push((_Voidfp)pthread_mutex_unlock, &tpool->tp_mutex);
+
+	/* mark the pool as being destroyed; wakeup idle workers */
+	tpool->tp_flags |= TP_DESTROY;
+	tpool->tp_flags &= ~TP_SUSPEND;
+	(void) pthread_cond_broadcast(&tpool->tp_workcv);
+
+	/* cancel all active workers */
+	for (activep = tpool->tp_active; activep; activep = activep->tpa_next)
+		(void) pthread_cancel(activep->tpa_tid);
+
+	/* wait for all active workers to finish */
+	while (tpool->tp_active != NULL) {
+		tpool->tp_flags |= TP_WAIT;
+		(void) pthread_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex);
+	}
+
+	/* the last worker to terminate will wake us up */
+	while (tpool->tp_current != 0)
+		(void) pthread_cond_wait(&tpool->tp_busycv, &tpool->tp_mutex);
+
+	pthread_cleanup_pop(1);	/* pthread_mutex_unlock(&tpool->tp_mutex); */
+	delete_pool(tpool);
+}
+
+/*
+ * Like tpool_destroy(), but don't cancel workers or wait for them to finish.
+ * The last worker to terminate will delete the pool.
+ */
+void
+tpool_abandon(tpool_t *tpool)
+{
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	if (tpool->tp_current == 0) {
+		/* no workers, just delete the pool */
+		pthread_mutex_unlock(&tpool->tp_mutex);
+		delete_pool(tpool);
+	} else {
+		/* wake up all workers, last one will delete the pool */
+		tpool->tp_flags |= TP_ABANDON;
+		tpool->tp_flags &= ~TP_SUSPEND;
+		(void) pthread_cond_broadcast(&tpool->tp_workcv);
+		pthread_mutex_unlock(&tpool->tp_mutex);
+	}
+}
+
+/*
+ * Wait for all jobs to complete.
+ * Calling tpool_wait() from a job in the pool will cause deadlock.
+ */
+void
+tpool_wait(tpool_t *tpool)
+{
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	pthread_cleanup_push((_Voidfp)pthread_mutex_unlock, &tpool->tp_mutex);
+	while (tpool->tp_head != NULL || tpool->tp_active != NULL) {
+		tpool->tp_flags |= TP_WAIT;
+		(void) pthread_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex);
+	}
+	pthread_cleanup_pop(1);	/* pthread_mutex_unlock(&tpool->tp_mutex); */
+}
+
+void
+tpool_suspend(tpool_t *tpool)
+{
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	tpool->tp_flags |= TP_SUSPEND;
+	pthread_mutex_unlock(&tpool->tp_mutex);
+}
+
+int
+tpool_suspended(tpool_t *tpool)
+{
+	int suspended;
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	suspended = (tpool->tp_flags & TP_SUSPEND) != 0;
+	pthread_mutex_unlock(&tpool->tp_mutex);
+
+	return (suspended);
+}
+
+void
+tpool_resume(tpool_t *tpool)
+{
+	int excess;
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	if (!(tpool->tp_flags & TP_SUSPEND)) {
+		pthread_mutex_unlock(&tpool->tp_mutex);
+		return;
+	}
+	tpool->tp_flags &= ~TP_SUSPEND;
+	(void) pthread_cond_broadcast(&tpool->tp_workcv);
+	excess = tpool->tp_njobs - tpool->tp_idle;
+	while (excess-- > 0 && tpool->tp_current < tpool->tp_maximum) {
+		if (create_worker(tpool) != 0)
+			break;		/* pthread_create() failed */
+		tpool->tp_current++;
+	}
+	pthread_mutex_unlock(&tpool->tp_mutex);
+}
+
+int
+tpool_member(tpool_t *tpool)
+{
+	pthread_t my_tid = pthread_self();
+	tpool_active_t *activep;
+
+	pthread_mutex_lock(&tpool->tp_mutex);
+	for (activep = tpool->tp_active; activep; activep = activep->tpa_next) {
+		if (activep->tpa_tid == my_tid) {
+			pthread_mutex_unlock(&tpool->tp_mutex);
+			return (1);
+		}
+	}
+	pthread_mutex_unlock(&tpool->tp_mutex);
+	return (0);
+}
Index: src/external/cddl/osnet/lib/libzfs/thread_pool_impl.h
===================================================================
RCS file: src/external/cddl/osnet/lib/libzfs/thread_pool_impl.h
diff -N src/external/cddl/osnet/lib/libzfs/thread_pool_impl.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/lib/libzfs/thread_pool_impl.h	17 Jul 2014 16:19:58 -0000
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * $FreeBSD: head/cddl/compat/opensolaris/misc/thread_pool_impl.h 265689 2014-05-08 16:59:36Z mav $
+ */
+
+#ifndef _THREAD_POOL_IMPL_H
+#define	_THREAD_POOL_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <thread_pool.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Thread pool implementation definitions.
+ * See <thread_pool.h> for interface declarations.
+ */
+
+/*
+ * FIFO queued job
+ */
+typedef struct tpool_job tpool_job_t;
+struct tpool_job {
+	tpool_job_t	*tpj_next;		/* list of jobs */
+	void		(*tpj_func)(void *);	/* function to call */
+	void		*tpj_arg;		/* its argument */
+};
+
+/*
+ * List of active threads, linked through their stacks.
+ */
+typedef struct tpool_active tpool_active_t;
+struct tpool_active {
+	tpool_active_t	*tpa_next;	/* list of active threads */
+	pthread_t	tpa_tid;	/* active thread id */
+};
+
+/*
+ * The thread pool.
+ */
+struct tpool {
+	tpool_t		*tp_forw;	/* circular list of all thread pools */
+	tpool_t		*tp_back;
+	mutex_t		tp_mutex;	/* protects the pool data */
+	cond_t		tp_busycv;	/* synchronization in tpool_dispatch */
+	cond_t		tp_workcv;	/* synchronization with workers */
+	cond_t		tp_waitcv;	/* synchronization in tpool_wait() */
+	tpool_active_t	*tp_active;	/* threads performing work */
+	tpool_job_t	*tp_head;	/* FIFO job queue */
+	tpool_job_t	*tp_tail;
+	pthread_attr_t	tp_attr;	/* attributes of the workers */
+	int		tp_flags;	/* see below */
+	uint_t		tp_linger;	/* seconds before idle workers exit */
+	int		tp_njobs;	/* number of jobs in job queue */
+	int		tp_minimum;	/* minimum number of worker threads */
+	int		tp_maximum;	/* maximum number of worker threads */
+	int		tp_current;	/* current number of worker threads */
+	int		tp_idle;	/* number of idle workers */
+};
+
+/* tp_flags */
+#define	TP_WAIT		0x01		/* waiting in tpool_wait() */
+#define	TP_SUSPEND	0x02		/* pool is being suspended */
+#define	TP_DESTROY	0x04		/* pool is being destroyed */
+#define	TP_ABANDON	0x08		/* pool is abandoned (auto-destroy) */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _THREAD_POOL_IMPL_H */
Index: src/external/cddl/osnet/lib/libzfs/zone.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libzfs/zone.c,v
retrieving revision 1.1
diff -u -p -r1.1 zone.c
--- src/external/cddl/osnet/lib/libzfs/zone.c	7 Aug 2009 20:57:56 -0000	1.1
+++ src/external/cddl/osnet/lib/libzfs/zone.c	9 Jun 2017 22:04:10 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/compat/opensolaris/misc/zone.c,v 1.1 2007/04/06 01:08:59 pjd Exp $
+ * $FreeBSD: head/cddl/compat/opensolaris/misc/zone.c 185029 2008-11-17 20:49:29Z pjd $
  */
 
 #include <stdlib.h>
Index: src/external/cddl/osnet/lib/libzfs_core/Makefile
===================================================================
RCS file: src/external/cddl/osnet/lib/libzfs_core/Makefile
diff -N src/external/cddl/osnet/lib/libzfs_core/Makefile
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/lib/libzfs_core/Makefile	25 Apr 2017 02:58:34 -0000
@@ -0,0 +1,24 @@
+#	$NetBSD$
+
+.include <bsd.init.mk>
+
+.PATH: ${.CURDIR}/../../dist/common/zfs
+.PATH: ${.CURDIR}/../../dist/common/fs/zfs
+.PATH: ${.CURDIR}/../../dist/lib/libzfs/common
+.PATH: ${.CURDIR}/../../dist/lib/libzfs_core/common
+
+LIB=	zfs_core
+
+LIBDPLIBS+= \
+	nvpair	${.CURDIR}/../libnvpair
+
+
+SRCS=	libzfs_core.c \
+	libzfs_core_compat.c \
+	zfs_ioctl_compat.c
+
+SRCS+=	libzfs_compat.c
+
+.include "../../Makefile.zfs"
+.include <bsd.lib.mk>
+
Index: src/external/cddl/osnet/lib/libzfs_core/shlib_version
===================================================================
RCS file: src/external/cddl/osnet/lib/libzfs_core/shlib_version
diff -N src/external/cddl/osnet/lib/libzfs_core/shlib_version
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/lib/libzfs_core/shlib_version	22 Apr 2017 08:57:43 -0000
@@ -0,0 +1,5 @@
+#	$NetBSD$
+#	Remember to update distrib/sets/lists/base/shl.* when changing
+
+major=0
+minor=0
Index: src/external/cddl/osnet/lib/libzpool/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/lib/libzpool/Makefile,v
retrieving revision 1.11
diff -u -p -r1.11 Makefile
--- src/external/cddl/osnet/lib/libzpool/Makefile	4 Aug 2016 17:07:24 -0000	1.11
+++ src/external/cddl/osnet/lib/libzpool/Makefile	11 Jun 2017 13:03:13 -0000
@@ -20,13 +20,15 @@ LIBDPLIBS+= \
 
 LIB=		zpool
 CPPFLAGS+=	-std=c99
-#CPPFLAGS+=-Wall -Wno-unknown-pragmas
+
+CWARNFLAGS+=	-Wno-missing-field-initializers
+CWARNFLAGS+=	-Wno-parentheses
 
 # Local stuff
-SRCS=		kernel.c kernel2.c atomic.c
+SRCS+=		kernel.c atomic.c
 
 # Sun stuff
-SRCS+=		${ZFS_COMMON_OBJS:C/.o$/.c/}
+SRCS+=		${ZFS_COMMON_OBJS:C/.o$/.c/} vdev_file.c trim_map.c
 SRCS+=		${ZFS_SHARED_OBJS:C/.o$/.c/}
 SRCS+=		taskq.c util.c list.c u8_textprep.c
 
Index: src/external/cddl/osnet/lib/libzpool/kernel.c
===================================================================
RCS file: src/external/cddl/osnet/lib/libzpool/kernel.c
diff -N src/external/cddl/osnet/lib/libzpool/kernel.c
--- src/external/cddl/osnet/lib/libzpool/kernel.c	21 Jun 2013 16:22:46 -0000	1.8
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,749 +0,0 @@
-/*     $NetBSD: kernel.c,v 1.8 2013/06/21 16:22:46 christos Exp $  */
-
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-#include <sys/cdefs.h>
-__RCSID("$NetBSD: kernel.c,v 1.8 2013/06/21 16:22:46 christos Exp $");
-
-#include <sys/zfs_context.h>
-#include <sys/sysctl.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <poll.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-#include <zlib.h>
-#include <sys/spa.h>
-#include <sys/stat.h>
-#include <sys/processor.h>
-#include <sys/zmod.h>
-#include <sys/utsname.h>
-
-/*
- * Emulation of kernel services in userland.
- */
-
-#ifdef XXXNETBSD
-int hz = 119;	/* frequency when using gethrtime() >> 23 for lbolt */
-#endif
-int aok;
-uint64_t physmem;
-vnode_t *rootdir = (vnode_t *)0xabcd1234;
-char hw_serial[11];
-size_t pgsize;
-
-struct utsname utsname = {
-	"userland"
-};
-
-/* this only exists to have its address taken */
-struct proc p0;
-
-/*
- * =========================================================================
- * threads
- * =========================================================================
- */
-/*ARGSUSED*/
-kthread_t *
-zk_thread_create(void (*func)(), void *arg)
-{
-	thread_t tid;
-
-	VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED,
-	    &tid) == 0);
-
-	return ((void *)(uintptr_t)tid);
-}
-
-/*
- * =========================================================================
- * kstats
- * =========================================================================
- */
-/*ARGSUSED*/
-kstat_t *
-kstat_create(char *module, int instance, char *name, char *class,
-    uchar_t type, ulong_t ndata, uchar_t ks_flag)
-{
-	return (NULL);
-}
-
-/*ARGSUSED*/
-void
-kstat_install(kstat_t *ksp)
-{}
-
-/*ARGSUSED*/
-void
-kstat_delete(kstat_t *ksp)
-{}
-
-/*
- * =========================================================================
- * vnode operations
- * =========================================================================
- */
-/*
- * Note: for the xxxat() versions of these functions, we assume that the
- * starting vp is always rootdir (which is true for spa_directory.c, the only
- * ZFS consumer of these interfaces).  We assert this is true, and then emulate
- * them by adding '/' in front of the path.
- */
-
-/*ARGSUSED*/
-int
-vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
-{
-	int fd;
-	vnode_t *vp;
-	int old_umask;
-	char realpath[MAXPATHLEN];
-	struct stat64 st;
-
-	/*
-	 * If we're accessing a real disk from userland, we need to use
-	 * the character interface to avoid caching.  This is particularly
-	 * important if we're trying to look at a real in-kernel storage
-	 * pool from userland, e.g. via zdb, because otherwise we won't
-	 * see the changes occurring under the segmap cache.
-	 * On the other hand, the stupid character device returns zero
-	 * for its size.  So -- gag -- we open the block device to get
-	 * its size, and remember it for subsequent VOP_GETATTR().
-	 */
-	if (strncmp(path, "/dev/", 5) == 0) {
-		char *dsk;
-		fd = open64(path, O_RDONLY);
-		if (fd == -1)
-			return (errno);
-		if (fstat64(fd, &st) == -1) {
-			close(fd);
-			return (errno);
-		}
-		close(fd);
-		(void) sprintf(realpath, "%s", path);
-		dsk = strstr(path, "/dsk/");
-		if (dsk != NULL)
-			(void) sprintf(realpath + (dsk - path) + 1, "r%s",
-			    dsk + 1);
-	} else {
-		(void) sprintf(realpath, "%s", path);
-		if (!(flags & FCREAT) && stat64(realpath, &st) == -1)
-			return (errno);
-	}
-
-	if (flags & FCREAT)
-		old_umask = umask(0);
-
-	/*
-	 * The construct 'flags - FREAD' conveniently maps combinations of
-	 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
-	 */
-	fd = open64(realpath, flags - FREAD, mode);
-
-	if (flags & FCREAT)
-		(void) umask(old_umask);
-
-	if (fd == -1)
-		return (errno);
-
-	if (fstat64(fd, &st) == -1) {
-		close(fd);
-		return (errno);
-	}
-
-	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
-
-	*vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
-
-	vp->v_fd = fd;
-	if (S_ISCHR(st.st_mode)) {
-#ifdef XXXAD
-		ioctl(fd, DIOCGMEDIASIZE, &vp->v_size);
-#endif
-	} else
-		vp->v_size = st.st_size;
-	vp->v_path = spa_strdup(path);
-
-	return (0);
-}
-
-int
-vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
-    int x3, vnode_t *startvp, int fd)
-{
-	char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
-	int ret;
-
-	ASSERT(startvp == rootdir);
-	(void) sprintf(realpath, "/%s", path);
-
-	ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
-
-	umem_free(realpath, strlen(path) + 2);
-
-	return (ret);
-}
-
-int
-vn_getattr(vnode_t *vp, vattr_t *va)
-{
-	int fd;
-	struct stat64 st;
-
-	fd = vp->v_fd;
-
-	if (fstat64(fd, &st) == -1)
-		return (errno);
-
-	vp->v_size = st.st_size;
-	va->va_size = st.st_size;
-
-	return 0;
-}
-
-
-/*ARGSUSED*/
-int
-vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
-	int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
-{
-	ssize_t iolen, split;
-
-	if (uio == UIO_READ) {
-		iolen = pread64(vp->v_fd, addr, len, offset);
-	} else {
-		/*
-		 * To simulate partial disk writes, we split writes into two
-		 * system calls so that the process can be killed in between.
-		 */
-		split = (len > 0 ? rand() % len : 0);
-		iolen = pwrite64(vp->v_fd, addr, split, offset);
-		iolen += pwrite64(vp->v_fd, (char *)addr + split,
-		    len - split, offset + split);
-	}
-
-	if (iolen == -1)
-		return (errno);
-	if (residp)
-		*residp = len - iolen;
-	else if (iolen != len)
-		return (EIO);
-	return (0);
-}
-
-void
-vn_close(vnode_t *vp)
-{
-	close(vp->v_fd);
-	spa_strfree(vp->v_path);
-	umem_free(vp, sizeof (vnode_t));
-}
-
-#ifdef ZFS_DEBUG
-
-/*
- * =========================================================================
- * Figure out which debugging statements to print
- * =========================================================================
- */
-
-static char *dprintf_string;
-static int dprintf_print_all;
-
-int
-dprintf_find_string(const char *string)
-{
-	char *tmp_str = dprintf_string;
-	int len = strlen(string);
-
-	/*
-	 * Find out if this is a string we want to print.
-	 * String format: file1.c,function_name1,file2.c,file3.c
-	 */
-
-	while (tmp_str != NULL) {
-		if (strncmp(tmp_str, string, len) == 0 &&
-		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
-			return (1);
-		tmp_str = strchr(tmp_str, ',');
-		if (tmp_str != NULL)
-			tmp_str++; /* Get rid of , */
-	}
-	return (0);
-}
-
-void
-dprintf_setup(int *argc, char **argv)
-{
-	int i, j;
-
-	/*
-	 * Debugging can be specified two ways: by setting the
-	 * environment variable ZFS_DEBUG, or by including a
-	 * "debug=..."  argument on the command line.  The command
-	 * line setting overrides the environment variable.
-	 */
-
-	for (i = 1; i < *argc; i++) {
-		int len = strlen("debug=");
-		/* First look for a command line argument */
-		if (strncmp("debug=", argv[i], len) == 0) {
-			dprintf_string = argv[i] + len;
-			/* Remove from args */
-			for (j = i; j < *argc; j++)
-				argv[j] = argv[j+1];
-			argv[j] = NULL;
-			(*argc)--;
-		}
-	}
-
-	if (dprintf_string == NULL) {
-		/* Look for ZFS_DEBUG environment variable */
-		dprintf_string = getenv("ZFS_DEBUG");
-	}
-
-	/*
-	 * Are we just turning on all debugging?
-	 */
-	if (dprintf_find_string("on"))
-		dprintf_print_all = 1;
-}
-
-/*
- * =========================================================================
- * debug printfs
- * =========================================================================
- */
-void
-__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
-{
-	const char *newfile;
-	va_list adx;
-
-	/*
-	 * Get rid of annoying "../common/" prefix to filename.
-	 */
-	newfile = strrchr(file, '/');
-	if (newfile != NULL) {
-		newfile = newfile + 1; /* Get rid of leading / */
-	} else {
-		newfile = file;
-	}
-
-	if (dprintf_print_all ||
-	    dprintf_find_string(newfile) ||
-	    dprintf_find_string(func)) {
-		/* Print out just the function name if requested */
-		flockfile(stdout);
-		if (dprintf_find_string("pid"))
-			(void) printf("%d ", getpid());
-		if (dprintf_find_string("tid"))
-			(void) printf("%u ", thr_self());
-#if 0
-		if (dprintf_find_string("cpu"))
-			(void) printf("%u ", getcpuid());
-#endif
-		if (dprintf_find_string("time"))
-			(void) printf("%llu ", gethrtime());
-		if (dprintf_find_string("long"))
-			(void) printf("%s, line %d: ", newfile, line);
-		(void) printf("%s: ", func);
-		va_start(adx, fmt);
-		(void) vprintf(fmt, adx);
-		va_end(adx);
-		funlockfile(stdout);
-	}
-}
-
-#endif /* ZFS_DEBUG */
-
-/*
- * =========================================================================
- * cmn_err() and panic()
- * =========================================================================
- */
-static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
-static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
-
-void
-vpanic(const char *fmt, va_list adx)
-{
-	(void) fprintf(stderr, "error: ");
-	(void) vfprintf(stderr, fmt, adx);
-	(void) fprintf(stderr, "\n");
-
-	abort();	/* think of it as a "user-level crash dump" */
-}
-
-void
-panic(const char *fmt, ...)
-{
-	va_list adx;
-
-	va_start(adx, fmt);
-	vpanic(fmt, adx);
-	va_end(adx);
-}
-
-void
-vcmn_err(int ce, const char *fmt, va_list adx)
-{
-	if (ce == CE_PANIC)
-		vpanic(fmt, adx);
-	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
-		(void) fprintf(stderr, "%s", ce_prefix[ce]);
-		(void) vfprintf(stderr, fmt, adx);
-		(void) fprintf(stderr, "%s", ce_suffix[ce]);
-	}
-}
-
-/*PRINTFLIKE2*/
-void
-cmn_err(int ce, const char *fmt, ...)
-{
-	va_list adx;
-
-	va_start(adx, fmt);
-	vcmn_err(ce, fmt, adx);
-	va_end(adx);
-}
-
-/*
- * =========================================================================
- * kobj interfaces
- * =========================================================================
- */
-struct _buf *
-kobj_open_file(char *name)
-{
-	struct _buf *file;
-	vnode_t *vp;
-
-	/* set vp as the _fd field of the file */
-	if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, 0) != 0)
-		return ((void *)-1UL);
-
-	file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
-	file->_fd = (intptr_t)vp;
-	return (file);
-}
-
-int
-kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
-{
-	ssize_t resid;
-
-	vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
-	    UIO_SYSSPACE, 0, 0, 0, &resid);
-
-	return (size - resid);
-}
-
-void
-kobj_close_file(struct _buf *file)
-{
-	vn_close((vnode_t *)file->_fd);
-	umem_free(file, sizeof (struct _buf));
-}
-
-int
-kobj_get_filesize(struct _buf *file, uint64_t *size)
-{
-	struct stat64 st;
-	vnode_t *vp = (vnode_t *)file->_fd;
-
-	if (fstat64(vp->v_fd, &st) == -1) {
-		vn_close(vp);
-		return (errno);
-	}
-	*size = st.st_size;
-	return (0);
-}
-
-/*
- * =========================================================================
- * misc routines
- * =========================================================================
- */
-
-void
-xdelay(clock_t ticks)
-{
-	poll(0, 0, ticks * (1000 / hz));
-}
-
-#if 0
-/*
- * Find highest one bit set.
- *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
- * High order bit is 31 (or 63 in _LP64 kernel).
- */
-int
-highbit(ulong_t i)
-{
-	register int h = 1;
-
-	if (i == 0)
-		return (0);
-#ifdef _LP64
-	if (i & 0xffffffff00000000ul) {
-		h += 32; i >>= 32;
-	}
-#endif
-	if (i & 0xffff0000) {
-		h += 16; i >>= 16;
-	}
-	if (i & 0xff00) {
-		h += 8; i >>= 8;
-	}
-	if (i & 0xf0) {
-		h += 4; i >>= 4;
-	}
-	if (i & 0xc) {
-		h += 2; i >>= 2;
-	}
-	if (i & 0x2) {
-		h += 1;
-	}
-	return (h);
-}
-#endif
-
-static int
-random_get_bytes_common(uint8_t *ptr, size_t len, char *devname)
-{
-	int fd = open(devname, O_RDONLY);
-	size_t resid = len;
-	ssize_t bytes;
-
-	ASSERT(fd != -1);
-
-	while (resid != 0) {
-		bytes = read(fd, ptr, resid);
-		ASSERT(bytes >= 0);
-		ptr += bytes;
-		resid -= bytes;
-	}
-
-	close(fd);
-
-	return (0);
-}
-
-int
-random_get_bytes(uint8_t *ptr, size_t len)
-{
-	return (random_get_bytes_common(ptr, len, "/dev/random"));
-}
-
-int
-random_get_pseudo_bytes(uint8_t *ptr, size_t len)
-{
-	return (random_get_bytes_common(ptr, len, "/dev/urandom"));
-}
-
-int
-ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
-{
-	char *end;
-
-	*result = strtoul(hw_serial, &end, base);
-	if (*result == 0)
-		return (errno);
-	return (0);
-}
-
-/*
- * =========================================================================
- * kernel emulation setup & teardown
- * =========================================================================
- */
-static int
-umem_out_of_memory(void)
-{
-	char errmsg[] = "out of memory -- generating core dump\n";
-
-	write(fileno(stderr), errmsg, sizeof (errmsg));
-	abort();
-	return (0);
-}
-
-void
-kernel_init(int mode)
-{
-	umem_nofail_callback(umem_out_of_memory);
-	uint64_t physmem;
-	size_t len = sizeof(physmem);
-	static int mib[2] = { CTL_HW, HW_USERMEM64 };
-
-	if (sysctl(mib, sizeof(mib), &physmem, &len, NULL, 0) != 0) {
-		len = 1048576 * 128;
-	}
-
-	pgsize = sysconf(_SC_PAGE_SIZE);
-	dprintf("physmem = %llu pages (%.2f GB)\n",
-	    physmem / pgsize, (double)physmem / (1ULL << 30));
-
-	snprintf(hw_serial, sizeof (hw_serial), "%ld", gethostid());
-
-	system_taskq_init();
-
-	spa_init(mode);
-}
-
-void
-kernel_fini(void)
-{
-	spa_fini();
-}
-
-int
-z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen)
-{
-	int ret;
-	uLongf len = *dstlen;
-
-	if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK)
-		*dstlen = (size_t)len;
-
-	return (ret);
-}
-
-int
-z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen,
-    int level)
-{
-	int ret;
-	uLongf len = *dstlen;
-
-	if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK)
-		*dstlen = (size_t)len;
-
-	return (ret);
-}
-
-uid_t
-crgetuid(cred_t *cr)
-{
-	return (0);
-}
-
-gid_t
-crgetgid(cred_t *cr)
-{
-	return (0);
-}
-
-int
-crgetngroups(cred_t *cr)
-{
-	return (0);
-}
-
-gid_t *
-crgetgroups(cred_t *cr)
-{
-	return (NULL);
-}
-
-int
-zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
-{
-	return (0);
-}
-
-int
-zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
-{
-	return (0);
-}
-
-int
-zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
-{
-	return (0);
-}
-
-ksiddomain_t *
-ksid_lookupdomain(const char *dom)
-{
-	ksiddomain_t *kd;
-
-	kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
-	kd->kd_name = spa_strdup(dom);
-	return (kd);
-}
-
-void
-ksiddomain_rele(ksiddomain_t *ksid)
-{
-	spa_strfree(ksid->kd_name);
-	umem_free(ksid, sizeof (ksiddomain_t));
-}
-
-size_t
-ptob(size_t npg)
-{
-
-	return npg * pgsize;
-}
-
-void
-print_timestamp(int fmt)
-{
-
-	return;
-}
-
-/*
- * Do not change the length of the returned string; it must be freed
- * with strfree().
- */
-char *
-kmem_asprintf(const char *fmt, ...)
-{
-	int size;
-	va_list adx;
-	char *buf;
-
-	va_start(adx, fmt);
-	size = vsnprintf(NULL, 0, fmt, adx) + 1;
-	va_end(adx);
-
-	buf = kmem_alloc(size, KM_SLEEP);
-
-	va_start(adx, fmt);
-	size = vsnprintf(buf, size, fmt, adx);
-	va_end(adx);
-
-	return (buf);
-}
Index: src/external/cddl/osnet/sbin/zfs/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sbin/zfs/Makefile,v
retrieving revision 1.3
diff -u -p -r1.3 Makefile
--- src/external/cddl/osnet/sbin/zfs/Makefile	28 Feb 2010 14:45:47 -0000	1.3
+++ src/external/cddl/osnet/sbin/zfs/Makefile	22 Apr 2017 08:52:02 -0000
@@ -18,6 +18,9 @@ LDADD+=         -L${LIBUUTIL_OBJDIR} -lu
 LIBZFS_OBJDIR!=  cd ${LIBZFS_SRCDIR} && ${PRINTOBJDIR}
 LDADD+=         -L${LIBZFS_OBJDIR} -lzfs
 
+LIBZFS_CORE_OBJDIR!=  cd ${LIBZFS_CORE_SRCDIR} && ${PRINTOBJDIR}
+LDADD+=         -L${LIBZFS_CORE_OBJDIR} -lzfs_core
+
 LIBUMEM_OBJDIR!=  cd ${LIBUMEM_SRCDIR} && ${PRINTOBJDIR}
 LDADD+=         -L${LIBUMEM_OBJDIR} -lumem
 
Index: src/external/cddl/osnet/sbin/zpool/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sbin/zpool/Makefile,v
retrieving revision 1.5
diff -u -p -r1.5 Makefile
--- src/external/cddl/osnet/sbin/zpool/Makefile	10 Apr 2015 20:34:16 -0000	1.5
+++ src/external/cddl/osnet/sbin/zpool/Makefile	22 Apr 2017 08:52:50 -0000
@@ -20,6 +20,9 @@ LDADD+=         -L${LIBNVPAIR_OBJDIR} -l
 LIBUUTIL_OBJDIR!=  cd ${LIBUUTIL_SRCDIR} && ${PRINTOBJDIR}
 LDADD+=         -L${LIBUUTIL_OBJDIR} -luutil
 
+LIBZFS_CORE_OBJDIR!=  cd ${LIBZFS_CORE_SRCDIR} && ${PRINTOBJDIR}
+LDADD+=         -L${LIBZFS_CORE_OBJDIR} -lzfs_core
+
 LIBZFS_OBJDIR!=  cd ${LIBZFS_SRCDIR} && ${PRINTOBJDIR}
 LDADD+=         -L${LIBZFS_OBJDIR} -lzfs
 
Index: src/external/cddl/osnet/sys/kern/ddi.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/ddi.c,v
retrieving revision 1.5
diff -u -p -r1.5 ddi.c
--- src/external/cddl/osnet/sys/kern/ddi.c	10 Sep 2015 19:56:13 -0000	1.5
+++ src/external/cddl/osnet/sys/kern/ddi.c	10 Jun 2017 05:28:56 -0000
@@ -28,10 +28,34 @@
 /*	Copyright (c) 1988 AT&T	*/
 /*	All Rights Reserved */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
 #include <sys/types.h>
-#include <sys/ddi.h>
+#include <sys/sunddi.h>
 #include <sys/debug.h>
 #include <sys/errno.h>
 #include <sys/param.h>
@@ -187,6 +211,18 @@ overflow:
 	return (ERANGE);
 }
 
+int
+ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result)
+{
+
+	*result = (unsigned long long)strtoull(str, nptr, base);
+	if (*result == 0)
+		return (EINVAL);
+	else if (*result == ULLONG_MAX)
+		return (ERANGE);
+	return (0);
+}
+
 /*
  * Find first bit set in a mask (returned counting from 1 up)
  */
@@ -570,6 +606,7 @@ ddi_remove_minor_node(dev_info_t *dip, c
 	PNBUF_PUT(pn);
 }
 
+#if 0
 clock_t
 ddi_get_lbolt()
 {
@@ -583,3 +620,4 @@ ddi_get_lbolt64()
 
 	return hardclock_ticks;
 }
+#endif
Index: src/external/cddl/osnet/sys/kern/fm.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/fm.c,v
retrieving revision 1.1
diff -u -p -r1.1 fm.c
--- src/external/cddl/osnet/sys/kern/fm.c	7 Aug 2009 20:57:57 -0000	1.1
+++ src/external/cddl/osnet/sys/kern/fm.c	7 May 2017 00:34:49 -0000
@@ -359,6 +359,7 @@ fm_ereport_post(nvlist_t *ereport, int e
 	size_t nvl_size = 0;
 	evchan_t *error_chan;
 
+#if 0
 	(void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
 	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
 		printf("fm_ereport_post: dropped report\n");
@@ -367,7 +368,7 @@ fm_ereport_post(nvlist_t *ereport, int e
 
 	fm_banner();
 	fm_nvprint(ereport);
-
+#endif
 }
 
 /*
Index: src/external/cddl/osnet/sys/kern/kmem.c
===================================================================
RCS file: src/external/cddl/osnet/sys/kern/kmem.c
diff -N src/external/cddl/osnet/sys/kern/kmem.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/sys/kern/kmem.c	17 Jun 2017 02:32:18 -0000
@@ -0,0 +1,67 @@
+/*	$NetBSD$	*/
+
+/*-
+ * Copyright (c) 2017 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/kmem.h>
+
+#undef kmem_alloc
+#undef kmem_zalloc
+#undef kmem_free
+
+/*
+ * Solaris allows allocating zero bytes (which returns NULL)
+ * and freeing a NULL pointer (which does nothing),
+ * so allow that here with wrappers around the native functions
+ * (which do not allow those things).
+ */
+
+void *
+solaris_kmem_alloc(size_t size, int flags)
+{
+
+	if (size == 0)
+		return NULL;
+	return kmem_alloc(size, flags);
+}
+
+void *
+solaris_kmem_zalloc(size_t size, int flags)
+{
+
+	if (size == 0)
+		return NULL;
+	return kmem_zalloc(size, flags);
+}
+
+void
+solaris_kmem_free(void *buf, size_t size)
+{
+
+	if (buf == NULL)
+		return;
+	kmem_free(buf, size);
+}
Index: src/external/cddl/osnet/sys/kern/kobj.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/kobj.c,v
retrieving revision 1.4
diff -u -p -r1.4 kobj.c
--- src/external/cddl/osnet/sys/kern/kobj.c	11 Oct 2012 08:01:23 -0000	1.4
+++ src/external/cddl/osnet/sys/kern/kobj.c	9 Jun 2017 22:11:53 -0000
@@ -56,7 +56,7 @@
  */
 
 #include <sys/cdefs.h>
-/* __FBSDID("$FreeBSD: src/sys/compat/opensolaris/kern/opensolaris_kobj.c,v 1.4 2007/05/31 11:51:49 kib Exp $"); */
+/* __FBSDID("$FreeBSD: head/sys/cddl/compat/opensolaris/kern/opensolaris_kobj.c 285391 2015-07-11 16:22:48Z mjg $"); */
 __KERNEL_RCSID(0, "$NetBSD: kobj.c,v 1.4 2012/10/11 08:01:23 njoly Exp $");
 
 #include <sys/types.h>
@@ -122,7 +122,7 @@ kobj_get_filesize_vnode(struct _buf *fil
 	struct vattr va;
 	int error;
 
-	error = VOP_GETATTR(file->ptr, &va, 0, kauth_cred_get(), NULL);
+	error = VOP_GETATTR(file->ptr, &va, 0, kcred, NULL);
 	if (error == 0)
 		*size = (uint64_t)va.va_size;
 	return (error);
@@ -142,7 +142,7 @@ kobj_read_file_vnode(struct _buf *file, 
 	size_t resid;
 
 	error = vn_rdwr(UIO_READ, file->ptr, buf, size, off, UIO_SYSSPACE, 0,
-	    RLIM64_INFINITY, kauth_cred_get(), &resid);
+	    RLIM64_INFINITY, kcred, &resid);
 	return (error != 0 ? -1 : size - resid);
 }
 
@@ -158,7 +158,7 @@ kobj_close_file(struct _buf *file)
 {
 
 	if (file->mounted) {
-		vn_close(file->ptr, FREAD, kauth_cred_get());
+		vn_close(file->ptr, FREAD, kcred);
 	}
 	kmem_free(file, sizeof(*file));
 }
Index: src/external/cddl/osnet/sys/kern/kstat.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/kstat.c,v
retrieving revision 1.2
diff -u -p -r1.2 kstat.c
--- src/external/cddl/osnet/sys/kern/kstat.c	19 Jun 2012 21:25:26 -0000	1.2
+++ src/external/cddl/osnet/sys/kern/kstat.c	23 Apr 2017 18:01:19 -0000
@@ -126,3 +126,19 @@ kstat_delete(kstat_t *ks)
 	sysctl_teardown(&ks->ks_clog);
 	kmem_free(ks, sizeof(*ks));
 }
+
+void
+kstat_set_string(char *dst, const char *src)
+{
+
+	memset(dst, 0, KSTAT_STRLEN);
+	(void) strncpy(dst, src, KSTAT_STRLEN - 1);
+}
+
+void
+kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type)
+{
+
+	kstat_set_string(knp->name, name);
+	knp->data_type = data_type;
+}
Index: src/external/cddl/osnet/sys/kern/misc.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/misc.c,v
retrieving revision 1.4
diff -u -p -r1.4 misc.c
--- src/external/cddl/osnet/sys/kern/misc.c	5 Jun 2012 22:51:47 -0000	1.4
+++ src/external/cddl/osnet/sys/kern/misc.c	4 Jul 2017 00:32:44 -0000
@@ -56,7 +56,8 @@
  */
 
 #include <sys/cdefs.h>
-/* __FBSDID("$FreeBSD: src/sys/compat/opensolaris/kern/opensolaris_misc.c,v 1.2 2007/04/23 00:52:06 pjd Exp $"); */
+/* __FBSDID("$FreeBSD: head/sys/cddl/compat/opensolaris/kern/opensolaris_misc.c 219089 2011-02-27 19:41:40Z pjd $"); */
+
 
 #include <sys/mount.h>
 #include <sys/param.h>
@@ -72,9 +73,18 @@
 
 char hw_serial[11] = "0";
 
-struct utsname utsname = {
-	.nodename = "XXXNETBSD"
-};
+struct utsname utsname;
+
+void
+opensolaris_utsname_init(void *arg)
+{
+
+	strlcpy(utsname.sysname, ostype, sizeof(utsname.sysname));
+	strlcpy(utsname.nodename, hostname, sizeof(utsname.nodename));
+	strlcpy(utsname.release, osrelease, sizeof(utsname.release));
+	strlcpy(utsname.version, "OpenSolaris", sizeof(utsname.version));
+	strlcpy(utsname.machine, machine, sizeof(utsname.machine));
+}
 
 int
 vn_is_readonly(vnode_t *vp)
@@ -114,21 +124,6 @@ thread_join(uint64_t kid)
 	return;
 }
 
-int
-newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
-    pid_t pid)
-{
-	int error;
-
-	ASSERT(cid == PRI_NONE);
-	
-	error = kthread_create(pri, KTHREAD_MPSAFE, NULL,
-	    pc, arg, NULL, "zfs_proc");
-	KASSERT(error == 0);
-
-	return 0;
-}
-
 void
 kmem_reap(void)
 {
@@ -138,6 +133,7 @@ kmem_reap(void)
 	bufcnt = uvmexp.freetarg - uvmexp.free;
 	if (bufcnt < 0)
 		bufcnt = 0;
+
 	/*
 	 * kill unused metadata buffers.
 	 */
@@ -149,7 +145,6 @@ kmem_reap(void)
 	 * drain the pools.
 	 */
 	pool_drain(&pp);
-//	printf("XXXNETBSD kmem_reap called, write me\n");
 }
 
 /*
@@ -180,3 +175,10 @@ xva_getxoptattr(xvattr_t *xvap)
 		xoap = &xvap->xva_xoptattrs;
 	return (xoap);
 }
+
+uint64_t
+kmem_size(void)
+{
+
+	return (uint64_t)1 << 31;
+}
Index: src/external/cddl/osnet/sys/kern/mod.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/mod.c,v
retrieving revision 1.2
diff -u -p -r1.2 mod.c
--- src/external/cddl/osnet/sys/kern/mod.c	12 Mar 2010 21:37:37 -0000	1.2
+++ src/external/cddl/osnet/sys/kern/mod.c	4 Jul 2017 00:33:25 -0000
@@ -35,6 +35,7 @@ __KERNEL_RCSID(0, "$NetBSD: mod.c,v 1.2 
 #include <sys/kthread.h>
 #include <sys/callb.h>
 
+void opensolaris_utsname_init(void);
 void opensolaris_init(void *);
 void opensolaris_fini(void *);
 
@@ -48,6 +49,7 @@ solaris_modcmd(modcmd_t cmd, void *arg)
 
 	switch (cmd) {
 	case MODULE_CMD_INIT:
+		opensolaris_utsname_init();
 		callb_init(NULL);
 		taskq_init();
 		opensolaris_init(NULL);
Index: src/external/cddl/osnet/sys/kern/opensolaris.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/opensolaris.c,v
retrieving revision 1.1
diff -u -p -r1.1 opensolaris.c
--- src/external/cddl/osnet/sys/kern/opensolaris.c	12 Mar 2010 21:37:37 -0000	1.1
+++ src/external/cddl/osnet/sys/kern/opensolaris.c	9 Jun 2017 22:06:31 -0000
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/kern/opensolaris.c,v 1.4.2.2 2009/08/13 13:56:05 trasz Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/kern/opensolaris.c 222670 2011-06-04 07:02:06Z avg $
  *
  */
 
Index: src/external/cddl/osnet/sys/kern/policy.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/policy.c,v
retrieving revision 1.6
diff -u -p -r1.6 policy.c
--- src/external/cddl/osnet/sys/kern/policy.c	19 Oct 2012 22:19:15 -0000	1.6
+++ src/external/cddl/osnet/sys/kern/policy.c	16 May 2017 01:13:35 -0000
@@ -88,28 +88,35 @@
 #include <sys/policy.h>
 
 int
-secpolicy_zfs(kauth_cred_t cred)
+secpolicy_nfs(cred_t *cr)
+{
+
+	return kauth_authorize_generic(cr, KAUTH_GENERIC_ISSUSER, NULL);
+}
+
+int
+secpolicy_zfs(cred_t *cred)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_sys_config(kauth_cred_t cred, int checkonly __unused)
+secpolicy_sys_config(cred_t *cred, int checkonly __unused)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_zinject(kauth_cred_t cred)
+secpolicy_zinject(cred_t *cred)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_fs_mount(kauth_cred_t cred, struct vnode *mvp, struct mount *vfsp)
+secpolicy_fs_mount(cred_t *cred, vnode_t *mvp, struct mount *vfsp)
 {
 
 	return kauth_authorize_system(cred, KAUTH_SYSTEM_MOUNT,
@@ -117,32 +124,39 @@ secpolicy_fs_mount(kauth_cred_t cred, st
 }
 
 int
-secpolicy_fs_unmount(kauth_cred_t cred, struct mount *vfsp)
+secpolicy_fs_unmount(cred_t *cred, struct mount *vfsp)
 {
 
 	return kauth_authorize_system(cred, KAUTH_SYSTEM_MOUNT,
 	    KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, vfsp, NULL, NULL);
 }
 
+int
+secpolicy_fs_owner(struct mount *mp, cred_t *cr)
+{
+
+	return (EPERM);
+}
+
 /*
  * This check is done in kern_link(), so we could just return 0 here.
  */
 int
-secpolicy_basic_link(kauth_cred_t cred)
+secpolicy_basic_link(vnode_t *vp, cred_t *cred)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_stky_modify(kauth_cred_t cred)
+secpolicy_vnode_stky_modify(cred_t *cred)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_remove(kauth_cred_t cred)
+secpolicy_vnode_remove(vnode_t *vp, cred_t *cred)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
@@ -150,39 +164,67 @@ secpolicy_vnode_remove(kauth_cred_t cred
 
 
 int
-secpolicy_vnode_owner(kauth_cred_t cred, uid_t owner)
+secpolicy_vnode_owner(vnode_t *vp, cred_t *cred, uid_t owner)
 {
 
 	if (owner == kauth_cred_getuid(cred))
 		return (0);
 
+	if (secpolicy_fs_owner(vp->v_mount, cred) == 0)
+		return (0);
+
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_access(kauth_cred_t cred, struct vnode *vp, uid_t owner,
+secpolicy_vnode_access(cred_t *cred, vnode_t *vp, uid_t owner,
     int mode)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ */
 int
-secpolicy_xvattr(xvattr_t *xvap, uid_t owner, kauth_cred_t cred, vtype_t vtype)
+secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+    accmode_t curmode, accmode_t wantmode)
+{
+	accmode_t mode;
+
+	mode = ~curmode & wantmode;
+
+	if (mode == 0)
+		return (0);
+
+	return (secpolicy_vnode_access(cr, vp, owner, mode));
+}
+
+int
+secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner)
+{
+
+	return kauth_authorize_generic(cr, KAUTH_GENERIC_ISSUSER, NULL);
+}
+
+int
+secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cred, vtype_t vtype)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_setid_retain(kauth_cred_t cred, boolean_t issuidroot __unused)
+secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cred, boolean_t issuidroot __unused)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_setids_setgids(kauth_cred_t cred, gid_t gid)
+secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cred, gid_t gid)
 {
 
 	if (groupmember(gid, cred))
@@ -192,28 +234,28 @@ secpolicy_vnode_setids_setgids(kauth_cre
 }
 
 int
-secpolicy_vnode_chown(kauth_cred_t cred, boolean_t check_self)
+secpolicy_vnode_chown(vnode_t *vp, cred_t *cred, uid_t owner)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_create_gid(kauth_cred_t cred)
+secpolicy_vnode_create_gid(cred_t *cred)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_utime_modify(kauth_cred_t cred)
+secpolicy_vnode_utime_modify(cred_t *cred)
 {
 
 	return kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 int
-secpolicy_vnode_setdac(kauth_cred_t cred, uid_t owner)
+secpolicy_vnode_setdac(vnode_t *vp, cred_t *cred, uid_t owner)
 {
 
 	if (owner == kauth_cred_getuid(cred))
@@ -223,8 +265,8 @@ secpolicy_vnode_setdac(kauth_cred_t cred
 }
 
 int
-secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
-    const struct vattr *ovap, kauth_cred_t cred)
+secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+    const struct vattr *ovap, cred_t *cred)
 {
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
@@ -240,7 +282,7 @@ secpolicy_setid_setsticky_clear(struct v
 	 * group-id bit.
 	 */
 	if ((vap->va_mode & S_ISGID) != 0)
-		return (secpolicy_vnode_setids_setgids(cred, ovap->va_gid));
+		return (secpolicy_vnode_setids_setgids(vp, cred, ovap->va_gid));
 
 	return (0);
 }
@@ -251,9 +293,9 @@ secpolicy_setid_setsticky_clear(struct v
  * policy.c rather than somewhere in vnode.c or something.
  */
 int
-secpolicy_vnode_setattr(kauth_cred_t cred, struct vnode *vp, struct vattr *vap,
+secpolicy_vnode_setattr(cred_t *cred, vnode_t *vp, struct vattr *vap,
     const struct vattr *ovap, int flags,
-    int unlocked_access(void *, int, kauth_cred_t), void *node)
+    int unlocked_access(void *, int, cred_t *), void *node)
 {
 	int mask = vap->va_mask;
 	int error = 0;
@@ -285,7 +327,7 @@ secpolicy_vnode_setattr(kauth_cred_t cre
 		 * In the specific case of creating a set-uid root
 		 * file, we need even more permissions.
 		 */
-		if ((error = secpolicy_vnode_setdac(cred, ovap->va_uid)) != 0)
+		if ((error = secpolicy_vnode_setdac(vp, cred, ovap->va_uid)) != 0)
 			goto out;
 
 		if ((error = secpolicy_setid_setsticky_clear(vp, vap,
@@ -326,7 +368,7 @@ secpolicy_vnode_setattr(kauth_cred_t cre
 		 * If necessary, check privilege to see if update can be done.
 		 */
 		if (checkpriv &&
-		    (error = secpolicy_vnode_chown(cred, ovap->va_uid)) != 0) {
+		    (error = secpolicy_vnode_chown(vp, cred, ovap->va_uid)) != 0) {
 			goto out;
 		}
 
@@ -334,7 +376,7 @@ secpolicy_vnode_setattr(kauth_cred_t cre
 		 * If the file has either the set UID or set GID bits
 		 * set and the caller can set the bits, then leave them.
 		 */
-		secpolicy_setid_clear(vap, cred);
+		secpolicy_setid_clear(vap, vp, cred);
 	}
 	if (mask & (AT_ATIME|AT_MTIME)) {
 		/*
@@ -362,14 +404,14 @@ secpolicy_vnode_setattr(kauth_cred_t cre
 	 * Check for optional attributes here by checking the following:
 	 */
 	if (mask & AT_XVATTR)
-		error = secpolicy_xvattr((xvattr_t *)vap, ovap->va_uid, cred,
-		    vp->v_type);
+		error = secpolicy_xvattr(vp, (xvattr_t *)vap, ovap->va_uid,
+		    cred, vp->v_type);
 out:
 	return (error);
 }
 
 void
-secpolicy_setid_clear(struct vattr *vap, kauth_cred_t cred)
+secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cred)
 {
 	if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) != 0)
 		return;
@@ -382,116 +424,16 @@ secpolicy_setid_clear(struct vattr *vap,
 	return;
 }
 
-#ifdef notyet
-int
-secpolicy_vnode_setdac(kauth_cred_t cred, uid_t owner)
-{
-
-	if (owner == cred->cr_uid)
-		return (0);
-	return (priv_check_cred(cred, PRIV_VFS_ADMIN, 0));
-}
-
-int
-secpolicy_vnode_setattr(kauth_cred_t cred, struct vnode *vp, struct vattr *vap,
-    const struct vattr *ovap, int flags,
-    int unlocked_access(void *, int, kauth_cred_t), void *node)
-{
-	int mask = vap->va_mask;
-	int error;
-
-	if (mask & AT_SIZE) {
-		if (vp->v_type == VDIR)
-			return (EISDIR);
-		error = unlocked_access(node, VWRITE, cred);
-		if (error)
-			return (error);
-	}
-	if (mask & AT_MODE) {
-		/*
-		 * If not the owner of the file then check privilege
-		 * for two things: the privilege to set the mode at all
-		 * and, if we're setting setuid, we also need permissions
-		 * to add the set-uid bit, if we're not the owner.
-		 * In the specific case of creating a set-uid root
-		 * file, we need even more permissions.
-		 */
-		error = secpolicy_vnode_setdac(cred, ovap->va_uid);
-		if (error)
-			return (error);
-		error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cred);
-		if (error)
-			return (error);
-	} else {
-		vap->va_mode = ovap->va_mode;
-	}
-	if (mask & (AT_UID | AT_GID)) {
-		error = secpolicy_vnode_setdac(cred, ovap->va_uid);
-		if (error)
-			return (error);
-
-		/*
-		 * To change the owner of a file, or change the group of a file to a
-		 * group of which we are not a member, the caller must have
-		 * privilege.
-		 */
-		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
-		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
-		     !groupmember(vap->va_gid, cred))) {
-			error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0);
-			if (error)
-				return (error);
-		}
-
-		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
-		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
-			secpolicy_setid_clear(vap, cred);
-		}
-	}
-	if (mask & (AT_ATIME | AT_MTIME)) {
-		/*
-		 * From utimes(2):
-		 * If times is NULL, ... The caller must be the owner of
-		 * the file, have permission to write the file, or be the
-		 * super-user.
-		 * If times is non-NULL, ... The caller must be the owner of
-		 * the file or be the super-user.
-		 */
-		error = secpolicy_vnode_setdac(cred, ovap->va_uid);
-		if (error && (vap->va_vaflags & VA_UTIMES_NULL))
-			error = unlocked_access(node, VWRITE, cred);
-		if (error)
-			return (error);
-	}
-	return (0);
-}
-
 int
-secpolicy_vnode_create_gid(kauth_cred_t cred)
+secpolicy_smb(cred_t *cr)
 {
 
-	return (EPERM);
-}
-
-int
-secpolicy_vnode_setid_retain(kauth_cred_t cred, boolean_t issuidroot __unused)
-{
-
-	return (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0));
+	return kauth_authorize_generic(cr, KAUTH_GENERIC_ISSUSER, NULL);
 }
 
 void
-secpolicy_setid_clear(struct vattr *vap, kauth_cred_t cred)
+secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
 {
 
-	if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL))
-		return;
-
-	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
-		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) {
-			vap->va_mask |= AT_MODE;
-			vap->va_mode &= ~(S_ISUID|S_ISGID);
-		}
-	}
+	printf("%s writeme\n", __func__);
 }
-#endif
Index: src/external/cddl/osnet/sys/kern/printf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/printf.c,v
retrieving revision 1.1
diff -u -p -r1.1 printf.c
--- src/external/cddl/osnet/sys/kern/printf.c	7 Aug 2009 20:57:57 -0000	1.1
+++ src/external/cddl/osnet/sys/kern/printf.c	16 May 2017 00:51:45 -0000
@@ -39,6 +39,7 @@ void
 vcmn_err(int ce, const char *fmt, va_list adx)
 {
 	char buf[256];
+	size_t len;
 
 	if (ce == CE_PANIC) {
 		vprintf(fmt, adx);
@@ -47,7 +48,8 @@ vcmn_err(int ce, const char *fmt, va_lis
 
 	if ((uint_t)ce < CE_IGNORE) {
 		strcpy(buf, ce_prefix[ce]);
-		vsnprintf(buf + strlen(ce_prefix[ce]), sizeof(buf),
+		len = strlen(buf);
+		vsnprintf(buf + len, sizeof(buf) - len,
 		    fmt, adx);
 		strlcat(buf, ce_suffix[ce], sizeof(buf));
 		printf("%s", buf);
Index: src/external/cddl/osnet/sys/kern/sysevent.c
===================================================================
RCS file: src/external/cddl/osnet/sys/kern/sysevent.c
diff -N src/external/cddl/osnet/sys/kern/sysevent.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/sys/kern/sysevent.c	16 Jun 2017 16:45:54 -0000
@@ -0,0 +1,344 @@
+/*-
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#ifdef __FreeBSD__
+__FBSDID("$FreeBSD: head/sys/cddl/compat/opensolaris/kern/opensolaris_sysevent.c 222343 2011-05-27 08:34:31Z pjd $");
+#endif
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/kmem.h>
+#ifdef __FreeBSD__
+#include <sys/sbuf.h>
+#endif
+#include <sys/nvpair.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent.h>
+#include <sys/fm/protocol.h>
+
+struct sysevent {
+	nvlist_t	*se_nvl;
+	char		 se_class[128];
+	char		 se_subclass[128];
+	char		 se_pub[128];
+};
+
+sysevent_t *
+sysevent_alloc(char *class, char *subclass, char *pub, int flag)
+{
+	struct sysevent *ev;
+
+	ASSERT(class != NULL);
+	ASSERT(subclass != NULL);
+	ASSERT(pub != NULL);
+	ASSERT(flag == SE_SLEEP);
+
+	ev = kmem_alloc(sizeof(*ev), KM_SLEEP);
+	ev->se_nvl = NULL;
+	strlcpy(ev->se_class, class, sizeof(ev->se_class));
+	strlcpy(ev->se_subclass, subclass, sizeof(ev->se_subclass));
+	strlcpy(ev->se_pub, pub, sizeof(ev->se_pub));
+
+	return ((sysevent_t *)ev);
+}
+
+void
+sysevent_free(sysevent_t *evp)
+{
+	struct sysevent *ev = (struct sysevent *)evp;
+
+	ASSERT(evp != NULL);
+
+	if (ev->se_nvl != NULL)
+		sysevent_free_attr(ev->se_nvl);
+	kmem_free(ev, sizeof(*ev));
+}
+
+int
+sysevent_add_attr(sysevent_attr_list_t **ev_attr_list, char *name,
+    sysevent_value_t *se_value, int flag)
+{
+	nvlist_t *nvl;
+	int error;
+
+	ASSERT(ev_attr_list != NULL);
+	ASSERT(name != NULL);
+	ASSERT(se_value != NULL);
+	ASSERT(flag == SE_SLEEP);
+
+	if (strlen(name) >= MAX_ATTR_NAME)
+		return (SE_EINVAL);
+
+	nvl = *ev_attr_list;
+	if (nvl == NULL) {
+		if (nvlist_alloc(&nvl, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0)
+			return (SE_ENOMEM);
+	}
+
+	error = 0;
+
+	switch (se_value->value_type) {
+	case SE_DATA_TYPE_UINT64:
+		error = nvlist_add_uint64(nvl, name, se_value->value.sv_uint64);
+		break;
+	case SE_DATA_TYPE_STRING:
+		if (strlen(se_value->value.sv_string) >= MAX_STRING_SZ)
+			error = SE_EINVAL;
+		if (error == 0) {
+			error = nvlist_add_string(nvl, name,
+			    se_value->value.sv_string);
+		}
+		break;
+	default:
+#if 0
+		printf("%s: type %d is not implemented\n", __func__,
+		    se_value->value_type);
+#endif
+		break;
+	}
+
+	if (error != 0) {
+		nvlist_free(nvl);
+		return (error);
+	}
+
+	*ev_attr_list = nvl;
+
+	return (0);
+}
+
+void
+sysevent_free_attr(sysevent_attr_list_t *ev_attr_list)
+{
+
+	nvlist_free(ev_attr_list);
+}
+
+int
+sysevent_attach_attributes(sysevent_t *evp, sysevent_attr_list_t *ev_attr_list)
+{
+	struct sysevent *ev = (struct sysevent *)evp;
+
+	ASSERT(ev->se_nvl == NULL);
+
+	ev->se_nvl = ev_attr_list;
+
+	return (0);
+}
+
+void
+sysevent_detach_attributes(sysevent_t *evp)
+{
+	struct sysevent *ev = (struct sysevent *)evp;
+
+	ASSERT(ev->se_nvl != NULL);
+
+	ev->se_nvl = NULL;
+}
+
+int
+log_sysevent(sysevent_t *evp, int flag, sysevent_id_t *eid)
+{
+#ifdef __NetBSD__
+	/* XXXNETBSD we ought to do something here */
+#else
+	struct sysevent *ev = (struct sysevent *)evp;
+	struct sbuf *sb;
+	const char *type;
+	char typestr[128];
+	nvpair_t *elem = NULL;
+
+	ASSERT(evp != NULL);
+	ASSERT(ev->se_nvl != NULL);
+	ASSERT(flag == SE_SLEEP);
+	ASSERT(eid != NULL);
+
+	sb = sbuf_new_auto();
+	if (sb == NULL)
+		return (SE_ENOMEM);
+	type = NULL;
+
+	while ((elem = nvlist_next_nvpair(ev->se_nvl, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_BOOLEAN:
+		    {
+			boolean_t value;
+
+			(void) nvpair_value_boolean_value(elem, &value);
+			sbuf_printf(sb, " %s=%s", nvpair_name(elem),
+			    value ? "true" : "false");
+			break;
+		    }
+		case DATA_TYPE_UINT8:
+		    {
+			uint8_t value;
+
+			(void) nvpair_value_uint8(elem, &value);
+			sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
+			break;
+		    }
+		case DATA_TYPE_INT32:
+		    {
+			int32_t value;
+
+			(void) nvpair_value_int32(elem, &value);
+			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+			    (intmax_t)value);
+			break;
+		    }
+		case DATA_TYPE_UINT32:
+		    {
+			uint32_t value;
+
+			(void) nvpair_value_uint32(elem, &value);
+			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+			    (uintmax_t)value);
+			break;
+		    }
+		case DATA_TYPE_INT64:
+		    {
+			int64_t value;
+
+			(void) nvpair_value_int64(elem, &value);
+			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+			    (intmax_t)value);
+			break;
+		    }
+		case DATA_TYPE_UINT64:
+		    {
+			uint64_t value;
+
+			(void) nvpair_value_uint64(elem, &value);
+			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+			    (uintmax_t)value);
+			break;
+		    }
+		case DATA_TYPE_STRING:
+		    {
+			char *value;
+
+			(void) nvpair_value_string(elem, &value);
+			sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
+			if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
+				type = value;
+			break;
+		    }
+		case DATA_TYPE_UINT8_ARRAY:
+		    {
+		    	uint8_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint8_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%02hhx", value[ii]);
+			break;
+		    }
+		case DATA_TYPE_UINT16_ARRAY:
+		    {
+		    	uint16_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint16_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%04hx", value[ii]);
+			break;
+		    }
+		case DATA_TYPE_UINT32_ARRAY:
+		    {
+		    	uint32_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint32_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
+			break;
+		    }
+		case DATA_TYPE_UINT64_ARRAY:
+		    {
+		    	uint64_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint64_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
+			break;
+		    }
+		default:
+#if 0
+			printf("%s: type %d is not implemented\n", __func__,
+			    nvpair_type(elem));
+#endif
+			break;
+		}
+	}
+
+	if (sbuf_finish(sb) != 0) {
+		sbuf_delete(sb);
+		return (SE_ENOMEM);
+	}
+
+	if (type == NULL)
+		type = ev->se_subclass;
+	if (strncmp(type, "ESC_ZFS_", 8) == 0) {
+		snprintf(typestr, sizeof(typestr), "misc.fs.zfs.%s", type + 8);
+		type = typestr;
+	}
+	devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
+	sbuf_delete(sb);
+#endif
+
+	return (0);
+}
+
+int
+_ddi_log_sysevent(char *vendor, char *class, char *subclass,
+    nvlist_t *attr_list, sysevent_id_t *eidp, int flag)
+{
+	sysevent_t *ev;
+	int ret;
+
+	ASSERT(vendor != NULL);
+	ASSERT(class != NULL);
+	ASSERT(subclass != NULL);
+	ASSERT(attr_list != NULL);
+	ASSERT(eidp != NULL);
+	ASSERT(flag == DDI_SLEEP);
+
+	ev = sysevent_alloc(class, subclass, vendor, SE_SLEEP);
+	ASSERT(ev != NULL);
+	(void)sysevent_attach_attributes(ev, attr_list);
+        ret = log_sysevent(ev, SE_SLEEP, eidp);
+	sysevent_detach_attributes(ev);
+	sysevent_free(ev);
+
+	return (ret);
+}
Index: src/external/cddl/osnet/sys/kern/taskq.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/taskq.c,v
retrieving revision 1.6
diff -u -p -r1.6 taskq.c
--- src/external/cddl/osnet/sys/kern/taskq.c	7 Jan 2017 21:39:53 -0000	1.6
+++ src/external/cddl/osnet/sys/kern/taskq.c	5 May 2017 22:33:53 -0000
@@ -26,7 +26,9 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ */
 
 /*
  * Kernel task queues: general-purpose asynchronous task scheduling.
@@ -60,7 +62,7 @@
  *	the same list managed by the same thread.
  *
  * (3) Some tasks may block for a long time, and this should not block other
- * 	tasks in the queue.
+ *	tasks in the queue.
  *
  * To provide useful service in such cases we define a "dynamic task queue"
  * which has an individual thread for each of the tasks. These threads are
@@ -77,23 +79,23 @@
  * The backing task queue is also used for scheduling internal tasks needed for
  * dynamic task queue maintenance.
  *
- * INTERFACES:
+ * INTERFACES ==================================================================
  *
- * taskq_t *taskq_create(name, nthreads, pri_t pri, minalloc, maxall, flags);
+ * taskq_t *taskq_create(name, nthreads, pri, minalloc, maxalloc, flags);
  *
  *	Create a taskq with specified properties.
  *	Possible 'flags':
  *
  *	  TASKQ_DYNAMIC: Create task pool for task management. If this flag is
- * 		specified, 'nthreads' specifies the maximum number of threads in
+ *		specified, 'nthreads' specifies the maximum number of threads in
  *		the task queue. Task execution order for dynamic task queues is
  *		not predictable.
  *
  *		If this flag is not specified (default case) a
- * 		single-list task queue is created with 'nthreads' threads
- * 		servicing it. Entries in this queue are managed by
- * 		taskq_ent_alloc() and taskq_ent_free() which try to keep the
- * 		task population between 'minalloc' and 'maxalloc', but the
+ *		single-list task queue is created with 'nthreads' threads
+ *		servicing it. Entries in this queue are managed by
+ *		taskq_ent_alloc() and taskq_ent_free() which try to keep the
+ *		task population between 'minalloc' and 'maxalloc', but the
  *		latter limit is only advisory for TQ_SLEEP dispatches and the
  *		former limit is only advisory for TQ_NOALLOC dispatches. If
  *		TASKQ_PREPOPULATE is set in 'flags', the taskq will be
@@ -106,13 +108,47 @@
  *	  TASKQ_PREPOPULATE: Prepopulate task queue with threads.
  *		Also prepopulate the task queue with 'minalloc' task structures.
  *
+ *	  TASKQ_THREADS_CPU_PCT: This flag specifies that 'nthreads' should be
+ *		interpreted as a percentage of the # of online CPUs on the
+ *		system.  The taskq subsystem will automatically adjust the
+ *		number of threads in the taskq in response to CPU online
+ *		and offline events, to keep the ratio.  nthreads must be in
+ *		the range [0,100].
+ *
+ *		The calculation used is:
+ *
+ *			MAX((ncpus_online * percentage)/100, 1)
+ *
+ *		This flag is not supported for DYNAMIC task queues.
+ *		This flag is not compatible with TASKQ_CPR_SAFE.
+ *
  *	  TASKQ_CPR_SAFE: This flag specifies that users of the task queue will
- * 		use their own protocol for handling CPR issues. This flag is not
- *		supported for DYNAMIC task queues.
+ *		use their own protocol for handling CPR issues. This flag is not
+ *		supported for DYNAMIC task queues.  This flag is not compatible
+ *		with TASKQ_THREADS_CPU_PCT.
  *
  *	The 'pri' field specifies the default priority for the threads that
  *	service all scheduled tasks.
  *
+ * taskq_t *taskq_create_instance(name, instance, nthreads, pri, minalloc,
+ *    maxalloc, flags);
+ *
+ *	Like taskq_create(), but takes an instance number (or -1 to indicate
+ *	no instance).
+ *
+ * taskq_t *taskq_create_proc(name, nthreads, pri, minalloc, maxalloc, proc,
+ *    flags);
+ *
+ *	Like taskq_create(), but creates the taskq threads in the specified
+ *	system process.  If proc != &p0, this must be called from a thread
+ *	in that process.
+ *
+ * taskq_t *taskq_create_sysdc(name, nthreads, minalloc, maxalloc, proc,
+ *    dc, flags);
+ *
+ *	Like taskq_create_proc(), but the taskq threads will use the
+ *	System Duty Cycle (SDC) scheduling class with a duty cycle of dc.
+ *
  * void taskq_destroy(tap):
  *
  *	Waits for any scheduled tasks to complete, then destroys the taskq.
@@ -137,7 +173,7 @@
  *
  *	  TQ_NOQUEUE: Do not enqueue a task if it can't dispatch it due to
  *		lack of available resources and fail. If this flag is not
- * 		set, and the task pool is exhausted, the task may be scheduled
+ *		set, and the task pool is exhausted, the task may be scheduled
  *		in the backing queue. This flag may ONLY be used with dynamic
  *		task queues.
  *
@@ -146,13 +182,26 @@
  *		Enqueueing dependent tasks may create deadlocks.
  *
  *	  TQ_SLEEP:   May block waiting for resources. May still fail for
- * 		dynamic task queues if TQ_NOQUEUE is also specified, otherwise
+ *		dynamic task queues if TQ_NOQUEUE is also specified, otherwise
  *		always succeed.
  *
+ *	  TQ_FRONT:   Puts the new task at the front of the queue.  Be careful.
+ *
  *	NOTE: Dynamic task queues are much more likely to fail in
  *		taskq_dispatch() (especially if TQ_NOQUEUE was specified), so it
  *		is important to have backup strategies handling such failures.
  *
+ * void taskq_dispatch_ent(tq, func, arg, flags, tqent)
+ *
+ *	This is a light-weight form of taskq_dispatch(), that uses a
+ *	preallocated taskq_ent_t structure for scheduling.  As a
+ *	result, it does not perform allocations and cannot ever fail.
+ *	Note especially that it cannot be used with TASKQ_DYNAMIC
+ *	taskqs.  The memory for the tqent must not be modified or used
+ *	until the function (func) is called.  (However, func itself
+ *	may safely modify or free this memory, once it is called.)
+ *	Note that the taskq framework will NOT free this memory.
+ *
  * void taskq_wait(tq):
  *
  *	Waits for all previously scheduled tasks to complete.
@@ -187,13 +236,13 @@
  *	any subsystem that needs to schedule tasks and does not need to manage
  *	its own task queues. It is initialized quite early during system boot.
  *
- * IMPLEMENTATION.
+ * IMPLEMENTATION ==============================================================
  *
  * This is schematic representation of the task queue structures.
  *
  *   taskq:
  *   +-------------+
- *   |tq_lock      | +---< taskq_ent_free()
+ *   | tq_lock     | +---< taskq_ent_free()
  *   +-------------+ |
  *   |...          | | tqent:                  tqent:
  *   +-------------+ | +------------+          +------------+
@@ -224,7 +273,7 @@
  *   +-------------+  |
  *                    |   DYNAMIC TASK QUEUES:
  *                    |
- *                    +-> taskq_bucket[nCPU]       	taskq_bucket_dispatch()
+ *                    +-> taskq_bucket[nCPU]		taskq_bucket_dispatch()
  *                        +-------------------+                    ^
  *                   +--->| tqbucket_lock     |                    |
  *                   |    +-------------------+   +--------+      +--------+
@@ -239,7 +288,7 @@
  *                   |    +-------------------+<--+--------+<--...+--------+
  *                   |    | ...               |   | thread |      | thread |
  *                   |    +-------------------+   +--------+      +--------+
- *		     +---> 	...
+ *		     +--->	...
  *
  *
  * Task queues use tq_task field to link new entry in the queue. The queue is a
@@ -251,7 +300,58 @@
  *	All threads used by task queues mark t_taskq field of the thread to
  *	point to the task queue.
  *
- * Dynamic Task Queues Implementation.
+ * Taskq Thread Management -----------------------------------------------------
+ *
+ * Taskq's non-dynamic threads are managed with several variables and flags:
+ *
+ *	* tq_nthreads	- The number of threads in taskq_thread() for the
+ *			  taskq.
+ *
+ *	* tq_active	- The number of threads not waiting on a CV in
+ *			  taskq_thread(); includes newly created threads
+ *			  not yet counted in tq_nthreads.
+ *
+ *	* tq_nthreads_target
+ *			- The number of threads desired for the taskq.
+ *
+ *	* tq_flags & TASKQ_CHANGING
+ *			- Indicates that tq_nthreads != tq_nthreads_target.
+ *
+ *	* tq_flags & TASKQ_THREAD_CREATED
+ *			- Indicates that a thread is being created in the taskq.
+ *
+ * During creation, tq_nthreads and tq_active are set to 0, and
+ * tq_nthreads_target is set to the number of threads desired.  The
+ * TASKQ_CHANGING flag is set, and taskq_thread_create() is called to
+ * create the first thread. taskq_thread_create() increments tq_active,
+ * sets TASKQ_THREAD_CREATED, and creates the new thread.
+ *
+ * Each thread starts in taskq_thread(), clears the TASKQ_THREAD_CREATED
+ * flag, and increments tq_nthreads.  It stores the new value of
+ * tq_nthreads as its "thread_id", and stores its thread pointer in the
+ * tq_threadlist at the (thread_id - 1).  We keep the thread_id space
+ * densely packed by requiring that only the largest thread_id can exit during
+ * normal adjustment.   The exception is during the destruction of the
+ * taskq; once tq_nthreads_target is set to zero, no new threads will be created
+ * for the taskq queue, so every thread can exit without any ordering being
+ * necessary.
+ *
+ * Threads will only process work if their thread id is <= tq_nthreads_target.
+ *
+ * When TASKQ_CHANGING is set, threads will check the current thread target
+ * whenever they wake up, and do whatever they can to apply its effects.
+ *
+ * TASKQ_THREAD_CPU_PCT --------------------------------------------------------
+ *
+ * When a taskq is created with TASKQ_THREAD_CPU_PCT, we store their requested
+ * percentage in tq_threads_ncpus_pct, start them off with the correct thread
+ * target, and add them to the taskq_cpupct_list for later adjustment.
+ *
+ * We register taskq_cpu_setup() to be called whenever a CPU changes state.  It
+ * walks the list of TASKQ_THREAD_CPU_PCT taskqs, adjusts their nthread_target
+ * if need be, and wakes up all of the threads to process the change.
+ *
+ * Dynamic Task Queues Implementation ------------------------------------------
  *
  * For a dynamic task queues there is a 1-to-1 mapping between a thread and
  * taskq_ent_structure. Each entry is serviced by its own thread and each thread
@@ -280,7 +380,7 @@
  * itself. The thread sleep time is controlled by a tunable variable
  * `taskq_thread_timeout'.
  *
- * There is various statistics kept in the bucket which allows for later
+ * There are various statistics kept in the bucket which allows for later
  * analysis of taskq usage patterns. Also, a global copy of taskq creation and
  * death statistics is kept in the global taskq data structure. Since thread
  * creation and death happen rarely, updating such global data does not present
@@ -302,7 +402,7 @@
  *	 memory. One solution may be allocation of buckets when they are first
  *	 touched, but it is not clear how useful it is.
  *
- * SUSPEND/RESUME implementation.
+ * SUSPEND/RESUME implementation -----------------------------------------------
  *
  *	Before executing a task taskq_thread() (executing non-dynamic task
  *	queues) obtains taskq's thread lock as a reader. The taskq_suspend()
@@ -323,18 +423,24 @@
  *	taskq_member() function works by comparing a thread t_taskq pointer with
  *	the passed thread pointer.
  *
- * LOCKS and LOCK Hierarchy:
+ * LOCKS and LOCK Hierarchy ----------------------------------------------------
  *
- *   There are two locks used in task queues.
+ *   There are three locks used in task queues:
  *
- *   1) Task queue structure has a lock, protecting global task queue state.
+ *   1) The taskq_t's tq_lock, protecting global task queue state.
  *
  *   2) Each per-CPU bucket has a lock for bucket management.
  *
- *   If both locks are needed, task queue lock should be taken only after bucket
+ *   3) The global taskq_cpupct_lock, which protects the list of
+ *      TASKQ_THREADS_CPU_PCT taskqs.
+ *
+ *   If both (1) and (2) are needed, tq_lock should be taken *after* the bucket
  *   lock.
  *
- * DEBUG FACILITIES.
+ *   If both (1) and (3) are needed, tq_lock should be taken *after*
+ *   taskq_cpupct_lock.
+ *
+ * DEBUG FACILITIES ------------------------------------------------------------
  *
  * For DEBUG kernels it is possible to induce random failures to
  * taskq_dispatch() function when it is given TQ_NOSLEEP argument. The value of
@@ -343,13 +449,18 @@
  *
  * Setting TASKQ_STATISTIC to 0 will disable per-bucket statistics.
  *
- * TUNABLES
+ * TUNABLES --------------------------------------------------------------------
  *
  *	system_taskq_size	- Size of the global system_taskq.
  *				  This value is multiplied by nCPUs to determine
  *				  actual size.
  *				  Default value: 64
  *
+ *	taskq_minimum_nthreads_max
+ *				- Minimum size of the thread list for a taskq.
+ *				  Useful for testing different thread pool
+ *				  sizes by overwriting tq_nthreads_target.
+ *
  *	taskq_thread_timeout	- Maximum idle time for taskq_d_thread()
  *				  Default value: 5 minutes
  *
@@ -367,7 +478,7 @@
  *				  for static task queues.
  *				  Default value: UINT_MAX (no induced failures)
  *
- * CONDITIONAL compilation.
+ * CONDITIONAL compilation -----------------------------------------------------
  *
  *    TASKQ_STATISTIC	- If set will enable bucket statistic (default).
  *
@@ -399,7 +510,23 @@ taskq_t *system_taskq;
 #define SYSTEM_TASKQ_SIZE 1
 int system_taskq_size = SYSTEM_TASKQ_SIZE;
 
-#define	TASKQ_ACTIVE 	0x00010000
+/*
+ * Minimum size for tq_nthreads_max; useful for those who want to play around
+ * with increasing a taskq's tq_nthreads_target.
+ */
+int taskq_minimum_nthreads_max = 1;
+
+/*
+ * We want to ensure that when taskq_create() returns, there is at least
+ * one thread ready to handle requests.  To guarantee this, we have to wait
+ * for the second thread, since the first one cannot process requests until
+ * the second thread has been created.
+ */
+#define	TASKQ_CREATE_ACTIVE_THREADS	2
+
+/* Maximum percentage allowed for TASKQ_THREADS_CPU_PCT */
+#define	TASKQ_CPUPCT_MAX_PERCENT	1000
+int taskq_cpupct_max_percent = TASKQ_CPUPCT_MAX_PERCENT;
 
 /*
  * Dynamic task queue threads that don't get any work within
@@ -445,6 +572,24 @@ static void taskq_ent_destructor(void *,
 static taskq_ent_t *taskq_ent_alloc(taskq_t *, int);
 static void taskq_ent_free(taskq_t *, taskq_ent_t *);
 
+void system_taskq_init(void);
+void system_taskq_fini(void);
+
+/*
+ * List of all TASKQ_THREADS_CPU_PCT taskqs.
+ */
+static list_t taskq_cpupct_list;	/* protected by cpu_lock */
+
+#ifdef __NetBSD__
+#define cpu_lock taskq_cpu_lock
+static kmutex_t cpu_lock;
+
+typedef struct {
+	int	cp_id;
+	int	cp_ncpus;
+} cpupart_t;
+#endif
+
 /*
  * Collect per-bucket statistic when TASKQ_STATISTIC is defined.
  */
@@ -508,22 +653,42 @@ uint_t taskq_smtbf = UINT_MAX;    /* mea
 	tqe->tqent_next->tqent_prev = tqe;			\
 	tqe->tqent_prev->tqent_next = tqe;			\
 }
+/*
+ * Prepend 'tqe' to the beginning of l
+ */
+#define	TQ_PREPEND(l, tqe) {					\
+	tqe->tqent_next = l.tqent_next;				\
+	tqe->tqent_prev = &l;					\
+	tqe->tqent_next->tqent_prev = tqe;			\
+	tqe->tqent_prev->tqent_next = tqe;			\
+}
 
 /*
  * Schedule a task specified by func and arg into the task queue entry tqe.
  */
-#define	TQ_ENQUEUE(tq, tqe, func, arg) {			\
-	ASSERT(MUTEX_HELD(&tq->tq_lock));			\
-	TQ_APPEND(tq->tq_task, tqe);				\
-	tqe->tqent_func = (func);				\
-	tqe->tqent_arg = (arg);					\
-	tq->tq_tasks++;						\
-	if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks)	\
+#define	TQ_DO_ENQUEUE(tq, tqe, func, arg, front) {			\
+	ASSERT(MUTEX_HELD(&tq->tq_lock));				\
+	_NOTE(CONSTCOND)						\
+	if (front) {							\
+		TQ_PREPEND(tq->tq_task, tqe);				\
+	} else {							\
+		TQ_APPEND(tq->tq_task, tqe);				\
+	}								\
+	tqe->tqent_func = (func);					\
+	tqe->tqent_arg = (arg);						\
+	tq->tq_tasks++;							\
+	if (tq->tq_tasks - tq->tq_executed > tq->tq_maxtasks)		\
 		tq->tq_maxtasks = tq->tq_tasks - tq->tq_executed;	\
-	cv_signal(&tq->tq_dispatch_cv);				\
+	cv_signal(&tq->tq_dispatch_cv);					\
 	DTRACE_PROBE2(taskq__enqueue, taskq_t *, tq, taskq_ent_t *, tqe); \
 }
 
+#define	TQ_ENQUEUE(tq, tqe, func, arg)					\
+	TQ_DO_ENQUEUE(tq, tqe, func, arg, 0)
+
+#define	TQ_ENQUEUE_FRONT(tq, tqe, func, arg)				\
+	TQ_DO_ENQUEUE(tq, tqe, func, arg, 1)
+
 /*
  * Do-nothing task which may be used to prepopulate thread caches.
  */
@@ -546,6 +711,7 @@ taskq_constructor(void *arg, void *obj, 
 	rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
 	cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tq->tq_exit_cv, NULL, CV_DEFAULT, NULL);
 
 	tq->tq_task.tqent_next = &tq->tq_task;
 	tq->tq_task.tqent_prev = &tq->tq_task;
@@ -563,6 +729,7 @@ taskq_destructor(void *arg, void *obj)
 	rw_destroy(&tq->tq_threadlock);
 	cv_destroy(&tq->tq_dispatch_cv);
 	cv_destroy(&tq->tq_wait_cv);
+	cv_destroy(&tq->tq_exit_cv);
 }
 
 /*ARGSUSED*/
@@ -587,23 +754,6 @@ taskq_ent_destructor(void *arg, void *ob
 	cv_destroy(&tqe->tqent_cv);
 }
 
-/*
- * Create global system dynamic task queue.
- */
-void
-system_taskq_init(void)
-{
-	system_taskq = taskq_create_common("system_taskq", 0,
-	    system_taskq_size * max_ncpus, minclsyspri, 4, 512,
-	    TASKQ_PREPOPULATE);
-}
-
-void
-system_taskq_fini(void)
-{
-	taskq_destroy(system_taskq);
-}
-
 void
 taskq_init(void)
 {
@@ -612,17 +762,147 @@ taskq_init(void)
 	    taskq_ent_destructor, NULL, NULL, NULL, 0);
 	taskq_cache = kmem_cache_create("taskq_cache", sizeof (taskq_t),
 	    0, taskq_constructor, taskq_destructor, NULL, NULL, NULL, 0);
+
+	list_create(&taskq_cpupct_list, sizeof (taskq_t),
+	    offsetof(taskq_t, tq_cpupct_link));
+
+#ifdef __NetBSD__
 	system_taskq_init();
+	mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL);
+#endif
 }
 
 void
 taskq_fini(void)
 {
 	system_taskq_fini();
+	mutex_destroy(&cpu_lock);
 	kmem_cache_destroy(taskq_cache);
 	kmem_cache_destroy(taskq_ent_cache);
 }
 
+static void
+taskq_update_nthreads(taskq_t *tq, uint_t ncpus)
+{
+	uint_t newtarget = TASKQ_THREADS_PCT(ncpus, tq->tq_threads_ncpus_pct);
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(MUTEX_HELD(&tq->tq_lock));
+
+	/* We must be going from non-zero to non-zero; no exiting. */
+	ASSERT3U(tq->tq_nthreads_target, !=, 0);
+	ASSERT3U(newtarget, !=, 0);
+
+	ASSERT3U(newtarget, <=, tq->tq_nthreads_max);
+	if (newtarget != tq->tq_nthreads_target) {
+		tq->tq_flags |= TASKQ_CHANGING;
+		tq->tq_nthreads_target = newtarget;
+		cv_broadcast(&tq->tq_dispatch_cv);
+		cv_broadcast(&tq->tq_exit_cv);
+	}
+}
+
+/* called during task queue creation */
+static void
+taskq_cpupct_install(taskq_t *tq, cpupart_t *cpup)
+{
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+
+	mutex_enter(&cpu_lock);
+	mutex_enter(&tq->tq_lock);
+	tq->tq_cpupart = cpup->cp_id;
+	taskq_update_nthreads(tq, cpup->cp_ncpus);
+	mutex_exit(&tq->tq_lock);
+
+	list_insert_tail(&taskq_cpupct_list, tq);
+	mutex_exit(&cpu_lock);
+}
+
+static void
+taskq_cpupct_remove(taskq_t *tq)
+{
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+
+	mutex_enter(&cpu_lock);
+	list_remove(&taskq_cpupct_list, tq);
+	mutex_exit(&cpu_lock);
+}
+
+#ifdef illumos
+/*ARGSUSED*/
+static int
+taskq_cpu_setup(cpu_setup_t what, int id, void *arg)
+{
+	taskq_t *tq;
+	cpupart_t *cp = cpu[id]->cpu_part;
+	uint_t ncpus = cp->cp_ncpus;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+	ASSERT(ncpus > 0);
+
+	switch (what) {
+	case CPU_OFF:
+	case CPU_CPUPART_OUT:
+		/* offlines are called *before* the cpu is offlined. */
+		if (ncpus > 1)
+			ncpus--;
+		break;
+
+	case CPU_ON:
+	case CPU_CPUPART_IN:
+		break;
+
+	default:
+		return (0);		/* doesn't affect cpu count */
+	}
+
+	for (tq = list_head(&taskq_cpupct_list); tq != NULL;
+	    tq = list_next(&taskq_cpupct_list, tq)) {
+
+		mutex_enter(&tq->tq_lock);
+		/*
+		 * If the taskq is part of the cpuset which is changing,
+		 * update its nthreads_target.
+		 */
+		if (tq->tq_cpupart == cp->cp_id) {
+			taskq_update_nthreads(tq, ncpus);
+		}
+		mutex_exit(&tq->tq_lock);
+	}
+	return (0);
+}
+
+void
+taskq_mp_init(void)
+{
+	mutex_enter(&cpu_lock);
+	register_cpu_setup_func(taskq_cpu_setup, NULL);
+	/*
+	 * Make sure we're up to date.  At this point in boot, there is only
+	 * one processor set, so we only have to update the current CPU.
+	 */
+	(void) taskq_cpu_setup(CPU_ON, CPU->cpu_id, NULL);
+	mutex_exit(&cpu_lock);
+}
+#endif /* illumos */
+
+/*
+ * Create global system dynamic task queue.
+ */
+void
+system_taskq_init(void)
+{
+	system_taskq = taskq_create_common("system_taskq", 0,
+	    system_taskq_size * max_ncpus, minclsyspri, 4, 512,
+	    TASKQ_PREPOPULATE);
+}
+
+void
+system_taskq_fini(void)
+{
+	taskq_destroy(system_taskq);
+}
+
 /*
  * taskq_ent_alloc()
  *
@@ -632,10 +912,9 @@ taskq_fini(void)
  * Assumes: tq->tq_lock is held.
  */
 static taskq_ent_t *
-taskq_ent_alloc(taskq_t *tq, int flags)
+taskq_ent_alloc(struct taskq *tq, int flags)
 {
-	int kmflags = KM_NOSLEEP;
-	
+	int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 	taskq_ent_t *tqe;
 
 	ASSERT(MUTEX_HELD(&tq->tq_lock));
@@ -664,7 +943,7 @@ taskq_ent_alloc(taskq_t *tq, int flags)
 			 * the caller.  So, we just delay for one second
 			 * to throttle the allocation rate.
 			 */
-			xdelay(hz);
+			delay(hz);
 		}
 		tqe = kmem_cache_alloc(taskq_ent_cache, kmflags);
 		mutex_enter(&tq->tq_lock);
@@ -718,33 +997,59 @@ taskq_dispatch(taskq_t *tq, task_func_t 
 	ASSERT(func != NULL);
 	ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
 
+	if (!(tq->tq_flags & TASKQ_DYNAMIC)) {
+		/*
+		 * TQ_NOQUEUE flag can't be used with non-dynamic task queues.
+		 */
+		ASSERT(!(flags & TQ_NOQUEUE));
+		/*
+		 * Enqueue the task to the underlying queue.
+		 */
+		mutex_enter(&tq->tq_lock);
+
+		TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags);
+
+		if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) {
+			mutex_exit(&tq->tq_lock);
+			return (0);
+		}
+		/* Make sure we start without any flags */
+		tqe->tqent_flags = 0;
+
+		if (flags & TQ_FRONT) {
+			TQ_ENQUEUE_FRONT(tq, tqe, func, arg);
+		} else {
+			TQ_ENQUEUE(tq, tqe, func, arg);
+		}
+		mutex_exit(&tq->tq_lock);
+		return ((taskqid_t)tqe);
+	}
+	panic("taskq_dispatch TASKQ_DYNAMIC");
+}
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+    taskq_ent_t *tqe)
+{
+	ASSERT(func != NULL);
+	ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
+
 	/*
-	 * TQ_NOQUEUE flag can't be used with non-dynamic task queues.
-	 */
-#ifdef __NetBSD__
-	/*
-	 * Dynamic task queues didn't seem to get imported.  Caller
-	 * must be prepared to handle failure anyway, so just fail.
+	 * Mark it as a prealloc'd task.  This is important
+	 * to ensure that we don't free it later.
 	 */
-	if (flags & TQ_NOQUEUE)
-		return ((taskqid_t)NULL);
-#endif
-	ASSERT(! (flags & TQ_NOQUEUE));
-
+	tqe->tqent_flags |= TQENT_FLAG_PREALLOC;
 	/*
 	 * Enqueue the task to the underlying queue.
 	 */
 	mutex_enter(&tq->tq_lock);
 
-	TASKQ_S_RANDOM_DISPATCH_FAILURE(tq, flags);
-
-	if ((tqe = taskq_ent_alloc(tq, flags)) == NULL) {
-		mutex_exit(&tq->tq_lock);
-		return ((taskqid_t)NULL);
+	if (flags & TQ_FRONT) {
+		TQ_ENQUEUE_FRONT(tq, tqe, func, arg);
+	} else {
+		TQ_ENQUEUE(tq, tqe, func, arg);
 	}
-	TQ_ENQUEUE(tq, tqe, func, arg);
 	mutex_exit(&tq->tq_lock);
-	return ((taskqid_t)tqe);
 }
 
 /*
@@ -828,37 +1133,206 @@ taskq_member(taskq_t *tq, kthread_t *thr
 }
 
 /*
+ * Creates a thread in the taskq.  We only allow one outstanding create at
+ * a time.  We drop and reacquire the tq_lock in order to avoid blocking other
+ * taskq activity while thread_create() or lwp_kernel_create() run.
+ *
+ * The first time we're called, we do some additional setup, and do not
+ * return until there are enough threads to start servicing requests.
+ */
+static void
+taskq_thread_create(taskq_t *tq)
+{
+	kthread_t	*t;
+	const boolean_t	first = (tq->tq_nthreads == 0);
+
+	ASSERT(MUTEX_HELD(&tq->tq_lock));
+	ASSERT(tq->tq_flags & TASKQ_CHANGING);
+	ASSERT(tq->tq_nthreads < tq->tq_nthreads_target);
+	ASSERT(!(tq->tq_flags & TASKQ_THREAD_CREATED));
+
+
+	tq->tq_flags |= TASKQ_THREAD_CREATED;
+	tq->tq_active++;
+	mutex_exit(&tq->tq_lock);
+
+	/*
+	 * With TASKQ_DUTY_CYCLE the new thread must have an LWP
+	 * as explained in ../disp/sysdc.c (for the msacct data).
+	 * Otherwise simple kthreads are preferred.
+	 */
+	if ((tq->tq_flags & TASKQ_DUTY_CYCLE) != 0) {
+#ifdef illumos
+		/* Enforced in taskq_create_common */
+		ASSERT3P(tq->tq_proc, !=, &p0);
+		t = lwp_kernel_create(tq->tq_proc, taskq_thread, tq, TS_RUN,
+		    tq->tq_pri);
+#endif
+	} else {
+		t = thread_create(NULL, 0, taskq_thread, tq, 0, tq->tq_proc,
+		    TS_RUN, tq->tq_pri);
+	}
+
+	if (!first) {
+		mutex_enter(&tq->tq_lock);
+		return;
+	}
+
+	/*
+	 * We know the thread cannot go away, since tq cannot be
+	 * destroyed until creation has completed.  We can therefore
+	 * safely dereference t.
+	 */
+	if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) {
+#ifdef __NetBSD__
+		cpupart_t cpupart = { 0, ncpu };
+		taskq_cpupct_install(tq, &cpupart);
+#else
+		taskq_cpupct_install(tq, t->t_cpupart);
+#endif
+	}
+	mutex_enter(&tq->tq_lock);
+
+	/* Wait until we can service requests. */
+	while (tq->tq_nthreads != tq->tq_nthreads_target &&
+	    tq->tq_nthreads < TASKQ_CREATE_ACTIVE_THREADS) {
+		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
+	}
+}
+
+/*
+ * Common "sleep taskq thread" function, which handles CPR stuff, as well
+ * as giving a nice common point for debuggers to find inactive threads.
+ */
+static clock_t
+taskq_thread_wait(taskq_t *tq, kmutex_t *mx, kcondvar_t *cv,
+    callb_cpr_t *cprinfo, clock_t timeout)
+{
+	clock_t ret = 0;
+
+	if (!(tq->tq_flags & TASKQ_CPR_SAFE)) {
+		CALLB_CPR_SAFE_BEGIN(cprinfo);
+	}
+	if (timeout < 0)
+		cv_wait(cv, mx);
+	else
+		ret = cv_reltimedwait(cv, mx, timeout, TR_CLOCK_TICK);
+
+	if (!(tq->tq_flags & TASKQ_CPR_SAFE)) {
+		CALLB_CPR_SAFE_END(cprinfo, mx);
+	}
+
+	return (ret);
+}
+
+/*
  * Worker thread for processing task queue.
  */
 static void
 taskq_thread(void *arg)
 {
+	int thread_id;
+
 	taskq_t *tq = arg;
 	taskq_ent_t *tqe;
 	callb_cpr_t cprinfo;
 	hrtime_t start, end;
+	boolean_t freeit;
 
-	CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr, tq->tq_name);
-
+	if (tq->tq_flags & TASKQ_CPR_SAFE) {
+		CALLB_CPR_INIT_SAFE(curthread, tq->tq_name);
+	} else {
+		CALLB_CPR_INIT(&cprinfo, &tq->tq_lock, callb_generic_cpr,
+		    tq->tq_name);
+	}
 	mutex_enter(&tq->tq_lock);
-	while (tq->tq_flags & TASKQ_ACTIVE) {
+	thread_id = ++tq->tq_nthreads;
+	ASSERT(tq->tq_flags & TASKQ_THREAD_CREATED);
+	ASSERT(tq->tq_flags & TASKQ_CHANGING);
+	tq->tq_flags &= ~TASKQ_THREAD_CREATED;
+
+	VERIFY3S(thread_id, <=, tq->tq_nthreads_max);
+
+	if (tq->tq_nthreads_max == 1)
+		tq->tq_thread = curthread;
+	else
+		tq->tq_threadlist[thread_id - 1] = curthread;
+
+	/* Allow taskq_create_common()'s taskq_thread_create() to return. */
+	if (tq->tq_nthreads == TASKQ_CREATE_ACTIVE_THREADS)
+		cv_broadcast(&tq->tq_wait_cv);
+
+	for (;;) {
+		if (tq->tq_flags & TASKQ_CHANGING) {
+			/* See if we're no longer needed */
+			if (thread_id > tq->tq_nthreads_target) {
+				/*
+				 * To preserve the one-to-one mapping between
+				 * thread_id and thread, we must exit from
+				 * highest thread ID to least.
+				 *
+				 * However, if everyone is exiting, the order
+				 * doesn't matter, so just exit immediately.
+				 * (this is safe, since you must wait for
+				 * nthreads to reach 0 after setting
+				 * tq_nthreads_target to 0)
+				 */
+				if (thread_id == tq->tq_nthreads ||
+				    tq->tq_nthreads_target == 0)
+					break;
+
+				/* Wait for higher thread_ids to exit */
+				(void) taskq_thread_wait(tq, &tq->tq_lock,
+				    &tq->tq_exit_cv, &cprinfo, -1);
+				continue;
+			}
+
+			/*
+			 * If no thread is starting taskq_thread(), we can
+			 * do some bookkeeping.
+			 */
+			if (!(tq->tq_flags & TASKQ_THREAD_CREATED)) {
+				/* Check if we've reached our target */
+				if (tq->tq_nthreads == tq->tq_nthreads_target) {
+					tq->tq_flags &= ~TASKQ_CHANGING;
+					cv_broadcast(&tq->tq_wait_cv);
+				}
+				/* Check if we need to create a thread */
+				if (tq->tq_nthreads < tq->tq_nthreads_target) {
+					taskq_thread_create(tq);
+					continue; /* tq_lock was dropped */
+				}
+			}
+		}
 		if ((tqe = tq->tq_task.tqent_next) == &tq->tq_task) {
 			if (--tq->tq_active == 0)
 				cv_broadcast(&tq->tq_wait_cv);
-			if (tq->tq_flags & TASKQ_CPR_SAFE) {
-				cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
-			} else {
-				CALLB_CPR_SAFE_BEGIN(&cprinfo);
-				cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
-				CALLB_CPR_SAFE_END(&cprinfo, &tq->tq_lock);
-			}
+			(void) taskq_thread_wait(tq, &tq->tq_lock,
+			    &tq->tq_dispatch_cv, &cprinfo, -1);
 			tq->tq_active++;
 			continue;
 		}
+
 		tqe->tqent_prev->tqent_next = tqe->tqent_next;
 		tqe->tqent_next->tqent_prev = tqe->tqent_prev;
 		mutex_exit(&tq->tq_lock);
 
+		/*
+		 * For prealloc'd tasks, we don't free anything.  We
+		 * have to check this now, because once we call the
+		 * function for a prealloc'd taskq, we can't touch the
+		 * tqent any longer (calling the function returns the
+		 * ownershp of the tqent back to caller of
+		 * taskq_dispatch.)
+		 */
+		if ((!(tq->tq_flags & TASKQ_DYNAMIC)) &&
+		    (tqe->tqent_flags & TQENT_FLAG_PREALLOC)) {
+			/* clear pointers to assist assertion checks */
+			tqe->tqent_next = tqe->tqent_prev = NULL;
+			freeit = B_FALSE;
+		} else {
+			freeit = B_TRUE;
+		}
 		rw_enter(&tq->tq_threadlock, RW_READER);
 		start = gethrtime();
 		DTRACE_PROBE2(taskq__exec__start, taskq_t *, tq,
@@ -873,12 +1347,33 @@ taskq_thread(void *arg)
 		tq->tq_totaltime += end - start;
 		tq->tq_executed++;
 
-		taskq_ent_free(tq, tqe);
+		if (freeit)
+			taskq_ent_free(tq, tqe);
 	}
+	if (tq->tq_nthreads_max == 1)
+		tq->tq_thread = NULL;
+	else
+		tq->tq_threadlist[thread_id - 1] = NULL;
+
+	/* We're exiting, and therefore no longer active */
+	ASSERT(tq->tq_active > 0);
+	tq->tq_active--;
+
+	ASSERT(tq->tq_nthreads > 0);
 	tq->tq_nthreads--;
-	cv_broadcast(&tq->tq_wait_cv);
+
+	/* Wake up anyone waiting for us to exit */
+	cv_broadcast(&tq->tq_exit_cv);
+	if (tq->tq_nthreads == tq->tq_nthreads_target) {
+		if (!(tq->tq_flags & TASKQ_THREAD_CREATED))
+			tq->tq_flags &= ~TASKQ_CHANGING;
+
+		cv_broadcast(&tq->tq_wait_cv);
+	}
+
 	ASSERT(!(tq->tq_flags & TASKQ_CPR_SAFE));
-	CALLB_CPR_EXIT(&cprinfo);
+	CALLB_CPR_EXIT(&cprinfo);		/* drops tq->tq_lock */
+
 	thread_exit();
 }
 
@@ -903,6 +1398,7 @@ taskq_create_common(const char *name, in
 	taskq_t *tq = kmem_cache_alloc(taskq_cache, KM_NOSLEEP);
 	uint_t ncpus = ((boot_max_ncpus == -1) ? max_ncpus : boot_max_ncpus);
 	uint_t bsize;	/* # of buckets - always power of 2 */
+	int max_nthreads;
 
 	ASSERT(instance == 0);
 	ASSERT(!ISSET(flags, TASKQ_CPR_SAFE));
@@ -921,47 +1417,69 @@ taskq_create_common(const char *name, in
 	bsize = MIN(bsize, taskq_maxbuckets);
 
 	ASSERT(!(flags & TASKQ_DYNAMIC));
-	if (flags & TASKQ_THREADS_CPU_PCT)
-		/* nthreads is % of CPUs we want to use.  */
-		nthreads = (ncpus*nthreads)/100;
 
+	if (flags & TASKQ_THREADS_CPU_PCT) {
+		uint_t pct;
+		ASSERT3S(nthreads, >=, 0);
+		pct = nthreads;
+
+		if (pct > taskq_cpupct_max_percent)
+			pct = taskq_cpupct_max_percent;
+
+		/*
+		 * If you're using THREADS_CPU_PCT, the process for the
+		 * taskq threads must be curproc.  This allows any pset
+		 * binding to be inherited correctly.  If proc is &p0,
+		 * we won't be creating LWPs, so new threads will be assigned
+		 * to the default processor set.
+		 */
+		tq->tq_threads_ncpus_pct = pct;
+		nthreads = 1;		/* corrected in taskq_thread_create() */
+		max_nthreads = TASKQ_THREADS_PCT(max_ncpus, pct);
+
+	} else {
+		ASSERT3S(nthreads, >=, 1);
+		max_nthreads = nthreads;
+	}
+
+	if (max_nthreads < taskq_minimum_nthreads_max)
+		max_nthreads = taskq_minimum_nthreads_max;
+
+	/*
+	 * Make sure the name is 0-terminated, and conforms to the rules for
+	 * C indentifiers
+	 */
 	(void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1);
-	tq->tq_name[TASKQ_NAMELEN] = '\0';
-	/* Make sure the name conforms to the rules for C indentifiers */
 	strident_canon(tq->tq_name, TASKQ_NAMELEN);
 
-	tq->tq_flags = flags | TASKQ_ACTIVE;
-	tq->tq_active = nthreads;
-	tq->tq_nthreads = nthreads;
+	tq->tq_flags = flags | TASKQ_CHANGING;
+	tq->tq_active = 0;
+	tq->tq_instance = instance;
+	tq->tq_nthreads_target = nthreads;
+	tq->tq_nthreads_max = max_nthreads;
 	tq->tq_minalloc = minalloc;
 	tq->tq_maxalloc = maxalloc;
 	tq->tq_nbuckets = bsize;
 	tq->tq_pri = pri;
+	list_link_init(&tq->tq_cpupct_link);
 
+	if (max_nthreads > 1)
+		tq->tq_threadlist = kmem_alloc(
+		    sizeof (kthread_t *) * max_nthreads, KM_SLEEP);
+
+	mutex_enter(&tq->tq_lock);
 	if (flags & TASKQ_PREPOPULATE) {
-		mutex_enter(&tq->tq_lock);
 		while (minalloc-- > 0)
 			taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
-		mutex_exit(&tq->tq_lock);
 	}
 
-	if (nthreads == 1) {
-		tq->tq_thread = thread_create(NULL, 0, taskq_thread, tq,
-		    0, NULL, TS_RUN, pri);
-	} else {
-		kthread_t **tpp = kmem_alloc(sizeof (kthread_t *) * nthreads,
-		    KM_SLEEP);
-
-		tq->tq_threadlist = tpp;
-
-		mutex_enter(&tq->tq_lock);
-		while (nthreads-- > 0) {
-			*tpp = thread_create(NULL, 0, taskq_thread, tq,
-			    0, NULL, TS_RUN, pri);
-			tpp++;
-		}
-		mutex_exit(&tq->tq_lock);
-	}
+	/*
+	 * Create the first thread, which will create any other threads
+	 * necessary.  taskq_thread_create will not return until we have
+	 * enough threads to be able to process requests.
+	 */
+	taskq_thread_create(tq);
+	mutex_exit(&tq->tq_lock);
 
 	return (tq);
 }
@@ -981,6 +1499,21 @@ taskq_destroy(taskq_t *tq)
 	ASSERT(! (tq->tq_flags & TASKQ_CPR_SAFE));
 
 	/*
+	 * Destroy kstats.
+	 */
+	if (tq->tq_kstat != NULL) {
+		kstat_delete(tq->tq_kstat);
+		tq->tq_kstat = NULL;
+	}
+
+	/*
+	 * Unregister from the cpupct list.
+	 */
+	if (tq->tq_flags & TASKQ_THREADS_CPU_PCT) {
+		taskq_cpupct_remove(tq);
+	}
+
+	/*
 	 * Wait for any pending entries to complete.
 	 */
 	taskq_wait(tq);
@@ -989,15 +1522,20 @@ taskq_destroy(taskq_t *tq)
 	ASSERT((tq->tq_task.tqent_next == &tq->tq_task) &&
 	    (tq->tq_active == 0));
 
-	if ((tq->tq_nthreads > 1) && (tq->tq_threadlist != NULL))
-		kmem_free(tq->tq_threadlist, sizeof (kthread_t *) *
-		    tq->tq_nthreads);
+	/* notify all the threads that they need to exit */
+	tq->tq_nthreads_target = 0;
 
-	tq->tq_flags &= ~TASKQ_ACTIVE;
+	tq->tq_flags |= TASKQ_CHANGING;
 	cv_broadcast(&tq->tq_dispatch_cv);
+	cv_broadcast(&tq->tq_exit_cv);
+
 	while (tq->tq_nthreads != 0)
 		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
 
+	if (tq->tq_nthreads_max != 1)
+		kmem_free(tq->tq_threadlist, sizeof (kthread_t *) *
+		    tq->tq_nthreads_max);
+
 	tq->tq_minalloc = 0;
 	while (tq->tq_nalloc != 0)
 		taskq_ent_free(tq, taskq_ent_alloc(tq, TQ_SLEEP));
Index: src/external/cddl/osnet/sys/kern/vfs.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/vfs.c,v
retrieving revision 1.6
diff -u -p -r1.6 vfs.c
--- src/external/cddl/osnet/sys/kern/vfs.c	6 May 2015 15:57:07 -0000	1.6
+++ src/external/cddl/osnet/sys/kern/vfs.c	19 Apr 2017 22:57:46 -0000
@@ -27,7 +27,8 @@
  */
 
 #include <sys/cdefs.h>
-/* __FBSDID("$FreeBSD: src/sys/compat/opensolaris/kern/opensolaris_vfs.c,v 1.7 2007/11/01 08:58:29 pjd Exp $"); */
+#define __FBSDID(x)
+__FBSDID("$FreeBSD: head/sys/cddl/compat/opensolaris/kern/opensolaris_lookup.c 314194 2017-02-24 07:53:56Z avg $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -40,24 +41,14 @@
 #include <lib/libkern/libkern.h>
 
 int
-lookupname(char *dirname, enum uio_seg seg, vnode_t **dirvpp, vnode_t **compvpp)
+lookupname(char *dirname, enum uio_seg seg, enum symfollow follow, vnode_t **dirvpp, vnode_t **compvpp)
 {
-	struct nameidata nd;
-	int error;
-
-	error = 0;
-
-	KASSERT(dirvpp == NULL);
-
-	error = namei_simple_kernel(dirname,  NSM_FOLLOW_NOEMULROOT, compvpp);
-	
-	return error;
+        return (lookupnameat(dirname, seg, follow, dirvpp, compvpp, NULL));
 }
 
-
 int
-lookupnameat(char *dirname, enum uio_seg seg, vnode_t **dirvpp, 
-	vnode_t **compvpp, vnode_t *startvp)
+lookupnameat(char *dirname, enum uio_seg seg, enum symfollow follow,
+    vnode_t **dirvpp, vnode_t **compvpp, vnode_t *startvp)
 {
 
 	struct nameidata nd;
@@ -179,170 +170,3 @@ vfs_optionisset(const vfs_t *vfsp, const
 	
 	return 0;
 }
-
-#ifdef PORT_FREEBSD
-int
-traverse(vnode_t **cvpp, int lktype)
-{
-	kthread_t *td = curthread;
-	vnode_t *cvp;
-	vnode_t *tvp;
-	vfs_t *vfsp;
-	int error;
-
-	cvp = *cvpp;
-	tvp = NULL;
-
-	/*
-	 * If this vnode is mounted on, then we transparently indirect
-	 * to the vnode which is the root of the mounted file system.
-	 * Before we do this we must check that an unmount is not in
-	 * progress on this vnode.
-	 */
-
-	for (;;) {
-		/*
-		 * Reached the end of the mount chain?
-		 */
-		vfsp = vn_mountedvfs(cvp);
-		if (vfsp == NULL)
-			break;
-		/*
-		 * tvp is NULL for *cvpp vnode, which we can't unlock.
-		 */
-		if (tvp != NULL)
-			vput(cvp);
-		else
-			vrele(cvp);
-
-		/*
-		 * The read lock must be held across the call to VFS_ROOT() to
-		 * prevent a concurrent unmount from destroying the vfs.
-		 */
-		error = VFS_ROOT(vfsp, &tvp);
-		if (error != 0)
-			return (error);
-		cvp = tvp;
-	}
-
-	*cvpp = cvp;
-	return (0);
-}
-
-int
-domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath,
-    char *fspec, int fsflags)
-{
-	struct mount *mp;
-	struct vfsconf *vfsp;
-	struct ucred *newcr, *oldcr;
-	int error;
-	
-	/*
-	 * Be ultra-paranoid about making sure the type and fspath
-	 * variables will fit in our mp buffers, including the
-	 * terminating NUL.
-	 */
-	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
-		return (ENAMETOOLONG);
-
-	vfsp = vfs_byname_kld(fstype, td, &error);
-	if (vfsp == NULL)
-		return (ENODEV);
-
-	if (vp->v_type != VDIR)
-		return (ENOTDIR);
-	simple_lock(&vp->v_interlock);
-	if ((vp->v_iflag & VI_MOUNT) != 0 ||
-	    vp->v_mountedhere != NULL) {
-		simple_unlock(&vp->v_interlock);
-		return (EBUSY);
-	}
-	vp->v_iflag |= VI_MOUNT;
-	simple_unlock(&vp->v_interlock);
-
-	/*
-	 * Allocate and initialize the filesystem.
-	 */
-	vn_lock(vp, LK_SHARED | LK_RETRY);
-	mp = vfs_mount_alloc(vp, vfsp, fspath, td);
-	VOP_UNLOCK(vp);
-
-	mp->mnt_optnew = NULL;
-	vfs_setmntopt(mp, "from", fspec, 0);
-	mp->mnt_optnew = mp->mnt_opt;
-	mp->mnt_opt = NULL;
-
-	/*
-	 * Set the mount level flags.
-	 * crdup() can sleep, so do it before acquiring a mutex.
-	 */
-	newcr = crdup(kcred);
-	MNT_ILOCK(mp);
-	if (fsflags & MNT_RDONLY)
-		mp->mnt_flag |= MNT_RDONLY;
-	mp->mnt_flag &=~ MNT_UPDATEMASK;
-	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
-	/*
-	 * Unprivileged user can trigger mounting a snapshot, but we don't want
-	 * him to unmount it, so we switch to privileged credentials.
-	 */
-	oldcr = mp->mnt_cred;
-	mp->mnt_cred = newcr;
-	mp->mnt_stat.f_owner = mp->mnt_cred->cr_uid;
-	MNT_IUNLOCK(mp);
-	crfree(oldcr);
-	/*
-	 * Mount the filesystem.
-	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
-	 * get.  No freeing of cn_pnbuf.
-	 */
-	error = VFS_MOUNT(mp, td);
-
-	if (!error) {
-		if (mp->mnt_opt != NULL)
-			vfs_freeopts(mp->mnt_opt);
-		mp->mnt_opt = mp->mnt_optnew;
-		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
-	}
-	/*
-	 * Prevent external consumers of mount options from reading
-	 * mnt_optnew.
-	*/
-	mp->mnt_optnew = NULL;
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	/*
-	 * Put the new filesystem on the mount list after root.
-	 */
-#ifdef FREEBSD_NAMECACHE
-	cache_purge(vp);
-#endif
-	if (!error) {
-		vnode_t *mvp;
-
-		simple_lock(&vp->v_interlock);
-		vp->v_iflag &= ~VI_MOUNT;
-		simple_unlock(&vp->v_interlock);
-		vp->v_mountedhere = mp;
-		mountlist_append(mp);
-		vfs_event_signal(NULL, VQ_MOUNT, 0);
-		if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp, td))
-			panic("mount: lost mount");
-		mountcheckdirs(vp, mvp);
-		vput(mvp);
-		VOP_UNLOCK(vp);
-		if ((mp->mnt_flag & MNT_RDONLY) == 0)
-			vfs_syncer_add_to_worklist(mp);
-		vfs_unbusy(mp, td);
-		vfs_mountedfrom(mp, fspec);
-	} else {
-		simple_lock(&vp->v_interlock);
-		vp->v_iflag &= ~VI_MOUNT;
-		simple_unlock(&vp->v_interlock);
-		VOP_UNLOCK(vp);
-		vfs_unbusy(mp, td);
-		vfs_mount_destroy(mp);
-	}
-	return (error);
-}
-#endif
Index: src/external/cddl/osnet/sys/kern/zfs_stub.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/zfs_stub.c,v
retrieving revision 1.1
diff -u -p -r1.1 zfs_stub.c
--- src/external/cddl/osnet/sys/kern/zfs_stub.c	7 Aug 2009 20:57:57 -0000	1.1
+++ src/external/cddl/osnet/sys/kern/zfs_stub.c	11 May 2017 23:14:06 -0000
@@ -1,7 +1,7 @@
+#include <sys/pathname.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 
-
 /* zfs_ctldir.c stub routtines */
 
 void
@@ -14,11 +14,11 @@ zfsctl_fini(void)
 {
 }
 
-vnode_t *
-zfsctl_root(znode_t *znode)
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **znode)
 {
 
-	return NULL;
+	return 0;
 }
 
 int
@@ -57,124 +57,3 @@ zfsctl_umount_snapshots(vfs_t *vfsp, int
 
 	return 0;
 }
-
-/* zfs_acl.c */
-#if 0
-/*
- * Convert unix access mask to v4 access mask
- */
-static uint32_t
-zfs_unix_to_v4(uint32_t access_mask)
-{
-	uint32_t new_mask = 0;
-
-/*	if (access_mask & S_IXOTH)
-		new_mask |= ACE_EXECUTE;
-	if (access_mask & S_IWOTH)
-		new_mask |= ACE_WRITE_DATA;
-	if (access_mask & S_IROTH)
-	new_mask |= ACE_READ_DATA;*/
-	return (new_mask);
-}
-
-int
-zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
-    znode_t *tzp, cred_t *cr)
-{
-
-	return 0;
-}
-
-int
-zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
-
-	return ENOTSUP;
-}
-
-int
-zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
-{
-
-	return ENOTSUP;
-}
-
-int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
-    zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
-{
-
-	return 0;
-}
-
-int
-zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
-{
-	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
-}
-
-int
-zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
-{
-	int v4_mode = zfs_unix_to_v4((uint32_t)mode >> 6);
-
-	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
-}
-
-int
-zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
-{
-
-	return 0;
-}
-
-void
-zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
-    vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp)
-{
-		
-	return;
-}
-
-void
-zfs_acl_free(zfs_acl_t *aclp)
-{
-	
-	kmem_free(aclp, sizeof(zfs_acl_t));
-	return;
-}
-
-int
-zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
-{
-	
-	/* return ok or EACCES here ? */
-	return 0;
-}
-
-int
-zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
-    vsecattr_t *vsecp, zfs_acl_t **zaclp)
-{
-	zfs_acl_t *aclp;
-	*zaclp = NULL;
-
-	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
-
-	*zaclp = aclp;
-
-	return 0;
-}
-
-int
-zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
-{
-	int error;
-	error = 0;
-
-	*aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);;
-
-	return (error);
-}
-#endif /* 0 */
Index: src/external/cddl/osnet/sys/kern/zone.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/kern/zone.c,v
retrieving revision 1.1
diff -u -p -r1.1 zone.c
--- src/external/cddl/osnet/sys/kern/zone.c	7 Aug 2009 20:57:57 -0000	1.1
+++ src/external/cddl/osnet/sys/kern/zone.c	7 May 2017 03:38:14 -0000
@@ -34,14 +34,14 @@
 #include <sys/zone.h>
 
 int
-zone_dataset_attach(struct kauth_cred *cred, const char *dataset, int jailid)
+zone_dataset_attach(cred_t *cred, const char *dataset, int jailid)
 {
 
 	return 0;
 }
 
 int
-zone_dataset_detach(struct kauth_cred *cred, const char *dataset, int jailid)
+zone_dataset_detach(cred_t *cred, const char *dataset, int jailid)
 {
 
 	return 0;
Index: src/external/cddl/osnet/sys/machine/endian.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/machine/endian.h,v
retrieving revision 1.3
diff -u -p -r1.3 endian.h
--- src/external/cddl/osnet/sys/machine/endian.h	28 Feb 2010 14:45:47 -0000	1.3
+++ src/external/cddl/osnet/sys/machine/endian.h	11 Jun 2017 12:28:32 -0000
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/machine/endian.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/machine/endian.h 177698 2008-03-28 22:16:18Z jb $
  */
 
 #ifndef	_OPENSOLARIS_MACHINE_ENDIAN_H_
Index: src/external/cddl/osnet/sys/sys/acl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/acl.h,v
retrieving revision 1.2
diff -u -p -r1.2 acl.h
--- src/external/cddl/osnet/sys/sys/acl.h	21 Feb 2010 01:46:35 -0000	1.2
+++ src/external/cddl/osnet/sys/sys/acl.h	26 Apr 2017 15:10:34 -0000
@@ -1,9 +1,10 @@
-/*	$NetBSD: acl.h,v 1.2 2010/02/21 01:46:35 darran Exp $	*/
-
 /*-
- * Copyright (c) 2008, 2009 Edward Tomasz NapieraÅ‚a <trasz@FreeBSD.org>
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * Copyright (c) 2008 Edward Tomasz NapieraÅ‚a <trasz@FreeBSD.org>
  * All rights reserved.
  *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -25,17 +26,406 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/sys/acl.h,v 1.6.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/sys/acl.h 287445 2015-09-04 00:14:20Z delphij $
+ */
+/* 
+ * Developed by the TrustedBSD Project.
+ * Support for POSIX.1e and NFSv4 access control lists.
  */
 
-#ifndef OPENSOLARIS_SYS_ACL_H
-#define OPENSOLARIS_SYS_ACL_H
+#ifndef _FREEBSD_SYS_ACL_H_
+#define	_FREEBSD_SYS_ACL_H_
 
+/*
+ * This file contains the FreeBSD native <sys/acl.h>.
+ * In FreeBSD, this file is a wrapper which pulls in osnet one
+ * and that one pulls in the FreeBSD native one for the kernel.
+ * But since NetBSD has no native ACLs yet, the contents are
+ * swapped around for now.
+ */
 #include_next <sys/acl.h>
 
-struct acl;
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/queue.h>
+
+/*
+ * POSIX.1e and NFSv4 ACL types and related constants.
+ */
+
+typedef uint32_t	acl_tag_t;
+typedef uint32_t	acl_perm_t;
+typedef uint16_t	acl_entry_type_t;
+typedef uint16_t	acl_flag_t;
+typedef int		acl_type_t;
+typedef int		*acl_permset_t;
+typedef uint16_t	*acl_flagset_t;
+
+/*
+ * With 254 entries, "struct acl_t_struct" is exactly one 4kB page big.
+ * Note that with NFSv4 ACLs, the maximum number of ACL entries one
+ * may set on file or directory is about half of ACL_MAX_ENTRIES.
+ *
+ * If you increase this, you might also need to increase
+ * _ACL_T_ALIGNMENT_BITS in lib/libc/posix1e/acl_support.h.
+ *
+ * The maximum number of POSIX.1e ACLs is controlled
+ * by OLDACL_MAX_ENTRIES.  Changing that one will break binary
+ * compatibility with pre-8.0 userland and change on-disk ACL layout.
+ */
+#define	ACL_MAX_ENTRIES				254
+
+#if defined(_KERNEL) || defined(_ACL_PRIVATE)
+
+#define	POSIX1E_ACL_ACCESS_EXTATTR_NAMESPACE	EXTATTR_NAMESPACE_SYSTEM
+#define	POSIX1E_ACL_ACCESS_EXTATTR_NAME		"posix1e.acl_access"
+#define	POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE	EXTATTR_NAMESPACE_SYSTEM
+#define	POSIX1E_ACL_DEFAULT_EXTATTR_NAME	"posix1e.acl_default"
+#define	NFS4_ACL_EXTATTR_NAMESPACE		EXTATTR_NAMESPACE_SYSTEM
+#define	NFS4_ACL_EXTATTR_NAME			"nfs4.acl"
+#define	OLDACL_MAX_ENTRIES			32
+
+/*
+ * "struct oldacl" is used in compatibility ACL syscalls and for on-disk
+ * storage of POSIX.1e ACLs.
+ */
+typedef int	oldacl_tag_t;
+typedef mode_t	oldacl_perm_t;
+
+struct oldacl_entry {
+	oldacl_tag_t	ae_tag;
+	uid_t		ae_id;
+	oldacl_perm_t	ae_perm;
+};
+typedef struct oldacl_entry	*oldacl_entry_t;
+
+struct oldacl {
+	int			acl_cnt;
+	struct oldacl_entry	acl_entry[OLDACL_MAX_ENTRIES];
+};
+
+/*
+ * Current "struct acl".
+ */
+struct acl_entry {
+	acl_tag_t		ae_tag;
+	uid_t			ae_id;
+	acl_perm_t		ae_perm;
+	/* NFSv4 entry type, "allow" or "deny".  Unused in POSIX.1e ACLs. */
+	acl_entry_type_t	ae_entry_type;
+	/* NFSv4 ACL inheritance.  Unused in POSIX.1e ACLs. */
+	acl_flag_t		ae_flags;
+};
+typedef struct acl_entry	*acl_entry_t;
+
+/*
+ * Internal ACL structure, used in libc, kernel APIs and for on-disk
+ * storage of NFSv4 ACLs.  POSIX.1e ACLs use "struct oldacl" for on-disk
+ * storage.
+ */
+struct acl {
+	unsigned int		acl_maxcnt;
+	unsigned int		acl_cnt;
+	/* Will be required e.g. to implement NFSv4.1 ACL inheritance. */
+	int			acl_spare[4];
+	struct acl_entry	acl_entry[ACL_MAX_ENTRIES];
+};
+
+/*
+ * ACL structure internal to libc.
+ */
+struct acl_t_struct {
+	struct acl		ats_acl;
+	int			ats_cur_entry;
+	/*
+	 * ats_brand is for libc internal bookkeeping only.
+	 * Applications should use acl_get_brand_np(3).
+	 * Kernel code should use the "type" argument passed
+	 * to VOP_SETACL, VOP_GETACL or VOP_ACLCHECK calls;
+	 * ACL_TYPE_ACCESS or ACL_TYPE_DEFAULT mean POSIX.1e
+	 * ACL, ACL_TYPE_NFS4 means NFSv4 ACL.
+	 */
+	int			ats_brand;
+};
+typedef struct acl_t_struct *acl_t;
+
+#else /* _KERNEL || _ACL_PRIVATE */
+
+typedef void *acl_entry_t;
+typedef void *acl_t;
+
+#endif /* !_KERNEL && !_ACL_PRIVATE */
+
+/*
+ * Possible valid values for ats_brand field.
+ */
+#define	ACL_BRAND_UNKNOWN	0
+#define	ACL_BRAND_POSIX		1
+#define	ACL_BRAND_NFS4		2
+
+/*
+ * Possible valid values for ae_tag field.  For explanation, see acl(9).
+ */
+#define	ACL_UNDEFINED_TAG	0x00000000
+#define	ACL_USER_OBJ		0x00000001
+#define	ACL_USER		0x00000002
+#define	ACL_GROUP_OBJ		0x00000004
+#define	ACL_GROUP		0x00000008
+#define	ACL_MASK		0x00000010
+#define	ACL_OTHER		0x00000020
+#define	ACL_OTHER_OBJ		ACL_OTHER
+#define	ACL_EVERYONE		0x00000040
+
+/*
+ * Possible valid values for ae_entry_type field, valid only for NFSv4 ACLs.
+ */
+#define	ACL_ENTRY_TYPE_ALLOW	0x0100
+#define	ACL_ENTRY_TYPE_DENY	0x0200
+#define	ACL_ENTRY_TYPE_AUDIT	0x0400
+#define	ACL_ENTRY_TYPE_ALARM	0x0800
+
+/*
+ * Possible valid values for acl_type_t arguments.  First two
+ * are provided only for backwards binary compatibility.
+ */
+#define	ACL_TYPE_ACCESS_OLD	0x00000000
+#define	ACL_TYPE_DEFAULT_OLD	0x00000001
+#define	ACL_TYPE_ACCESS		0x00000002
+#define	ACL_TYPE_DEFAULT	0x00000003
+#define	ACL_TYPE_NFS4		0x00000004
+
+/*
+ * Possible bits in ae_perm field for POSIX.1e ACLs.  Note
+ * that ACL_EXECUTE may be used in both NFSv4 and POSIX.1e ACLs.
+ */
+#define	ACL_EXECUTE		0x0001
+#define	ACL_WRITE		0x0002
+#define	ACL_READ		0x0004
+#define	ACL_PERM_NONE		0x0000
+#define	ACL_PERM_BITS		(ACL_EXECUTE | ACL_WRITE | ACL_READ)
+#define	ACL_POSIX1E_BITS	(ACL_EXECUTE | ACL_WRITE | ACL_READ)
+
+/*
+ * Possible bits in ae_perm field for NFSv4 ACLs.
+ */
+#define	ACL_READ_DATA		0x00000008
+#define	ACL_LIST_DIRECTORY	0x00000008
+#define	ACL_WRITE_DATA		0x00000010
+#define	ACL_ADD_FILE		0x00000010
+#define	ACL_APPEND_DATA		0x00000020
+#define	ACL_ADD_SUBDIRECTORY	0x00000020
+#define	ACL_READ_NAMED_ATTRS	0x00000040
+#define	ACL_WRITE_NAMED_ATTRS	0x00000080
+/* ACL_EXECUTE is defined above. */
+#define	ACL_DELETE_CHILD	0x00000100
+#define	ACL_READ_ATTRIBUTES	0x00000200
+#define	ACL_WRITE_ATTRIBUTES	0x00000400
+#define	ACL_DELETE		0x00000800
+#define	ACL_READ_ACL		0x00001000
+#define	ACL_WRITE_ACL		0x00002000
+#define	ACL_WRITE_OWNER		0x00004000
+#define	ACL_SYNCHRONIZE		0x00008000
+
+#define	ACL_FULL_SET		(ACL_READ_DATA | ACL_WRITE_DATA | \
+    ACL_APPEND_DATA | ACL_READ_NAMED_ATTRS | ACL_WRITE_NAMED_ATTRS | \
+    ACL_EXECUTE | ACL_DELETE_CHILD | ACL_READ_ATTRIBUTES | \
+    ACL_WRITE_ATTRIBUTES | ACL_DELETE | ACL_READ_ACL | ACL_WRITE_ACL | \
+    ACL_WRITE_OWNER | ACL_SYNCHRONIZE)
+
+#define	ACL_MODIFY_SET		(ACL_FULL_SET & \
+    ~(ACL_WRITE_ACL | ACL_WRITE_OWNER))
+
+#define	ACL_READ_SET		(ACL_READ_DATA | ACL_READ_NAMED_ATTRS | \
+    ACL_READ_ATTRIBUTES | ACL_READ_ACL)
+
+#define	ACL_WRITE_SET		(ACL_WRITE_DATA | ACL_APPEND_DATA | \
+    ACL_WRITE_NAMED_ATTRS | ACL_WRITE_ATTRIBUTES)
+
+#define	ACL_NFS4_PERM_BITS	ACL_FULL_SET
+
+/*
+ * Possible entry_id values for acl_get_entry(3).
+ */
+#define	ACL_FIRST_ENTRY		0
+#define	ACL_NEXT_ENTRY		1
+
+/*
+ * Possible values in ae_flags field; valid only for NFSv4 ACLs.
+ */
+#define	ACL_ENTRY_FILE_INHERIT		0x0001
+#define	ACL_ENTRY_DIRECTORY_INHERIT	0x0002
+#define	ACL_ENTRY_NO_PROPAGATE_INHERIT	0x0004
+#define	ACL_ENTRY_INHERIT_ONLY		0x0008
+#define	ACL_ENTRY_SUCCESSFUL_ACCESS	0x0010
+#define	ACL_ENTRY_FAILED_ACCESS		0x0020
+#define	ACL_ENTRY_INHERITED		0x0080
+
+#define	ACL_FLAGS_BITS			(ACL_ENTRY_FILE_INHERIT | \
+    ACL_ENTRY_DIRECTORY_INHERIT | ACL_ENTRY_NO_PROPAGATE_INHERIT | \
+    ACL_ENTRY_INHERIT_ONLY | ACL_ENTRY_SUCCESSFUL_ACCESS | \
+    ACL_ENTRY_FAILED_ACCESS | ACL_ENTRY_INHERITED)
+
+/*
+ * Undefined value in ae_id field.  ae_id should be set to this value
+ * iff ae_tag is ACL_USER_OBJ, ACL_GROUP_OBJ, ACL_OTHER or ACL_EVERYONE.
+ */
+#define	ACL_UNDEFINED_ID	((uid_t)-1)
+
+/*
+ * Possible values for _flags parameter in acl_to_text_np(3).
+ */
+#define	ACL_TEXT_VERBOSE	0x01
+#define	ACL_TEXT_NUMERIC_IDS	0x02
+#define	ACL_TEXT_APPEND_ID	0x04
+
+/*
+ * POSIX.1e ACLs are capable of expressing the read, write, and execute bits
+ * of the POSIX mode field.  We provide two masks: one that defines the bits
+ * the ACL will replace in the mode, and the other that defines the bits that
+ * must be preseved when an ACL is updating a mode.
+ */
+#define	ACL_OVERRIDE_MASK	(S_IRWXU | S_IRWXG | S_IRWXO)
+#define	ACL_PRESERVE_MASK	(~ACL_OVERRIDE_MASK)
+
+#ifdef _KERNEL
+
+/*
+ * Filesystem-independent code to move back and forth between POSIX mode and
+ * POSIX.1e ACL representations.
+ */
+acl_perm_t		acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode);
+struct acl_entry	acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid,
+			    gid_t gid, mode_t mode);
+mode_t			acl_posix1e_perms_to_mode(
+			    struct acl_entry *acl_user_obj_entry,
+			    struct acl_entry *acl_group_obj_entry,
+			    struct acl_entry *acl_other_entry);
+mode_t			acl_posix1e_acl_to_mode(struct acl *acl);
+mode_t			acl_posix1e_newfilemode(mode_t cmode,
+			    struct acl *dacl);
+struct acl		*acl_alloc(int flags);
+void			acl_free(struct acl *aclp);
+
+void			acl_nfs4_sync_acl_from_mode(struct acl *aclp,
+			    mode_t mode, int file_owner_id);
+void			acl_nfs4_sync_mode_from_acl(mode_t *mode,
+			    const struct acl *aclp);
+int			acl_nfs4_is_trivial(const struct acl *aclp,
+			    int file_owner_id);
+void			acl_nfs4_compute_inherited_acl(
+			    const struct acl *parent_aclp,
+			    struct acl *child_aclp, mode_t mode,
+			    int file_owner_id, int is_directory);
+int			acl_copy_oldacl_into_acl(const struct oldacl *source,
+			    struct acl *dest);
+int			acl_copy_acl_into_oldacl(const struct acl *source,
+			    struct oldacl *dest);
+
+#if 0
+/*
+ * To allocate 'struct acl', use acl_alloc()/acl_free() instead of this.
+ */
+MALLOC_DECLARE(M_ACL);
+#endif
+/*
+ * Filesystem-independent syntax check for a POSIX.1e ACL.
+ */
+int			acl_posix1e_check(struct acl *acl);
+int 			acl_nfs4_check(const struct acl *aclp, int is_directory);
+
+#else /* !_KERNEL */
+
+#if defined(_ACL_PRIVATE)
+
+/*
+ * Syscall interface -- use the library calls instead as the syscalls have
+ * strict ACL entry ordering requirements.
+ */
+__BEGIN_DECLS
+int	__acl_aclcheck_fd(int _filedes, acl_type_t _type, struct acl *_aclp);
+int	__acl_aclcheck_file(const char *_path, acl_type_t _type,
+	    struct acl *_aclp);
+int	__acl_aclcheck_link(const char *_path, acl_type_t _type,
+	    struct acl *_aclp);
+int	__acl_delete_fd(int _filedes, acl_type_t _type);
+int	__acl_delete_file(const char *_path_p, acl_type_t _type);
+int	__acl_delete_link(const char *_path_p, acl_type_t _type);
+int	__acl_get_fd(int _filedes, acl_type_t _type, struct acl *_aclp);
+int	__acl_get_file(const char *_path, acl_type_t _type, struct acl *_aclp);
+int	__acl_get_link(const char *_path, acl_type_t _type, struct acl *_aclp);
+int	__acl_set_fd(int _filedes, acl_type_t _type, struct acl *_aclp);
+int	__acl_set_file(const char *_path, acl_type_t _type, struct acl *_aclp);
+int	__acl_set_link(const char *_path, acl_type_t _type, struct acl *_aclp);
+__END_DECLS
+
+#endif /* _ACL_PRIVATE */
+
+/*
+ * Supported POSIX.1e ACL manipulation and assignment/retrieval API _np calls
+ * are local extensions that reflect an environment capable of opening file
+ * descriptors of directories, and allowing additional ACL type for different
+ * filesystems (i.e., AFS).
+ */
+__BEGIN_DECLS
+int	acl_add_flag_np(acl_flagset_t _flagset_d, acl_flag_t _flag);
+int	acl_add_perm(acl_permset_t _permset_d, acl_perm_t _perm);
+int	acl_calc_mask(acl_t *_acl_p);
+int	acl_clear_flags_np(acl_flagset_t _flagset_d);
+int	acl_clear_perms(acl_permset_t _permset_d);
+int	acl_copy_entry(acl_entry_t _dest_d, acl_entry_t _src_d);
+ssize_t	acl_copy_ext(void *_buf_p, acl_t _acl, ssize_t _size);
+acl_t	acl_copy_int(const void *_buf_p);
+int	acl_create_entry(acl_t *_acl_p, acl_entry_t *_entry_p);
+int	acl_create_entry_np(acl_t *_acl_p, acl_entry_t *_entry_p, int _index);
+int	acl_delete_entry(acl_t _acl, acl_entry_t _entry_d);
+int	acl_delete_entry_np(acl_t _acl, int _index);
+int	acl_delete_fd_np(int _filedes, acl_type_t _type);
+int	acl_delete_file_np(const char *_path_p, acl_type_t _type);
+int	acl_delete_link_np(const char *_path_p, acl_type_t _type);
+int	acl_delete_def_file(const char *_path_p);
+int	acl_delete_def_link_np(const char *_path_p);
+int	acl_delete_flag_np(acl_flagset_t _flagset_d, acl_flag_t _flag);
+int	acl_delete_perm(acl_permset_t _permset_d, acl_perm_t _perm);
+acl_t	acl_dup(acl_t _acl);
+int	acl_free(void *_obj_p);
+acl_t	acl_from_text(const char *_buf_p);
+int	acl_get_brand_np(acl_t _acl, int *_brand_p);
+int	acl_get_entry(acl_t _acl, int _entry_id, acl_entry_t *_entry_p);
+acl_t	acl_get_fd(int _fd);
+acl_t	acl_get_fd_np(int fd, acl_type_t _type);
+acl_t	acl_get_file(const char *_path_p, acl_type_t _type);
+int	acl_get_entry_type_np(acl_entry_t _entry_d, acl_entry_type_t *_entry_type_p);
+acl_t	acl_get_link_np(const char *_path_p, acl_type_t _type);
+void	*acl_get_qualifier(acl_entry_t _entry_d);
+int	acl_get_flag_np(acl_flagset_t _flagset_d, acl_flag_t _flag);
+int	acl_get_perm_np(acl_permset_t _permset_d, acl_perm_t _perm);
+int	acl_get_flagset_np(acl_entry_t _entry_d, acl_flagset_t *_flagset_p);
+int	acl_get_permset(acl_entry_t _entry_d, acl_permset_t *_permset_p);
+int	acl_get_tag_type(acl_entry_t _entry_d, acl_tag_t *_tag_type_p);
+acl_t	acl_init(int _count);
+int	acl_set_fd(int _fd, acl_t _acl);
+int	acl_set_fd_np(int _fd, acl_t _acl, acl_type_t _type);
+int	acl_set_file(const char *_path_p, acl_type_t _type, acl_t _acl);
+int	acl_set_entry_type_np(acl_entry_t _entry_d, acl_entry_type_t _entry_type);
+int	acl_set_link_np(const char *_path_p, acl_type_t _type, acl_t _acl);
+int	acl_set_flagset_np(acl_entry_t _entry_d, acl_flagset_t _flagset_d);
+int	acl_set_permset(acl_entry_t _entry_d, acl_permset_t _permset_d);
+int	acl_set_qualifier(acl_entry_t _entry_d, const void *_tag_qualifier_p);
+int	acl_set_tag_type(acl_entry_t _entry_d, acl_tag_t _tag_type);
+ssize_t	acl_size(acl_t _acl);
+char	*acl_to_text(acl_t _acl, ssize_t *_len_p);
+char	*acl_to_text_np(acl_t _acl, ssize_t *_len_p, int _flags);
+int	acl_valid(acl_t _acl);
+int	acl_valid_fd_np(int _fd, acl_type_t _type, acl_t _acl);
+int	acl_valid_file_np(const char *_path_p, acl_type_t _type, acl_t _acl);
+int	acl_valid_link_np(const char *_path_p, acl_type_t _type, acl_t _acl);
+int	acl_is_trivial_np(const acl_t _acl, int *_trivialp);
+acl_t	acl_strip_np(const acl_t _acl, int recalculate_mask);
+__END_DECLS
+
+#endif /* !_KERNEL */
 
-void aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp);
-int acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries);
+#endif /* _KERNEL */
 
-#endif /* OPENSOLARIS_SYS_ACL_H */
+#endif /* !_FREEBSD_SYS_ACL_H_ */
Index: src/external/cddl/osnet/sys/sys/byteorder.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/byteorder.h,v
retrieving revision 1.4
diff -u -p -r1.4 byteorder.h
--- src/external/cddl/osnet/sys/sys/byteorder.h	28 Feb 2010 14:45:47 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/byteorder.h	22 Apr 2017 09:36:25 -0000
@@ -87,4 +87,6 @@
 #define	BE_64(x)	BSWAP_64(x)
 #endif
 
+#define BE_IN32(xa)	htonl(*((uint32_t *)(void *)(xa)))
+
 #endif /* _OPENSOLARIS_SYS_BYTEORDER_H_ */
Index: src/external/cddl/osnet/sys/sys/callb.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/callb.h,v
retrieving revision 1.1
diff -u -p -r1.1 callb.h
--- src/external/cddl/osnet/sys/sys/callb.h	7 Aug 2009 20:57:57 -0000	1.1
+++ src/external/cddl/osnet/sys/sys/callb.h	16 Apr 2017 21:12:38 -0000
@@ -4,9 +4,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -22,15 +21,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_CALLB_H
 #define	_SYS_CALLB_H
 
-#pragma ident	"@(#)callb.h	1.29	05/06/23 SMI"
-
 #include <sys/kcondvar.h>
 
 #ifdef	__cplusplus
@@ -70,7 +67,8 @@ extern "C" {
 #define	CB_CL_MDBOOT		CB_CL_UADMIN
 #define	CB_CL_ENTER_DEBUGGER	14
 #define	CB_CL_CPR_POST_KERNEL	15
-#define	NCBCLASS		16 /* CHANGE ME if classes are added/removed */
+#define	CB_CL_CPU_DEEP_IDLE	16
+#define	NCBCLASS		17 /* CHANGE ME if classes are added/removed */
 
 /*
  * CB_CL_CPR_DAEMON class specific definitions are given below:
Index: src/external/cddl/osnet/sys/sys/cdefs.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/cdefs.h
diff -N src/external/cddl/osnet/sys/sys/cdefs.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/sys/sys/cdefs.h	29 Apr 2017 21:31:40 -0000
@@ -0,0 +1,10 @@
+/*	$NetBSD$	*/
+
+#ifndef _FREEBSD_SYS_CDEFS_H_
+#define _FREEBSD_SYS_CDEFS_H_
+
+#include_next <sys/cdefs.h>
+
+#define __FBSDID(x) /* nothing */
+
+#endif /* !_FREEBSD_SYS_CDEFS_H_ */
Index: src/external/cddl/osnet/sys/sys/cmn_err.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/cmn_err.h,v
retrieving revision 1.4
diff -u -p -r1.4 cmn_err.h
--- src/external/cddl/osnet/sys/sys/cmn_err.h	17 Jul 2011 20:54:33 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/cmn_err.h	10 Jun 2017 00:13:40 -0000
@@ -1,35 +1,6 @@
 /*	$NetBSD: cmn_err.h,v 1.4 2011/07/17 20:54:33 joerg Exp $	*/
 
 /*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Andrew Doran.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*-
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
@@ -54,17 +25,13 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/cmn_err.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/cmn_err.h 179202 2008-05-22 08:33:24Z jb $
  */
 
 #ifndef _OPENSOLARIS_SYS_CMN_ERR_H_
 #define	_OPENSOLARIS_SYS_CMN_ERR_H_
 
-#ifdef _KERNEL
 #include <sys/systm.h>
-#else
-#include <stdio.h>
-#endif
 
 #ifdef	__cplusplus
 extern "C" {
@@ -81,44 +48,6 @@ extern "C" {
 void	vcmn_err(int, const char *, va_list);
 void	cmn_err(int, const char *, ...);
 
-#ifdef XXXAD
-
-{
-	char buf[256];
-
-	switch (ce) {
-	case CE_CONT:
-		snprintf(buf, sizeof(buf), "ZFS(cont): %s\n", fmt);
-		break;
-	case CE_NOTE:
-		snprintf(buf, sizeof(buf), "ZFS: NOTICE: %s\n", fmt);
-		break;
-	case CE_WARN:
-		snprintf(buf, sizeof(buf), "ZFS: WARNING: %s\n", fmt);
-		break;
-	case CE_PANIC:
-		snprintf(buf, sizeof(buf), "ZFS(panic): %s\n", fmt);
-		break;
-	case CE_IGNORE:
-		break;
-	default:
-		panic("unknown severity level");
-	}
-	if (ce != CE_IGNORE)
-		vprintf(buf, adx);
-	if (ce == CE_PANIC)
-		panic("ZFS");
-}
-
-{
-	va_list adx;
-
-	va_start(adx, fmt);
-	vcmn_err(ce, fmt, adx);
-	va_end(adx);
-}
-#endif
-
 #ifdef	__cplusplus
 }
 #endif
Index: src/external/cddl/osnet/sys/sys/cpuvar.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/cpuvar.h,v
retrieving revision 1.7
diff -u -p -r1.7 cpuvar.h
--- src/external/cddl/osnet/sys/sys/cpuvar.h	13 Mar 2010 22:31:15 -0000	1.7
+++ src/external/cddl/osnet/sys/sys/cpuvar.h	19 Apr 2017 22:39:40 -0000
@@ -29,6 +29,7 @@
 #ifndef _COMPAT_OPENSOLARIS_SYS_CPUVAR_H
 #define	_COMPAT_OPENSOLARIS_SYS_CPUVAR_H
 
+#include <sys/processor.h>
 #include <sys/mutex.h>
 #include <sys/cpuvar_defs.h>
 
Index: src/external/cddl/osnet/sys/sys/cred.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/cred.h,v
retrieving revision 1.4
diff -u -p -r1.4 cred.h
--- src/external/cddl/osnet/sys/sys/cred.h	13 Mar 2010 22:31:15 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/cred.h	10 Jun 2017 00:22:00 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/cred.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/cred.h 248571 2013-03-21 08:38:03Z mm $
  */
 
 #ifndef _OPENSOLARIS_SYS_CRED_H_
@@ -38,16 +38,17 @@
 #include <sys/kauth.h>
 
 #define	CRED()		(kauth_cred_get())
-#define KCRED()		(cred0)
 #define	kcred		cred0
 
 extern kauth_cred_t	cred0;
 
-#define	crget(cr)		cr = kauth_cred_get()
-#define	crgetuid(cr)		kauth_cred_getuid(cr)
-#define	crgetgid(cr)		kauth_cred_getgid(cr)
+#define	crgetuid(cr)		kauth_cred_geteuid(cr)
+#define	crgetruid(cr)		kauth_cred_getuid(cr)
+#define	crgetgid(cr)		kauth_cred_getegid(cr)
+#define	crgetrgid(cr)		kauth_cred_getgid(cr)
 #define	crgetngroups(cr) 	kauth_cred_ngroups(cr)
 #define	cralloc()		kauth_cred_alloc()
+#define crhold(cr)		kauth_cred_hold(cr)
 #define	crfree(cr)		kauth_cred_free(cr)
 #define	crsetugid(cr, u, g)	( \
 	kauth_cred_setuid(cr, u), \
@@ -75,11 +76,10 @@ crgetgroups(cred_t *cr)
 static __inline int
 groupmember(gid_t gid, cred_t *cr) 
 {
-  int result;
+	int result;
 
-  kauth_cred_ismember_gid(cr, gid, &result);
-
-  return result;
+	kauth_cred_ismember_gid(cr, gid, &result);
+	return result;
 }
 
 #endif	/* _KERNEL */
Index: src/external/cddl/osnet/sys/sys/cyclic.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/cyclic.h,v
retrieving revision 1.4
diff -u -p -r1.4 cyclic.h
--- src/external/cddl/osnet/sys/sys/cyclic.h	2 Dec 2012 00:05:39 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/cyclic.h	11 Jun 2017 12:30:23 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/cyclic.h 179202 2008-05-22 08:33:24Z jb $
  *
  */
 /*
@@ -40,8 +40,15 @@ typedef	void	cpu_t;
 #ifndef _ASM
 #include <sys/time.h>
 #include <sys/cpuvar.h>
+#include <sys/cpupart.h>
 #endif /* !_ASM */
 
+#define	CY_LOW_LEVEL		0
+#define	CY_LOCK_LEVEL		1
+#define	CY_HIGH_LEVEL		2
+#define	CY_SOFT_LEVELS		2
+#define	CY_LEVELS		3
+
 #ifndef _ASM
 
 typedef uintptr_t cyclic_id_t;
@@ -55,6 +62,7 @@ typedef void *cyb_arg_t;
 typedef struct cyc_handler {
 	cyc_func_t cyh_func;
 	void *cyh_arg;
+	cyc_level_t cyh_level;
 } cyc_handler_t;
 
 typedef struct cyc_time {
@@ -68,11 +76,14 @@ typedef struct cyc_omni_handler {
 	void *cyo_arg;
 } cyc_omni_handler_t;
 
+#define	CY_INFINITY	INT64_MAX
+
 #ifdef _KERNEL
 
 cyclic_id_t cyclic_add(cyc_handler_t *, cyc_time_t *);
 cyclic_id_t cyclic_add_omni(cyc_omni_handler_t *);
 void cyclic_remove(cyclic_id_t);
+int cyclic_reprogram(cyclic_id_t, hrtime_t);
 
 #endif /* _KERNEL */
 
Index: src/external/cddl/osnet/sys/sys/cyclic_impl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/cyclic_impl.h,v
retrieving revision 1.4
diff -u -p -r1.4 cyclic_impl.h
--- src/external/cddl/osnet/sys/sys/cyclic_impl.h	2 Dec 2012 01:05:16 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/cyclic_impl.h	11 Jun 2017 12:30:42 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/cyclic_impl.h 216254 2010-12-07 12:25:26Z avg $
  *
  */
 /*
Index: src/external/cddl/osnet/sys/sys/ddi.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/ddi.h
diff -N src/external/cddl/osnet/sys/sys/ddi.h
--- src/external/cddl/osnet/sys/sys/ddi.h	7 Aug 2009 20:57:57 -0000	1.1
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,63 +0,0 @@
-/*	$NetBSD: ddi.h,v 1.1 2009/08/07 20:57:57 haad Exp $	*/
-
-/*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Andrew Doran.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _UTS_DDI_H
-#define	_UTS_DDI_H
-
-#include <sys/u8_textprep.h>
-
-#define ddi_name_to_major(name) devsw_name2blk(name, NULL, 0)
-
-int	ddi_strtoul(const char *, char **, int, unsigned long *);
-int	ddi_strtol(const char *, char **, int, long *);
-int	ddi_soft_state_init(void **, size_t, size_t);
-int	ddi_soft_state_zalloc(void *, int);
-void	*ddi_get_soft_state(void *, int);
-void	ddi_soft_state_free(void *, int);
-void	ddi_soft_state_fini(void **);
-int	ddi_create_minor_node(dev_info_t *, char *, int,
-                              minor_t, char *, int);
-void	ddi_remove_minor_node(dev_info_t *, char *);
-
-typedef void 	*ldi_ident_t;
-
-#define	DDI_SUCCESS	0
-#define	DDI_FAILURE	-1
-
-#define	DDI_PSEUDO	""
-
-#define	ddi_prop_update_int64(a, b, c, d)	DDI_SUCCESS
-#define	ddi_prop_update_string(a, b, c, d)	DDI_SUCCESS
-
-#define	bioerror(bp, er)	((bp)->b_error = (er))
-#define	b_edev			b_dev
-
-#endif	/* _UTS_DDI_H_ */
Index: src/external/cddl/osnet/sys/sys/debug.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/debug.h,v
retrieving revision 1.3
diff -u -p -r1.3 debug.h
--- src/external/cddl/osnet/sys/sys/debug.h	21 Feb 2010 01:46:35 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/debug.h	11 Jun 2017 12:21:15 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/debug.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/debug.h 240415 2012-09-12 18:05:43Z mm $
  */
 
 #ifndef _OPENSOLARIS_SYS_DEBUG_H_
@@ -44,7 +44,44 @@
 	panic("solaris assert: %s (0x%jx %s 0x%jx), file: %s, line: %d", \
 	    (a), (uintmax_t)(lv), (op), (uintmax_t)(rv), (f), (l))
 #else	/* !_KERNEL */
+
+#include <stdio.h>
+#include <stdlib.h>
+
 #include_next <sys/debug.h>
+
+#ifndef HAVE_ASSFAIL
+extern int aok;
+
+static __inline int
+__assfail(const char *expr, const char *file, int line)
+{
+
+	(void)fprintf(stderr, "Assertion failed: (%s), file %s, line %d.\n",
+	    expr, file, line);
+	if (!aok)
+		abort();
+	return (0);
+}
+#define assfail __assfail
 #endif
 
+#ifndef HAVE_ASSFAIL3
+extern int aok;
+
+static inline void
+__assfail3(const char *expr, uintmax_t lv, const char *op, uintmax_t rv,
+    const char *file, int line) {
+
+	(void)fprintf(stderr,
+	    "Assertion failed: %s (0x%jx %s 0x%jx), file %s, line %d.\n",
+	    expr, lv, op, rv, file, line);
+	if (!aok)
+		abort();
+}
+#define assfail3 __assfail3
+#endif
+
+#endif	/* !_KERNEL */
+
 #endif	/* _OPENSOLARIS_SYS_DEBUG_H_ */
Index: src/external/cddl/osnet/sys/sys/dirent.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/dirent.h,v
retrieving revision 1.3
diff -u -p -r1.3 dirent.h
--- src/external/cddl/osnet/sys/sys/dirent.h	21 Feb 2010 01:46:35 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/dirent.h	10 Jun 2017 00:30:43 -0000
@@ -25,22 +25,25 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/dirent.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/dirent.h 219089 2011-02-27 19:41:40Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_DIRENT_H_
 #define	_OPENSOLARIS_SYS_DIRENT_H_
 
+#include <sys/types.h>
+
 #include_next <sys/dirent.h>
 
 typedef	struct dirent	dirent64_t;
+typedef ino_t		ino64_t;
+
 #define	dirent64	dirent
-#define	ino64_t		ino_t
 
 #define	d_ino	d_fileno
 
-#define	DIRENT64_RECLEN(len)	((sizeof(struct dirent) -		\
-				 sizeof(((struct dirent *)NULL)->d_name) + \
-				 (len) + 1 + 3) & ~3)
+#define	__DIRENT64_NAMEOFF	__builtin_offsetof(struct dirent, d_name)
+#define	DIRENT64_RECLEN(len)	\
+	roundup2(__DIRENT64_NAMEOFF + (len) + 1, sizeof(ino_t))
 
 #endif	/* !_OPENSOLARIS_SYS_DIRENT_H_ */
Index: src/external/cddl/osnet/sys/sys/dnlc.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/dnlc.h,v
retrieving revision 1.3
diff -u -p -r1.3 dnlc.h
--- src/external/cddl/osnet/sys/sys/dnlc.h	21 Feb 2010 01:46:35 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/dnlc.h	10 Jun 2017 00:14:01 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/dnlc.h,v 1.2 2007/05/24 13:44:45 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/dnlc.h 301997 2016-06-17 17:34:28Z kib $
  */
 
 #ifndef _OPENSOLARIS_SYS_DNLC_H_
Index: src/external/cddl/osnet/sys/sys/elf.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/elf.h,v
retrieving revision 1.5
diff -u -p -r1.5 elf.h
--- src/external/cddl/osnet/sys/sys/elf.h	1 Mar 2010 11:19:40 -0000	1.5
+++ src/external/cddl/osnet/sys/sys/elf.h	10 Jun 2017 00:31:30 -0000
@@ -20,7 +20,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/elf.h,v 1.1 2007/11/28 21:50:40 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/elf.h 177698 2008-03-28 22:16:18Z jb $
  *
  * ELF compatibility definitions for OpenSolaris source.
  *
Index: src/external/cddl/osnet/sys/sys/feature_tests.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/feature_tests.h,v
retrieving revision 1.2
diff -u -p -r1.2 feature_tests.h
--- src/external/cddl/osnet/sys/sys/feature_tests.h	21 Feb 2010 01:46:36 -0000	1.2
+++ src/external/cddl/osnet/sys/sys/feature_tests.h	10 Jun 2017 00:31:58 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/sys/feature_tests.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/feature_tests.h 179204 2008-05-22 09:08:41Z jb $
  *
  */
 
Index: src/external/cddl/osnet/sys/sys/kcondvar.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/kcondvar.h,v
retrieving revision 1.3
diff -u -p -r1.3 kcondvar.h
--- src/external/cddl/osnet/sys/sys/kcondvar.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/kcondvar.h	28 Apr 2017 17:21:59 -0000
@@ -41,8 +41,35 @@ typedef enum {
 	CV_DRIVER
 } kcv_type_t;
 
+typedef enum {
+	TR_CLOCK_TICK,
+} time_res_t;
+
 #define	cv_init(a, b, c, d)	cv_init(a, "zfscv")
 
+static inline clock_t
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+    int flag)
+{
+	extern int hz;
+	int ticks = ((uint64_t)tim * hz) / 1000000000;
+
+	return cv_timedwait(cvp, mp, ticks);
+}
+
+static inline clock_t
+cv_reltimedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t delta, time_res_t res)
+{
+
+	cv_wait(cvp, mp);
+	return 0;
+}
+
+#else
+
+extern	clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t,
+    int);
+
 #endif	/* _KERNEL */
 
 #endif	/* _OPENSOLARIS_SYS_KCONDVAR_H_ */
Index: src/external/cddl/osnet/sys/sys/kidmap.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/kidmap.h,v
retrieving revision 1.3
diff -u -p -r1.3 kidmap.h
--- src/external/cddl/osnet/sys/sys/kidmap.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/kidmap.h	10 Jun 2017 16:09:35 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD$
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/kidmap.h 185029 2008-11-17 20:49:29Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_KIDMAP_H_
Index: src/external/cddl/osnet/sys/sys/kmem.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/kmem.h,v
retrieving revision 1.6
diff -u -p -r1.6 kmem.h
--- src/external/cddl/osnet/sys/sys/kmem.h	21 Feb 2010 01:46:36 -0000	1.6
+++ src/external/cddl/osnet/sys/sys/kmem.h	16 Jun 2017 17:51:59 -0000
@@ -34,14 +34,16 @@
 #define	_OPENSOLARIS_SYS_KMEM_H_
 
 #include_next <sys/kmem.h>
-#include_next <sys/pool.h>
-#include_next <sys/vmem.h>
+#include <sys/pool.h>
+#include <sys/vmem.h>
 
 typedef void kmem_cache_t;
 
-u_long	kmem_size(void);
-u_long	kmem_used(void);
-void	kmem_reap(void);
+#define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
+#define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
+
+uint64_t	kmem_size(void);
+void		kmem_reap(void);
 
 void	*calloc(size_t n, size_t s);
 
@@ -51,10 +53,11 @@ kmem_cache_create(char *name, size_t buf
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags)
 {
 	pool_cache_t pc;
+	int flags = bufsize > PAGESIZE ? PR_NOALIGN : 0;
 
 	KASSERT(vmp == NULL);
 
-	pc = pool_cache_init(bufsize, align, 0, 0, name, NULL, IPL_NONE,
+	pc = pool_cache_init(bufsize, align, 0, flags, name, NULL, IPL_NONE,
 	    constructor, destructor, private);
 	if (pc != NULL && reclaim != NULL) {
 		pool_cache_set_drain_hook(pc, (void *)reclaim, private);
@@ -67,7 +70,23 @@ kmem_cache_create(char *name, size_t buf
 #define	kmem_cache_free(cache, buf)		pool_cache_put(cache, buf)
 #define	kmem_cache_reap_now(cache)		pool_cache_invalidate(cache)
 
-#define	KM_PUSHPAGE	0x00	/* XXXNETBSD */
-#define	KMC_NODEBUG	0x00
+#define	KM_PUSHPAGE	0	/* XXXNETBSD */
+#define	KM_NORMALPRI	0
+#define	KM_NODEBUG	0
+
+#define	KMC_NODEBUG	0
+#define	KMC_NOTOUCH	0
+
+#define	heap_arena			kmem_arena
+
+#define	kmem_cache_set_move(cache, movefunc)	do { } while (0)
+
+#define kmem_alloc solaris_kmem_alloc
+#define kmem_zalloc solaris_kmem_zalloc
+#define kmem_free solaris_kmem_free
+
+void *solaris_kmem_alloc(size_t, int);
+void *solaris_kmem_zalloc(size_t, int);
+void solaris_kmem_free(void *, size_t);
 
 #endif	/* _OPENSOLARIS_SYS_KMEM_H_ */
Index: src/external/cddl/osnet/sys/sys/kobj.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/kobj.h,v
retrieving revision 1.3
diff -u -p -r1.3 kobj.h
--- src/external/cddl/osnet/sys/sys/kobj.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/kobj.h	10 Jun 2017 00:32:32 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/kobj.h,v 1.2 2007/04/08 23:57:08 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/kobj.h 177698 2008-03-28 22:16:18Z jb $
  */
 
 #ifndef _OPENSOLARIS_SYS_KOBJ_H_
@@ -33,6 +33,7 @@
 
 #include <sys/types.h>
 #include <sys/kmem.h>
+#include_next <sys/kobj.h>
 #ifdef AT_UID
 #undef AT_UID
 #endif
Index: src/external/cddl/osnet/sys/sys/kobj_impl.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/kobj_impl.h
diff -N src/external/cddl/osnet/sys/sys/kobj_impl.h
--- src/external/cddl/osnet/sys/sys/kobj_impl.h	7 Aug 2009 20:57:57 -0000	1.1
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,30 +0,0 @@
-/*	$NetBSD: kobj_impl.h,v 1.1 2009/08/07 20:57:57 haad Exp $	*/
-
-/*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Andrew Doran.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
Index: src/external/cddl/osnet/sys/sys/kstat.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/kstat.h,v
retrieving revision 1.3
diff -u -p -r1.3 kstat.h
--- src/external/cddl/osnet/sys/sys/kstat.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/kstat.h	23 Apr 2017 18:02:16 -0000
@@ -54,7 +54,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/kstat.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/kstat.h 315896 2017-03-24 09:37:00Z mav $
  */
 
 #ifndef _OPENSOLARIS_SYS_KSTAT_H_
@@ -66,11 +66,18 @@
 
 #define	KSTAT_FLAG_VIRTUAL	0x01
 
+#define	KSTAT_READ	0
+#define	KSTAT_WRITE	1
+
 typedef struct kstat {
 	void	*ks_data;
 	u_int	 ks_ndata;
+#ifdef _KERNEL
 	struct sysctllog *ks_clog;
 	const struct sysctlnode *ks_node;
+#endif
+	int		(*ks_update)(struct kstat *, int); /* dynamic update */
+	void		*ks_private;	/* arbitrary provider-private data */
 } kstat_t;
 
 typedef struct kstat_named {
@@ -82,6 +89,8 @@ typedef struct kstat_named {
 #define	KSTAT_DATA_INT64	3
 #define	KSTAT_DATA_UINT64	4
 	uchar_t	data_type;
+#define	KSTAT_DESCLEN		128
+	char	desc[KSTAT_DESCLEN];
 	union {
 		uint64_t	ui64;
 	} value;
@@ -91,5 +100,7 @@ kstat_t *kstat_create(char *module, int 
     uchar_t type, ulong_t ndata, uchar_t flags);
 void kstat_install(kstat_t *ksp);
 void kstat_delete(kstat_t *ksp);
+void kstat_set_string(char *, const char *);
+void kstat_named_init(kstat_named_t *, const char *, uchar_t);
 
 #endif	/* _OPENSOLARIS_SYS_KSTAT_H_ */
Index: src/external/cddl/osnet/sys/sys/lock.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/lock.h
diff -N src/external/cddl/osnet/sys/sys/lock.h
--- src/external/cddl/osnet/sys/sys/lock.h	21 Feb 2010 01:46:36 -0000	1.3
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,48 +0,0 @@
-/*	$NetBSD: lock.h,v 1.3 2010/02/21 01:46:36 darran Exp $	*/
-
-/*-
- * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD: src/sys/compat/opensolaris/sys/lock.h,v 1.1 2007/04/09 01:05:31 pjd Exp $
- */
-
-#ifndef _OPENSOLARIS_SYS_LOCK_H_
-#define	_OPENSOLARIS_SYS_LOCK_H_
-
-#include_next <sys/lock.h>
-
-#ifdef _KERNEL
-
-#define	LO_ALLMASK	(LO_INITIALIZED | LO_WITNESS | LO_QUIET |	\
-			 LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE |	\
-			 LO_DUPOK | LO_ENROLLPEND | LO_CLASSMASK |	\
-			 LO_NOPROFILE)
-#define	LO_EXPECTED	(LO_INITIALIZED | LO_WITNESS | LO_RECURSABLE |	\
-			 LO_SLEEPABLE | LO_UPGRADABLE | LO_DUPOK |	\
-			 /* sx lock class */(2 << LO_CLASSSHIFT))
-
-#endif	/* defined(_KERNEL) */
-
-#endif	/* _OPENSOLARIS_SYS_LOCK_H_ */
Index: src/external/cddl/osnet/sys/sys/misc.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/misc.h,v
retrieving revision 1.3
diff -u -p -r1.3 misc.h
--- src/external/cddl/osnet/sys/sys/misc.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/misc.h	10 Jun 2017 00:43:44 -0000
@@ -25,12 +25,21 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/misc.h,v 1.2 2007/04/23 00:52:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/misc.h 219089 2011-02-27 19:41:40Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_MISC_H_
 #define	_OPENSOLARIS_SYS_MISC_H_
 
+#include <sys/syslimits.h>
+
+#define	MAXUID	UID_MAX
+
+#define	SPEC_MAXOFFSET_T	LLONG_MAX
+
+#define _ACL_ACLENT_ENABLED     0x1
+#define _ACL_ACE_ENABLED        0x2
+
 #define	_FIOFFS		(INT_MIN)
 #define	_FIOGDIO	(INT_MIN+1)
 #define	_FIOSDIO	(INT_MIN+2)
Index: src/external/cddl/osnet/sys/sys/mkdev.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/mkdev.h,v
retrieving revision 1.1
diff -u -p -r1.1 mkdev.h
--- src/external/cddl/osnet/sys/sys/mkdev.h	7 Aug 2009 20:57:58 -0000	1.1
+++ src/external/cddl/osnet/sys/sys/mkdev.h	18 Apr 2017 13:21:46 -0000
@@ -1,31 +1,45 @@
 /*	$NetBSD: mkdev.h,v 1.1 2009/08/07 20:57:58 haad Exp $	*/
 
-/*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
+/*
+ * CDDL HEADER START
  *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Andrew Doran.
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
  *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ *
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
  */
 
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+
+#define	MAXMIN32	0x3fffful	/* SVR4 max minor value */
+
+#ifdef _LP64
+
+#define	MAXMIN64	0xfffffffful	/* max minor value */
+#define	MAXMIN		MAXMIN64
+
+#else /* !_LP64 */
+
+#define	MAXMIN		MAXMIN32
+
+#endif /* !_LP64 */
Index: src/external/cddl/osnet/sys/sys/mman.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/mman.h,v
retrieving revision 1.3
diff -u -p -r1.3 mman.h
--- src/external/cddl/osnet/sys/sys/mman.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/mman.h	10 Jun 2017 01:07:18 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/mman.h,v 1.1 2007/11/28 21:50:40 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/mman.h 177698 2008-03-28 22:16:18Z jb $
  *
  */
 
Index: src/external/cddl/osnet/sys/sys/mntent.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/mntent.h,v
retrieving revision 1.2
diff -u -p -r1.2 mntent.h
--- src/external/cddl/osnet/sys/sys/mntent.h	21 Feb 2010 01:46:36 -0000	1.2
+++ src/external/cddl/osnet/sys/sys/mntent.h	10 Jun 2017 01:07:34 -0000
@@ -21,7 +21,7 @@
  *
  * CDDL HEADER END
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/sys/mntent.h,v 1.4.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/mntent.h 185029 2008-11-17 20:49:29Z pjd $
  */
 /*
  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
Index: src/external/cddl/osnet/sys/sys/mnttab.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/mnttab.h,v
retrieving revision 1.4
diff -u -p -r1.4 mnttab.h
--- src/external/cddl/osnet/sys/sys/mnttab.h	14 Dec 2010 01:00:26 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/mnttab.h	20 Apr 2017 22:35:54 -0000
@@ -33,6 +33,12 @@
 #ifndef _SYS_MNTTAB_H
 #define	_SYS_MNTTAB_H
 
+#ifndef	_KERNEL
+#include <mnttab.h>
+#endif
+
+#if 0
+
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
@@ -102,4 +108,6 @@ extern char	*mntopt();
 }
 #endif
 
+#endif
+
 #endif	/* _SYS_MNTTAB_H */
Index: src/external/cddl/osnet/sys/sys/modctl.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/modctl.h,v
retrieving revision 1.5
diff -u -p -r1.5 modctl.h
--- src/external/cddl/osnet/sys/sys/modctl.h	1 Mar 2010 11:19:40 -0000	1.5
+++ src/external/cddl/osnet/sys/sys/modctl.h	10 Jun 2017 01:07:55 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/modctl.h,v 1.1 2007/11/28 21:50:40 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/modctl.h 177698 2008-03-28 22:16:18Z jb $
  *
  */
 
@@ -35,6 +35,6 @@
 #include <sys/param.h>
 #include <sys/linker.h>
 
-typedef struct module dtrace_modctl_t;
+typedef struct module modctl_t;
 
 #endif /* _COMPAT_OPENSOLARIS_SYS_MODCTL_H */
Index: src/external/cddl/osnet/sys/sys/mount.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/mount.h,v
retrieving revision 1.7
diff -u -p -r1.7 mount.h
--- src/external/cddl/osnet/sys/sys/mount.h	4 Mar 2014 09:24:42 -0000	1.7
+++ src/external/cddl/osnet/sys/sys/mount.h	10 Jun 2017 01:08:47 -0000
@@ -25,15 +25,16 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/mount.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/mount.h 219089 2011-02-27 19:41:40Z pjd $
  */
 
-#include_next <sys/mount.h>
-
 #ifndef _OPENSOLARIS_SYS_MOUNT_H_
 #define	_OPENSOLARIS_SYS_MOUNT_H_
 
+#include_next <sys/mount.h>
+
 #define	MS_OVERLAY	0
+#define MS_RDONLY	MNT_RDONLY
 #define	MS_FORCE	MNT_FORCE
 #define	MS_REMOUNT	MNT_UPDATE
 #define	MS_OPTIONSTR	__MNT_UNUSED1
Index: src/external/cddl/osnet/sys/sys/mutex.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/mutex.h,v
retrieving revision 1.3
diff -u -p -r1.3 mutex.h
--- src/external/cddl/osnet/sys/sys/mutex.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/mutex.h	2 May 2017 00:32:27 -0000
@@ -32,13 +32,13 @@
 #ifndef _OPENSOLARIS_SYS_MUTEX_H_
 #define	_OPENSOLARIS_SYS_MUTEX_H_
 
-#include_next <sys/mutex.h>
-
 #ifdef _KERNEL
 
+#include_next <sys/mutex.h>
+
 #define	MUTEX_HELD(x)		(mutex_owned(x))
 #define	MUTEX_NOT_HELD(x)	(!mutex_owned(x) || panicstr != NULL)
-#define	mutex_init(a, b, c, d)	mutex_init(a, c, IPL_NONE)
+#define	mutex_init(a, b, c, d)	mutex_init(a, MUTEX_DEFAULT, IPL_NONE)
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/sys/sys/opentypes.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/opentypes.h,v
retrieving revision 1.1
diff -u -p -r1.1 opentypes.h
--- src/external/cddl/osnet/sys/sys/opentypes.h	1 Feb 2016 02:12:55 -0000	1.1
+++ src/external/cddl/osnet/sys/sys/opentypes.h	17 Apr 2017 02:58:27 -0000
@@ -5,8 +5,8 @@
 #define	FMNAMESZ	8
 
 #if defined(__APPLE__) || defined(HAVE_NBTOOL_CONFIG_H)
-typedef int64_t longlong_t;
-typedef uint64_t u_longlong_t;
+typedef long long longlong_t;
+typedef unsigned long long u_longlong_t;
 typedef unsigned long vsize_t;
 #endif
 
@@ -14,6 +14,7 @@ typedef unsigned int	size32_t;
 typedef unsigned int	caddr32_t;
 
 typedef	struct timespec	timestruc_t;
+typedef	struct timespec	timespec_t;
 typedef unsigned int	uint_t;
 typedef unsigned char	uchar_t;
 typedef unsigned short	ushort_t;
Index: src/external/cddl/osnet/sys/sys/param.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/param.h,v
retrieving revision 1.3
diff -u -p -r1.3 param.h
--- src/external/cddl/osnet/sys/sys/param.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/param.h	10 Jun 2017 01:09:40 -0000
@@ -25,13 +25,16 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/param.h,v 1.1 2007/11/28 21:50:40 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/param.h 185029 2008-11-17 20:49:29Z pjd $
  *
  */
 
+#include <sys/proc.h>
+
 #ifndef _COMPAT_OPENSOLARIS_SYS_PARAM_H_
 #define _COMPAT_OPENSOLARIS_SYS_PARAM_H_
 
+#define __HIDE_DELAY
 #include_next <sys/param.h>
 
 #ifdef _KERNEL
Index: src/external/cddl/osnet/sys/sys/pathname.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/pathname.h,v
retrieving revision 1.3
diff -u -p -r1.3 pathname.h
--- src/external/cddl/osnet/sys/sys/pathname.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/pathname.h	10 Jun 2017 01:09:55 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/sys/pathname.h,v 1.1 2008/11/17 20:49:29 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/pathname.h 314194 2017-02-24 07:53:56Z avg $
  */
 
 #ifndef _OPENSOLARIS_SYS_PATHNAME_H_
@@ -43,15 +43,11 @@ typedef struct pathname {
 	size_t	pn_bufsize;		/* total size of pn_buf */
 } pathname_t;
 
-#define	pn_alloc(pnp)	panic("pn_alloc() called")
-#define	pn_free(pnp)	panic("pn_free() called")
-
 #define	NO_FOLLOW	NOFOLLOW
 
-int lookupname(char *, enum uio_seg, vnode_t **, vnode_t **);
-int lookupnameat(char *, enum uio_seg, vnode_t **, vnode_t **,
+int lookupname(char *, enum uio_seg, enum symfollow, vnode_t **, vnode_t **);
+int lookupnameat(char *, enum uio_seg, enum symfollow, vnode_t **, vnode_t **,
     vnode_t *);
-int traverse(vnode_t **, int);
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/sys/sys/pcpu.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/pcpu.h,v
retrieving revision 1.3
diff -u -p -r1.3 pcpu.h
--- src/external/cddl/osnet/sys/sys/pcpu.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/pcpu.h	10 Jun 2017 01:10:24 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/pcpu.h,v 1.1 2007/11/28 21:50:40 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/pcpu.h 179202 2008-05-22 08:33:24Z jb $
  *
  */
 
Index: src/external/cddl/osnet/sys/sys/policy.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/policy.h,v
retrieving revision 1.7
diff -u -p -r1.7 policy.h
--- src/external/cddl/osnet/sys/sys/policy.h	19 Oct 2012 19:58:33 -0000	1.7
+++ src/external/cddl/osnet/sys/sys/policy.h	10 Jun 2017 16:09:58 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- $ $FreeBSD: src/sys/compat/opensolaris/sys/policy.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ $ $FreeBSD: head/sys/cddl/compat/opensolaris/sys/policy.h 219089 2011-02-27 19:41:40Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_POLICY_H_
@@ -39,35 +39,43 @@
 
 struct mount;
 struct vattr;
-struct vnode;
 
-int	secpolicy_zfs(kauth_cred_t cred);
-int	secpolicy_sys_config(kauth_cred_t cred, int checkonly);
-int	secpolicy_zinject(kauth_cred_t cred);
-int 	secpolicy_fs_mount(kauth_cred_t cred, struct vnode *mvp,
-	    struct mount *vfsp);
-int	secpolicy_fs_unmount(kauth_cred_t cred, struct mount *vfsp);
-int	secpolicy_basic_link(kauth_cred_t cred);
-int	secpolicy_vnode_stky_modify(kauth_cred_t cred);
-int 	secpolicy_vnode_owner(kauth_cred_t cred, uid_t owner);
-int	secpolicy_vnode_remove(kauth_cred_t cred);
-int	secpolicy_vnode_access(kauth_cred_t cred, struct vnode *vp,
-	    uid_t owner, int mode);
-int 	secpolicy_vnode_chown(kauth_cred_t cred,
-            boolean_t check_self);
-int	secpolicy_vnode_setdac(kauth_cred_t cred, uid_t owner);
-int	secpolicy_vnode_setattr(kauth_cred_t cred, struct vnode *vp,
-	    struct vattr *vap, const struct vattr *ovap, int flags,
-	    int unlocked_access(void *, int, kauth_cred_t), void *node);
-int	secpolicy_vnode_create_gid(kauth_cred_t cred);
-int	secpolicy_vnode_utime_modify(kauth_cred_t cred);
-int	secpolicy_vnode_setids_setgids(kauth_cred_t cred, gid_t gid);
-int	secpolicy_vnode_setid_retain(kauth_cred_t cred, boolean_t issuidroot);
-void	secpolicy_setid_clear(struct vattr *vap, kauth_cred_t cred);
-int	secpolicy_setid_setsticky_clear(struct vnode *vp, struct vattr *vap,
-	    const struct vattr *ovap, kauth_cred_t cred);
-int     secpolicy_xvattr(xvattr_t *xvap, uid_t owner, kauth_cred_t cred,
+typedef int accmode_t;
+
+int	secpolicy_nfs(cred_t *cr);
+int	secpolicy_zfs(cred_t *crd);
+int	secpolicy_sys_config(cred_t *cr, int checkonly);
+int	secpolicy_zinject(cred_t *cr);
+int	secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp);
+int	secpolicy_basic_link(vnode_t *vp, cred_t *cr);
+int	secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner);
+int	secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner);
+int	secpolicy_vnode_stky_modify(cred_t *cr);
+int	secpolicy_vnode_remove(vnode_t *vp, cred_t *cr);
+int	secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner,
+	    accmode_t accmode);
+int	secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+	    accmode_t curmode, accmode_t wantmode);
+int	secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner);
+int	secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner);
+int	secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
+	    const struct vattr *ovap, int flags,
+	    int unlocked_access(void *, int, cred_t *), void *node);
+int	secpolicy_vnode_create_gid(cred_t *cr);
+int	secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid);
+int	secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr,
+	    boolean_t issuidroot);
+void	secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr);
+int	secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+	    const struct vattr *ovap, cred_t *cr);
+int	secpolicy_fs_owner(struct mount *vfsp, cred_t *cr);
+int	secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp);
+void	secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp);
+int	secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
 	    vtype_t vtype);
+int	secpolicy_smb(cred_t *cr);
+
+int	secpolicy_vnode_utime_modify(cred_t *cr);
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/sys/sys/proc.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/proc.h,v
retrieving revision 1.5
diff -u -p -r1.5 proc.h
--- src/external/cddl/osnet/sys/sys/proc.h	13 Mar 2010 22:31:15 -0000	1.5
+++ src/external/cddl/osnet/sys/sys/proc.h	11 Jun 2017 04:07:26 -0000
@@ -54,7 +54,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/proc.h,v 1.4 2007/11/28 21:50:05 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/proc.h 306304 2016-09-24 21:40:14Z markj $
  */
 
 #ifndef _OPENSOLARIS_SYS_PROC_H_
@@ -99,8 +99,6 @@ kthread_t *thread_create(void *, size_t,
 void	thread_exit(void);
 void	thread_join(uint64_t);
 
-int newproc(void (*)(void *), caddr_t, id_t, int, struct contract **, pid_t);
-
 #endif	/* _KERNEL */
 
 #endif	/* _OPENSOLARIS_SYS_PROC_H_ */
Index: src/external/cddl/osnet/sys/sys/procset.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/procset.h,v
retrieving revision 1.1
diff -u -p -r1.1 procset.h
--- src/external/cddl/osnet/sys/sys/procset.h	7 Aug 2009 20:57:58 -0000	1.1
+++ src/external/cddl/osnet/sys/sys/procset.h	20 Apr 2017 14:59:37 -0000
@@ -1,30 +1,133 @@
-/*	$NetBSD: procset.h,v 1.1 2009/08/07 20:57:58 haad Exp $	*/
+/*	$NetBSD$	*/
 
-/*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
+/*
+ * CDDL HEADER START
  *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Andrew Doran.
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
  *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+
+#ifndef _SYS_PROCSET_H
+#define	_SYS_PROCSET_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/feature_tests.h>
+#include <sys/types.h>
+#include <sys/signal.h>
+
+/*
+ *	This file defines the data needed to specify a set of
+ *	processes.  These types are used by the sigsend, sigsendset,
+ *	priocntl, priocntlset, waitid, evexit, and evexitset system
+ *	calls.
+ */
+#define	P_INITPID	1
+#define	P_INITUID	0
+#define	P_INITPGID	0
+
+#include <sys/idtype.h>
+
+/*
+ *	The following defines the operations which can be performed to
+ *	combine two simple sets of processes to form another set of
+ *	processes.
+ */
+#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
+typedef enum idop {
+	POP_DIFF,	/* Set difference.  The processes which	*/
+			/* are in the left operand set and not	*/
+			/* in the right operand set.		*/
+	POP_AND,	/* Set disjunction.  The processes	*/
+			/* which are in both the left and right	*/
+			/* operand sets.			*/
+	POP_OR,		/* Set conjunction.  The processes	*/
+			/* which are in either the left or the	*/
+			/* right operand sets (or both).	*/
+	POP_XOR		/* Set exclusive or.  The processes 	*/
+			/* which are in either the left or	*/
+			/* right operand sets but not in both.	*/
+} idop_t;
+
+
+/*
+ *	The following structure is used to define a set of processes.
+ *	The set is defined in terms of two simple sets of processes
+ *	and an operator which operates on these two operand sets.
  */
+typedef struct procset {
+	idop_t		p_op;	/* The operator connection the	*/
+				/* following two operands each	*/
+				/* of which is a simple set of	*/
+				/* processes.			*/
+
+	idtype_t	p_lidtype;
+				/* The type of the left operand	*/
+				/* simple set.			*/
+	id_t		p_lid;	/* The id of the left operand.	*/
+
+	idtype_t	p_ridtype;
+				/* The type of the right	*/
+				/* operand simple set.		*/
+	id_t		p_rid;	/* The id of the right operand.	*/
+} procset_t;
+
+/*
+ *	The following macro can be used to initialize a procset_t
+ *	structure.
+ */
+#define	setprocset(psp, op, ltype, lid, rtype, rid) \
+			(psp)->p_op		= (op); \
+			(psp)->p_lidtype	= (ltype); \
+			(psp)->p_lid		= (lid); \
+			(psp)->p_ridtype	= (rtype); \
+			(psp)->p_rid		= (rid);
+
+#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
+
+#ifdef illumos
+#ifdef _KERNEL
+
+struct proc;
+
+extern int dotoprocs(procset_t *, int (*)(), char *);
+extern int dotolwp(procset_t *, int (*)(), char *);
+extern int procinset(struct proc *, procset_t *);
+extern int sigsendproc(struct proc *, sigsend_t *);
+extern int sigsendset(procset_t *, sigsend_t *);
+extern boolean_t cur_inset_only(procset_t *);
+extern id_t getmyid(idtype_t);
+
+#endif	/* _KERNEL */
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_PROCSET_H */
Index: src/external/cddl/osnet/sys/sys/racct.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/racct.h
diff -N src/external/cddl/osnet/sys/sys/racct.h
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/external/cddl/osnet/sys/sys/racct.h	5 May 2017 22:37:19 -0000
@@ -0,0 +1 @@
+/* This file intentionally left blank. */
Index: src/external/cddl/osnet/sys/sys/random.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/random.h,v
retrieving revision 1.5
diff -u -p -r1.5 random.h
--- src/external/cddl/osnet/sys/sys/random.h	3 Jan 2013 17:12:05 -0000	1.5
+++ src/external/cddl/osnet/sys/sys/random.h	10 Jun 2017 01:11:42 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/random.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/random.h 305679 2016-09-10 09:37:41Z mav $
  */
 
 #ifndef _OPENSOLARIS_SYS_RANDOM_H_
@@ -33,7 +33,8 @@
 
 #include <sys/cprng.h>
 
-#define	random_get_bytes(p, s)		cprng_fast(p, s)
-#define	random_get_pseudo_bytes(p, s)	cprng_fast(p, s)
+#define	random_get_bytes(p, s)		cprng_fast((p), (s))
+#define	random_get_pseudo_bytes(p, s)	cprng_fast((p), (s))
+#define	read_random(p, s)		cprng_fast((p), (s))
 
 #endif	/* !_OPENSOLARIS_SYS_RANDOM_H_ */
Index: src/external/cddl/osnet/sys/sys/refstr.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/refstr.h,v
retrieving revision 1.2
diff -u -p -r1.2 refstr.h
--- src/external/cddl/osnet/sys/sys/refstr.h	21 Feb 2010 01:46:36 -0000	1.2
+++ src/external/cddl/osnet/sys/sys/refstr.h	10 Jun 2017 01:12:47 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- $ $FreeBSD: src/sys/cddl/compat/opensolaris/sys/refstr.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ $ $FreeBSD: head/sys/cddl/compat/opensolaris/sys/refstr.h 185029 2008-11-17 20:49:29Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_REFSTR_H_
Index: src/external/cddl/osnet/sys/sys/rwlock.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/rwlock.h,v
retrieving revision 1.3
diff -u -p -r1.3 rwlock.h
--- src/external/cddl/osnet/sys/sys/rwlock.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/rwlock.h	10 Jun 2017 01:13:59 -0000
@@ -54,7 +54,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/rwlock.h,v 1.4 2007/11/05 18:40:55 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/rwlock.h 253996 2013-08-06 15:51:56Z avg $
  */
 
 #ifndef _OPENSOLARIS_SYS_RWLOCK_H_
Index: src/external/cddl/osnet/sys/sys/sdt.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/sdt.h,v
retrieving revision 1.7
diff -u -p -r1.7 sdt.h
--- src/external/cddl/osnet/sys/sys/sdt.h	2 Oct 2015 17:28:57 -0000	1.7
+++ src/external/cddl/osnet/sys/sys/sdt.h	17 Apr 2017 20:52:10 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/sdt.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/sdt.h 271802 2014-09-18 20:00:36Z smh $
  */
 
 #ifndef _OPENSOLARIS_SYS_SDT_H_
@@ -55,4 +55,15 @@
 #define	DTRACE_PROBE7(name, type1, arg1, type2, arg2, type3, arg3, \
 	type4, arg4, type5, arg5, type6, arg6, type7, arg7)
 
+#ifdef KDTRACE_HOOKS
+SDT_PROBE_DECLARE(sdt, , , set__error);
+
+#define SET_ERROR(err) \
+	((sdt_sdt___set__error->id ? \
+	(*sdt_probe_func)(sdt_sdt___set__error->id, \
+	    (uintptr_t)err, 0, 0, 0, 0) : 0), err)
+#else
+#define SET_ERROR(err) (err)
+#endif
+
 #endif	/* _OPENSOLARIS_SYS_SDT_H_ */
Index: src/external/cddl/osnet/sys/sys/sema.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/sema.h,v
retrieving revision 1.2
diff -u -p -r1.2 sema.h
--- src/external/cddl/osnet/sys/sys/sema.h	21 Feb 2010 01:46:36 -0000	1.2
+++ src/external/cddl/osnet/sys/sys/sema.h	10 Jun 2017 01:14:27 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/sys/sema.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/sema.h 179203 2008-05-22 08:35:03Z jb $
  *
  */
 
Index: src/external/cddl/osnet/sys/sys/sid.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/sid.h,v
retrieving revision 1.3
diff -u -p -r1.3 sid.h
--- src/external/cddl/osnet/sys/sys/sid.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/sid.h	10 Jun 2017 16:10:55 -0000
@@ -1,35 +1,6 @@
 /*	$NetBSD: sid.h,v 1.3 2010/02/21 01:46:36 darran Exp $	*/
 
 /*-
- * Copyright (c) 2009 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Andrew Doran.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*-
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
@@ -54,14 +25,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/sys/sid.h,v 1.1 2008/11/17 20:49:29 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/sid.h 240162 2012-09-06 13:43:48Z mm $
  */
 
 #ifndef _OPENSOLARIS_SYS_SID_H_
 #define	_OPENSOLARIS_SYS_SID_H_
 
 typedef struct ksiddomain {
-	char	kd_name[16];	/* Domain part of SID */
+	char	*kd_name;	/* Domain part of SID */
+	uint_t	kd_len;
 } ksiddomain_t;
 typedef void	ksid_t;
 
@@ -69,9 +41,13 @@ static __inline ksiddomain_t *
 ksid_lookupdomain(const char *domain)
 {
 	ksiddomain_t *kd;
+	size_t len;
 
+	len = strlen(domain) + 1;
 	kd = kmem_alloc(sizeof(*kd), KM_SLEEP);
-	strlcpy(kd->kd_name, "FreeBSD", sizeof(kd->kd_name));
+	kd->kd_len = (uint_t)len;
+	kd->kd_name = kmem_alloc(len, KM_SLEEP);
+	strcpy(kd->kd_name, domain);
 	return (kd);
 }
 
@@ -79,10 +55,32 @@ static __inline void
 ksiddomain_rele(ksiddomain_t *kd)
 {
 
+	kmem_free(kd->kd_name, kd->kd_len);
 	kmem_free(kd, sizeof(*kd));
 }
 
-#define	kidmap_getuidbysid(a, b, c, d)	panic("XXXNETBSD kidmap_getuidbysid")
-#define	kidmap_getgidbysid(a, b, c, d)	panic("XXXNETBSD kidmap_getgidbysid")
+static __inline uint_t
+ksid_getid(ksid_t *ks)
+{
+
+	panic("%s has been unexpectedly called", __func__);
+}
+
+static __inline const char *
+ksid_getdomain(ksid_t *ks)
+{
+
+	panic("%s has been unexpectedly called", __func__);
+}
+
+static __inline uint_t
+ksid_getrid(ksid_t *ks)
+{
+
+	panic("%s has been unexpectedly called", __func__);
+}
+
+#define	kidmap_getsidbyuid(zone, uid, sid_prefix, rid)	(1)
+#define	kidmap_getsidbygid(zone, gid, sid_prefix, rid)	(1)
 
 #endif	/* _OPENSOLARIS_SYS_SID_H_ */
Index: src/external/cddl/osnet/sys/sys/sig.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/sig.h,v
retrieving revision 1.2
diff -u -p -r1.2 sig.h
--- src/external/cddl/osnet/sys/sys/sig.h	21 Feb 2010 01:46:36 -0000	1.2
+++ src/external/cddl/osnet/sys/sys/sig.h	10 Jun 2017 01:14:57 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/cddl/compat/opensolaris/sys/sig.h,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/sig.h 248470 2013-03-18 17:23:58Z jhb $
  */
 
 #ifndef _OPENSOLARIS_SYS_SIG_H_
Index: src/external/cddl/osnet/sys/sys/stat.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/stat.h,v
retrieving revision 1.3
diff -u -p -r1.3 stat.h
--- src/external/cddl/osnet/sys/sys/stat.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/stat.h	10 Jun 2017 01:16:21 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/stat.h,v 1.1 2007/11/28 21:50:40 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/stat.h 219089 2011-02-27 19:41:40Z pjd $
  *
  */
 
Index: src/external/cddl/osnet/sys/sys/string.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/string.h,v
retrieving revision 1.4
diff -u -p -r1.4 string.h
--- src/external/cddl/osnet/sys/sys/string.h	14 Dec 2010 01:01:41 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/string.h	10 Jun 2017 01:16:54 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/string.h,v 1.2 2007/04/10 21:42:12 wkoszek Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/string.h 219089 2011-02-27 19:41:40Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_STRING_H_
Index: src/external/cddl/osnet/sys/sys/sunddi.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/sunddi.h,v
retrieving revision 1.4
diff -u -p -r1.4 sunddi.h
--- src/external/cddl/osnet/sys/sys/sunddi.h	19 May 2010 18:01:26 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/sunddi.h	10 Jun 2017 05:41:17 -0000
@@ -25,12 +25,18 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/sunddi.h,v 1.1 2007/04/23 00:52:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/sunddi.h 277448 2015-01-20 22:27:45Z will $
  */
 
 #ifndef _OPENSOLARIS_SYS_SUNDDI_H_
 #define	_OPENSOLARIS_SYS_SUNDDI_H_
 
+#ifdef _KERNEL
+
+#include <sys/kmem.h>
+#include <lib/libkern/libkern.h>
+#include <sys/sysevent.h>
+
 #ifndef _KERNEL
 #define	ddi_copyin(from, to, size, flag)	(bcopy((from), (to), (size)), 0)
 #define	ddi_copyout(from, to, size, flag)	(bcopy((from), (to), (size)), 0)
@@ -38,6 +44,40 @@
 #define	ddi_copyin(from, to, size, flag)	(ioctl_copyin((flag), (from), (to), (size)))
 #define	ddi_copyout(from, to, size, flag)	(ioctl_copyout((flag), (from), (to), (size)))
 #endif
+int ddi_strtol(const char *str, char **nptr, int base, long *result);
 int ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result);
+int ddi_strtoull(const char *str, char **nptr, int base,
+    unsigned long long *result);
+
+#define	DDI_SUCCESS	(0)
+#define	DDI_FAILURE	(-1)
+#define	DDI_SLEEP	0x666
+
+int ddi_soft_state_init(void **statep, size_t size, size_t nitems);
+void ddi_soft_state_fini(void **statep);
+
+void *ddi_get_soft_state(void *state, int item);
+int ddi_soft_state_zalloc(void *state, int item);
+void ddi_soft_state_free(void *state, int item);
+
+int _ddi_log_sysevent(char *vendor, char *class_name, char *subclass_name,
+    nvlist_t *attr_list, sysevent_id_t *eidp, int flag);
+#define	ddi_log_sysevent(dip, vendor, class_name, subclass_name,	\
+	    attr_list, eidp, flag)					\
+	_ddi_log_sysevent((vendor), (class_name), (subclass_name),	\
+	    (attr_list), (eidp), (flag))
+
+
+#define	DDI_PSEUDO	""
+
+typedef void	*dev_info_t;
+int	ddi_create_minor_node(dev_info_t *, char *, int,
+                              minor_t, char *, int);
+void	ddi_remove_minor_node(dev_info_t *, char *);
+
+#define ddi_name_to_major(name) devsw_name2blk(name, NULL, 0)
+#define	ddi_prop_update_string(a, b, c, d)	DDI_SUCCESS
+
+#endif	/* _KERNEL */
 
 #endif	/* _OPENSOLARIS_SYS_SUNDDI_H_ */
Index: src/external/cddl/osnet/sys/sys/sysevent.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/sysevent.h
diff -N src/external/cddl/osnet/sys/sys/sysevent.h
--- src/external/cddl/osnet/sys/sys/sysevent.h	28 Feb 2010 15:07:14 -0000	1.1
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,276 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_SYSEVENT_H
-#define	_SYS_SYSEVENT_H
-
-#include <sys/nvpair.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#ifndef	NULL
-#if defined(_LP64) && !defined(__cplusplus)
-#define	NULL    0L
-#else
-#define	NULL	0
-#endif
-#endif
-
-/* Internal registration class and subclass */
-#define	EC_ALL		"register_all_classes"
-#define	EC_SUB_ALL	"register_all_subclasses"
-
-/*
- * Event allocation/enqueuing sleep/nosleep flags
- */
-#define	SE_SLEEP		0
-#define	SE_NOSLEEP		1
-
-/* Framework error codes */
-#define	SE_EINVAL		1	/* Invalid argument */
-#define	SE_ENOMEM		2	/* Unable to allocate memory */
-#define	SE_EQSIZE		3	/* Maximum event q size exceeded */
-#define	SE_EFAULT		4	/* Copy fault */
-#define	SE_NOTFOUND		5	/* Attribute not found */
-#define	SE_NO_TRANSPORT		6	/* sysevent transport down */
-
-/* Internal data types */
-
-#define	SE_DATA_TYPE_BYTE	DATA_TYPE_BYTE
-#define	SE_DATA_TYPE_INT16	DATA_TYPE_INT16
-#define	SE_DATA_TYPE_UINT16	DATA_TYPE_UINT16
-#define	SE_DATA_TYPE_INT32	DATA_TYPE_INT32
-#define	SE_DATA_TYPE_UINT32	DATA_TYPE_UINT32
-#define	SE_DATA_TYPE_INT64	DATA_TYPE_INT64
-#define	SE_DATA_TYPE_UINT64	DATA_TYPE_UINT64
-#define	SE_DATA_TYPE_STRING	DATA_TYPE_STRING
-#define	SE_DATA_TYPE_BYTES	DATA_TYPE_BYTE_ARRAY
-#define	SE_DATA_TYPE_TIME	DATA_TYPE_HRTIME
-
-#define	SE_KERN_PID	0
-
-#define	SUNW_VENDOR	"SUNW"
-#define	SE_USR_PUB	"usr:"
-#define	SE_KERN_PUB	"kern:"
-#define	SUNW_KERN_PUB	SUNW_VENDOR":"SE_KERN_PUB
-#define	SUNW_USR_PUB	SUNW_VENDOR":"SE_USR_PUB
-
-/*
- * Event header and attribute value limits
- */
-#define	MAX_ATTR_NAME	1024
-#define	MAX_STRING_SZ	1024
-#define	MAX_BYTE_ARRAY	1024
-
-#define	MAX_CLASS_LEN		64
-#define	MAX_SUBCLASS_LEN	64
-#define	MAX_PUB_LEN		128
-#define	MAX_CHNAME_LEN		128
-#define	MAX_SUBID_LEN		16
-
-/*
- * Limit for the event payload size
- */
-#define	MAX_EV_SIZE_LEN		(SHRT_MAX/4)
-
-/* Opaque sysevent_t data type */
-typedef void *sysevent_t;
-
-/* Opaque channel bind data type */
-typedef void evchan_t;
-
-/* sysevent attribute list */
-typedef nvlist_t sysevent_attr_list_t;
-
-/* sysevent attribute name-value pair */
-typedef nvpair_t sysevent_attr_t;
-
-/* Unique event identifier */
-typedef struct sysevent_id {
-	uint64_t eid_seq;
-	hrtime_t eid_ts;
-} sysevent_id_t;
-
-/* Event attribute value structures */
-typedef struct sysevent_bytes {
-	int32_t	size;
-	uchar_t	*data;
-} sysevent_bytes_t;
-
-typedef struct sysevent_value {
-	int32_t		value_type;		/* data type */
-	union {
-		uchar_t		sv_byte;
-		int16_t		sv_int16;
-		uint16_t	sv_uint16;
-		int32_t		sv_int32;
-		uint32_t	sv_uint32;
-		int64_t		sv_int64;
-		uint64_t	sv_uint64;
-		hrtime_t	sv_time;
-		char		*sv_string;
-		sysevent_bytes_t	sv_bytes;
-	} value;
-} sysevent_value_t;
-
-/*
- * The following flags determine the memory allocation semantics to use for
- * kernel event buffer allocation by userland and kernel versions of
- * sysevent_evc_publish().
- *
- * EVCH_SLEEP and EVCH_NOSLEEP respectively map to KM_SLEEP and KM_NOSLEEP.
- * EVCH_TRYHARD is a kernel-only publish flag that allow event allocation
- * routines to use use alternate kmem caches in situations where free memory
- * may be low.  Kernel callers of sysevent_evc_publish() must set flags to
- * one of EVCH_SLEEP, EVCH_NOSLEEP or EVCH_TRYHARD.  Userland callers of
- * sysevent_evc_publish() must set flags to one of EVCH_SLEEP or EVCH_NOSLEEP.
- *
- * EVCH_QWAIT determines whether or not we should wait for slots in the event
- * queue at publication time.  EVCH_QWAIT may be used by kernel and userland
- * publishers and must be used in conjunction with any of one of EVCH_SLEEP,
- * EVCH_NOSLEEP or EVCH_TRYHARD (kernel-only).
- */
-
-#define	EVCH_NOSLEEP	0x0001	/* No sleep on kmem_alloc() */
-#define	EVCH_SLEEP	0x0002	/* Sleep on kmem_alloc() */
-#define	EVCH_TRYHARD	0x0004	/* May use alternate kmem cache for alloc */
-#define	EVCH_QWAIT	0x0008	/* Wait for slot in event queue */
-
-/*
- * Meaning of flags for subscribe. Bits 8 to 15 are dedicated to
- * the consolidation private interface, so flags defined here are restricted
- * to the LSB.
- *
- * EVCH_SUB_KEEP indicates that this subscription should persist even if
- * this subscriber id should die unexpectedly; matching events will be
- * queued (up to a limit) and will be delivered if/when we restart again
- * with the same subscriber id.
- */
-#define	EVCH_SUB_KEEP		0x01
-
-/*
- * Subscriptions may be wildcarded, but we limit the number of
- * wildcards permitted.
- */
-#define	EVCH_WILDCARD_MAX	10
-
-/*
- * Used in unsubscribe to indicate all subscriber ids for a channel.
- */
-#define	EVCH_ALLSUB		"all_subs"
-
-/*
- * Meaning of flags parameter of channel bind function
- *
- * EVCH_CREAT indicates to create a channel if not already present.
- *
- * EVCH_HOLD_PEND indicates that events should be published to this
- * channel even if there are no matching subscribers present; when
- * a subscriber belatedly binds to the channel and registers their
- * subscriptions they will receive events that predate their bind.
- * If the channel is closed, however, with no remaining bindings then
- * the channel is destroyed.
- *
- * EVCH_HOLD_PEND_INDEF is a stronger version of EVCH_HOLD_PEND -
- * even if the channel has no remaining bindings it will not be
- * destroyed so long as events remain unconsumed.  This is suitable for
- * use with short-lived event producers that may bind to (create) the
- * channel and exit before the intended consumer has started.
- */
-#define	EVCH_CREAT		0x0001
-#define	EVCH_HOLD_PEND		0x0002
-#define	EVCH_HOLD_PEND_INDEF	0x0004
-#define	EVCH_B_FLAGS		0x0007	/* All valid bits */
-
-/*
- * Meaning of commands of evc_control function
- */
-#define	EVCH_GET_CHAN_LEN_MAX	 1	/* Get event queue length limit */
-#define	EVCH_GET_CHAN_LEN	 2	/* Get event queue length */
-#define	EVCH_SET_CHAN_LEN	 3	/* Set event queue length */
-#define	EVCH_CMD_LAST		 EVCH_SET_CHAN_LEN	/* Last command */
-
-/*
- * Shared user/kernel event channel interface definitions
- */
-
-/*int sysevent_evc_bind(const char *, evchan_t **, uint32_t);*/
-/*void sysevent_evc_unbind(evchan_t *);*/
-int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
-int (*)(sysevent_t *, void *), void *, uint32_t);
-extern int sysevent_evc_unsubscribe(evchan_t *, const char *);
-extern int sysevent_evc_publish(evchan_t *, const char *, const char *,
-    const char *, const char *, nvlist_t *, uint32_t);
-/*int sysevent_evc_control(evchan_t *, int, ...);*/
-
-#ifndef	_KERNEL
-
-/*
- * Userland-only event channel interfaces
- */
-
-typedef struct sysevent_subattr sysevent_subattr_t;
-
-extern sysevent_subattr_t *sysevent_subattr_alloc(void);
-extern void sysevent_subattr_free(sysevent_subattr_t *);
-
-extern void sysevent_subattr_thrattr(sysevent_subattr_t *, pthread_attr_t *);
-extern void sysevent_subattr_sigmask(sysevent_subattr_t *, sigset_t *);
-
-extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
-    int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
-
-#else
-
-/*
- * Kernel log_event interfaces.
- */
-extern int log_sysevent(sysevent_t *, int, sysevent_id_t *);
-
-extern sysevent_t *sysevent_alloc(char *, char *, char *, int);
-extern void sysevent_free(sysevent_t *);
-extern int sysevent_add_attr(sysevent_attr_list_t **, char *,
-    sysevent_value_t *, int);
-extern void sysevent_free_attr(sysevent_attr_list_t *);
-extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
-extern void sysevent_detach_attributes(sysevent_t *);
-extern char *sysevent_get_class_name(sysevent_t *);
-extern char *sysevent_get_subclass_name(sysevent_t *);
-extern uint64_t sysevent_get_seq(sysevent_t *);
-extern void sysevent_get_time(sysevent_t *, hrtime_t *);
-extern size_t sysevent_get_size(sysevent_t *);
-extern char *sysevent_get_pub(sysevent_t *);
-extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
-
-#endif	/* _KERNEL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_SYSEVENT_H */
Index: src/external/cddl/osnet/sys/sys/sysmacros.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/sysmacros.h,v
retrieving revision 1.7
diff -u -p -r1.7 sysmacros.h
--- src/external/cddl/osnet/sys/sys/sysmacros.h	1 Feb 2016 02:12:55 -0000	1.7
+++ src/external/cddl/osnet/sys/sys/sysmacros.h	22 Apr 2017 17:55:06 -0000
@@ -77,8 +77,6 @@ extern unsigned char bcd_to_byte[256];
 #define	BYTE_TO_BCD(x)	byte_to_bcd[(x) & 0xff]
 #define	BCD_TO_BYTE(x)	bcd_to_byte[(x) & 0xff]
 
-#define MAXMIN        0xfffffffful /* max minor value with 64b dev_t*/
-	
 #endif	/* _KERNEL */
 
 #ifndef __NetBSD__
@@ -400,6 +398,9 @@ extern unsigned char bcd_to_byte[256];
 static __inline int
 highbit(ulong_t i)
 {
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSL)
+	return (flsl(i));
+#else
 	register int h = 1;
 
 	if (i == 0)
@@ -425,6 +426,43 @@ highbit(ulong_t i)
 		h += 1;
 	}
 	return (h);
+#endif
+}
+
+/*
+ * Find highest one bit set.
+ *	Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+static __inline int
+highbit64(uint64_t i)
+{
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSLL)
+	return (flsll(i));
+#else
+	int h = 1;
+
+	if (i == 0)
+		return (0);
+	if (i & 0xffffffff00000000ULL) {
+		h += 32; i >>= 32;
+	}
+	if (i & 0xffff0000) {
+		h += 16; i >>= 16;
+	}
+	if (i & 0xff00) {
+		h += 8; i >>= 8;
+	}
+	if (i & 0xf0) {
+		h += 4; i >>= 4;
+	}
+	if (i & 0xc) {
+		h += 2; i >>= 2;
+	}
+	if (i & 0x2) {
+		h += 1;
+	}
+	return (h);
+#endif
 }
 
 #ifdef	__cplusplus
Index: src/external/cddl/osnet/sys/sys/systm.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/systm.h,v
retrieving revision 1.4
diff -u -p -r1.4 systm.h
--- src/external/cddl/osnet/sys/sys/systm.h	21 Jun 2013 16:22:46 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/systm.h	10 Jun 2017 05:48:07 -0000
@@ -54,13 +54,16 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/systm.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/systm.h 296510 2016-03-08 17:27:13Z mav $
  */
 
+#include <sys/proc.h>
+
 #ifndef _OPENSOLARIS_SYS_SYSTM_H_
 #define	_OPENSOLARIS_SYS_SYSTM_H_
 
 #ifdef _KERNEL
+#define __HIDE_DELAY
 #include_next <sys/systm.h>
 #endif
 
@@ -73,10 +76,10 @@
 #define	PAGEOFFSET	(PAGESIZE - 1)
 #define	PAGEMASK	(~PAGEOFFSET)
 
-#define	xdelay(x)	kpause("soldelay", false, (x), NULL)
+#define	delay(x)	kpause("soldelay", false, (x), NULL)
 
-#define	xcopyin(u, k, s)	copyin(u, k, s)
-#define	xcopyout(k, u, s)	copyout(k, u, s)
+#define	timeout_generic(type, fn, arg, t, r, f) \
+    panic("timeout_generic() not implemented")
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/sys/sys/thread.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/thread.h,v
retrieving revision 1.1
diff -u -p -r1.1 thread.h
--- src/external/cddl/osnet/sys/sys/thread.h	7 Aug 2009 20:57:58 -0000	1.1
+++ src/external/cddl/osnet/sys/sys/thread.h	22 Apr 2017 08:09:40 -0000
@@ -28,4 +28,3 @@
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
-
Index: src/external/cddl/osnet/sys/sys/time.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/time.h,v
retrieving revision 1.7
diff -u -p -r1.7 time.h
--- src/external/cddl/osnet/sys/sys/time.h	28 Apr 2016 11:45:02 -0000	1.7
+++ src/external/cddl/osnet/sys/sys/time.h	10 Jun 2017 05:50:50 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/time.h,v 1.2 2007/11/28 21:44:17 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/time.h 296530 2016-03-08 18:28:24Z mav $
  */
 
 #ifndef _OPENSOLARIS_SYS_TIME_H_
@@ -37,11 +37,26 @@
 #define MILLISEC	1000
 #define MICROSEC	1000000
 #define NANOSEC		1000000000
+#define TIME_MAX	LLONG_MAX
 
-#define	LBOLT	((gethrtime() * hz) / NANOSEC)
+#define	MSEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / MILLISEC))
+#define	NSEC2MSEC(n)	((n) / (NANOSEC / MILLISEC))
 
-#define TIMESPEC_OVERFLOW(ts)                       \
+#define	NSEC2SEC(n)	((n) / (NANOSEC / SEC))
+#define	SEC2NSEC(m)	((hrtime_t)(m) * (NANOSEC / SEC))
+
+typedef longlong_t	hrtime_t;
+
+#if defined(__i386__) || defined(__powerpc__)
+#define	TIMESPEC_OVERFLOW(ts)						\
 	((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX)
+#else
+#define	TIMESPEC_OVERFLOW(ts)						\
+	((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX)
+#endif
+
+#define	SEC_TO_TICK(sec)	((sec) * hz)
+#define	NSEC_TO_TICK(nsec)	((nsec) / (NANOSEC / hz))
 
 #ifdef _KERNEL
 #include <sys/systm.h>
@@ -52,42 +67,41 @@ gethrtime(void) {
 	struct timespec ts;
 	hrtime_t nsec;
 
-#if 1
 	getnanouptime(&ts);
-#else
-	nanouptime(&ts);
-#endif
 	nsec = (hrtime_t)ts.tv_sec * NANOSEC + ts.tv_nsec;
 	return (nsec);
 }
 
 #define	gethrestime_sec()	(time_second)
 #define	gethrestime(ts)		getnanotime(ts)
+#define	gethrtime_waitfree()	gethrtime()
+
+static inline int64_t
+ddi_get_lbolt64(void)
+{
+	struct timespec ts;
+	const int hz = 100;
+
+	getnanouptime(&ts);
+	return (int64_t)(SEC_TO_TICK(ts.tv_sec) + NSEC_TO_TICK(ts.tv_nsec));
+}
+
+#define ddi_get_lbolt()		(clock_t)ddi_get_lbolt64()
 
 #else
 
-#ifdef CLOCK_REALTIME
 int clock_gettime(clockid_t, struct timespec *)
-#ifdef __RENAME
-    __RENAME(__clock_gettime50)
-#endif
-;
-#else
-#include <stdio.h>	/* For NULL */
-#endif
+    __RENAME(__clock_gettime50);
 
 static __inline hrtime_t gethrtime(void) {
-#ifdef CLOCK_REALTIME
 	struct timespec ts;
 	clock_gettime(CLOCK_REALTIME,&ts);
 	return (((u_int64_t) ts.tv_sec) * NANOSEC + ts.tv_nsec);
-#else
-	struct timeval tv;
-	gettimeofday(&tv, NULL);
-	return (((u_int64_t) tv.tv_sec) * MICROSEC + tv.tv_usec) * 1000;
-#endif
 }
 
+#define	ddi_get_lbolt()		(gethrtime() >> 23)
+#define	ddi_get_lbolt64()	(gethrtime() >> 23)
+
 #endif	/* _KERNEL */
 
 #endif	/* !_OPENSOLARIS_SYS_TIME_H_ */
Index: src/external/cddl/osnet/sys/sys/types.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/types.h,v
retrieving revision 1.17
diff -u -p -r1.17 types.h
--- src/external/cddl/osnet/sys/sys/types.h	1 Feb 2016 02:12:55 -0000	1.17
+++ src/external/cddl/osnet/sys/sys/types.h	10 Jun 2017 05:54:11 -0000
@@ -54,7 +54,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/types.h,v 1.3 2007/11/28 21:49:16 jb Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/types.h 299459 2016-05-11 16:05:32Z cem $
  */
 
 #ifndef _OPENSOLARIS_SYS_TYPES_H_
@@ -63,6 +63,8 @@
 /*
  * This is a bag of dirty hacks to keep things compiling.
  */
+#define longlong_t __hide_longlong_t
+#define u_longlong_t __hide_u_longlong_t
 #ifndef _KERNEL
 #include <stdint.h>
 #else
@@ -79,6 +81,10 @@
 #undef _NETBSD_SOURCE
 #endif
 #endif
+#undef longlong_t
+#undef u_longlong_t
+typedef long long longlong_t;
+typedef unsigned long long u_longlong_t;
 
 #ifndef _KERNEL
 #include <stdarg.h>
Index: src/external/cddl/osnet/sys/sys/uio.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/uio.h,v
retrieving revision 1.9
diff -u -p -r1.9 uio.h
--- src/external/cddl/osnet/sys/sys/uio.h	26 Sep 2015 03:32:17 -0000	1.9
+++ src/external/cddl/osnet/sys/sys/uio.h	10 Jun 2017 05:54:27 -0000
@@ -54,15 +54,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/uio.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/uio.h 219089 2011-02-27 19:41:40Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_UIO_H_
 #define	_OPENSOLARIS_SYS_UIO_H_
 
 #include_next <sys/uio.h>
-#include <sys/sysmacros.h>
-
+#include <sys/debug.h>
 
 #ifndef _KERNEL
 #include <assert.h>
Index: src/external/cddl/osnet/sys/sys/unistd.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/unistd.h
diff -N src/external/cddl/osnet/sys/sys/unistd.h
--- src/external/cddl/osnet/sys/sys/unistd.h	1 Mar 2010 00:41:35 -0000	1.1
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,17 +0,0 @@
-
-#include_next <sys/unistd.h>
-
-#define _PC_ACL_ENABLED         20
-#define _PC_SATTR_ENABLED       23
-#define _PC_SATTR_EXISTS        24
-#define _PC_ACCESS_FILTERING    25
-/* UNIX 08 names */
-#define _PC_TIMESTAMP_RESOLUTION 26
-
-
-/*
- * The value of 0 is returned when
- * ACL's are not supported
- */
-#define _ACL_ACLENT_ENABLED     0x1
-#define _ACL_ACE_ENABLED        0x2
Index: src/external/cddl/osnet/sys/sys/varargs.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/varargs.h,v
retrieving revision 1.4
diff -u -p -r1.4 varargs.h
--- src/external/cddl/osnet/sys/sys/varargs.h	17 Jul 2011 20:54:33 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/varargs.h	10 Jun 2017 05:56:56 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/varargs.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/varargs.h 177698 2008-03-28 22:16:18Z jb $
  */
 
 #ifndef _OPENSOLARIS_SYS_VARARGS_H_
Index: src/external/cddl/osnet/sys/sys/vfs.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/vfs.h,v
retrieving revision 1.4
diff -u -p -r1.4 vfs.h
--- src/external/cddl/osnet/sys/sys/vfs.h	28 Feb 2010 14:45:47 -0000	1.4
+++ src/external/cddl/osnet/sys/sys/vfs.h	26 Apr 2017 13:07:48 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/vfs.h,v 1.2 2007/06/04 11:31:45 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/vfs.h 297513 2016-04-02 16:25:46Z avg $
  */
 
 #ifndef _OPENSOLARIS_SYS_VFS_H_
@@ -45,29 +45,17 @@ typedef	struct mount	vfs_t;
 #define	vfs_flag	mnt_flag
 #define	vfs_data	mnt_data
 #define	vfs_count	mnt_refcnt
-#define	vfs_fsid 	mnt_stat.f_fsid
 #define	vfs_bsize	mnt_stat.f_bsize
+#define	vfs_resource	mnt_stat.f_mntfromname
 
 #define	v_flag		v_vflag
 #define	v_vfsp		v_mount
 
-#define f_basetype      f_fstypename
-
-#define vfs_setfsops(a, b, c)    vfs_attach(&b)
-#define vfs_freevfsops_by_type(zfsfstype) do { } while (0)
-
 #define	VFS_RDONLY	MNT_RDONLY
 #define	VFS_NOSETUID	MNT_NOSUID
 #define	VFS_NOEXEC	MNT_NOEXEC
-#define VFS_NODEVICES   MNT_NODEV
-#define VFS_NOTRUNC     0
 
-#define	VFS_HOLD(vfsp)	do {						\
-	/* XXXNETBSD nothing */						\
-} while (0)
-#define	VFS_RELE(vfsp)	do {						\
-	/* XXXNETBSD nothing */						\
-} while (0)
+#define fs_vscan(vp, cr, async) (0)
 
 #define	VROOT		VV_ROOT
 
@@ -101,9 +89,6 @@ typedef struct mntopt {
 #define	VFS_CREATEOPT	0x08		/* Create the opt if it's not there */
 #define VFS_UNMOUNTED   0x100           /* file system has been unmounted */
 
-#define MS_SYSSPACE     0x0008          /* Mounta already in kernel space */
-#define MS_DATA         0
-
 /*
  * Structure holding mount option strings for the mounted file system.
  */
@@ -133,21 +118,38 @@ struct mounta {
 };
 
 #define vfs_devismounted(dev) 0
-#define fs_vscan(vp, cr, async) (0)
 
 void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
     int flags __unused);
 void vfs_clearmntopt(vfs_t *vfsp, const char *name);
 int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp);
-int traverse(vnode_t **cvpp, int lktype);
-int domount(kthread_t *td, vnode_t *vp, const char *fstype, char *fspath,
-    char *fspec, int fsflags);
+int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype,
+    char *fspath, char *fspec, int fsflags);
+
+typedef	uint64_t	vfs_feature_t;
 
-#define vfs_set_feature(vfsp, feature)  do { } while (0)
-#define vfs_has_feature(vfsp, feature)  (0)
+#define	VFSFT_XVATTR		0x100000001	/* Supports xvattr for attrs */
+#define	VFSFT_CASEINSENSITIVE	0x100000002	/* Supports case-insensitive */
+#define	VFSFT_NOCASESENSITIVE	0x100000004	/* NOT case-sensitive */
+#define	VFSFT_DIRENTFLAGS	0x100000008	/* Supports dirent flags */
+#define	VFSFT_ACLONCREATE	0x100000010	/* Supports ACL on create */
+#define	VFSFT_ACEMASKONACCESS	0x100000020	/* Can use ACEMASK for access */
+#define	VFSFT_SYSATTR_VIEWS	0x100000040	/* Supports sysattr view i/f */
+#define	VFSFT_ACCESS_FILTER	0x100000080	/* dirents filtered by access */
+#define	VFSFT_REPARSE		0x100000100	/* Supports reparse point */
+#define	VFSFT_ZEROCOPY_SUPPORTED	0x100000200
+				/* Support loaning /returning cache buffer */
+
+#define	vfs_set_feature(vfsp, feature)		do { } while (0)
+#define	vfs_clear_feature(vfsp, feature)	do { } while (0)
+#define	vfs_has_feature(vfsp, feature)		(0)
 
-int zfs_vfsinit(int fstype, char *name);
-int zfs_vfsfini(void);
+#define	VFS_HOLD(vfsp)	do {						\
+	/* XXXNETBSD nothing */						\
+} while (0)
+#define	VFS_RELE(vfsp)	do {						\
+	/* XXXNETBSD nothing */						\
+} while (0)
 
 #endif	/* _KERNEL */
 
Index: src/external/cddl/osnet/sys/sys/vm.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/vm.h,v
retrieving revision 1.1
diff -u -p -r1.1 vm.h
--- src/external/cddl/osnet/sys/sys/vm.h	7 Aug 2009 20:57:58 -0000	1.1
+++ src/external/cddl/osnet/sys/sys/vm.h	2 May 2017 16:57:33 -0000
@@ -1,5 +1,6 @@
 #ifndef _SYS_VFS_VM_H
 #define _SYS_VFS_VM_H
 
+#include <uvm/uvm.h>
 
 #endif
Index: src/external/cddl/osnet/sys/sys/vnode.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/vnode.h,v
retrieving revision 1.13
diff -u -p -r1.13 vnode.h
--- src/external/cddl/osnet/sys/sys/vnode.h	9 Apr 2015 05:32:53 -0000	1.13
+++ src/external/cddl/osnet/sys/sys/vnode.h	28 Jun 2017 00:41:28 -0000
@@ -22,8 +22,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
@@ -93,31 +92,43 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/vnode.h,v 1.3 2007/05/31 11:51:49 kib Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/vnode.h 308691 2016-11-15 18:22:50Z alc $
  */
 
-#include <sys/mount.h>
-#include_next <sys/vnode.h>
-
 #ifndef _OPENSOLARIS_SYS_VNODE_H_
 #define	_OPENSOLARIS_SYS_VNODE_H_
 
+#ifdef _KERNEL
+
+struct vnode;
+struct vattr;
+
+typedef	struct vnode	vnode_t;
+typedef	struct vattr	vattr_t;
+typedef enum vtype vtype_t;
+
+#include <sys/namei.h>
+enum symfollow { NO_FOLLOW = NOFOLLOW };
+
+/*
+ * We'll use the two-parameter FreeBSD VOP_UNLOCK() in ZFS code.
+ */
+
+#define VOP_UNLOCK __hide_VOP_UNLOCK
+#include_next <sys/vnode.h>
+/* verify that the real definition is what we expect */
+int __hide_VOP_UNLOCK(struct vnode *);
+#undef VOP_UNLOCK
+
+int VOP_UNLOCK(struct vnode *, int);
+
 #include <sys/cred.h>
 #include <sys/fcntl.h>
-#include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/buf.h>
-#include <sys/debug.h>
 
-
-#ifdef _KERNEL
 #include <sys/vfs_syscalls.h>
-#endif
-
-typedef	struct vattr	vattr_t;
-typedef	enum vtype	vtype_t;
-typedef	void		caller_context_t;
 
 typedef int (**vnodeops_t)(void *);
 
@@ -133,196 +144,52 @@ struct vop_vptofh_args {
 	struct fid *a_fid;
 };
 
-/*
- * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations
- */
-
-/*
- * Flags to VOP_SETATTR/VOP_GETATTR.
- */
-#define ATTR_UTIME  0x01    /* non-default utime(2) request */
-#define ATTR_EXEC   0x02    /* invocation from exec(2) */
-#define ATTR_COMM   0x04    /* yield common vp attributes */
-#define ATTR_HINT   0x08    /* information returned will be `hint' */
-#define ATTR_REAL   0x10    /* yield attributes of the real vp */
-#define ATTR_NOACLCHECK 0x20    /* Don't check ACL when checking permissions */
-#define ATTR_TRIGGER    0x40    /* Mount first if vnode is a trigger mount */
-
-typedef struct vsecattr {
-	uint_t		vsa_mask;	/* See below */
-	int		vsa_aclcnt;	/* ACL entry count */
-	void		*vsa_aclentp;	/* pointer to ACL entries */
-	int		vsa_dfaclcnt;	/* default ACL entry count */
-	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
-	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
-	uint_t		vsa_aclflags;	/* ACE ACL flags */
-} vsecattr_t;
-
-#define V_XATTRDIR      0x0000  /* attribute unnamed directory */
 #define IS_XATTRDIR(vp)	(0)
 
-#define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
-
-/*
- * Structure of all optional attributes.
- */
-typedef struct xoptattr {
-	timestruc_t	xoa_createtime;	/* Create time of file */
-	uint8_t		xoa_archive;
-	uint8_t		xoa_system;
-	uint8_t		xoa_readonly;
-	uint8_t		xoa_hidden;
-	uint8_t		xoa_nounlink;
-	uint8_t		xoa_immutable;
-	uint8_t		xoa_appendonly;
-	uint8_t		xoa_nodump;
-	uint8_t		xoa_opaque;
-	uint8_t		xoa_av_quarantined;
-	uint8_t		xoa_av_modified;
-	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
-	uint8_t 	xoa_reparse;
-} xoptattr_t;
-
-
-/*
- * The xvattr structure is really a variable length structure that
- * is made up of:
- * - The classic vattr_t (xva_vattr)
- * - a 32 bit quantity (xva_mapsize) that specifies the size of the
- *   attribute bitmaps in 32 bit words.
- * - A pointer to the returned attribute bitmap (needed because the
- *   previous element, the requested attribute bitmap) is variable lenth.
- * - The requested attribute bitmap, which is an array of 32 bit words.
- *   Callers use the XVA_SET_REQ() macro to set the bits corresponding to
- *   the attributes that are being requested.
- * - The returned attribute bitmap, which is an array of 32 bit words.
- *   File systems that support optional attributes use the XVA_SET_RTN()
- *   macro to set the bits corresponding to the attributes that are being
- *   returned.
- * - The xoptattr_t structure which contains the attribute values
- *
- * xva_mapsize determines how many words in the attribute bitmaps.
- * Immediately following the attribute bitmaps is the xoptattr_t.
- * xva_getxoptattr() is used to get the pointer to the xoptattr_t
- * section.
- */
-
-#define	XVA_MAPSIZE	3		/* Size of attr bitmaps */
-#define	XVA_MAGIC	0x78766174	/* Magic # for verification */
-
-/*
- * The xvattr structure is an extensible structure which permits optional
- * attributes to be requested/returned.  File systems may or may not support
- * optional attributes.  They do so at their own discretion but if they do
- * support optional attributes, they must register the VFSFT_XVATTR feature
- * so that the optional attributes can be set/retrived.
- *
- * The fields of the xvattr structure are:
- *
- * xva_vattr - The first element of an xvattr is a legacy vattr structure
- * which includes the common attributes.  If AT_XVATTR is set in the va_mask
- * then the entire structure is treated as an xvattr.  If AT_XVATTR is not
- * set, then only the xva_vattr structure can be used.
- *
- * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification.
- *
- * xva_mapsize - Size of requested and returned attribute bitmaps.
- *
- * xva_rtnattrmapp - Pointer to xva_rtnattrmap[].  We need this since the
- * size of the array before it, xva_reqattrmap[], could change which means
- * the location of xva_rtnattrmap[] could change.  This will allow unbundled
- * file systems to find the location of xva_rtnattrmap[] when the sizes change.
- *
- * xva_reqattrmap[] - Array of requested attributes.  Attributes are
- * represented by a specific bit in a specific element of the attribute
- * map array.  Callers set the bits corresponding to the attributes
- * that the caller wants to get/set.
- *
- * xva_rtnattrmap[] - Array of attributes that the file system was able to
- * process.  Not all file systems support all optional attributes.  This map
- * informs the caller which attributes the underlying file system was able
- * to set/get.  (Same structure as the requested attributes array in terms
- * of each attribute  corresponding to specific bits and array elements.)
- *
- * xva_xoptattrs - Structure containing values of optional attributes.
- * These values are only valid if the corresponding bits in xva_reqattrmap
- * are set and the underlying file system supports those attributes.
- */
-typedef struct xvattr {
-	vattr_t		xva_vattr;	/* Embedded vattr structure */
-	uint32_t	xva_magic;	/* Magic Number */
-	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
-	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
-	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
-	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
-	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
-} xvattr_t;
-
-/* vsa_mask values */
-#define	VSA_ACL			0x0001
-#define	VSA_ACLCNT		0x0002
-#define	VSA_DFACL		0x0004
-#define	VSA_DFACLCNT		0x0008
-#define	VSA_ACE			0x0010
-#define	VSA_ACECNT		0x0020
-#define	VSA_ACE_ALLTYPES	0x0040
-#define	VSA_ACE_ACLFLAGS	0x0080	/* get/set ACE ACL flags */
-
 #define v_lock v_interlock
 
-/*
- * vnode flags.
- */
-#define VROOT		VV_ROOT/* root of its file system */
-#define VNOCACHE	0x00/* don't keep cache pages on vnode */
-#define VNOMAP		VV_MAPPED/* file cannot be mapped/faulted */
-#define VDUP		0x00/* file should be dup'ed rather then opened */
-#define VNOSWAP		0x00/* file cannot be used as virtual swap device */
-#define VNOMOUNT	0x00/* file cannot be covered by mount */
-#define VISSWAP		0x00/* vnode is being used for swap */
-#define VSWAPLIKE	0x00/* vnode acts like swap (but may not be) */
+#define SAVENAME 0
 
 int	vn_is_readonly(vnode_t *);
 
-#define	vn_free(vp)		vrele((vp))
-#define	vn_setops(vp, ops)	(0)
 #define	vn_vfswlock(vp)		(0)
 #define	vn_vfsunlock(vp)	do { } while (0)
 #define	vn_ismntpt(vp)		((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL)
 #define	vn_mountedvfs(vp)	((vp)->v_mountedhere)
-#define	vn_has_cached_data(vp)	((vp)->v_uobj.uo_npages > 0)
+#define vn_has_cached_data(vp)	((vp)->v_uobj.uo_npages != 0)
+#define	vn_exists(vp)		do { } while (0)
+#define	vn_invalid(vp)		do { } while (0)
+#define	vn_free(vp)		do { } while (0)
 #define vn_renamepath(tdvp, svp, tnm, lentnm)   do { } while (0)
+#define	vn_matchops(vp, vops)	((vp)->v_op == &(vops))
 
 #define	VN_HOLD(v)	vref(v)
-#define	VN_RELE(v)	do { \
-	    if ((v)->v_usecount == 0) \
-		    printf("%s, %d: %p unused\n", __FILE__, __LINE__, v); \
-	    else \
-		    vrele(v); \
-    } while (/*CONSTCOND*/0)
+#define	VN_RELE(v)						      \
+do {								      \
+	if ((v)->v_usecount == 0) {				      \
+		printf("VN_RELE(%s,%d): %p unused\n", __FILE__, __LINE__, v); \
+		vprint("VN_RELE", (v));				      \
+		panic("VN_RELE");				      \
+	} else {						      \
+		vrele(v);					      \
+	}							      \
+} while (/*CONSTCOND*/0)
 #define	VN_URELE(v)	vput(v)
-#define	VN_SET_VFS_TYPE_DEV(vp, vfs, type, flag)	(0)
-
-#define VI_LOCK(vp)     mutex_enter((vp)->v_interlock)
-#define VI_UNLOCK(vp)   mutex_exit((vp)->v_interlock)
-
-#define	VOP_REALVP(vp, vpp, ct)	(*(vpp) = (vp), 0)
+#undef VN_RELE_ASYNC
+#define VN_RELE_ASYNC(vp, taskq) 	vrele_async((vp))
+#define VN_RELE_CLEANER(vp, taskq)	/* nothing */
 
-#define vnevent_remove(vp, dvp, name, ct) do { } while (0)
-#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0)
-#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0)
+#define	vnevent_create(vp, ct)			do { } while (0)
+#define	vnevent_link(vp, ct)			do { } while (0)
+#define	vnevent_remove(vp, dvp, name, ct)	do { } while (0)
+#define	vnevent_rmdir(vp, dvp, name, ct)	do { } while (0)
+#define	vnevent_rename_src(vp, dvp, name, ct)	do { } while (0)
 #define	vnevent_rename_dest(vp, dvp, name, ct)	do { } while (0)
-#define vnevent_rename_dest_dir(vp, ct)     do { } while (0)
-#define vnevent_create(vp, ct)  do { } while (0)
-#define vnevent_link(vp, ct)            do { } while (0)
-
-#define	IS_DEVVP(vp)	\
-	((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
-
-#define	MODEMASK	ALLPERMS
+#define	vnevent_rename_dest_dir(vp, ct)		do { } while (0)
 
 #define	specvp(vp, rdev, type, cr)	(VN_HOLD(vp), (vp))
 #define	MANDMODE(mode)	(0)
+#define	MANDLOCK(vp, mode)	(0)
 #define	chklock(vp, op, offset, size, mode, ct)	(0)
 #define	cleanlocks(vp, pid, foo)	do { } while (0)
 #define	cleanshares(vp, pid)		do { } while (0)
@@ -341,169 +208,11 @@ int	vn_is_readonly(vnode_t *);
 
 #define	EXCL		0
 
-#define	AT_TYPE		0x0001
-#define	AT_MODE		0x0002
-#define	AT_UID		0x0004
-#define	AT_GID		0x0008
-#define	AT_FSID		0x0010
-#define	AT_NODEID	0x0020
-#define	AT_NLINK	0x0040
-#define	AT_SIZE		0x0080
-#define	AT_ATIME	0x0100
-#define	AT_MTIME	0x0200
-#define	AT_CTIME	0x0400
-#define	AT_RDEV		0x0800
-#define	AT_BLKSIZE	0x1000
-#define	AT_NBLOCKS	0x2000
-#define	AT_SEQ		0x4000
-#define	AT_NOSET	(AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
-			 AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
-
 #define	ACCESSED		(AT_ATIME)
 #define	STATE_CHANGED		(AT_CTIME)
 #define	CONTENT_MODIFIED	(AT_MTIME | AT_CTIME)
 
-/*
- * If AT_XVATTR is set then there are additional bits to process in
- * the xvattr_t's attribute bitmap.  If this is not set then the bitmap
- * MUST be ignored.  Note that this bit must be set/cleared explicitly.
- * That is, setting AT_ALL will NOT set AT_XVATTR.
- */
-#define	AT_XVATTR	0x10000
-
-#define	AT_ALL		(AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\
-			AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\
-			AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
-
-#define	AT_STAT		(AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\
-			AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE)
-
-#define	AT_TIMES	(AT_ATIME|AT_MTIME|AT_CTIME)
-
-#define	AT_NOSET	(AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
-			AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
-
-/*
- * Attribute bits used in the extensible attribute's (xva's) attribute
- * bitmaps.  Note that the bitmaps are made up of a variable length number
- * of 32-bit words.  The convention is to use XAT{n}_{attrname} where "n"
- * is the element in the bitmap (starting at 1).  This convention is for
- * the convenience of the maintainer to keep track of which element each
- * attribute belongs to.
- *
- * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY.  CONSUMERS
- * MUST USE THE XAT_* DEFINES.
- */
-#define	XAT0_INDEX	0LL		/* Index into bitmap for XAT0 attrs */
-#define	XAT0_CREATETIME	0x00000001	/* Create time of file */
-#define	XAT0_ARCHIVE	0x00000002	/* Archive */
-#define	XAT0_SYSTEM	0x00000004	/* System */
-#define	XAT0_READONLY	0x00000008	/* Readonly */
-#define	XAT0_HIDDEN	0x00000010	/* Hidden */
-#define	XAT0_NOUNLINK	0x00000020	/* Nounlink */
-#define	XAT0_IMMUTABLE	0x00000040	/* immutable */
-#define	XAT0_APPENDONLY	0x00000080	/* appendonly */
-#define	XAT0_NODUMP	0x00000100	/* nodump */
-#define	XAT0_OPAQUE	0x00000200	/* opaque */
-#define	XAT0_AV_QUARANTINED	0x00000400	/* anti-virus quarantine */
-#define	XAT0_AV_MODIFIED	0x00000800	/* anti-virus modified */
-#define	XAT0_AV_SCANSTAMP	0x00001000	/* anti-virus scanstamp */
-#define XAT0_REPARSE 	0x00002000 	/* FS reparse point */
-
-#define	XAT0_ALL_ATTRS	(XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \
-    XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \
-    XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| \
-    XAT0_AV_MODIFIED|XAT0_AV_SCANSTAMP)
-
-/* Support for XAT_* optional attributes */
-#define	XVA_MASK		0xffffffff	/* Used to mask off 32 bits */
-#define	XVA_SHFT		32		/* Used to shift index */
-
-/*
- * Used to pry out the index and attribute bits from the XAT_* attributes
- * defined below.  Note that we're masking things down to 32 bits then
- * casting to uint32_t.
- */
-#define	XVA_INDEX(attr)		((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK))
-#define	XVA_ATTRBIT(attr)	((uint32_t)((attr) & XVA_MASK))
-
-/*
- * The following defines present a "flat namespace" so that consumers don't
- * need to keep track of which element belongs to which bitmap entry.
- *
- * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER
- */
-#define	XAT_CREATETIME		((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME)
-#define	XAT_ARCHIVE		((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE)
-#define	XAT_SYSTEM		((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM)
-#define	XAT_READONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY)
-#define	XAT_HIDDEN		((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN)
-#define	XAT_NOUNLINK		((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK)
-#define	XAT_IMMUTABLE		((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE)
-#define	XAT_APPENDONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY)
-#define	XAT_NODUMP		((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP)
-#define	XAT_OPAQUE		((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE)
-#define	XAT_AV_QUARANTINED	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED)
-#define	XAT_AV_MODIFIED		((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED)
-#define	XAT_AV_SCANSTAMP	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP)
-#define XAT_REPARSE 		((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE)
-
-/*
- * The returned attribute map array (xva_rtnattrmap[]) is located past the
- * requested attribute map array (xva_reqattrmap[]).  Its location changes
- * when the array sizes change.  We use a separate pointer in a known location
- * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[].  This is
- * set in xva_init()
- */
-#define	XVA_RTNATTRMAP(xvap)	((xvap)->xva_rtnattrmapp)
-
-/*
- * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap
- * of requested attributes (xva_reqattrmap[]).
- */
-#define	XVA_SET_REQ(xvap, attr)					\
-	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
-	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
-	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr)
-
-/*
- * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap
- * of requested attributes (xva_reqattrmap[]).
- */
-#define XVA_CLR_REQ(xvap, attr)                                 \
-	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);          \
-	ASSERT((xvap)->xva_magic == XVA_MAGIC);                 \
-	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr)
-/*
- * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap
- * of returned attributes (xva_rtnattrmap[]).
- */
-#define	XVA_SET_RTN(xvap, attr)					\
-	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
-	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
-	(XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr)
-
-/*
- * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[])
- * to see of the corresponding attribute bit is set.  If so, returns non-zero.
- */
-#define	XVA_ISSET_REQ(xvap, attr)					\
-	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
-		((xvap)->xva_magic == XVA_MAGIC) &&			\
-		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
-	((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) :	0)
-
-/*
- * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[])
- * to see of the corresponding attribute bit is set.  If so, returns non-zero.
- */
-#define	XVA_ISSET_RTN(xvap, attr)					\
-	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
-		((xvap)->xva_magic == XVA_MAGIC) &&			\
-		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
-	((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
-
-static __inline void
+static inline void
 vattr_init_mask(vattr_t *vap)
 {
 
@@ -530,9 +239,7 @@ vattr_init_mask(vattr_t *vap)
 #define	FSYNC	FFSYNC
 #define	FOFFMAX	0x00
 
-enum create	{ CRCREAT };
-
-static __inline int
+static inline int
 zfs_vn_open(const char *pnamep, enum uio_seg seg, int filemode, int createmode,
     vnode_t **vpp, enum create crwhy, mode_t umask)
 {
@@ -546,13 +253,10 @@ zfs_vn_open(const char *pnamep, enum uio
 	ASSERT(umask == 0);
 
 	pb = pathbuf_create(pnamep);
-	if (pb == NULL) {
-		return ENOMEM;
-	}
 	NDINIT(&nd, LOOKUP, NOFOLLOW, pb);
 	error = vn_open(&nd, filemode, createmode);
 	if (error == 0) {
-		VOP_UNLOCK(nd.ni_vp);
+		VOP_UNLOCK(nd.ni_vp, 0);
 		*vpp = nd.ni_vp;
 	}
 	pathbuf_destroy(pb);
@@ -565,7 +269,7 @@ zfs_vn_open(const char *pnamep, enum uio
 	zfs_vn_open((pnamep), (seg), (filemode), (createmode), (vpp), (crwhy), (umask))
 
 #define	RLIM64_INFINITY	0
-static __inline int
+static inline int
 zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp, caddr_t base, ssize_t len,
     offset_t offset, enum uio_seg seg, int ioflag, uint64_t ulimit, cred_t *cr,
     ssize_t *residp)
@@ -588,7 +292,7 @@ zfs_vn_rdwr(enum uio_rw rw, vnode_t *vp,
 #define	vn_rdwr(rw, vp, base, len, offset, seg, ioflag, ulimit, cr, residp) \
 	zfs_vn_rdwr((rw), (vp), (base), (len), (offset), (seg), (ioflag), (ulimit), (cr), (residp))
 
-static __inline int
+static inline int
 zfs_vop_fsync(vnode_t *vp, int flag, cred_t *cr)
 {
 	int error;
@@ -597,12 +301,12 @@ zfs_vop_fsync(vnode_t *vp, int flag, cre
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(vp, cr, FSYNC_WAIT, 0, 0);
-	VOP_UNLOCK(vp);
+	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 #define	VOP_FSYNC(vp, flag, cr, unk)	zfs_vop_fsync((vp), (flag), (cr))
 
-static __inline int
+static inline int
 zfs_vop_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
 
@@ -615,33 +319,33 @@ zfs_vop_close(vnode_t *vp, int flag, int
 #define	VOP_CLOSE(vp, oflags, count, offset, cr, unk) \
 	zfs_vop_close((vp), (oflags), (count), (offset), (cr))
 
-static __inline int
+static inline int
 zfs_vop_getattr(vnode_t *vp, vattr_t *ap, int flag, cred_t *cr)
 {
 	int error;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_GETATTR(vp, ap, cr);
-	VOP_UNLOCK(vp);
+	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 #define	VOP_GETATTR(vp, ap, flag, cr, unk)	zfs_vop_getattr((vp), (ap), (flag), (cr))
 
-static __inline int
+static inline int
 zfs_vop_seek(vnode_t *vp, off_t off, off_t *offp)
 {
 	int error;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	error = VOP_SEEK(vp, off, *offp, kauth_cred_get());
-	VOP_UNLOCK(vp);
+	error = VOP_SEEK(vp, off, *offp, CRED());
+	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 #define	VOP_SEEK(vp, off, offp, unk)	zfs_vop_seek(vp, off, offp)
 
 #define	B_INVAL		BC_INVAL
 
-static __inline int
+static inline int
 zfs_vop_putpage(vnode_t *vp, off_t off, size_t len, int flag)
 {
 	int nbflag;
@@ -664,7 +368,7 @@ zfs_vop_putpage(vnode_t *vp, off_t off, 
 }
 #define	VOP_PUTPAGE(vp, off, len, flag, cr, ct)	zfs_vop_putpage((vp), (off), (len), (flag))
 
-static __inline int
+static inline int
 vn_rename(char *from, char *to, enum uio_seg seg)
 {
 
@@ -673,8 +377,7 @@ vn_rename(char *from, char *to, enum uio
 	return (do_sys_rename(from, to, seg, 0));
 }
 
-enum rm	{ RMFILE };
-static __inline int
+static inline int
 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
 {
 
@@ -684,35 +387,22 @@ vn_remove(char *fnamep, enum uio_seg seg
 	return (do_sys_unlink(fnamep, seg));
 }
 
-#define VN_RELE_ASYNC(vp, taskq) 	vrele_async((vp))
-#define vn_exists(a) 	do { } while(0)
-
 /*
- * Flags for VOP_LOOKUP
- *
- * Defined in file.h, but also possible, FIGNORECASE
- *
+ * VOP_ACCESS flags
  */
-#define LOOKUP_XATTR 		0x02	/* lookup up extended attr dir */
+#define V_APPEND    0x2 /* want to do append only check */
 
-/*
- * Flags for VOP_READDIR
- */
-#define V_RDDIR_ENTFLAGS 	0x01    /* request dirent flags */
-#define	V_RDDIR_ACCFILTER 	0x02	/* filter out inaccessible dirents */
+#define VV_NOSYNC	0
 
-/*
- * Extensible vnode attribute (xva) routines:
- * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR)
- * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t
- */
-void            xva_init(xvattr_t *);
-xoptattr_t	*xva_getxoptattr(xvattr_t *);
+#define VI_LOCK(vp)     mutex_enter((vp)->v_interlock)
+#define VI_UNLOCK(vp)   mutex_exit((vp)->v_interlock)
 
-/*
- * VOP_ACCESS flags
- */
-#define V_ACE_MASK  0x1 /* mask represents  NFSv4 ACE permissions */
-#define V_APPEND    0x2 /* want to do append only check */
+#define VATTR_NULL(x) vattr_null(x)
+
+#define getnewvnode_reserve(x)
+#define getnewvnode_drop_reserve()
+#define cache_purge_negative(vp) cache_purge(vp)
+
+#endif	/* _KERNEL */
 
 #endif	/* _OPENSOLARIS_SYS_VNODE_H_ */
Index: src/external/cddl/osnet/sys/sys/zfs_context.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/zfs_context.h,v
retrieving revision 1.18
diff -u -p -r1.18 zfs_context.h
--- src/external/cddl/osnet/sys/sys/zfs_context.h	10 Jan 2017 19:20:35 -0000	1.18
+++ src/external/cddl/osnet/sys/sys/zfs_context.h	16 Jun 2017 16:42:28 -0000
@@ -39,37 +39,50 @@ extern "C" {
 
 #ifndef _KERNEL
 
-#include <sys/mutex.h>
-#include <sys/rwlock.h>
-#include <sys/condvar.h>
-	
+#define _SYS_MUTEX_H
+#define _SYS_MUTEX_H_
+#define _SYS_RWLOCK_H
+#define _SYS_RWLOCK_H_
+#define _SYS_CONDVAR_H
+#define _SYS_CONDVAR_H_
 #define	_SYS_SYSTM_H
-#define	_SYS_DEBUG_H
+#define	_SYS_SYSTM_H_
 #define	_SYS_T_LOCK_H
 #define	_SYS_VNODE_H
 #define	_SYS_VFS_H
 #define	_SYS_SUNDDI_H
 #define	_SYS_CALLB_H
-#define	_SYS_SCHED_H
-#define	_SYS_TASKQ_
 
-#include <thread.h>
-#include <umem.h>
-#include <paths.h>
-#include <strings.h>
-#include <string.h>
+#include <solaris.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <errno.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <fcntl.h>
 #include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+#include <thread.h>
+#include <assert.h>
+#include <dirent.h>
+#include <time.h>
+#include <math.h>
+#include <umem.h>
+#include <inttypes.h>
+#include <fsshare.h>
+#include <pthread.h>
+#include <paths.h>
 
+#include <sys/debug.h>
+#include <sys/proc.h>
 #include <sys/kstat.h>
 #include <sys/param.h>
 #include <sys/stdint.h>
 #include <sys/note.h>
 #include <sys/uio.h>
+#include <sys/fcntl.h>
 #include <sys/atomic.h>
-#include <sys/zfs_debug.h>
 #include <sys/misc.h>
 #include <sys/sdt.h>
 #include <sys/list.h>
@@ -77,11 +90,12 @@ extern "C" {
 #include <sys/byteorder.h>
 #include <sys/utsname.h>
 #include <sys/cred.h>
-#include <sys/pset.h>
 #include <sys/u8_textprep.h>
 #include <sys/zone.h>
 #include <sys/pathname.h>
 #include <sys/sysevent.h>
+#include <sys/zfs_debug.h>
+#include <sys/cyclic.h>
 
 extern void panic(const char *, ...);
 
@@ -89,14 +103,12 @@ extern void panic(const char *, ...);
 
 #include <sys/sysevent/eventdefs.h>
 
-/* #include <libzfs.h> */
-
 #ifndef ABS
 #define	ABS(a) ((a) < 0 ? -(a) : (a))
 #endif
 
 extern int aok;
-	
+
 /*
  * Debugging
  */
@@ -130,46 +142,6 @@ extern void vcmn_err(int, const char *, 
 
 #define	fm_panic	panic
 
-#if !defined(_LIBZFS_H) && !defined(_ZFS_CONTEXT_NO_VERIFY)
-#define	verify(EX) (void)((EX) || \
-	(__assert13(__FILE__, __LINE__, __func__, #EX), 0))
-#define	VERIFY	verify
-#endif
-
-#define	ASSERT	assert
-
-#ifdef lint
-#define	VERIFY3_IMPL(x, y, z, t)	if (x == z) ((void)0)
-#else
-/* BEGIN CSTYLED */
-#define	VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE) do { \
-	const TYPE __left = (TYPE)(LEFT); \
-	const TYPE __right = (TYPE)(RIGHT); \
-	if (!(__left OP __right)) { \
-		char __buf[256]; \
-		(void) snprintf(__buf, 256, "%s %s %s (0x%llx %s 0x%llx)", \
-			#LEFT, #OP, #RIGHT, \
-			(u_longlong_t)__left, #OP, (u_longlong_t)__right); \
-		__assert13(__FILE__, __LINE__, __func__,__buf); \
-	} \
-_NOTE(CONSTCOND) } while (0)
-/* END CSTYLED */
-#endif /* lint */
-
-#define	VERIFY3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
-#define	VERIFY3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
-#define	VERIFY3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
-
-#ifdef NDEBUG
-#define	ASSERT3S(x, y, z)	((void)0)
-#define	ASSERT3U(x, y, z)	((void)0)
-#define	ASSERT3P(x, y, z)	((void)0)
-#else
-#define	ASSERT3S(x, y, z)	VERIFY3S(x, y, z)
-#define	ASSERT3U(x, y, z)	VERIFY3U(x, y, z)
-#define	ASSERT3P(x, y, z)	VERIFY3P(x, y, z)
-#endif
-
 /*
  * Dtrace SDT probes have different signatures in userland than they do in
  * kernel.  If they're being used in kernel code, re-define them out of
@@ -201,55 +173,99 @@ _NOTE(CONSTCOND) } while (0)
  */
 #define	curthread	((void *)(uintptr_t)thr_self())
 
-typedef pthread_t kthread_t;
+#define	kpreempt(x)	yield()
+
+typedef struct kthread kthread_t;
 
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
 	zk_thread_create(func, arg)
 #define	thread_exit() thr_exit(NULL)
-#define thread_join(t)  panic("libzpool cannot join threads")
+#define	thread_join(t)	panic("libzpool cannot join threads")
 
-#define newproc(f, a, cid, pri, ctp, pid)   (ENOSYS)
-	
-extern kthread_t *zk_thread_create(void (*func)(void *), void *arg);
-
-/* In NetBSD struct proc may be visible in userspace therefore we use it's original
-   definition. */
-#if !defined(p_startzero)
+#define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
+
+/* in libzpool, p0 exists only to have its address taken */
 struct proc {
-	uintptr_t   this_is_never_used_dont_dereference_it;
-	};
-#endif
+	uintptr_t	this_is_never_used_dont_dereference_it;
+};
 
 extern struct proc p0;
-	
-#define	issig(why)		(FALSE)
-#define	ISSIG(thr, why)		(FALSE)
-#define	makedevice(min, maj)	makedev((min), (maj))
+#define	curproc		(&p0)
+
+#define	PS_NONE		-1
+
+extern kthread_t *zk_thread_create(void (*func)(), void *arg);
+
+#define	issig(why)	(FALSE)
+#define	ISSIG(thr, why)	(FALSE)
 
 /*
  * Mutexes
  */
 
-#define	MUTEX_DEFAULT	USYNC_THREAD
-#define	MUTEX_HELD(m)	(mutex_owned(m))
+typedef struct kmutex {
+	void		*m_owner;
+	boolean_t	initialized;
+	mutex_t		m_lock;
+} kmutex_t;
 
-extern void mutex_init(kmutex_t *mp, void *, kmutex_type_t, void *);
-extern void mutex_destroy(kmutex_t *mp);
-extern void mutex_enter(kmutex_t *mp);
-extern void mutex_exit(kmutex_t *mp);
+#define	MUTEX_DEFAULT	USYNC_THREAD
+#undef	MUTEX_HELD
+#undef	MUTEX_NOT_HELD
+#define	MUTEX_HELD(m)	((m)->m_owner == curthread)
+#define	MUTEX_NOT_HELD(m) (!MUTEX_HELD(m))
+#define	_mutex_held(m)	pthread_mutex_isowned_np(m)
+
+/*
+ * Argh -- we have to get cheesy here because the kernel and userland
+ * have different signatures for the same routine.
+ */
+//extern int _mutex_init(mutex_t *mp, int type, void *arg);
+//extern int _mutex_destroy(mutex_t *mp);
+//extern int _mutex_owned(mutex_t *mp);
+
+#define	mutex_init(mp, b, c, d)		zmutex_init((kmutex_t *)(mp))
+#define	mutex_destroy(mp)		zmutex_destroy((kmutex_t *)(mp))
+#define	mutex_enter(mp)			zmutex_enter(mp)
+#define	mutex_exit(mp)			zmutex_exit(mp)
+#define	mutex_owned(mp)			zmutex_owned((kmutex_t *)(mp))
+
+extern void zmutex_init(kmutex_t *mp);
+extern void zmutex_destroy(kmutex_t *mp);
+extern void zmutex_enter(kmutex_t *mp);
+extern void zmutex_exit(kmutex_t *mp);
 extern int mutex_tryenter(kmutex_t *mp);
 extern void *mutex_owner(kmutex_t *mp);
-extern int mutex_owned(kmutex_t *mp);
 
 /*
  * RW locks
  */
+typedef struct krwlock {
+        int             rw_count;
+	void		*rw_owner;
+	boolean_t	initialized;
+	rwlock_t	rw_lock;
+} krwlock_t;
+
+typedef int krw_t;
+
+#define	RW_READER	0
+#define	RW_WRITER	1
 #define	RW_DEFAULT	USYNC_THREAD
 
 #define	RW_LOCK_HELD(x)		rw_lock_held(x)
 #define	RW_WRITE_HELD(x)	rw_write_held(x)
 #define	RW_READ_HELD(x)		rw_read_held(x)
 
+#undef RW_READ_HELD
+#define RW_READ_HELD(x)		((x)->rw_owner == NULL && (x)->rw_count > 0)
+
+#undef RW_WRITE_HELD
+#define	RW_WRITE_HELD(x)	((x)->rw_owner == curthread)
+
+#undef RW_LOCK_HELD
+#define	RW_LOCK_HELD(x)		(RW_READ_HELD(x) || RW_WRITE_HELD(x))
+
 extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
 extern void rw_destroy(krwlock_t *rwlp);
 extern void rw_enter(krwlock_t *rwlp, krw_t rw);
@@ -269,39 +285,56 @@ extern gid_t *crgetgroups(cred_t *cr);
 /*
  * Condition variables
  */
+typedef cond_t kcondvar_t;
+
 #define	CV_DEFAULT	USYNC_THREAD
+#define	CALLOUT_FLAG_ABSOLUTE	0x2
 
 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
 extern void cv_destroy(kcondvar_t *cv);
 extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
 extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
+extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag);
 extern void cv_signal(kcondvar_t *cv);
 extern void cv_broadcast(kcondvar_t *cv);
 
 /*
+ * Thread-specific data
+ */
+#define	tsd_get(k) pthread_getspecific(k)
+#define	tsd_set(k, v) pthread_setspecific(k, v)
+#define	tsd_create(kp, d) pthread_key_create(kp, d)
+#define	tsd_destroy(kp) /* nothing */
+
+/*
  * Kernel memory
  */
 #define	KM_SLEEP		UMEM_NOFAIL
+#define	KM_PUSHPAGE		KM_SLEEP
 #define	KM_NOSLEEP		UMEM_DEFAULT
-#define	KM_PUSHPAGE		UMEM_NOFAIL
 #define	KMC_NODEBUG		UMC_NODEBUG
+#define	KMC_NOTOUCH		0	/* not needed for userland caches */
+#define	KM_NODEBUG		0
 #define	kmem_alloc(_s, _f)	umem_alloc(_s, _f)
 #define	kmem_zalloc(_s, _f)	umem_zalloc(_s, _f)
 #define	kmem_free(_b, _s)	umem_free(_b, _s)
-#define	kmem_size()		(physmem * PAGESIZE)
+#define	kmem_size()		((uint64_t)physmem * PAGESIZE)
 #define	kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \
 	umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i)
 #define	kmem_cache_destroy(_c)	umem_cache_destroy(_c)
 #define	kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
 #define	kmem_cache_free(_c, _b)	umem_cache_free(_c, _b)
 #define	kmem_debugging()	0
-#define	kmem_cache_reap_now(c)
+#define	kmem_cache_reap_now(_c)		/* nothing */
+#define	kmem_cache_set_move(_c, _cb)	/* nothing */
+#define	vmem_qcache_reap(_v)		/* nothing */
+#define	POINTER_INVALIDATE(_pp)		/* nothing */
+#define	POINTER_IS_VALID(_p)	0
 
 typedef umem_cache_t kmem_cache_t;
 typedef struct vmem vmem_t;
 
-#define	ZFS_EXPORTS_PATH	"/etc/zfs/exports"
-
 /*
  * Task queues
  */
@@ -309,16 +342,27 @@ typedef struct taskq taskq_t;
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
+typedef struct taskq_ent {
+	struct taskq_ent	*tqent_next;
+	struct taskq_ent	*tqent_prev;
+	task_func_t		*tqent_func;
+	void			*tqent_arg;
+	uintptr_t		tqent_flags;
+} taskq_ent_t;
+
+#define	TQENT_FLAG_PREALLOC	0x1	/* taskq_dispatch_ent used */
+
 #define TASKQ_PREPOPULATE   0x0001
 #define TASKQ_CPR_SAFE      0x0002  /* Use CPR safe protocol */
 #define TASKQ_DYNAMIC       0x0004  /* Use dynamic thread scheduling */
 #define TASKQ_THREADS_CPU_PCT   0x0008  /* Scale # threads by # cpus */
 #define TASKQ_DC_BATCH      0x0010  /* Mark threads as batch */
 
-#define TQ_SLEEP    KM_SLEEP    /* Can block for memory */
-#define TQ_NOSLEEP  KM_NOSLEEP  /* cannot block for memory; may fail */
-#define TQ_NOQUEUE  0x02        /* Do not enqueue if can't dispatch */
-#define TQ_FRONT    0x08        /* Queue in front */
+#define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
+#define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
+#define	TQ_NOQUEUE	0x02		/* Do not enqueue if can't dispatch */
+#define	TQ_FRONT	0x08		/* Queue in front */
+
 
 extern taskq_t *system_taskq;
 
@@ -333,7 +377,7 @@ extern void taskq_wait(taskq_t *);
 extern int  taskq_member(taskq_t *, void *);
 extern void system_taskq_init(void);
 extern void system_taskq_fini(void);
-	
+
 #define	XVA_MAPSIZE	3
 #define	XVA_MAGIC	0x78766174
 
@@ -344,8 +388,12 @@ typedef struct vnode {
 	uint64_t	v_size;
 	int		v_fd;
 	char		*v_path;
+	int		v_dump_fd;
 } vnode_t;
 
+extern char *vn_dumpdir;
+#define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
+
 typedef struct vattr {
 	uint_t		va_mask;	/* bit-mask of attributes */
 	u_offset_t	va_size;	/* file size in bytes */
@@ -371,10 +419,14 @@ typedef struct vattr {
 
 #define	VOP_CLOSE(vp, f, c, o, cr, unk)	0
 #define	VOP_PUTPAGE(vp, of, sz, fl, cr, unk)	0
-#define	VOP_GETATTR(vp, vap, fl, cr, unk) vn_getattr(vp, vap)
+#define	VOP_GETATTR(vp, vap, fl, cr, unk) fop_getattr(vp, vap)
 #define	VOP_FSYNC(vp, f, cr, unk)	fsync((vp)->v_fd)
 
 #define	VN_RELE(vp)	vn_close(vp)
+#define	VN_RELE_ASYNC(vp)	vn_close(vp)
+
+#define vn_lock(vp, type)
+#define VOP_UNLOCK(vp, type)
 
 extern int vn_open(char *path, int x1, int oflags, int mode, vnode_t **vpp,
     int x2, int x3);
@@ -382,7 +434,7 @@ extern int vn_openat(char *path, int x1,
     int x2, int x3, vnode_t *vp, int unk);
 extern int vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len,
     offset_t offset, int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp);
-extern void vn_close(vnode_t *vp);
+extern void vn_close(vnode_t *vp, int, cred_t *, kthread_t *);
 extern int vn_getattr(vnode_t *vp, vattr_t *va);
 
 #define	vn_remove(path, x1, x2)		remove(path)
@@ -393,8 +445,6 @@ extern vnode_t *rootdir;
 
 #include <sys/file.h>		/* for FREAD, FWRITE, etc */
 #define	FTRUNC	O_TRUNC
-#define	openat64	openat
-
 
 /*
  * Random stuff
@@ -402,8 +452,8 @@ extern vnode_t *rootdir;
 #define	lbolt	(gethrtime() >> 23)
 #define	lbolt64	(gethrtime() >> 23)
 #define	hz	119	/* frequency when using gethrtime() >> 23 for lbolt */
-	
-extern void xdelay(clock_t ticks);
+
+extern void delay(clock_t ticks);
 
 #define	gethrestime_sec() time(NULL)
 #define gethrestime(t) \
@@ -422,6 +472,11 @@ extern void xdelay(clock_t ticks);
 #define	kcred		NULL
 #define	CRED()		NULL
 
+#ifdef ptob
+#undef ptob
+#endif
+#define	ptob(x)		((x) * PAGESIZE)
+
 extern uint64_t physmem;
 
 extern int random_get_bytes(uint8_t *ptr, size_t len);
@@ -431,7 +486,7 @@ extern void kernel_init(int);
 extern void kernel_fini(void);
 
 struct spa;
-extern void nicenum(uint64_t, char *, size_t);
+extern void nicenum(uint64_t, char *);
 extern void show_pool_stats(struct spa *);
 
 typedef struct callb_cpr {
@@ -458,9 +513,6 @@ typedef struct callb_cpr {
 #define	zone_dataset_visible(x, y)	(1)
 #define	INGLOBALZONE(z)			(1)
 
-#define	open64	open
-int	mkdirp(const char *, mode_t);
-
 /*
  * Hostname information
  */
@@ -491,11 +543,11 @@ extern int kobj_read_file(struct _buf *f
     unsigned off);
 extern void kobj_close_file(struct _buf *file);
 extern int kobj_get_filesize(struct _buf *file, uint64_t *size);
+
 /* Random compatibility stuff. */
 #define	lbolt	(gethrtime() >> 23)
 #define	lbolt64	(gethrtime() >> 23)
 
-/* extern int hz; */
 extern uint64_t physmem;
 
 #define	gethrestime_sec()	time(NULL)
@@ -513,29 +565,13 @@ struct file {
 #define	FCREAT	O_CREAT
 #define	FOFFMAX	0x0
 
-#define	SYSCTL_DECL(...)
-#define	SYSCTL_NODE(...)
-#define	SYSCTL_INT(...)
-#define	SYSCTL_ULONG(...)
-#ifdef TUNABLE_INT
-#undef TUNABLE_INT
-#undef TUNABLE_ULONG
-#endif
-#define	TUNABLE_INT(...)
-#define	TUNABLE_ULONG(...)
-
 /* Errors */
 
 #ifndef	ERESTART
 #define	ERESTART	(-1)
 #endif
 
-#ifdef ptob
-#undef ptob
-#endif
-
-size_t	ptob(size_t);
-
+/* SID stuff */
 
 typedef struct ksiddomain {
 	uint_t	kd_ref;
@@ -546,6 +582,9 @@ typedef struct ksiddomain {
 ksiddomain_t *ksid_lookupdomain(const char *);
 void ksiddomain_rele(ksiddomain_t *);
 
+#define	DDI_SLEEP	KM_SLEEP
+#define	ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g)	(0)
+
 typedef struct ace_object {
  	uid_t		a_who;
  	uint32_t	a_access_mask;
@@ -575,13 +614,13 @@ extern char *kmem_asprintf(const char *f
 
 #define DEV_PHYS_PATH "phys_path"
 
-#define DDI_SLEEP KM_SLEEP
-#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) 0	
+typedef	uint32_t	idmap_rid_t;
+
+#define	ZFS_EXPORTS_PATH	"/etc/zfs/exports"
 
 #else	/* _KERNEL */
 
 #include <sys/systm.h>
-#include <sys/kcondvar.h>
 #include <sys/syslimits.h>
 #include <sys/kmem.h>
 #include <sys/param.h>
@@ -595,9 +634,11 @@ extern char *kmem_asprintf(const char *f
 #include <sys/bitmap.h>
 #include <sys/cmn_err.h>
 #include <sys/taskq.h>
+#include <sys/taskq_impl.h>
 #include <sys/conf.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
+#include <sys/kcondvar.h>
 #include <sys/list.h>
 #include <sys/uio.h>
 #include <sys/dirent.h>
@@ -611,6 +652,7 @@ extern char *kmem_asprintf(const char *f
 #include <sys/file.h>
 #include <sys/vfs.h>
 #include <sys/lockf.h>
+#include <sys/refstr.h>
 #include <sys/zone.h>
 #include <sys/misc.h>
 #include <sys/zfs_debug.h>
@@ -621,16 +663,17 @@ extern char *kmem_asprintf(const char *f
 #include <sys/u8_textprep.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
+#include <sys/u8_textprep.h>
+#include <sys/sunddi.h>
+#include <sys/cyclic.h>
 
+#include <miscfs/specfs/specdev.h>
 #include <uvm/uvm_extern.h>
 
 #define	CPU_SEQID	(curcpu()->ci_data.cpu_index)
 #define	lbolt		hardclock_ticks
 #define	lbolt64		lbolt
 
-clock_t ddi_get_lbolt(void);
-int64_t ddi_get_lbolt64(void);
-
 #ifdef	__cplusplus
 }
 #endif
@@ -664,8 +707,6 @@ extern struct utsname utsname;
 #define ptob(x)			((x) * PAGE_SIZE)
 
 #define	strncat(a, b, c)	strlcat(a, b, c)
-#define	tsd_get(x)		lwp_getspecific(x)
-#define	tsd_set(x, y)		(lwp_setspecific(x, y), 0)
 #define	kmem_debugging()	0
 
 #define zone_get_hostid(a)      0
@@ -673,23 +714,106 @@ extern struct utsname utsname;
 extern char *kmem_asprintf(const char *fmt, ...);
 #define strfree(str) kmem_free((str), strlen(str)+1)
 
+static inline void
+tsd_create(uint_t *keyp, void (*func)(void *))
+{
+	int error __unused;
+
+	error = lwp_specific_key_create(keyp, func);
+	KASSERT(error == 0);
+}
+
+static inline void
+tsd_destroy(uint_t *keyp)
+{
+
+	lwp_specific_key_delete(*keyp);
+}
+
+static inline void *
+tsd_get(uint_t key)
+{
+
+	return lwp_getspecific(key);
+}
+
+static inline int
+tsd_set(uint_t key, void *value)
+{
+
+	lwp_setspecific(key, value);
+	return 0;	
+}
+
+#define td_ru l_ru
+
 /* NetBSD doesn't need this routines in zfs code, yet */
 #define	taskq_create_proc(a, b, c, d, e, p, f) \
 	(taskq_create(a, b, c, d, e, f))
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
 	(taskq_create(a, b, maxclsyspri, d, e, f))
 
-#define	MAXUID			UID_MAX
 #define	FIGNORECASE		0
-#define	CREATE_XATTR_DIR	0
 
 #define DEV_PHYS_PATH "phys_path"
 
-#define DDI_SLEEP KM_SLEEP
-#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) 0
-
 #define sys_shutdown	0
 
+static inline int
+sprintf(char * __restrict buf, const char * __restrict fmt, ...)
+{
+	va_list ap;
+	int rv;
+
+	va_start(ap, fmt);
+	rv = vsnprintf(buf, 1024, fmt, ap);
+	va_end(ap);
+	return rv;
+}
+
+static inline int
+vsprintf(char * __restrict buf, const char * __restrict fmt, va_list ap)
+{
+
+	return vsnprintf(buf, 1024, fmt, ap);
+}
+
+/*
+ * FreeBSD interfaces
+ */
+
+void zfs_netbsd_setsize(vnode_t *, off_t);
+#define vnode_pager_setsize(vp, size) zfs_netbsd_setsize(vp, size)
+#define vn_pages_remove(a, b, c)
+
+#define getf		fd_getfile
+#define releasef	fd_putfile
+
+#define callout_drain(x) callout_stop(x)
+
 #endif	/* _KERNEL */
 
+#define	SYSCTL_DECL(...)
+#define	SYSCTL_NODE(...)
+#define	SYSCTL_INT(...)
+#define	SYSCTL_UINT(...)
+#define	SYSCTL_LONG(...)
+#define	SYSCTL_ULONG(...)
+#define	SYSCTL_QUAD(...)
+#define	SYSCTL_UQUAD(...)
+#ifdef TUNABLE_INT
+#undef TUNABLE_INT
+#undef TUNABLE_ULONG
+#undef TUNABLE_INT_FETCH
+#endif
+#define	TUNABLE_INT(...)
+#define	TUNABLE_ULONG(...)
+#define	TUNABLE_INT_FETCH(...)		0
+
+#define ASSERT_VOP_LOCKED(vp, name) KASSERT(VOP_ISLOCKED(vp) != 0)
+#define ASSERT_VOP_ELOCKED(vp, name) KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+
+#define UID_NOBODY (-2)
+#define GID_NOBODY (-2)
+
 #endif	/* _SYS_ZFS_CONTEXT_H */
Index: src/external/cddl/osnet/sys/sys/zone.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/sys/sys/zone.h,v
retrieving revision 1.3
diff -u -p -r1.3 zone.h
--- src/external/cddl/osnet/sys/sys/zone.h	21 Feb 2010 01:46:36 -0000	1.3
+++ src/external/cddl/osnet/sys/sys/zone.h	10 Jun 2017 05:57:06 -0000
@@ -25,7 +25,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/compat/opensolaris/sys/zone.h,v 1.1 2007/04/06 01:09:06 pjd Exp $
+ * $FreeBSD: head/sys/cddl/compat/opensolaris/sys/zone.h 219089 2011-02-27 19:41:40Z pjd $
  */
 
 #ifndef _OPENSOLARIS_SYS_ZONE_H_
@@ -47,12 +47,12 @@ struct ucred;
 /*
  * Attach the given dataset to the given jail.
  */
-extern int zone_dataset_attach(struct kauth_cred *, const char *, int);
+extern int zone_dataset_attach(cred_t *, const char *, int);
 
 /*
  * Detach the given dataset to the given jail.
  */
-extern int zone_dataset_detach(struct kauth_cred *, const char *, int);
+extern int zone_dataset_detach(cred_t *, const char *, int);
 
 /*
  * Returns true if the named pool/dataset is visible in the current zone.
Index: src/external/cddl/osnet/sys/sys/acl/acl_common.h
===================================================================
RCS file: src/external/cddl/osnet/sys/sys/acl/acl_common.h
diff -N src/external/cddl/osnet/sys/sys/acl/acl_common.h
--- src/external/cddl/osnet/sys/sys/acl/acl_common.h	28 Feb 2010 14:45:47 -0000	1.3
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,66 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ACL_COMMON_H
-#define _ACL_COMMON_H
-
-#pragma ident   "%Z%%M% %I% %E% SMI"
-
-
-#include <sys/types.h>
-#include <sys/acl.h>
-#include <sys/stat.h>
-
-#define _ACL_ACLENT_ENABLED 0x1
-#define _ACL_ACE_ENABLED    0x2
-
-#ifdef  __cplusplus
-extern "C" {
-	#endif
-
-	extern ace_t trivial_acl[6];
-
-	extern int acltrivial(const char *);
-	extern void adjust_ace_pair(ace_t *pair, mode_t mode);
-	extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t);
-	extern int ace_trivial(ace_t *acep, int aclcnt);
-	extern int ace_trivial_common(void *, int,
-	    uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *,
-		uint32_t *mask));
-#if !defined(_KERNEL)
-	extern acl_t *acl_alloc(acl_type_t);
-	extern void acl_free(acl_t *aclp);
-	extern int acl_translate(acl_t *aclp, int target_flavor,
-	    int isdir, uid_t owner, gid_t group);
-	void ksort(caddr_t v, int n, int s, int (*f)());
-	int cmp2acls(void *a, void *b);
-
-#endif /* _KERNEL */
-
-	#ifdef  __cplusplus
-}
-#endif
-
-#endif /* _ACL_COMMON_H */
Index: src/external/cddl/osnet/usr.bin/ctfconvert/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.bin/ctfconvert/Makefile,v
retrieving revision 1.7
diff -u -p -r1.7 Makefile
--- src/external/cddl/osnet/usr.bin/ctfconvert/Makefile	4 Feb 2016 17:42:34 -0000	1.7
+++ src/external/cddl/osnet/usr.bin/ctfconvert/Makefile	10 Jun 2017 06:00:40 -0000
@@ -1,6 +1,6 @@
 #	$NetBSD: Makefile,v 1.7 2016/02/04 17:42:34 roy Exp $
 
-# $FreeBSD: src/cddl/usr.bin/ctfconvert/Makefile,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+# $FreeBSD: head/cddl/usr.bin/ctfconvert/Makefile 314654 2017-03-04 11:30:04Z ngie $
 
 .include <bsd.own.mk>
 
Index: src/external/cddl/osnet/usr.bin/ctfconvert/ctfconvert.1
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.bin/ctfconvert/ctfconvert.1,v
retrieving revision 1.3
diff -u -p -r1.3 ctfconvert.1
--- src/external/cddl/osnet/usr.bin/ctfconvert/ctfconvert.1	20 Jan 2013 23:51:40 -0000	1.3
+++ src/external/cddl/osnet/usr.bin/ctfconvert/ctfconvert.1	10 Jun 2017 05:58:08 -0000
@@ -27,7 +27,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\" $FreeBSD: src/cddl/usr.bin/ctfconvert/ctfconvert.1,v 1.1 2010/08/11 18:00:45 rpaulo Exp $
+.\" $FreeBSD: head/cddl/usr.bin/ctfconvert/ctfconvert.1 276294 2014-12-27 08:31:52Z joel $
 .\"
 .Dd July 7, 2010
 .Dt CTFCONVERT 1
Index: src/external/cddl/osnet/usr.bin/ctfdump/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.bin/ctfdump/Makefile,v
retrieving revision 1.8
diff -u -p -r1.8 Makefile
--- src/external/cddl/osnet/usr.bin/ctfdump/Makefile	4 Feb 2016 18:47:14 -0000	1.8
+++ src/external/cddl/osnet/usr.bin/ctfdump/Makefile	10 Jun 2017 06:01:53 -0000
@@ -1,6 +1,6 @@
 #	$NetBSD: Makefile,v 1.8 2016/02/04 18:47:14 roy Exp $
 
-# $FreeBSD: src/cddl/usr.bin/ctfdump/Makefile,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+# $FreeBSD: head/cddl/usr.bin/ctfdump/Makefile 314654 2017-03-04 11:30:04Z ngie $
 
 .include <bsd.own.mk>
 
Index: src/external/cddl/osnet/usr.bin/ctfdump/ctfdump.1
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.bin/ctfdump/ctfdump.1,v
retrieving revision 1.2
diff -u -p -r1.2 ctfdump.1
--- src/external/cddl/osnet/usr.bin/ctfdump/ctfdump.1	19 Jan 2013 01:22:13 -0000	1.2
+++ src/external/cddl/osnet/usr.bin/ctfdump/ctfdump.1	10 Jun 2017 06:01:32 -0000
@@ -27,7 +27,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\" $FreeBSD: src/cddl/usr.bin/ctfdump/ctfdump.1,v 1.1 2010/08/11 18:00:45 rpaulo Exp $
+.\" $FreeBSD: head/cddl/usr.bin/ctfdump/ctfdump.1 211187 2010-08-11 18:00:45Z rpaulo $
 .\"
 .Dd July 7, 2010
 .Dt CTFDUMP 1
Index: src/external/cddl/osnet/usr.bin/ctfmerge/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.bin/ctfmerge/Makefile,v
retrieving revision 1.9
diff -u -p -r1.9 Makefile
--- src/external/cddl/osnet/usr.bin/ctfmerge/Makefile	4 Feb 2016 17:47:17 -0000	1.9
+++ src/external/cddl/osnet/usr.bin/ctfmerge/Makefile	10 Jun 2017 06:02:24 -0000
@@ -1,6 +1,6 @@
 #	$NetBSD: Makefile,v 1.9 2016/02/04 17:47:17 roy Exp $
 
-# $FreeBSD: src/cddl/usr.bin/ctfmerge/Makefile,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+# $FreeBSD: head/cddl/usr.bin/ctfmerge/Makefile 314654 2017-03-04 11:30:04Z ngie $
 
 .include <bsd.own.mk>
 
Index: src/external/cddl/osnet/usr.bin/ctfmerge/ctfmerge.1
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.bin/ctfmerge/ctfmerge.1,v
retrieving revision 1.2
diff -u -p -r1.2 ctfmerge.1
--- src/external/cddl/osnet/usr.bin/ctfmerge/ctfmerge.1	19 Jan 2013 01:22:13 -0000	1.2
+++ src/external/cddl/osnet/usr.bin/ctfmerge/ctfmerge.1	10 Jun 2017 06:02:13 -0000
@@ -27,7 +27,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.\" $FreeBSD: src/cddl/usr.bin/ctfmerge/ctfmerge.1,v 1.1 2010/08/11 18:00:45 rpaulo Exp $
+.\" $FreeBSD: head/cddl/usr.bin/ctfmerge/ctfmerge.1 239969 2012-09-01 06:23:13Z joel $
 .\"
 .Dd July 7, 2010
 .Dt CTFMERGE 1
Index: src/external/cddl/osnet/usr.sbin/dtrace/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.sbin/dtrace/Makefile,v
retrieving revision 1.9
diff -u -p -r1.9 Makefile
--- src/external/cddl/osnet/usr.sbin/dtrace/Makefile	5 Feb 2016 10:14:49 -0000	1.9
+++ src/external/cddl/osnet/usr.sbin/dtrace/Makefile	10 Jun 2017 06:02:54 -0000
@@ -1,6 +1,6 @@
 #	$NetBSD: Makefile,v 1.9 2016/02/05 10:14:49 roy Exp $
 
-# $FreeBSD: src/cddl/usr.sbin/dtrace/Makefile,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $
+# $FreeBSD: head/cddl/usr.sbin/dtrace/Makefile 314654 2017-03-04 11:30:04Z ngie $
 
 .include <bsd.own.mk>
 .include	"../../Makefile.inc"
@@ -31,6 +31,12 @@ CFLAGS+=	-I${OSNETDIR}/sys \
 COPTS.dtrace.c += -Wno-stack-protector
 COPTS.dtrace.c += -Wno-format-extra-args
 
+COPTS.dtrace.c += -Wno-shadow
+COPTS.dtrace.c += -Wno-discarded-qualifiers
+COPTS.dtrace.c += -Wno-missing-field-initializers
+COPTS.dtrace.c += -Wno-unused-but-set-variable
+COPTS.dtrace.c += -Wno-format
+
 LDFLAGS+=	-pthread
 
 PROGDPLIBS+=	ctf ${.CURDIR}/../../lib/libctf
Index: src/external/cddl/osnet/usr.sbin/zdb/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/usr.sbin/zdb/Makefile,v
retrieving revision 1.3
diff -u -p -r1.3 Makefile
--- src/external/cddl/osnet/usr.sbin/zdb/Makefile	24 Feb 2010 11:56:35 -0000	1.3
+++ src/external/cddl/osnet/usr.sbin/zdb/Makefile	22 Apr 2017 17:24:09 -0000
@@ -24,6 +24,9 @@ LDADD+=         -L${LIBUUTIL_OBJDIR} -lu
 LIBZFS_OBJDIR!=  cd ${LIBZFS_SRCDIR} && ${PRINTOBJDIR}
 LDADD+=         -L${LIBZFS_OBJDIR} -lzfs
 
+LIBZFS_CORE_OBJDIR!=  cd ${LIBZFS_CORE_SRCDIR} && ${PRINTOBJDIR}
+LDADD+=         -L${LIBZFS_CORE_OBJDIR} -lzfs_core
+
 LIBZPOOL_OBJDIR!=  cd ${LIBZPOOL_SRCDIR} && ${PRINTOBJDIR}
 LDADD+=         -L${LIBZPOOL_OBJDIR} -lzpool
 
Index: src/lib/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/lib/Makefile,v
retrieving revision 1.253
diff -u -p -r1.253 Makefile
--- src/lib/Makefile	10 Oct 2017 19:28:33 -0000	1.253
+++ src/lib/Makefile	10 Oct 2017 20:17:09 -0000
@@ -141,7 +141,7 @@ SUBDIR+=	../external/bsd/iscsi/lib	# dep
 .endif
 
 .if (${MKZFS} != "no")
-SUBDIR+=        ../external/cddl/osnet/lib/libzfs
+SUBDIR+=        ../external/cddl/osnet/lib/libzfs_core
 SUBDIR+=        ../external/cddl/osnet/lib/libzpool
 .endif
 
@@ -210,6 +210,10 @@ SUBDIR+=	../external/bsd/fetch/lib	# dep
 SUBDIR+=	../external/bsd/openldap/lib	# depends on libcrypto, ...
 .endif
 
+.if (${MKZFS} != "no")
+SUBDIR+=        ../external/cddl/osnet/lib/libzfs
+.endif
+
 #==================== 3rd library dependency barrier ====================
 SUBDIR+=	.WAIT
 
Index: src/lib/libpthread/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/lib/libpthread/Makefile,v
retrieving revision 1.87
diff -u -p -r1.87 Makefile
--- src/lib/libpthread/Makefile	3 Jul 2016 14:24:58 -0000	1.87
+++ src/lib/libpthread/Makefile	27 Dec 2016 23:24:35 -0000
@@ -90,31 +90,13 @@ SRCS+=		pthread_compat.c
 
 ALIGN_FUNCTIONS=	${${ACTIVE_CC} == "gcc":? -falign-functions=32 :}
 
-.if ${MACHINE_CPU} != "m68k" && ${MACHINE_CPU} != "sh3" && ${MACHINE_ARCH} != "vax"
-OMIT_FRAME_POINTER=	-fomit-frame-pointer
-.else
-OMIT_FRAME_POINTER=
-.endif
-
 # The TSD routines are used in the implementation of profiling, and so
 # can't be profiled themselves.
-COPTS.pthread_specific.c+=	${OMIT_FRAME_POINTER} ${ALIGN_FUNCTIONS}
+COPTS.pthread_specific.c+=	${ALIGN_FUNCTIONS}
 pthread_specific.po: pthread_specific.o
 	${_MKTARGET_CREATE}
 	cp pthread_specific.o pthread_specific.po
 
-# Internal spinlock routines are performance critical.  Don't profile them,
-# it's incompatibile with -fomit-frame-pointer.
-COPTS.pthread_lock.c+=	${OMIT_FRAME_POINTER} ${ALIGN_FUNCTIONS}
-pthread_lock.po: pthread_lock.o
-	${_MKTARGET_CREATE}
-	cp pthread_lock.o pthread_lock.po
-
-COPTS.pthread_mutex.c+=	${OMIT_FRAME_POINTER} ${ALIGN_FUNCTIONS}
-pthread_mutex.po: pthread_mutex.o
-	${_MKTARGET_CREATE}
-	cp pthread_mutex.o pthread_mutex.po
-
 COPTS.pthread.c += -Wno-stack-protector -Wno-format-nonliteral
 COPTS.pthread_attr.c += -Wno-format-nonliteral
 
Index: src/share/mk/bsd.host.mk
===================================================================
RCS file: /home/chs/netbsd/cvs/src/share/mk/bsd.host.mk,v
retrieving revision 1.3
diff -u -p -r1.3 bsd.host.mk
--- src/share/mk/bsd.host.mk	20 Feb 2016 15:18:46 -0000	1.3
+++ src/share/mk/bsd.host.mk	9 May 2017 09:08:57 -0000
@@ -8,12 +8,13 @@ HOST_DBG?= -g
 .else
 HOST_DBG?= -O
 .endif
+HOST_DTRACE_OPTS?=	-fno-omit-frame-pointer -fno-optimize-sibling-calls -fno-ipa-sra
 
 # Helpers for cross-compiling
 HOST_CC?=	cc
 HOST_CFLAGS?=	${HOST_DBG}
-HOST_COMPILE.c?=${HOST_CC} ${HOST_CFLAGS} ${HOST_CPPFLAGS} -c
-HOST_COMPILE.cc?=      ${HOST_CXX} ${HOST_CXXFLAGS} ${HOST_CPPFLAGS} -c
+HOST_COMPILE.c?=${HOST_CC} ${HOST_CFLAGS} ${HOST_DTRACE_OPTS} ${HOST_CPPFLAGS} -c
+HOST_COMPILE.cc?=      ${HOST_CXX} ${HOST_CXXFLAGS} ${HOST_DTRACE_OPTS} ${HOST_CPPFLAGS} -c
 HOST_LINK.cc?=  ${HOST_CXX} ${HOST_CXXFLAGS} ${HOST_CPPFLAGS} ${HOST_LDFLAGS}
 .if defined(HOSTPROG_CXX)
 HOST_LINK.c?=   ${HOST_LINK.cc}
Index: src/share/mk/sys.mk
===================================================================
RCS file: /home/chs/netbsd/cvs/src/share/mk/sys.mk,v
retrieving revision 1.129
diff -u -p -r1.129 sys.mk
--- src/share/mk/sys.mk	30 Mar 2016 17:35:43 -0000	1.129
+++ src/share/mk/sys.mk	8 May 2017 14:30:36 -0000
@@ -37,9 +37,10 @@ DBG?=	-O2 ${"${.TARGET:M*.po}" == "":? -
 .else
 DBG?=	-O2
 .endif
+DTRACE_OPTS?=	-fno-omit-frame-pointer -fno-optimize-sibling-calls -fno-ipa-sra -fno-ipa-icf
 CFLAGS?=	${DBG}
 LDFLAGS?=
-COMPILE.c?=	${CC} ${CFLAGS} ${CPPFLAGS} -c
+COMPILE.c?=	${CC} ${CFLAGS} ${DTRACE_OPTS} ${CPPFLAGS} -c
 LINK.c?=	${CC} ${CFLAGS} ${CPPFLAGS} ${LDFLAGS}
 
 # C Type Format data is required for DTrace
@@ -55,7 +56,7 @@ __ALLSRC3=	${empty(NETBSDSRCDIR):?${__AL
 __BUILDSEED=	${BUILDSEED}/${__ALLSRC3:O}/${.TARGET}
 _CXXSEED?=	${BUILDSEED:D-frandom-seed=${__BUILDSEED:hash}}
 
-COMPILE.cc?=	${CXX} ${_CXXSEED} ${CXXFLAGS} ${CPPFLAGS} -c
+COMPILE.cc?=	${CXX} ${_CXXSEED} ${CXXFLAGS} ${DTRACE_OPTS} ${CPPFLAGS} -c
 LINK.cc?=	${CXX} ${CXXFLAGS} ${CPPFLAGS} ${LDFLAGS}
 
 OBJC?=		${CC}
Index: src/sys/arch/arm/arm/undefined.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/arch/arm/arm/undefined.c,v
retrieving revision 1.60
diff -u -p -r1.60 undefined.c
--- src/sys/arch/arm/arm/undefined.c	2 Jul 2017 16:16:44 -0000	1.60
+++ src/sys/arch/arm/arm/undefined.c	3 Jul 2017 21:56:29 -0000
@@ -225,34 +225,26 @@ static struct undefined_handler gdb_uh_t
 dtrace_doubletrap_func_t	dtrace_doubletrap_func = NULL;
 dtrace_trap_func_t		dtrace_trap_func = NULL;
 
-int (* dtrace_invop_jump_addr)(uintptr_t, struct trapframe *, uintptr_t);
-void (* dtrace_emulation_jump_addr)(int, struct trapframe *);
+int (* dtrace_invop_jump_addr)(struct trapframe *);
 
 static int
 dtrace_trapper(u_int addr, struct trapframe *frame)
 {
-	int op;
-	struct trapframe back;
 	u_int insn = read_insn(addr, false);
 
-	if (dtrace_invop_jump_addr == NULL || dtrace_emulation_jump_addr == NULL)
+	if (dtrace_invop_jump_addr == NULL)
 		return 1;
 
 	if (!DTRACE_IS_BREAKPOINT(insn))
 		return 1;
 
-	/* cond value is encoded in the first byte */
+	/* cond value is encoded in the low nibble */
 	if (!arm_cond_ok_p(__SHIFTIN(insn, INSN_COND_MASK), frame->tf_spsr)) {
 		frame->tf_pc += INSN_SIZE;
 		return 0;
 	}
 
-	back = *frame;
-	op = dtrace_invop_jump_addr(addr, frame, frame->tf_r0);
-	*frame = back;
-
-	dtrace_emulation_jump_addr(op, frame);
-
+	dtrace_invop_jump_addr(frame);
 	return 0;
 }
 #endif
@@ -333,10 +325,11 @@ undefinedinstruction(trapframe_t *tf)
 	l = curlwp;
 
 #ifdef __PROG26
-	if ((tf->tf_r15 & R15_MODE) == R15_MODE_USR) {
+	if ((tf->tf_r15 & R15_MODE) == R15_MODE_USR)
 #else
-	if ((tf->tf_spsr & PSR_MODE) == PSR_USR32_MODE) {
+	if ((tf->tf_spsr & PSR_MODE) == PSR_USR32_MODE)
 #endif
+	{
 		user = 1;
 		LWP_CACHE_CREDS(l, l->l_proc);
 	} else
Index: src/sys/arch/arm/arm32/db_interface.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/arch/arm/arm32/db_interface.c,v
retrieving revision 1.56
diff -u -p -r1.56 db_interface.c
--- src/sys/arch/arm/arm32/db_interface.c	30 Jun 2017 08:10:50 -0000	1.56
+++ src/sys/arch/arm/arm32/db_interface.c	3 Jul 2017 21:56:29 -0000
@@ -251,87 +251,8 @@ db_read_bytes(vaddr_t addr, size_t size,
 static void
 db_write_text(vaddr_t addr, size_t size, const char *data)
 {
-	struct pmap *pmap = pmap_kernel();
-	pd_entry_t *pde, oldpde, tmppde;
-	pt_entry_t *pte, oldpte, tmppte;
-	vaddr_t pgva;
-	size_t limit, savesize;
-	char *dst;
-
-	/* XXX: gcc */
-	oldpte = 0;
-
-	if ((savesize = size) == 0)
-		return;
-
-	dst = (char *) addr;
-
-	do {
-		/* Get the PDE of the current VA. */
-		if (pmap_get_pde_pte(pmap, (vaddr_t) dst, &pde, &pte) == false)
-			goto no_mapping;
-		switch ((oldpde = *pde) & L1_TYPE_MASK) {
-		case L1_TYPE_S:
-			pgva = (vaddr_t)dst & L1_S_FRAME;
-			limit = L1_S_SIZE - ((vaddr_t)dst & L1_S_OFFSET);
-
-			tmppde = l1pte_set_writable(oldpde);
-			*pde = tmppde;
-			PTE_SYNC(pde);
-			break;
-
-		case L1_TYPE_C:
-			pgva = (vaddr_t)dst & L2_S_FRAME;
-			limit = L2_S_SIZE - ((vaddr_t)dst & L2_S_OFFSET);
-
-			if (pte == NULL)
-				goto no_mapping;
-			oldpte = *pte;
-			tmppte = l2pte_set_writable(oldpte);
-			*pte = tmppte;
-			PTE_SYNC(pte);
-			break;
-
-		default:
-		no_mapping:
-			printf(" address 0x%08lx not a valid page\n",
-			    (vaddr_t) dst);
-			return;
-		}
-		cpu_tlb_flushD_SE(pgva);
-		cpu_cpwait();
-
-		if (limit > size)
-			limit = size;
-		size -= limit;
-
-		/*
-		 * Page is now writable.  Do as much access as we
-		 * can in this page.
-		 */
-		for (; limit > 0; limit--)
-			*dst++ = *data++;
-
-		/*
-		 * Restore old mapping permissions.
-		 */
-		switch (oldpde & L1_TYPE_MASK) {
-		case L1_TYPE_S:
-			*pde = oldpde;
-			PTE_SYNC(pde);
-			break;
-
-		case L1_TYPE_C:
-			*pte = oldpte;
-			PTE_SYNC(pte);
-			break;
-		}
-		cpu_tlb_flushD_SE(pgva);
-		cpu_cpwait();
-	} while (size != 0);
 
-	/* Sync the I-cache. */
-	cpu_icache_sync_range(addr, savesize);
+	ktext_write((void *)addr, data, size);
 }
 
 /*
Index: src/sys/arch/arm/arm32/vm_machdep.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/arch/arm/arm32/vm_machdep.c,v
retrieving revision 1.71
diff -u -p -r1.71 vm_machdep.c
--- src/sys/arch/arm/arm32/vm_machdep.c	8 Oct 2017 12:09:44 -0000	1.71
+++ src/sys/arch/arm/arm32/vm_machdep.c	9 Oct 2017 19:43:08 -0000
@@ -288,4 +288,92 @@ vunmapbuf(struct buf *bp, vsize_t len)
 	bp->b_saveaddr = 0;
 }
 
+void
+ktext_write(void *to, const void *from, size_t size)
+{
+	struct pmap *pmap = pmap_kernel();
+	pd_entry_t *pde, oldpde, tmppde;
+	pt_entry_t *pte, oldpte, tmppte;
+	vaddr_t pgva;
+	size_t limit, savesize;
+	const char *src;
+	char *dst;
+
+	/* XXX: gcc */
+	oldpte = 0;
+
+	if ((savesize = size) == 0)
+		return;
+
+	dst = (char *) to;
+	src = (const char *) from;
+
+	do {
+		/* Get the PDE of the current VA. */
+		if (pmap_get_pde_pte(pmap, (vaddr_t) dst, &pde, &pte) == false)
+			goto no_mapping;
+		switch ((oldpde = *pde) & L1_TYPE_MASK) {
+		case L1_TYPE_S:
+			pgva = (vaddr_t)dst & L1_S_FRAME;
+			limit = L1_S_SIZE - ((vaddr_t)dst & L1_S_OFFSET);
+
+			tmppde = l1pte_set_writable(oldpde);
+			*pde = tmppde;
+			PTE_SYNC(pde);
+			break;
+
+		case L1_TYPE_C:
+			pgva = (vaddr_t)dst & L2_S_FRAME;
+			limit = L2_S_SIZE - ((vaddr_t)dst & L2_S_OFFSET);
+
+			if (pte == NULL)
+				goto no_mapping;
+			oldpte = *pte;
+			tmppte = l2pte_set_writable(oldpte);
+			*pte = tmppte;
+			PTE_SYNC(pte);
+			break;
+
+		default:
+		no_mapping:
+			printf(" address 0x%08lx not a valid page\n",
+			    (vaddr_t) dst);
+			return;
+		}
+		cpu_tlb_flushD_SE(pgva);
+		cpu_cpwait();
+
+		if (limit > size)
+			limit = size;
+		size -= limit;
+
+		/*
+		 * Page is now writable.  Do as much access as we
+		 * can in this page.
+		 */
+		for (; limit > 0; limit--)
+			*dst++ = *src++;
+
+		/*
+		 * Restore old mapping permissions.
+		 */
+		switch (oldpde & L1_TYPE_MASK) {
+		case L1_TYPE_S:
+			*pde = oldpde;
+			PTE_SYNC(pde);
+			break;
+
+		case L1_TYPE_C:
+			*pte = oldpte;
+			PTE_SYNC(pte);
+			break;
+		}
+		cpu_tlb_flushD_SE(pgva);
+		cpu_cpwait();
+	} while (size != 0);
+
+	/* Sync the I-cache. */
+	cpu_icache_sync_range((vaddr_t)to, savesize);
+}
+
 /* End of vm_machdep.c */
Index: src/sys/arch/evbarm/conf/CUBIETRUCK
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/arch/evbarm/conf/CUBIETRUCK,v
retrieving revision 1.11
diff -u -p -r1.11 CUBIETRUCK
--- src/sys/arch/evbarm/conf/CUBIETRUCK	28 May 2017 18:39:58 -0000	1.11
+++ src/sys/arch/evbarm/conf/CUBIETRUCK	26 Jun 2017 16:49:17 -0000
@@ -37,3 +37,9 @@ awintve0 	at awinio0
 cinclude	"arch/evbarm/conf/CUBIETRUCK.local"
 
 #options 	VERBOSE_INIT_ARM # verbose bootstraping messages
+
+
+options INSECURE
+makeoptions	COPTS="-O2 -fno-omit-frame-pointer -fno-optimize-sibling-calls -fno-ipa-sra -fno-ipa-icf"
+#makeoptions	COPTS="-O2 -fno-optimize-sibling-calls -fno-ipa-sra -fno-ipa-icf"
+#config		netbsd		root on awge0 type nfs
Index: src/sys/arch/evbarm/conf/TEGRA
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/arch/evbarm/conf/TEGRA,v
retrieving revision 1.33
diff -u -p -r1.33 TEGRA
--- src/sys/arch/evbarm/conf/TEGRA	26 Sep 2017 16:12:45 -0000	1.33
+++ src/sys/arch/evbarm/conf/TEGRA	5 Oct 2017 14:28:08 -0000
@@ -198,3 +198,11 @@ include "dev/usb/usbdevices.config"
 midi*		at midibus?
 
 cinclude "arch/evbarm/conf/TEGRA.local"
+
+
+options INSECURE
+#makeoptions	COPTS="-O2 -fno-omit-frame-pointer -fno-optimize-sibling-calls -fno-ipa-sra -fno-ipa-icf"
+makeoptions	COPTS="-O2 -fno-optimize-sibling-calls -fno-ipa-sra -fno-ipa-icf"
+options 	DIAGNOSTIC	# internal consistency checks
+options 	DEBUG
+options 	LOCKDEBUG
Index: src/sys/fs/efs/efs_genfs.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/efs/efs_genfs.c,v
retrieving revision 1.1
diff -u -p -r1.1 efs_genfs.c
--- src/sys/fs/efs/efs_genfs.c	29 Jun 2007 23:30:28 -0000	1.1
+++ src/sys/fs/efs/efs_genfs.c	10 Jul 2017 22:19:22 -0000
@@ -42,6 +42,7 @@ const struct genfs_ops efs_genfsops = {
 	.gop_size  = genfs_size,
 	.gop_alloc = efs_gop_alloc,
 	.gop_write = genfs_gop_write,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 int
Index: src/sys/fs/msdosfs/msdosfs_denode.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/msdosfs/msdosfs_denode.c,v
retrieving revision 1.56
diff -u -p -r1.56 msdosfs_denode.c
--- src/sys/fs/msdosfs/msdosfs_denode.c	26 May 2017 14:34:19 -0000	1.56
+++ src/sys/fs/msdosfs/msdosfs_denode.c	10 Jul 2017 22:19:30 -0000
@@ -125,6 +125,7 @@ static const struct genfs_ops msdosfs_ge
 	.gop_alloc = msdosfs_gop_alloc,
 	.gop_write = genfs_gop_write,
 	.gop_markupdate = msdosfs_gop_markupdate,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 MALLOC_DECLARE(M_MSDOSFSFAT);
Index: src/sys/fs/nilfs/nilfs_vfsops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/nilfs/nilfs_vfsops.c,v
retrieving revision 1.23
diff -u -p -r1.23 nilfs_vfsops.c
--- src/sys/fs/nilfs/nilfs_vfsops.c	17 Feb 2017 08:31:24 -0000	1.23
+++ src/sys/fs/nilfs/nilfs_vfsops.c	10 Jul 2017 22:19:34 -0000
@@ -148,6 +148,7 @@ static const struct genfs_ops nilfs_genf
 	.gop_alloc = nilfs_gop_alloc,
 	.gop_write = genfs_gop_write_rwmap,
 	.gop_markupdate = nilfs_gop_markupdate,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 /* --------------------------------------------------------------------- */
Index: src/sys/fs/puffs/puffs_vfsops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/puffs/puffs_vfsops.c,v
retrieving revision 1.120
diff -u -p -r1.120 puffs_vfsops.c
--- src/sys/fs/puffs/puffs_vfsops.c	1 Apr 2017 19:35:56 -0000	1.120
+++ src/sys/fs/puffs/puffs_vfsops.c	10 Jul 2017 22:19:53 -0000
@@ -76,6 +76,7 @@ static const struct genfs_ops puffs_genf
 #if 0
 	.gop_alloc, should ask userspace
 #endif
+	.gop_putrange = genfs_gop_putrange,
 };
 
 /*
Index: src/sys/fs/sysvbfs/sysvbfs.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/sysvbfs/sysvbfs.c,v
retrieving revision 1.16
diff -u -p -r1.16 sysvbfs.c
--- src/sys/fs/sysvbfs/sysvbfs.c	17 Feb 2017 08:31:24 -0000	1.16
+++ src/sys/fs/sysvbfs/sysvbfs.c	10 Jul 2017 22:20:03 -0000
@@ -108,6 +108,7 @@ const struct genfs_ops sysvbfs_genfsops 
 	.gop_size = genfs_size,
 	.gop_alloc = sysvbfs_gop_alloc,
 	.gop_write = genfs_gop_write,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 struct vfsops sysvbfs_vfsops = {
Index: src/sys/fs/tmpfs/tmpfs_subr.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/tmpfs/tmpfs_subr.c,v
retrieving revision 1.102
diff -u -p -r1.102 tmpfs_subr.c
--- src/sys/fs/tmpfs/tmpfs_subr.c	4 Jan 2017 10:06:43 -0000	1.102
+++ src/sys/fs/tmpfs/tmpfs_subr.c	27 Apr 2017 04:44:30 -0000
@@ -275,7 +275,7 @@ tmpfs_newvnode(struct mount *mp, struct 
 	case VREG:
 		/* Regular file.  Create an underlying UVM object. */
 		node->tn_spec.tn_reg.tn_aobj =
-		    uao_create(INT32_MAX - PAGE_SIZE, 0);
+		    uao_create(((uint64_t)1 << 63) - PAGE_SIZE, 0);
 		node->tn_spec.tn_reg.tn_aobj_pages = 0;
 		break;
 	default:
Index: src/sys/fs/udf/udf_subr.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/udf/udf_subr.c,v
retrieving revision 1.139
diff -u -p -r1.139 udf_subr.c
--- src/sys/fs/udf/udf_subr.c	1 Apr 2017 19:35:56 -0000	1.139
+++ src/sys/fs/udf/udf_subr.c	10 Jul 2017 22:20:07 -0000
@@ -4028,6 +4028,7 @@ static const struct genfs_ops udf_genfso
 	.gop_alloc = udf_gop_alloc,
 	.gop_write = genfs_gop_write_rwmap,
 	.gop_markupdate = udf_gop_markupdate,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 
Index: src/sys/fs/v7fs/v7fs_extern.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/fs/v7fs/v7fs_extern.c,v
retrieving revision 1.5
diff -u -p -r1.5 v7fs_extern.c
--- src/sys/fs/v7fs/v7fs_extern.c	17 Feb 2017 08:31:25 -0000	1.5
+++ src/sys/fs/v7fs/v7fs_extern.c	10 Jul 2017 22:20:13 -0000
@@ -222,6 +222,7 @@ const struct genfs_ops v7fs_genfsops = {
 	.gop_size = genfs_size,
 	.gop_alloc = v7fs_gop_alloc,
 	.gop_write = genfs_gop_write,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 struct vfsops v7fs_vfsops = {
Index: src/sys/kern/kern_ctf.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/kern/kern_ctf.c,v
retrieving revision 1.6
diff -u -p -r1.6 kern_ctf.c
--- src/sys/kern/kern_ctf.c	7 Jul 2016 06:55:43 -0000	1.6
+++ src/sys/kern/kern_ctf.c	5 Jul 2017 22:56:45 -0000
@@ -28,6 +28,7 @@
  */
 
 #define ELFSIZE ARCH_ELFSIZE
+#include <sys/proc.h>
 #include <sys/module.h>
 #include <sys/exec.h>
 #include <sys/exec_elf.h>
@@ -69,9 +70,9 @@ z_free(void *nil, void *ptr)
 }
 
 int
-mod_ctf_get(struct module *mod, mod_ctf_t *mc)
+mod_ctf_get(struct module *mod, mod_ctf_t **mcp)
 {
-	mod_ctf_t *cmc;
+	mod_ctf_t *mc;
 	struct ksyms_symtab *st;
 	void * ctftab = NULL;
 	size_t sz;
@@ -82,22 +83,24 @@ mod_ctf_get(struct module *mod, mod_ctf_
 	uint8_t *ctfaddr;
 	size_t ctfsize;
 
-	if (mc == NULL) {
-		return EINVAL;
-	}
+	/*
+	 * Return the cached mc if there is one already.
+	 */
 
-	/* Set the defaults for no CTF present. That's not a crime! */
-	memset(mc, 0, sizeof(*mc));
+	extern specificdata_key_t fbt_module_key;
 
-	/* cached mc? */
-	if (mod->mod_ctf != NULL) {
-		cmc = mod->mod_ctf;
-		*mc = *cmc;
+	mc = module_getspecific(mod, fbt_module_key);
+	if (mc != NULL) {
+		*mcp = mc;
 		return (0);
 	}
 
-	st = ksyms_get_mod(mod->mod_info->mi_name);
+	/*
+	 * Allocate and initialize a new mc.
+	 */
 
+	mc = kmem_zalloc(sizeof(mod_ctf_t), KM_SLEEP);
+	st = ksyms_get_mod(module_name(mod));
 	if (st != NULL) {
 		mc->nmap     = st->sd_nmap;
 		mc->nmapsize = st->sd_nmapsize;
@@ -106,7 +109,8 @@ mod_ctf_get(struct module *mod, mod_ctf_
 	if (mod->mod_kobj == NULL) {
 	    	/* no kobj entry, try building from ksyms list */
 		if (st == NULL) {
-			return ENOENT;
+			error = ENOENT;
+			goto out;
 		}
 
 		ctfaddr = st->sd_ctfstart;
@@ -118,7 +122,8 @@ mod_ctf_get(struct module *mod, mod_ctf_
 		mc->nsym   = st->sd_symsize / sizeof(Elf_Sym);
 	} else {
 		if (kobj_find_section(mod->mod_kobj, ".SUNW_ctf", (void **)&ctfaddr, &ctfsize)) {
-			return ENOENT;
+			error = ENOENT;
+			goto out;
 		}
 
 		mc->symtab = mod->mod_kobj->ko_symtab;
@@ -216,21 +221,19 @@ mod_ctf_get(struct module *mod, mod_ctf_
 	}
 
 	/* Got the CTF data! */
-	mc->ctftab = ctftab;
 	mc->ctfcnt = ctfsize;
-
-	/* cache it */
-	cmc = kmem_alloc(sizeof(mod_ctf_t), KM_SLEEP);
-
-	*cmc = *mc;
-	mod->mod_ctf = cmc;
-
-	/* We'll retain the memory allocated for the CTF data. */
+	mc->ctftab = ctftab;
 	ctfbuf = NULL;
 
+	module_setspecific(mod, fbt_module_key, mc);
+	*mcp = mc;
+	mc = NULL;
+
 out:
 	if (ctfbuf != NULL)
 		free(ctfbuf, M_TEMP);
+	if (mc != NULL)
+		kmem_free(mc, sizeof(*mc));
 
 	return (error);
 }
Index: src/sys/kern/kern_ksyms.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/kern/kern_ksyms.c,v
retrieving revision 1.85
diff -u -p -r1.85 kern_ksyms.c
--- src/sys/kern/kern_ksyms.c	14 Jun 2017 00:52:37 -0000	1.85
+++ src/sys/kern/kern_ksyms.c	5 Jul 2017 05:55:25 -0000
@@ -444,6 +444,12 @@ ksyms_addsyms_elf(int symsize, void *sta
 	char *ctfstart = NULL;
 	size_t ctfsize = 0;
 
+	printf("chuq %s symsize %d start %p end %p\n",
+	       __func__, symsize, start, end);
+#if 0
+	Debugger();
+#endif
+
 	if (symsize <= 0) {
 		printf("[ Kernel symbol table missing! ]\n");
 		return;
Index: src/sys/kern/kern_module.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/kern/kern_module.c,v
retrieving revision 1.125
diff -u -p -r1.125 kern_module.c
--- src/sys/kern/kern_module.c	1 Jun 2017 02:45:13 -0000	1.125
+++ src/sys/kern/kern_module.c	2 Jun 2017 23:04:38 -0000
@@ -65,6 +65,21 @@ struct modlist        module_list = TAIL
 struct modlist        module_builtins = TAILQ_HEAD_INITIALIZER(module_builtins);
 static struct modlist module_bootlist = TAILQ_HEAD_INITIALIZER(module_bootlist);
 
+struct module_callbacks {
+	TAILQ_ENTRY(module_callbacks) modcb_list;
+	void (*modcb_load)(struct module *);
+	void (*modcb_unload)(struct module *);
+};
+TAILQ_HEAD(modcblist, module_callbacks);
+static struct modcblist modcblist;
+
+static module_t *module_netbsd;
+static const modinfo_t module_netbsd_modinfo = {
+	.mi_version = __NetBSD_Version__,
+	.mi_class = MODULE_CLASS_MISC,
+	.mi_name = "netbsd"
+};
+
 static module_t	*module_active;
 bool		module_verbose_on;
 #ifdef MODULAR_DEFAULT_AUTOLOAD
@@ -84,11 +99,14 @@ int (*module_load_vfs_vec)(const char *,
 
 static kauth_listener_t	module_listener;
 
+static specificdata_domain_t module_specificdata_domain;
+
 /* Ensure that the kernel's link set isn't empty. */
 static modinfo_t module_dummy;
 __link_set_add_rodata(modules, module_dummy);
 
 static module_t	*module_newmodule(modsrc_t);
+static void	module_free(module_t *);
 static void	module_require_force(module_t *);
 static int	module_do_load(const char *, bool, int, prop_dictionary_t,
 		    module_t **, modclass_t modclass, bool);
@@ -106,6 +124,9 @@ static bool	module_merge_dicts(prop_dict
 static void	sysctl_module_setup(void);
 static int	sysctl_module_autotime(SYSCTLFN_PROTO);
 
+static void	module_callback_load(struct module *);
+static void	module_callback_unload(struct module *);
+
 #define MODULE_CLASS_MATCH(mi, modclass) \
 	((modclass) == MODULE_CLASS_ANY || (modclass) == (mi)->mi_class)
 
@@ -116,6 +137,13 @@ module_incompat(const modinfo_t *mi, int
 	    mi->mi_name, modclass, mi->mi_class);
 }
 
+struct module *
+module_kernel(void)
+{
+
+	return module_netbsd;
+}
+
 /*
  * module_error:
  *
@@ -152,6 +180,30 @@ module_print(const char *fmt, ...)
 	}
 }
 
+/*
+ * module_name:
+ *
+ *	Utility function: return the module's name.
+ */
+const char *
+module_name(struct module *mod)
+{
+
+	return mod->mod_info->mi_name;
+}
+
+/*
+ * module_source:
+ *
+ *	Utility function: return the module's source.
+ */
+modsrc_t
+module_source(struct module *mod)
+{
+
+	return mod->mod_source;
+}
+
 static int
 module_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
     void *arg0, void *arg1, void *arg2, void *arg3)
@@ -179,12 +231,22 @@ module_newmodule(modsrc_t source)
 
 	mod = kmem_zalloc(sizeof(*mod), KM_SLEEP);
 	mod->mod_source = source;
-	mod->mod_info = NULL;
-	mod->mod_flags = 0;
+	specificdata_init(module_specificdata_domain, &mod->mod_sdref);
 	return mod;
 }
 
 /*
+ * Free a module_t
+ */
+static void
+module_free(module_t *mod)
+{
+
+	specificdata_fini(module_specificdata_domain, &mod->mod_sdref);
+	kmem_free(mod, sizeof(*mod));
+}
+
+/*
  * Require the -f (force) flag to load a module
  */
 static void
@@ -280,7 +342,7 @@ module_builtin_add(modinfo_t *const *mip
 	if (rv != 0) {
 		for (i = 0; i < nmodinfo; i++) {
 			if (modp[i])
-				kmem_free(modp[i], sizeof(*modp[i]));
+				module_free(modp[i]);
 		}
 	}
 	kmem_free(modp, sizeof(*modp) * nmodinfo);
@@ -347,6 +409,7 @@ module_init(void)
 	}
 	cv_init(&module_thread_cv, "mod_unld");
 	mutex_init(&module_thread_lock, MUTEX_DEFAULT, IPL_NONE);
+	TAILQ_INIT(&modcblist);
 
 #ifdef MODULAR	/* XXX */
 	module_init_md();
@@ -373,6 +436,11 @@ module_init(void)
 	}
 
 	sysctl_module_setup();
+	module_specificdata_domain = specificdata_domain_create();
+
+	module_netbsd = module_newmodule(MODULE_SOURCE_KERNEL);
+	module_netbsd->mod_refcnt = 1;
+	module_netbsd->mod_info = &module_netbsd_modinfo;
 }
 
 /*
@@ -685,21 +753,13 @@ module_lookup(const char *name)
  *	responsibility to ensure that the reference is dropped
  *	later.
  */
-int
-module_hold(const char *name)
+void
+module_hold(module_t *mod)
 {
-	module_t *mod;
 
 	kernconfig_lock();
-	mod = module_lookup(name);
-	if (mod == NULL) {
-		kernconfig_unlock();
-		return ENOENT;
-	}
 	mod->mod_refcnt++;
 	kernconfig_unlock();
-
-	return 0;
 }
 
 /*
@@ -708,16 +768,11 @@ module_hold(const char *name)
  *	Release a reference acquired with module_hold().
  */
 void
-module_rele(const char *name)
+module_rele(module_t *mod)
 {
-	module_t *mod;
 
 	kernconfig_lock();
-	mod = module_lookup(name);
-	if (mod == NULL) {
-		kernconfig_unlock();
-		panic("%s: gone", __func__);
-	}
+	KASSERT(mod->mod_refcnt > 0);
 	mod->mod_refcnt--;
 	kernconfig_unlock();
 }
@@ -982,7 +1037,7 @@ module_do_load(const char *name, bool is
 				module_error("vfs load failed for `%s', "
 				    "error %d", name, error);
 #endif
-			kmem_free(mod, sizeof(*mod));
+			module_free(mod);
 			depth--;
 			return error;
 		}
@@ -1152,6 +1207,7 @@ module_do_load(const char *name, bool is
 	}
 	depth--;
 	module_print("module `%s' loaded successfully", mi->mi_name);
+	module_callback_load(mod);
 	return 0;
 
  fail:
@@ -1162,7 +1218,7 @@ module_do_load(const char *name, bool is
 		filedict = NULL;
 	}
 	TAILQ_REMOVE(pending, mod, mod_chain);
-	kmem_free(mod, sizeof(*mod));
+	module_free(mod);
 	depth--;
 	return error;
 }
@@ -1207,6 +1263,7 @@ module_do_unload(const char *name, bool 
 
 	prev_active = module_active;
 	module_active = mod;
+	module_callback_unload(mod);
 	error = (*mod->mod_info->mi_modcmd)(MODULE_CMD_FINI, NULL);
 	module_active = prev_active;
 	if (error != 0) {
@@ -1230,7 +1287,7 @@ module_do_unload(const char *name, bool 
 		TAILQ_INSERT_TAIL(&module_builtins, mod, mod_chain);
 		module_builtinlist++;
 	} else {
-		kmem_free(mod, sizeof(*mod));
+		module_free(mod);
 	}
 	module_gen++;
 
@@ -1280,7 +1337,7 @@ module_prime(const char *name, void *bas
 
 	error = kobj_load_mem(&mod->mod_kobj, name, base, size);
 	if (error != 0) {
-		kmem_free(mod, sizeof(*mod));
+		module_free(mod);
 		module_error("unable to load `%s' pushed by boot loader, "
 		    "error %d", name, error);
 		return error;
@@ -1288,7 +1345,7 @@ module_prime(const char *name, void *bas
 	error = module_fetch_info(mod);
 	if (error != 0) {
 		kobj_unload(mod->mod_kobj);
-		kmem_free(mod, sizeof(*mod));
+		module_free(mod);
 		module_error("unable to fetch_info for `%s' pushed by boot "
 		    "loader, error %d", name, error);
 		return error;
@@ -1534,3 +1591,89 @@ out:
 
 	return !error;
 }
+
+
+specificdata_key_t
+module_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
+{
+
+	return specificdata_key_create(module_specificdata_domain, keyp, dtor);
+}
+
+void
+module_specific_key_delete(specificdata_key_t key)
+{
+
+	return specificdata_key_delete(module_specificdata_domain, key);
+}
+
+void *
+module_getspecific(module_t *mod, specificdata_key_t key)
+{
+
+	return specificdata_getspecific(module_specificdata_domain,
+					&mod->mod_sdref, key);
+}
+
+void
+module_setspecific(module_t *mod, specificdata_key_t key, void *data)
+{
+
+	specificdata_setspecific(module_specificdata_domain,
+				 &mod->mod_sdref, key, data);
+}
+
+void *
+module_register_callbacks(void (*load)(struct module *), void (*unload)(struct module *))
+{
+	struct module_callbacks *modcb;
+	struct module *mod;
+
+	modcb = kmem_alloc(sizeof(*modcb), KM_SLEEP);
+	modcb->modcb_load = load;
+	modcb->modcb_unload = unload;
+
+	kernconfig_lock();
+
+	TAILQ_INSERT_TAIL(&modcblist, modcb, modcb_list);
+	TAILQ_FOREACH(mod, &module_list, mod_chain)
+		load(mod);
+
+	kernconfig_unlock();
+	return modcb;
+}
+
+void
+module_unregister_callbacks(void *opaque)
+{
+	struct module_callbacks *modcb;
+	struct module *mod;
+
+	modcb = opaque;
+	kernconfig_lock();
+	TAILQ_FOREACH(mod, &module_list, mod_chain)
+		modcb->modcb_unload(mod);
+	TAILQ_REMOVE(&modcblist, modcb, modcb_list);
+	kernconfig_unlock();
+	kmem_free(modcb, sizeof(*modcb));
+}
+
+static void
+module_callback_load(struct module *mod)
+{
+	struct module_callbacks *modcb;
+
+	TAILQ_FOREACH(modcb, &modcblist, modcb_list) {
+		modcb->modcb_load(mod);
+	}
+}
+
+static void
+module_callback_unload(struct module *mod)
+{
+	struct module_callbacks *modcb;
+
+	TAILQ_FOREACH(modcb, &modcblist, modcb_list) {
+		modcb->modcb_unload(mod);
+	}
+}
Index: src/sys/kern/kern_mutex.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/kern/kern_mutex.c,v
retrieving revision 1.67
diff -u -p -r1.67 kern_mutex.c
--- src/sys/kern/kern_mutex.c	16 Sep 2017 23:55:33 -0000	1.67
+++ src/sys/kern/kern_mutex.c	5 Oct 2017 14:28:18 -0000
@@ -59,6 +59,9 @@ __KERNEL_RCSID(0, "$NetBSD: kern_mutex.c
 
 #include <machine/lock.h>
 
+#define MUTEX_PANIC_SKIP_SPIN 0
+#define MUTEX_PANIC_SKIP_ADAPTIVE 1
+
 /*
  * When not running a debug kernel, spin mutexes are not much
  * more than an splraiseipl() and splx() pair.
@@ -480,8 +483,10 @@ mutex_vector_enter(kmutex_t *mtx)
 		 * to reduce cache line ping-ponging between CPUs.
 		 */
 		do {
+#if MUTEX_PANIC_SKIP_SPIN
 			if (panicstr != NULL)
 				break;
+#endif
 			while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
 				SPINLOCK_BACKOFF(count);
 #ifdef LOCKDEBUG
@@ -537,10 +542,12 @@ mutex_vector_enter(kmutex_t *mtx)
 			owner = mtx->mtx_owner;
 			continue;
 		}
+#if MUTEX_PANIC_SKIP_ADAPTIVE
 		if (__predict_false(panicstr != NULL)) {
 			KPREEMPT_ENABLE(curlwp);
 			return;
 		}
+#endif
 		if (__predict_false(MUTEX_OWNER(owner) == curthread)) {
 			MUTEX_ABORT(mtx, "locking against myself");
 		}
@@ -716,8 +723,10 @@ mutex_vector_exit(kmutex_t *mtx)
 	if (MUTEX_SPIN_P(mtx)) {
 #ifdef FULL
 		if (__predict_false(!MUTEX_SPINBIT_LOCKED_P(mtx))) {
+#if MUTEX_PANIC_SKIP_SPIN
 			if (panicstr != NULL)
 				return;
+#endif
 			MUTEX_ABORT(mtx, "exiting unheld spin mutex");
 		}
 		MUTEX_UNLOCKED(mtx);
@@ -727,11 +736,13 @@ mutex_vector_exit(kmutex_t *mtx)
 		return;
 	}
 
+#ifdef MUTEX_PANIC_SKIP_ADAPTIVE
 	if (__predict_false((uintptr_t)panicstr | cold)) {
 		MUTEX_UNLOCKED(mtx);
 		MUTEX_RELEASE(mtx);
 		return;
 	}
+#endif
 
 	curthread = (uintptr_t)curlwp;
 	MUTEX_DASSERT(mtx, curthread != 0);
@@ -922,8 +933,10 @@ mutex_spin_retry(kmutex_t *mtx)
 	 * to reduce cache line ping-ponging between CPUs.
 	 */
 	do {
+#if MUTEX_PANIC_SKIP_SPIN
 		if (panicstr != NULL)
 			break;
+#endif
 		while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
 			SPINLOCK_BACKOFF(count);
 #ifdef LOCKDEBUG
Index: src/sys/kern/subr_copy.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/kern/subr_copy.c,v
retrieving revision 1.7
diff -u -p -r1.7 subr_copy.c
--- src/sys/kern/subr_copy.c	25 May 2016 17:43:58 -0000	1.7
+++ src/sys/kern/subr_copy.c	12 Jun 2017 15:04:47 -0000
@@ -300,6 +300,33 @@ copyout_proc(struct proc *p, const void 
 }
 
 /*
+ * Like copyin(), but operates on an arbitrary pid.
+ */
+int
+copyin_pid(pid_t pid, const void *uaddr, void *kaddr, size_t len)
+{
+	struct proc *p;
+	struct vmspace *vm;
+	int error;
+
+	mutex_enter(proc_lock);
+	p = proc_find(pid);
+	if (p == NULL) {
+		mutex_exit(proc_lock);
+		return ESRCH;
+	}
+	mutex_enter(p->p_lock);
+	proc_vmspace_getref(p, &vm);
+	mutex_exit(p->p_lock);
+	mutex_exit(proc_lock);
+
+	error = copyin_vmspace(vm, uaddr, kaddr, len);
+
+	uvmspace_free(vm);
+	return error;
+}
+
+/*
  * Like copyin(), except it operates on kernel addresses when the FKIOCTL
  * flag is passed in `ioctlflags' from the ioctl call.
  */
Index: src/sys/miscfs/genfs/genfs_io.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/miscfs/genfs/genfs_io.c,v
retrieving revision 1.70
diff -u -p -r1.70 genfs_io.c
--- src/sys/miscfs/genfs/genfs_io.c	27 Jun 2017 08:40:53 -0000	1.70
+++ src/sys/miscfs/genfs/genfs_io.c	11 Jul 2017 00:51:32 -0000
@@ -1115,6 +1115,67 @@ retry:
 			pg->flags |= PG_BUSY;
 			UVM_PAGE_OWN(pg, "genfs_putpages");
 
+#if 1
+			/*
+			 * let the fs constrain the offset range of the cluster.
+			 * we additionally constrain the range here such that
+			 * it fits in the "pgs" pages array.
+			 */
+
+			off_t fslo, fshi, genlo, lo;
+			GOP_PUTRANGE(vp, off, &fslo, &fshi);
+			KASSERT(fslo == trunc_page(fslo));
+			KASSERT(fslo <= off);
+			KASSERT(fshi == trunc_page(fshi));
+			KASSERT(fshi == 0 || off < fshi);
+
+			if (off > MAXPHYS / 2)
+				genlo = trunc_page(off - (MAXPHYS / 2));
+			else
+				genlo = 0;
+			lo = MAX(fslo, genlo);
+
+			/*
+			 * first look backward.
+			 */
+
+			npages = (off - lo) >> PAGE_SHIFT;
+			nback = npages;
+			uvn_findpages(uobj, off - PAGE_SIZE, &nback, &pgs[0],
+			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
+			if (nback) {
+				memmove(&pgs[0], &pgs[npages - nback],
+				    nback * sizeof(pgs[0]));
+				if (npages - nback < nback)
+					memset(&pgs[nback], 0,
+					    (npages - nback) * sizeof(pgs[0]));
+				else
+					memset(&pgs[npages - nback], 0,
+					    nback * sizeof(pgs[0]));
+			}
+
+			/*
+			 * then plug in our page of interest.
+			 */
+
+			pgs[nback] = pg;
+
+			/*
+			 * then look forward to fill in the remaining space in
+			 * the array of pages.
+			 */
+
+			npages = MAXPAGES - nback - 1;
+			if (fshi)
+				npages = MIN(npages,
+					     (fshi - off - 1) >> PAGE_SHIFT);
+			uvn_findpages(uobj, off + PAGE_SIZE, &npages,
+			    &pgs[nback + 1],
+			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
+			npages += nback + 1;
+
+#else
+
 			/*
 			 * first look backward.
 			 */
@@ -1150,6 +1211,7 @@ retry:
 			    &pgs[nback + 1],
 			    UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
 			npages += nback + 1;
+#endif
 		} else {
 			pgs[0] = pg;
 			npages = 1;
@@ -1310,6 +1372,18 @@ skip_scan:
 	return (error);
 }
 
+/*
+ * Default putrange method for file systems that do not care
+ * how many pages are given to one GOP_WRITE() call.
+ */
+void
+genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
+{
+
+	*lop = 0;
+	*hip = 0;
+}
+
 int
 genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
 {
Index: src/sys/miscfs/genfs/genfs_node.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/miscfs/genfs/genfs_node.h,v
retrieving revision 1.21
diff -u -p -r1.21 genfs_node.h
--- src/sys/miscfs/genfs/genfs_node.h	6 Jun 2013 02:00:59 -0000	1.21
+++ src/sys/miscfs/genfs/genfs_node.h	12 Oct 2017 21:16:21 -0000
@@ -46,6 +46,7 @@ struct genfs_ops {
 	    struct kauth_cred *);
 	int	(*gop_write)(struct vnode *, struct vm_page **, int, int);
 	void	(*gop_markupdate)(struct vnode *, int);
+	void	(*gop_putrange)(struct vnode *, off_t, off_t *, off_t *);
 };
 
 #define GOP_SIZE(vp, size, eobp, flags) \
@@ -54,6 +55,8 @@ struct genfs_ops {
 	(*VTOG(vp)->g_op->gop_alloc)((vp), (off), (len), (flags), (cred))
 #define GOP_WRITE(vp, pgs, npages, flags) \
 	(*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags))
+#define GOP_PUTRANGE(vp, off, lop, hip) \
+	(*VTOG(vp)->g_op->gop_putrange)((vp), (off), (lop), (hip))
 
 /*
  * GOP_MARKUPDATE: mark vnode's timestamps for update.
@@ -85,6 +88,7 @@ struct genfs_node {
 void	genfs_size(struct vnode *, off_t, off_t *, int);
 void	genfs_node_init(struct vnode *, const struct genfs_ops *);
 void	genfs_node_destroy(struct vnode *);
+void	genfs_gop_putrange(struct vnode *, off_t, off_t *, off_t *);
 int	genfs_gop_write(struct vnode *, struct vm_page **, int, int);
 int	genfs_gop_write_rwmap(struct vnode *, struct vm_page **, int, int);
 int	genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int);
Index: src/sys/modules/dtrace/dtrace/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/dtrace/dtrace/Makefile,v
retrieving revision 1.4
diff -u -p -r1.4 Makefile
--- src/sys/modules/dtrace/dtrace/Makefile	5 Mar 2014 06:40:45 -0000	1.4
+++ src/sys/modules/dtrace/dtrace/Makefile	10 May 2017 11:01:03 -0000
@@ -21,23 +21,41 @@ ARCH=${MACHINE_CPU}
 
 KMOD=		dtrace
 SRCS=		dtrace.c \
+		dtrace_xoroshiro128_plus.c \
 		dtrace_asm.S \
 		dtrace_subr.c
 
 .if ${MACHINE} == "amd64" || ${MACHINE} == "i386"
+.PATH: ${OSNETDIR}/dev/dtrace/x86
 SRCS+=		dis_tables.c \
 		instr_size.c
+CPPFLAGS+=	-I${OSNETDIR}/dev/dtrace/x86
 .endif
 
 CPPFLAGS+=	-I${OSNETDIR}/sys \
 		-I${OSNETDIR}/dev/dtrace \
 		-I${OSNETDIR}/dev/dtrace/${ARCH} \
 		-I${OSNETDIR}/dist/uts/common \
+		-I${OSNETDIR}/dist/common/util \
 		-DDIS_MEM
 
-CPPFLAGS+=	-DSMP -DDEBUG
+CPPFLAGS+=	-Wno-unknown-pragmas \
+		-Wno-shadow \
+		-Wno-unused-variable \
+		-Wno-discarded-qualifiers \
+		-Wno-unused-function
+
+COPTS.dis_tables.c+= \
+		-Wno-missing-field-initializers \
+		-Wno-cast-qual \
+		-Wno-parentheses \
+		-Wno-uninitialized
 
-CPPFLAGS+=	-Wno-unknown-pragmas
+COPTS.dtrace.c+= \
+		-Wno-unused-but-set-variable \
+
+COPTS.dtrace_subr.c+= \
+		-Wno-unused-value
 
 EXPORT_SYMS=	dtrace_register \
 		dtrace_unregister \
Index: src/sys/modules/dtrace/fbt/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/dtrace/fbt/Makefile,v
retrieving revision 1.6
diff -u -p -r1.6 Makefile
--- src/sys/modules/dtrace/fbt/Makefile	16 Sep 2016 03:10:45 -0000	1.6
+++ src/sys/modules/dtrace/fbt/Makefile	10 May 2017 02:13:39 -0000
@@ -3,17 +3,27 @@
 .include "../../Makefile.inc"
 
 .PATH: ${NETBSDSRCDIR}/external/cddl/osnet/dev/fbt
+.PATH: ${NETBSDSRCDIR}/external/cddl/osnet/dev/fbt/${MACHINE_CPU}
+
+.if ${MACHINE} == "amd64" || ${MACHINE} == "i386"
+.PATH: ${NETBSDSRCDIR}/external/cddl/osnet/dev/fbt/x86
+CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/dev/fbt/x86
+.endif
 
 KMOD=		dtrace_fbt
-SRCS=		fbt.c
+SRCS=		fbt.c fbt_isa.c
 
 .PATH:	${S}/kern
 
 SRCS+=		kern_ctf.c
 
-CPPFLAGS.fbt.c+=	-I${NETBSDSRCDIR}/external/cddl/osnet/sys \
-		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common 
+CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/sys \
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common \
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dev/fbt \
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dev/fbt/${MACHINE_CPU}
 
 CPPFLAGS+=	-Wno-unknown-pragmas
 
+CPPFLAGS+=	-DDIAGNOSTIC
+
 .include <bsd.kmodule.mk>
Index: src/sys/modules/dtrace/linux32_syscall/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/dtrace/linux32_syscall/Makefile,v
retrieving revision 1.1
diff -u -p -r1.1 Makefile
--- src/sys/modules/dtrace/linux32_syscall/Makefile	8 Mar 2015 17:11:57 -0000	1.1
+++ src/sys/modules/dtrace/linux32_syscall/Makefile	20 Apr 2017 14:32:58 -0000
@@ -8,8 +8,10 @@ KMOD=		dtrace_syscall_linux32
 SRCS=		systrace.c
 
 CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/sys \
-		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common 
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common \
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dev/dtrace
 
-CPPFLAGS+=	-Wno-unknown-pragmas
+CPPFLAGS+=	-Wno-unknown-pragmas \
+		-Wno-discarded-qualifiers
 
 .include <bsd.kmodule.mk>
Index: src/sys/modules/dtrace/linux_syscall/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/dtrace/linux_syscall/Makefile,v
retrieving revision 1.1
diff -u -p -r1.1 Makefile
--- src/sys/modules/dtrace/linux_syscall/Makefile	7 Mar 2015 15:13:04 -0000	1.1
+++ src/sys/modules/dtrace/linux_syscall/Makefile	20 Apr 2017 14:32:15 -0000
@@ -8,8 +8,10 @@ KMOD=		dtrace_syscall_linux
 SRCS=		systrace.c
 
 CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/sys \
-		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common 
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common \
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dev/dtrace
 
-CPPFLAGS+=	-Wno-unknown-pragmas
+CPPFLAGS+=	-Wno-unknown-pragmas \
+		-Wno-discarded-qualifiers
 
 .include <bsd.kmodule.mk>
Index: src/sys/modules/dtrace/netbsd32_syscall/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/dtrace/netbsd32_syscall/Makefile,v
retrieving revision 1.1
diff -u -p -r1.1 Makefile
--- src/sys/modules/dtrace/netbsd32_syscall/Makefile	7 Mar 2015 15:13:04 -0000	1.1
+++ src/sys/modules/dtrace/netbsd32_syscall/Makefile	20 Apr 2017 14:32:35 -0000
@@ -8,8 +8,10 @@ KMOD=		dtrace_syscall_netbsd32
 SRCS=		systrace.c
 
 CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/sys \
-		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common 
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common \
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dev/dtrace
 
-CPPFLAGS+=	-Wno-unknown-pragmas
+CPPFLAGS+=	-Wno-unknown-pragmas \
+		-Wno-discarded-qualifiers
 
 .include <bsd.kmodule.mk>
Index: src/sys/modules/dtrace/profile/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/dtrace/profile/Makefile,v
retrieving revision 1.2
diff -u -p -r1.2 Makefile
--- src/sys/modules/dtrace/profile/Makefile	18 Sep 2014 19:45:24 -0000	1.2
+++ src/sys/modules/dtrace/profile/Makefile	20 Apr 2017 14:20:30 -0000
@@ -11,6 +11,6 @@ CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cd
 		-I${NETBSDSRCDIR}/external/cddl/osnet/dev/profile \
 		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common
 
-CPPFLAGS+=	-Wno-unknown-pragmas -Wno-cast-qual
+CPPFLAGS+=	-Wno-unknown-pragmas -Wno-cast-qual -Wno-discarded-qualifiers
 
 .include <bsd.kmodule.mk>
Index: src/sys/modules/dtrace/syscall/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/dtrace/syscall/Makefile,v
retrieving revision 1.1
diff -u -p -r1.1 Makefile
--- src/sys/modules/dtrace/syscall/Makefile	7 Mar 2015 15:13:04 -0000	1.1
+++ src/sys/modules/dtrace/syscall/Makefile	20 Apr 2017 14:31:33 -0000
@@ -8,8 +8,10 @@ KMOD=		dtrace_syscall
 SRCS=		systrace.c
 
 CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/sys \
-		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common 
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common \
+		-I${NETBSDSRCDIR}/external/cddl/osnet/dev/dtrace
 
-CPPFLAGS+=	-Wno-unknown-pragmas
+CPPFLAGS+=	-Wno-unknown-pragmas \
+		-Wno-discarded-qualifiers
 
 .include <bsd.kmodule.mk>
Index: src/sys/modules/solaris/Makefile.solmod
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/solaris/Makefile.solmod,v
retrieving revision 1.4
diff -u -p -r1.4 Makefile.solmod
--- src/sys/modules/solaris/Makefile.solmod	21 Nov 2012 17:48:05 -0000	1.4
+++ src/sys/modules/solaris/Makefile.solmod	16 Jun 2017 17:49:01 -0000
@@ -8,6 +8,7 @@ SRCS=	avl.c
 
 .PATH:	${ZFSDIR}/dist/common/nvpair
 
+SRCS+=	fnvpair.c
 SRCS+=	nvpair.c
 SRCS+=	nvpair_alloc_fixed.c
 
@@ -46,19 +47,21 @@ SRCS+= acl_common.c
 
 .PATH:	${ZFSDIR}/sys/kern
 
+SRCS+=	callb.c
+SRCS+=	ddi.c
+SRCS+=	kmem.c
 SRCS+=	kobj.c
 SRCS+=	kstat.c
 SRCS+=	misc.c
-SRCS+=	policy.c
-SRCS+=	string.c
-SRCS+=	zone.c
-SRCS+=	callb.c
-SRCS+=	ddi.c
 SRCS+=	mod.c
+SRCS+=	opensolaris.c
+SRCS+=	policy.c
 SRCS+=	printf.c
+SRCS+=	string.c
+SRCS+=	sysevent.c
 SRCS+=	taskq.c
 SRCS+=	vfs.c
-SRCS+=	opensolaris.c
+SRCS+=	zone.c
 
 WARNS=		0
 NOGCCERROR=	yes
@@ -76,14 +79,14 @@ CWARNFLAGS+=	-Wno-format
 
 CPPFLAGS+=	-I${ZFSDIR}/sys
 CPPFLAGS+=	-I${ZFSDIR}/dist/common/acl
+CPPFLAGS+=	-I${ZFSDIR}/dist/common
 CPPFLAGS+=	-I${ZFSDIR}/dist/uts/common/zmod
 CPPFLAGS+=	-I${ZFSDIR}/dist/uts/common
 CPPFLAGS+=      -I${ZFSDIR}/sys/sys
 
-CPPFLAGS+=	-Dcaddr_t=__caddr_t "-D__va_list=va_list"
-CPPFLAGS+=	-std=c99
-# CPPFLAGS+=	-D_NFS_NFS_H_
-# CPPFLAGS+=	-D_PROPLIB_ZFS_CONFLICT
 CPPFLAGS+=	-DDIAGNOSTIC
 
+# make dtrace work better
+CFLAGS+=	-fno-inline -fno-optimize-sibling-calls -fno-ipa-sra -fno-ipa-icf
+
 NEED_ISYSTEM=   # duplicate headers, depends on -isystem to build
Index: src/sys/modules/zfs/Makefile.zfsmod
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/modules/zfs/Makefile.zfsmod,v
retrieving revision 1.6
diff -u -p -r1.6 Makefile.zfsmod
--- src/sys/modules/zfs/Makefile.zfsmod	18 Jun 2016 21:45:05 -0000	1.6
+++ src/sys/modules/zfs/Makefile.zfsmod	10 Jul 2017 20:08:06 -0000
@@ -12,14 +12,6 @@ SRCS+=	zfs_prop.c
 SRCS+=	zpool_prop.c
 SRCS+=	zprop_common.c
 
-.PATH:	${ZFSDIR}/dist/uts/common/fs
-
-#SRCS+=	gfs.c
-
-.PATH: ${ZFSDIR}/dist/uts/common/zmod
-
-#SRCS+= trees.c
-
 .PATH:	${ZFSDIR}/lib/libzpool
 
 SRCS+=	atomic.c
@@ -27,11 +19,16 @@ SRCS+=	atomic.c
 .PATH:	${ZFSDIR}/dist/uts/common/fs/zfs
 
 SRCS+=	arc.c
+SRCS+=	blkptr.c
 SRCS+=	bplist.c
+SRCS+=	bpobj.c
+SRCS+=	bptree.c
+SRCS+=	bqueue.c
 SRCS+=	dbuf.c
 SRCS+=	ddt.c
 SRCS+=	ddt_zap.c
 SRCS+=	dmu.c
+SRCS+=	dmu_diff.c
 SRCS+=	dmu_object.c
 SRCS+=	dmu_objset.c
 SRCS+=	dmu_send.c
@@ -40,25 +37,37 @@ SRCS+=	dmu_tx.c
 SRCS+=	dmu_zfetch.c
 SRCS+=	dnode.c
 SRCS+=	dnode_sync.c
+SRCS+=	dsl_bookmark.c
 SRCS+=	dsl_dataset.c
+SRCS+=	dsl_deadlist.c
 SRCS+=	dsl_deleg.c
+SRCS+=	dsl_destroy.c
 SRCS+=	dsl_dir.c
 SRCS+=	dsl_pool.c
 SRCS+=	dsl_prop.c
-SRCS+=	dsl_scrub.c
+SRCS+=	dsl_scan.c
 SRCS+=	dsl_synctask.c
+SRCS+=	dsl_userhold.c
+#SRCS+=	edonr_zfs.c
 SRCS+=	gzip.c
+SRCS+=	lz4.c
 SRCS+=	lzjb.c
 SRCS+=	metaslab.c
+SRCS+=	multilist.c
+SRCS+=	range_tree.c
 SRCS+=	refcount.c
 SRCS+=	rrwlock.c
+SRCS+=	sa.c
 SRCS+=	sha256.c
+#SRCS+=	skein_zfs.c
 SRCS+=	spa.c
 SRCS+=	spa_config.c
 SRCS+=	spa_errlog.c
 SRCS+=	spa_history.c
 SRCS+=	spa_misc.c
 SRCS+=	space_map.c
+SRCS+=	space_reftree.c
+SRCS+=	trim_map.c
 SRCS+=	txg.c
 SRCS+=	uberblock.c
 SRCS+=	unique.c
@@ -75,31 +84,38 @@ SRCS+=	vdev_root.c
 SRCS+=	zap.c
 SRCS+=	zap_leaf.c
 SRCS+=	zap_micro.c
+SRCS+=	zfeature.c
+SRCS+=	zfeature_common.c
 SRCS+=	zfs_byteswap.c
-#SRCS+=	zfs_ctldir.c
+SRCS+=	zfs_ctldir.c
+SRCS+=	zfs_debug.c
 SRCS+=	zfs_dir.c
-SRCS+=	zfs_fuid.c
 SRCS+=	zfs_fm.c
+SRCS+=	zfs_fuid.c
 SRCS+=	zfs_ioctl.c
+SRCS+=	zfs_ioctl_compat.c
 SRCS+=	zfs_log.c
+SRCS+=	zfs_onexit.c
 SRCS+=	zfs_replay.c
 SRCS+=	zfs_rlock.c
+SRCS+=	zfs_sa.c
 SRCS+=	zfs_vfsops.c
 SRCS+=	zfs_vnops.c
 SRCS+=	zfs_znode.c
-SRCS+= 	zfs_acl.c
 SRCS+=	zil.c
 SRCS+=	zio.c
 SRCS+=	zio_checksum.c
 SRCS+=	zio_compress.c
 SRCS+=	zio_inject.c
+SRCS+=	zle.c
+SRCS+=	zrlock.c
 #SRCS+=	zutil.c
 SRCS+=	zvol.c
-SRCS+=	zle.c
+SRCS+= 	zfs_acl.c
 
 .PATH: ${ZFSDIR}/sys/kern
 
-SRCS+= zfs_stub.c
+SRCS+=	zfs_stub.c
 
 WARNS=		0
 NOGCCERROR=	yes
@@ -108,12 +124,10 @@ CWARNFLAGS+=	-Wall
 CWARNFLAGS+=	-Wno-unknown-pragmas
 CWARNFLAGS+=	-Wno-missing-braces
 CWARNFLAGS+=	-Wno-parentheses
-CWARNFLAGS+=	-Wno-uninitialized
 CWARNFLAGS+=	-Wno-unused
 CWARNFLAGS+=	-Wno-switch
 CWARNFLAGS+=	-Wno-strict-prototypes
 CWARNFLAGS+=	-Wno-missing-prototypes
-CWARNFLAGS+=	-Wno-format
 
 CPPFLAGS+=	-I${ZFSDIR}/sys
 CPPFLAGS+=      -I${ZFSDIR}/dist/common
@@ -125,12 +139,13 @@ CPPFLAGS+=	-I${ZFSDIR}/dist/uts/common
 CPPFLAGS+=	-I${ZFSDIR}/dist/uts/common/zfs
 CPPFLAGS+=	-I${ZFSDIR}/dist/uts/common
 
-CPPFLAGS+=	-Dcaddr_t=__caddr_t "-D__va_list=va_list" -D__NetBSD__
-CPPFLAGS+=	-std=c99
-CPPFLAGS+=	-D_NFS_NFS_H_
 CPPFLAGS+=	-D_PROPLIB_ZFS_CONFLICT
 
 CPPFLAGS+=	-DDIAGNOSTIC
-# -DZFS_DEBUG
+CPPFLAGS+=	-DDEBUG -DZFS_DEBUG
+CPPFLAGS+=	-DUVM_PAGE_TRKOWN
+
+# make dtrace work better
+CFLAGS+=	-fno-inline -fno-optimize-sibling-calls -fno-ipa-sra -fno-ipa-icf
 
 NEED_ISYSTEM=	# duplicate headers, depends on -isystem to build
Index: src/sys/nfs/nfs_node.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/nfs/nfs_node.c,v
retrieving revision 1.122
diff -u -p -r1.122 nfs_node.c
--- src/sys/nfs/nfs_node.c	26 May 2017 14:34:20 -0000	1.122
+++ src/sys/nfs/nfs_node.c	10 Jul 2017 22:20:18 -0000
@@ -73,6 +73,7 @@ static const struct genfs_ops nfs_genfso
 	.gop_size = nfs_gop_size,
 	.gop_alloc = nfs_gop_alloc,
 	.gop_write = nfs_gop_write,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 /*
Index: src/sys/rump/kern/lib/libsolaris/Makefile
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/rump/kern/lib/libsolaris/Makefile,v
retrieving revision 1.3
diff -u -p -r1.3 Makefile
--- src/sys/rump/kern/lib/libsolaris/Makefile	19 Oct 2015 16:16:37 -0000	1.3
+++ src/sys/rump/kern/lib/libsolaris/Makefile	23 Apr 2017 11:44:54 -0000
@@ -8,7 +8,5 @@ S!=             cd ${.PARSEDIR}/../../..
 LIB=	rumpkern_solaris
 COMMENT=Solaris compatibility layer (for ZFS)
 
-CPPFLAGS+=	-DASSERT=KASSERT
-
 .include <bsd.lib.mk>
 .include <bsd.klinks.mk>
Index: src/sys/sys/filio.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/sys/filio.h,v
retrieving revision 1.10
diff -u -p -r1.10 filio.h
--- src/sys/sys/filio.h	11 Dec 2005 12:25:20 -0000	1.10
+++ src/sys/sys/filio.h	27 Jun 2017 17:40:24 -0000
@@ -44,6 +44,9 @@
 /* Generic file-descriptor ioctl's. */
 #define	FIOCLEX		 _IO('f', 1)		/* set close on exec on fd */
 #define	FIONCLEX	 _IO('f', 2)		/* remove close on exec */
+/* Handle lseek SEEK_DATA and SEEK_HOLE for holey file knowledge. */
+#define	FIOSEEKDATA	_IOWR('f', 97, off_t)	/* SEEK_DATA */
+#define	FIOSEEKHOLE	_IOWR('f', 98, off_t)	/* SEEK_HOLE */
 #define	FIONREAD	_IOR('f', 127, int)	/* get # bytes to read */
 #define	FIONBIO		_IOW('f', 126, int)	/* set/clear non-blocking i/o */
 #define	FIOASYNC	_IOW('f', 125, int)	/* set/clear async i/o */
@@ -55,7 +58,6 @@
 						 * in send queue. */
 #define	FIONSPACE	_IOR('f', 120, int)	/* get space in send queue. */
 
-
 /* Ugly symbol for compatibility with other operating systems */
 #define	FIBMAP		FIOGETBMAP
 
Index: src/sys/sys/kern_ctf.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/sys/kern_ctf.h,v
retrieving revision 1.1
diff -u -p -r1.1 kern_ctf.h
--- src/sys/sys/kern_ctf.h	12 Mar 2010 21:43:10 -0000	1.1
+++ src/sys/sys/kern_ctf.h	15 May 2017 23:37:19 -0000
@@ -33,7 +33,7 @@
  * Modules CTF section
  */
 typedef struct mod_ctf {
-	const uint8_t 	*ctftab;	/* Decompressed CTF data. */
+	uint8_t 	*ctftab;	/* Decompressed CTF data. */
 	int 		ctfcnt;		/* Number of CTF data bytes. */
 	const Elf_Sym	*symtab;	/* Ptr to the symbol table. */
 	int		nsym;		/* Number of symbols. */
@@ -45,9 +45,11 @@ typedef struct mod_ctf {
 	uint32_t	*typoffp;	/* Ptr to array of type offsets. */
 	long		typlen;		/* number of type data entries. */
 	int		ctfalloc;	/* ctftab is alloced */
+	bool		fbt_provided;
+	int		nenabled;
 } mod_ctf_t;
 
 int
-mod_ctf_get(struct module *, mod_ctf_t *);
+mod_ctf_get(struct module *, mod_ctf_t **);
 
 #endif
Index: src/sys/sys/module.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/sys/module.h,v
retrieving revision 1.41
diff -u -p -r1.41 module.h
--- src/sys/sys/module.h	16 Nov 2016 10:42:14 -0000	1.41
+++ src/sys/sys/module.h	16 May 2017 15:32:33 -0000
@@ -32,7 +32,6 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/cdefs.h>
-#include <sys/queue.h>
 #include <sys/uio.h>
 
 #define	MAXMODNAME	32
@@ -68,6 +67,8 @@ typedef enum modcmd {
 
 #include <sys/kernel.h>
 #include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/specificdata.h>
 
 #include <prop/proplib.h>
 
@@ -83,6 +84,9 @@ typedef struct modinfo {
 /* Per module information, maintained by kern_module.c */ 
 typedef struct module {
 	u_int			mod_refcnt;
+	int			mod_flags;
+#define MODFLG_MUST_FORCE	0x01
+#define MODFLG_AUTO_LOADED	0x02
 	const modinfo_t		*mod_info;
 	struct kobj		*mod_kobj;
 	TAILQ_ENTRY(module)	mod_chain;
@@ -90,12 +94,7 @@ typedef struct module {
 	u_int			mod_nrequired;
 	modsrc_t		mod_source;
 	time_t			mod_autotime;
-	void 			*mod_ctf;
-	u_int			mod_fbtentries;	/* DTrace FBT entry count */
-	int			mod_flags;
-#define MODFLG_MUST_FORCE	0x01
-#define MODFLG_AUTO_LOADED	0x02
-
+	specificdata_reference	mod_sdref;
 } module_t;
 
 /*
@@ -177,18 +176,29 @@ void	module_init_md(void);
 void	module_init_class(modclass_t);
 int	module_prime(const char *, void *, size_t);
 
+module_t *module_kernel(void);
+const char *module_name(struct module *);
+modsrc_t module_source(struct module *);
 bool	module_compatible(int, int);
 int	module_load(const char *, int, prop_dictionary_t, modclass_t);
 int	module_builtin_add(modinfo_t * const *, size_t, bool);
 int	module_builtin_remove(modinfo_t *, bool);
 int	module_autoload(const char *, modclass_t);
 int	module_unload(const char *);
-int	module_hold(const char *);
-void	module_rele(const char *);
+void	module_hold(module_t *);
+void	module_rele(module_t *);
 int	module_find_section(const char *, void **, size_t *);
 void	module_thread_kick(void);
 void	module_load_vfs_init(void);
 
+specificdata_key_t module_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
+void	module_specific_key_delete(specificdata_key_t);
+void	*module_getspecific(module_t *, specificdata_key_t);
+void	module_setspecific(module_t *, specificdata_key_t, void *);
+void	*module_register_callbacks(void (*)(struct module *),
+				  void (*)(struct module *));
+void	module_unregister_callbacks(void *);
+
 void	module_whatis(uintptr_t, void (*)(const char *, ...)
     __printflike(1, 2));
 void	module_print_list(void (*)(const char *, ...) __printflike(1, 2));
@@ -224,12 +234,12 @@ typedef struct modctl_load {
 	size_t ml_propslen;
 } modctl_load_t;
 
-typedef enum modctl {
+enum modctl {
 	MODCTL_LOAD,		/* modctl_load_t *ml */
 	MODCTL_UNLOAD,		/* char *name */
 	MODCTL_STAT,		/* struct iovec *buffer */
 	MODCTL_EXISTS		/* enum: 0: load, 1: autoload */
-} modctl_t;
+};
 
 /*
  * This structure intentionally has the same layout for 32 and 64
Index: src/sys/sys/pset.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/sys/pset.h,v
retrieving revision 1.5
diff -u -p -r1.5 pset.h
--- src/sys/sys/pset.h	2 Apr 2016 20:38:40 -0000	1.5
+++ src/sys/sys/pset.h	17 Apr 2017 01:07:14 -0000
@@ -39,9 +39,6 @@
 #define	PS_MYID			-1
 #define	PS_QUERY		-2
 
-/* For compatibility only */
-typedef	cpuid_t		processorid_t;
-
 __BEGIN_DECLS
 int	pset_assign(psetid_t, cpuid_t, psetid_t *);
 int	pset_bind(psetid_t, idtype_t, id_t, psetid_t *);
Index: src/sys/sys/systm.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/sys/systm.h,v
retrieving revision 1.272
diff -u -p -r1.272 systm.h
--- src/sys/sys/systm.h	22 Dec 2016 16:05:15 -0000	1.272
+++ src/sys/sys/systm.h	11 Jun 2017 04:08:00 -0000
@@ -266,6 +266,7 @@ typedef int	(*copyout_t)(const void *, v
 
 int	copyin_proc(struct proc *, const void *, void *, size_t);
 int	copyout_proc(struct proc *, const void *, void *, size_t);
+int	copyin_pid(pid_t, const void *, void *, size_t);
 int	copyin_vmspace(struct vmspace *, const void *, void *, size_t);
 int	copyout_vmspace(struct vmspace *, const void *, void *, size_t);
 
Index: src/sys/ufs/chfs/chfs_vfsops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/ufs/chfs/chfs_vfsops.c,v
retrieving revision 1.16
diff -u -p -r1.16 chfs_vfsops.c
--- src/sys/ufs/chfs/chfs_vfsops.c	17 Feb 2017 08:31:26 -0000	1.16
+++ src/sys/ufs/chfs/chfs_vfsops.c	10 Jul 2017 22:20:28 -0000
@@ -99,6 +99,7 @@ const struct genfs_ops chfs_genfsops = {
 	.gop_alloc = chfs_gop_alloc,
 	.gop_write = genfs_gop_write,
 	.gop_markupdate = ufs_gop_markupdate,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 struct pool chfs_inode_pool;
Index: src/sys/ufs/ext2fs/ext2fs_vfsops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/ufs/ext2fs/ext2fs_vfsops.c,v
retrieving revision 1.210
diff -u -p -r1.210 ext2fs_vfsops.c
--- src/sys/ufs/ext2fs/ext2fs_vfsops.c	30 Jul 2017 14:23:54 -0000	1.210
+++ src/sys/ufs/ext2fs/ext2fs_vfsops.c	12 Sep 2017 16:25:19 -0000
@@ -152,6 +152,7 @@ static const struct genfs_ops ext2fs_gen
 	.gop_alloc = ext2fs_gop_alloc,
 	.gop_write = genfs_gop_write,
 	.gop_markupdate = ufs_gop_markupdate,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 static const struct ufs_ops ext2fs_ufsops = {
Index: src/sys/ufs/ffs/ffs_vfsops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.354
diff -u -p -r1.354 ffs_vfsops.c
--- src/sys/ufs/ffs/ffs_vfsops.c	20 Aug 2017 12:51:38 -0000	1.354
+++ src/sys/ufs/ffs/ffs_vfsops.c	12 Sep 2017 16:25:19 -0000
@@ -178,6 +178,7 @@ static const struct genfs_ops ffs_genfso
 	.gop_alloc = ufs_gop_alloc,
 	.gop_write = genfs_gop_write,
 	.gop_markupdate = ufs_gop_markupdate,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 static const struct ufs_ops ffs_ufsops = {
Index: src/sys/ufs/lfs/lfs_vfsops.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/ufs/lfs/lfs_vfsops.c,v
retrieving revision 1.360
diff -u -p -r1.360 lfs_vfsops.c
--- src/sys/ufs/lfs/lfs_vfsops.c	26 Jul 2017 16:42:37 -0000	1.360
+++ src/sys/ufs/lfs/lfs_vfsops.c	12 Sep 2017 16:25:19 -0000
@@ -176,6 +176,7 @@ const struct genfs_ops lfs_genfsops = {
 	.gop_alloc = ulfs_gop_alloc,
 	.gop_write = lfs_gop_write,
 	.gop_markupdate = ulfs_gop_markupdate,
+	.gop_putrange = genfs_gop_putrange,
 };
 
 struct shortlong {
Index: src/sys/uvm/uvm_aobj.c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.125
diff -u -p -r1.125 uvm_aobj.c
--- src/sys/uvm/uvm_aobj.c	30 May 2017 17:09:17 -0000	1.125
+++ src/sys/uvm/uvm_aobj.c	2 Jun 2017 23:03:43 -0000
@@ -408,12 +408,12 @@ uao_free(struct uvm_aobj *aobj)
  */
 
 struct uvm_object *
-uao_create(vsize_t size, int flags)
+uao_create(voff_t size, int flags)
 {
 	static struct uvm_aobj kernel_object_store;
 	static kmutex_t kernel_object_lock;
 	static int kobj_alloced __diagused = 0;
-	pgoff_t pages = round_page(size) >> PAGE_SHIFT;
+	pgoff_t pages = round_page((uint64_t)size) >> PAGE_SHIFT;
 	struct uvm_aobj *aobj;
 	int refs;
 
@@ -699,9 +699,11 @@ uao_put(struct uvm_object *uobj, voff_t 
 		} else {
 			stop = round_page(stop);
 		}
-		if (stop > (aobj->u_pages << PAGE_SHIFT)) {
-			printf("uao_flush: strange, got an out of range "
-			    "flush (fixed)\n");
+		if (stop > (uint64_t)(aobj->u_pages << PAGE_SHIFT)) {
+			printf("uao_put: strange, got an out of range "
+			    "flush 0x%jx > 0x%jx (fixed)\n",
+			    (uintmax_t)stop,
+			    (uintmax_t)(aobj->u_pages << PAGE_SHIFT));
 			stop = aobj->u_pages << PAGE_SHIFT;
 		}
 		by_list = (uobj->uo_npages <=
Index: src/sys/uvm/uvm_extern.h
===================================================================
RCS file: /home/chs/netbsd/cvs/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.206
diff -u -p -r1.206 uvm_extern.h
--- src/sys/uvm/uvm_extern.h	20 May 2017 07:27:15 -0000	1.206
+++ src/sys/uvm/uvm_extern.h	20 Jun 2017 19:07:41 -0000
@@ -599,9 +599,10 @@ extern struct vm_map *phys_map;
 /* vm_machdep.c */
 int		vmapbuf(struct buf *, vsize_t);
 void		vunmapbuf(struct buf *, vsize_t);
+void            ktext_write(void *, const void *, size_t);
 
 /* uvm_aobj.c */
-struct uvm_object	*uao_create(vsize_t, int);
+struct uvm_object	*uao_create(voff_t, int);
 void			uao_set_pgfl(struct uvm_object *, int);
 void			uao_detach(struct uvm_object *);
 void			uao_reference(struct uvm_object *);
Index: src/tools/Makefile.host
===================================================================
RCS file: /home/chs/netbsd/cvs/src/tools/Makefile.host,v
retrieving revision 1.31
diff -u -p -r1.31 Makefile.host
--- src/tools/Makefile.host	14 Jun 2013 16:10:02 -0000	1.31
+++ src/tools/Makefile.host	22 Apr 2017 18:52:02 -0000
@@ -48,6 +48,8 @@ HOST_CPPFLAGS:=	${HOST_CPPFLAGS:N-Wp,-ir
 HOST_INSTALLPROG?=${HOST_BINDIR}/${HOSTPROGNAME}${HOSTEXEEXT}
 .undef LINKS
 
+HOST_CFLAGS+=-g
+
 SRCS?=		${HOSTPROG}.c
 SRCS+=		${HOST_SRCS}
 
Index: src/usr.bin/kdump/Makefile.ioctl-c
===================================================================
RCS file: /home/chs/netbsd/cvs/src/usr.bin/kdump/Makefile.ioctl-c,v
retrieving revision 1.33
diff -u -p -r1.33 Makefile.ioctl-c
--- src/usr.bin/kdump/Makefile.ioctl-c	18 Aug 2016 08:02:58 -0000	1.33
+++ src/usr.bin/kdump/Makefile.ioctl-c	10 Jun 2017 15:49:51 -0000
@@ -44,6 +44,7 @@ CPPFLAGS+=	-I${DESTDIR}/usr/X11R7/includ
 .if ${MKDTRACE} != "no"
 CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/sys
 CPPFLAGS+=	-I${NETBSDSRCDIR}/external/cddl/osnet/dist/uts/common
+CWARNFLAGS+=	-Wno-unknown-pragmas
 .endif
 CPPFLAGS+=	-D_DEV_IC_BT8XX_H_
 CPPFLAGS+=	-D_ALTQ_ALTQ_JOBS_H_	# redefinition of inline
Index: src/usr.bin/kdump/mkioctls
===================================================================
RCS file: /home/chs/netbsd/cvs/src/usr.bin/kdump/mkioctls,v
retrieving revision 1.45
diff -u -p -r1.45 mkioctls
--- src/usr.bin/kdump/mkioctls	22 Oct 2015 00:29:58 -0000	1.45
+++ src/usr.bin/kdump/mkioctls	5 May 2017 22:59:40 -0000
@@ -49,6 +49,7 @@ echo "typedef struct crap videomemory_t;
 echo "typedef struct crap RF_AccTotals_t;" # XXX Raidframe lossage
 echo "struct rf_test_acc { int _xxx; };" # XXX Raidframe lossage
 echo "struct map_info { int _xxx[22]; };" # XXX Xorg lossage
+echo "typedef void *kmutex_t;"
 echo "#include <stdio.h>"
 echo "#include <sys/types.h>"
 echo "#include <sys/param.h>"