From 42462d23e60b89a3c2f7d8d63f5f4e464ba77727 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 23 Mar 2017 18:24:19 +0100
Subject: KVM: kvm_io_bus_unregister_dev() should never fail

commit 90db10434b163e46da413d34db8d0e77404cc645 upstream.

No caller currently checks the return value of
kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on
freeing their device. A stale reference will remain in the io_bus,
getting at least used again, when the iobus gets teared down on
kvm_destroy_vm() - leading to use after free errors.

There is nothing the callers could do, except retrying over and over
again.

So let's simply remove the bus altogether, print an error and make
sure no one can access this broken bus again (returning -ENOMEM on any
attempt to access it).

Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c923350ca20a..d7ce4e3280db 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -182,8 +182,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev);
-int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			      struct kvm_io_device *dev);
+void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			       struct kvm_io_device *dev);
 
 #ifdef CONFIG_KVM_ASYNC_PF
 struct kvm_async_pf {
-- 
cgit v1.2.3


From ad4ae2feef4f65b860f139e0d8455e2a16efb93c Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Mon, 27 Mar 2017 11:21:25 +0200
Subject: drm/ttm, drm/vmwgfx: Relax permission checking when opening surfaces

commit fe25deb7737ce6c0879ccf79c99fa1221d428bf2 upstream.

Previously, when a surface was opened using a legacy (non prime) handle,
it was verified to have been created by a client in the same master realm.
Relax this so that opening is also allowed recursively if the client
already has the surface open.

This works around a regression in svga mesa where opening of a shared
surface is used recursively to obtain surface information.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/gpu/drm/ttm/ttm_object.c         | 10 +++++++---
 drivers/gpu/drm/vmwgfx/vmwgfx_fence.c    |  6 ++----
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c |  4 ++--
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c  | 22 +++++++++-------------
 include/drm/ttm/ttm_object.h             |  5 ++++-
 5 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c
index 4f5fa8d65fe9..144367c0c28f 100644
--- a/drivers/gpu/drm/ttm/ttm_object.c
+++ b/drivers/gpu/drm/ttm/ttm_object.c
@@ -179,7 +179,7 @@ int ttm_base_object_init(struct ttm_object_file *tfile,
 	if (unlikely(ret != 0))
 		goto out_err0;
 
-	ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL);
+	ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL, false);
 	if (unlikely(ret != 0))
 		goto out_err1;
 
@@ -318,7 +318,8 @@ EXPORT_SYMBOL(ttm_ref_object_exists);
 
 int ttm_ref_object_add(struct ttm_object_file *tfile,
 		       struct ttm_base_object *base,
-		       enum ttm_ref_type ref_type, bool *existed)
+		       enum ttm_ref_type ref_type, bool *existed,
+		       bool require_existed)
 {
 	struct drm_open_hash *ht = &tfile->ref_hash[ref_type];
 	struct ttm_ref_object *ref;
@@ -345,6 +346,9 @@ int ttm_ref_object_add(struct ttm_object_file *tfile,
 		}
 
 		rcu_read_unlock();
+		if (require_existed)
+			return -EPERM;
+
 		ret = ttm_mem_global_alloc(mem_glob, sizeof(*ref),
 					   false, false);
 		if (unlikely(ret != 0))
@@ -635,7 +639,7 @@ int ttm_prime_fd_to_handle(struct ttm_object_file *tfile,
 	prime = (struct ttm_prime_object *) dma_buf->priv;
 	base = &prime->base;
 	*handle = base->hash.key;
-	ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL);
+	ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL, false);
 
 	dma_buf_put(dma_buf);
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
index b2f329917eda..6c649f7b5929 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
@@ -1144,10 +1144,8 @@ int vmw_fence_event_ioctl(struct drm_device *dev, void *data,
 		(void) vmw_fence_obj_reference(fence);
 
 		if (user_fence_rep != NULL) {
-			bool existed;
-
-			ret = ttm_ref_object_add(tfile, base,
-						 TTM_REF_USAGE, &existed);
+			ret = ttm_ref_object_add(vmw_fp->tfile, base,
+						 TTM_REF_USAGE, NULL, false);
 			if (unlikely(ret != 0)) {
 				DRM_ERROR("Failed to reference a fence "
 					  "object.\n");
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index e57667ca7557..dbca128a9aa6 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -591,7 +591,7 @@ static int vmw_user_dmabuf_synccpu_grab(struct vmw_user_dma_buffer *user_bo,
 		return ret;
 
 	ret = ttm_ref_object_add(tfile, &user_bo->prime.base,
-				 TTM_REF_SYNCCPU_WRITE, &existed);
+				 TTM_REF_SYNCCPU_WRITE, &existed, false);
 	if (ret != 0 || existed)
 		ttm_bo_synccpu_write_release(&user_bo->dma.base);
 
@@ -775,7 +775,7 @@ int vmw_user_dmabuf_reference(struct ttm_object_file *tfile,
 
 	*handle = user_bo->prime.base.hash.key;
 	return ttm_ref_object_add(tfile, &user_bo->prime.base,
-				  TTM_REF_USAGE, NULL);
+				  TTM_REF_USAGE, NULL, false);
 }
 
 /*
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index b363f0be6512..79f78a68d92d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -904,17 +904,16 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv,
 	uint32_t handle;
 	struct ttm_base_object *base;
 	int ret;
+	bool require_exist = false;
 
 	if (handle_type == DRM_VMW_HANDLE_PRIME) {
 		ret = ttm_prime_fd_to_handle(tfile, u_handle, &handle);
 		if (unlikely(ret != 0))
 			return ret;
 	} else {
-		if (unlikely(drm_is_render_client(file_priv))) {
-			DRM_ERROR("Render client refused legacy "
-				  "surface reference.\n");
-			return -EACCES;
-		}
+		if (unlikely(drm_is_render_client(file_priv)))
+			require_exist = true;
+
 		if (ACCESS_ONCE(vmw_fpriv(file_priv)->locked_master)) {
 			DRM_ERROR("Locked master refused legacy "
 				  "surface reference.\n");
@@ -942,17 +941,14 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv,
 
 		/*
 		 * Make sure the surface creator has the same
-		 * authenticating master.
+		 * authenticating master, or is already registered with us.
 		 */
 		if (drm_is_primary_client(file_priv) &&
-		    user_srf->master != file_priv->master) {
-			DRM_ERROR("Trying to reference surface outside of"
-				  " master domain.\n");
-			ret = -EACCES;
-			goto out_bad_resource;
-		}
+		    user_srf->master != file_priv->master)
+			require_exist = true;
 
-		ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL);
+		ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL,
+					 require_exist);
 		if (unlikely(ret != 0)) {
 			DRM_ERROR("Could not add a reference to a surface.\n");
 			goto out_bad_resource;
diff --git a/include/drm/ttm/ttm_object.h b/include/drm/ttm/ttm_object.h
index ed953f98f0e1..1487011fe057 100644
--- a/include/drm/ttm/ttm_object.h
+++ b/include/drm/ttm/ttm_object.h
@@ -229,6 +229,8 @@ extern void ttm_base_object_unref(struct ttm_base_object **p_base);
  * @ref_type: The type of reference.
  * @existed: Upon completion, indicates that an identical reference object
  * already existed, and the refcount was upped on that object instead.
+ * @require_existed: Fail with -EPERM if an identical ref object didn't
+ * already exist.
  *
  * Checks that the base object is shareable and adds a ref object to it.
  *
@@ -243,7 +245,8 @@ extern void ttm_base_object_unref(struct ttm_base_object **p_base);
  */
 extern int ttm_ref_object_add(struct ttm_object_file *tfile,
 			      struct ttm_base_object *base,
-			      enum ttm_ref_type ref_type, bool *existed);
+			      enum ttm_ref_type ref_type, bool *existed,
+			      bool require_existed);
 
 extern bool ttm_ref_object_exists(struct ttm_object_file *tfile,
 				  struct ttm_base_object *base);
-- 
cgit v1.2.3


From 9e242ec92f951165b52de630ca8ecc6a2748f655 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 22 Feb 2016 11:49:09 -0500
Subject: BACKPORT: [UPSTREAM] mbcache2: reimplement mbcache

(Cherry-pick from commit f9a61eb4e2471c56a63cd804c7474128138c38ac)

Original mbcache was designed to have more features than what ext?
filesystems ended up using. It supported entry being in more hashes, it
had a home-grown rwlocking of each entry, and one cache could cache
entries from multiple filesystems. This genericity also resulted in more
complex locking, larger cache entries, and generally more code
complexity.

This is reimplementation of the mbcache functionality to exactly fit the
purpose ext? filesystems use it for. Cache entries are now considerably
smaller (7 instead of 13 longs), the code is considerably smaller as
well (414 vs 913 lines of code), and IMO also simpler. The new code is
also much more lightweight.

I have measured the speed using artificial xattr-bench benchmark, which
spawns P processes, each process sets xattr for F different files, and
the value of xattr is randomly chosen from a pool of V values. Averages
of runtimes for 5 runs for various combinations of parameters are below.
The first value in each cell is old mbache, the second value is the new
mbcache.

V=10
F\P	1		2		4		8		16		32		64
10	0.158,0.157	0.208,0.196	0.500,0.277	0.798,0.400	3.258,0.584	13.807,1.047	61.339,2.803
100	0.172,0.167	0.279,0.222	0.520,0.275	0.825,0.341	2.981,0.505	12.022,1.202	44.641,2.943
1000	0.185,0.174	0.297,0.239	0.445,0.283	0.767,0.340	2.329,0.480	6.342,1.198	16.440,3.888

V=100
F\P	1		2		4		8		16		32		64
10	0.162,0.153	0.200,0.186	0.362,0.257	0.671,0.496	1.433,0.943	3.801,1.345	7.938,2.501
100	0.153,0.160	0.221,0.199	0.404,0.264	0.945,0.379	1.556,0.485	3.761,1.156	7.901,2.484
1000	0.215,0.191	0.303,0.246	0.471,0.288	0.960,0.347	1.647,0.479	3.916,1.176	8.058,3.160

V=1000
F\P	1		2		4		8		16		32		64
10	0.151,0.129	0.210,0.163	0.326,0.245	0.685,0.521	1.284,0.859	3.087,2.251	6.451,4.801
100	0.154,0.153	0.211,0.191	0.276,0.282	0.687,0.506	1.202,0.877	3.259,1.954	8.738,2.887
1000	0.145,0.179	0.202,0.222	0.449,0.319	0.899,0.333	1.577,0.524	4.221,1.240	9.782,3.579

V=10000
F\P	1		2		4		8		16		32		64
10	0.161,0.154	0.198,0.190	0.296,0.256	0.662,0.480	1.192,0.818	2.989,2.200	6.362,4.746
100	0.176,0.174	0.236,0.203	0.326,0.255	0.696,0.511	1.183,0.855	4.205,3.444	19.510,17.760
1000	0.199,0.183	0.240,0.227	1.159,1.014	2.286,2.154	6.023,6.039	---,10.933	---,36.620

V=100000
F\P	1		2		4		8		16		32		64
10	0.171,0.162	0.204,0.198	0.285,0.230	0.692,0.500	1.225,0.881	2.990,2.243	6.379,4.771
100	0.151,0.171	0.220,0.210	0.295,0.255	0.720,0.518	1.226,0.844	3.423,2.831	19.234,17.544
1000	0.192,0.189	0.249,0.225	1.162,1.043	2.257,2.093	5.853,4.997	---,10.399	---,32.198

We see that the new code is faster in pretty much all the cases and
starting from 4 processes there are significant gains with the new code
resulting in upto 20-times shorter runtimes. Also for large numbers of
cached entries all values for the old code could not be measured as the
kernel started hitting softlockups and died before the test completed.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Bug: 32461228
---
 fs/Makefile              |   2 +-
 fs/mbcache2.c            | 359 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mbcache2.h |  50 +++++++
 3 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 fs/mbcache2.c
 create mode 100644 include/linux/mbcache2.h

(limited to 'include')

diff --git a/fs/Makefile b/fs/Makefile
index 3b54070cd629..dee237540bc0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
 obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 
-obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
+obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o mbcache2.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_COREDUMP)		+= coredump.o
diff --git a/fs/mbcache2.c b/fs/mbcache2.c
new file mode 100644
index 000000000000..5c3e1a8c38f6
--- /dev/null
+++ b/fs/mbcache2.c
@@ -0,0 +1,359 @@
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/list_bl.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mbcache2.h>
+
+/*
+ * Mbcache is a simple key-value store. Keys need not be unique, however
+ * key-value pairs are expected to be unique (we use this fact in
+ * mb2_cache_entry_delete_block()).
+ *
+ * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
+ * They use hash of a block contents as a key and block number as a value.
+ * That's why keys need not be unique (different xattr blocks may end up having
+ * the same hash). However block number always uniquely identifies a cache
+ * entry.
+ *
+ * We provide functions for creation and removal of entries, search by key,
+ * and a special "delete entry with given key-value pair" operation. Fixed
+ * size hash table is used for fast key lookups.
+ */
+
+struct mb2_cache {
+	/* Hash table of entries */
+	struct hlist_bl_head	*c_hash;
+	/* log2 of hash table size */
+	int			c_bucket_bits;
+	/* Protects c_lru_list, c_entry_count */
+	spinlock_t		c_lru_list_lock;
+	struct list_head	c_lru_list;
+	/* Number of entries in cache */
+	unsigned long		c_entry_count;
+	struct shrinker		c_shrink;
+};
+
+static struct kmem_cache *mb2_entry_cache;
+
+/*
+ * mb2_cache_entry_create - create entry in cache
+ * @cache - cache where the entry should be created
+ * @mask - gfp mask with which the entry should be allocated
+ * @key - key of the entry
+ * @block - block that contains data
+ *
+ * Creates entry in @cache with key @key and records that data is stored in
+ * block @block. The function returns -EBUSY if entry with the same key
+ * and for the same block already exists in cache. Otherwise 0 is returned.
+ */
+int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key,
+			   sector_t block)
+{
+	struct mb2_cache_entry *entry, *dup;
+	struct hlist_bl_node *dup_node;
+	struct hlist_bl_head *head;
+
+	entry = kmem_cache_alloc(mb2_entry_cache, mask);
+	if (!entry)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&entry->e_lru_list);
+	/* One ref for hash, one ref returned */
+	atomic_set(&entry->e_refcnt, 1);
+	entry->e_key = key;
+	entry->e_block = block;
+	head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+	entry->e_hash_list_head = head;
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
+		if (dup->e_key == key && dup->e_block == block) {
+			hlist_bl_unlock(head);
+			kmem_cache_free(mb2_entry_cache, entry);
+			return -EBUSY;
+		}
+	}
+	hlist_bl_add_head(&entry->e_hash_list, head);
+	hlist_bl_unlock(head);
+
+	spin_lock(&cache->c_lru_list_lock);
+	list_add_tail(&entry->e_lru_list, &cache->c_lru_list);
+	/* Grab ref for LRU list */
+	atomic_inc(&entry->e_refcnt);
+	cache->c_entry_count++;
+	spin_unlock(&cache->c_lru_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(mb2_cache_entry_create);
+
+void __mb2_cache_entry_free(struct mb2_cache_entry *entry)
+{
+	kmem_cache_free(mb2_entry_cache, entry);
+}
+EXPORT_SYMBOL(__mb2_cache_entry_free);
+
+static struct mb2_cache_entry *__entry_find(struct mb2_cache *cache,
+					    struct mb2_cache_entry *entry,
+					    u32 key)
+{
+	struct mb2_cache_entry *old_entry = entry;
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+
+	if (entry)
+		head = entry->e_hash_list_head;
+	else
+		head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+	hlist_bl_lock(head);
+	if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
+		node = entry->e_hash_list.next;
+	else
+		node = hlist_bl_first(head);
+	while (node) {
+		entry = hlist_bl_entry(node, struct mb2_cache_entry,
+				       e_hash_list);
+		if (entry->e_key == key) {
+			atomic_inc(&entry->e_refcnt);
+			goto out;
+		}
+		node = node->next;
+	}
+	entry = NULL;
+out:
+	hlist_bl_unlock(head);
+	if (old_entry)
+		mb2_cache_entry_put(cache, old_entry);
+
+	return entry;
+}
+
+/*
+ * mb2_cache_entry_find_first - find the first entry in cache with given key
+ * @cache: cache where we should search
+ * @key: key to look for
+ *
+ * Search in @cache for entry with key @key. Grabs reference to the first
+ * entry found and returns the entry.
+ */
+struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache,
+						   u32 key)
+{
+	return __entry_find(cache, NULL, key);
+}
+EXPORT_SYMBOL(mb2_cache_entry_find_first);
+
+/*
+ * mb2_cache_entry_find_next - find next entry in cache with the same
+ * @cache: cache where we should search
+ * @entry: entry to start search from
+ *
+ * Finds next entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races
+ * with the search), finds the first entry in the hash chain. The function
+ * drops reference to @entry and returns with a reference to the found entry.
+ */
+struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache,
+						  struct mb2_cache_entry *entry)
+{
+	return __entry_find(cache, entry, entry->e_key);
+}
+EXPORT_SYMBOL(mb2_cache_entry_find_next);
+
+/* mb2_cache_entry_delete_block - remove information about block from cache
+ * @cache - cache we work with
+ * @key - key of the entry to remove
+ * @block - block containing data for @key
+ *
+ * Remove entry from cache @cache with key @key with data stored in @block.
+ */
+void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key,
+				  sector_t block)
+{
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+	struct mb2_cache_entry *entry;
+
+	head = &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+		if (entry->e_key == key && entry->e_block == block) {
+			/* We keep hash list reference to keep entry alive */
+			hlist_bl_del_init(&entry->e_hash_list);
+			hlist_bl_unlock(head);
+			spin_lock(&cache->c_lru_list_lock);
+			if (!list_empty(&entry->e_lru_list)) {
+				list_del_init(&entry->e_lru_list);
+				cache->c_entry_count--;
+				atomic_dec(&entry->e_refcnt);
+			}
+			spin_unlock(&cache->c_lru_list_lock);
+			mb2_cache_entry_put(cache, entry);
+			return;
+		}
+	}
+	hlist_bl_unlock(head);
+}
+EXPORT_SYMBOL(mb2_cache_entry_delete_block);
+
+/* mb2_cache_entry_touch - cache entry got used
+ * @cache - cache the entry belongs to
+ * @entry - entry that got used
+ *
+ * Move entry in lru list to reflect the fact that it was used.
+ */
+void mb2_cache_entry_touch(struct mb2_cache *cache,
+			   struct mb2_cache_entry *entry)
+{
+	spin_lock(&cache->c_lru_list_lock);
+	if (!list_empty(&entry->e_lru_list))
+		list_move_tail(&cache->c_lru_list, &entry->e_lru_list);
+	spin_unlock(&cache->c_lru_list_lock);
+}
+EXPORT_SYMBOL(mb2_cache_entry_touch);
+
+static unsigned long mb2_cache_count(struct shrinker *shrink,
+				     struct shrink_control *sc)
+{
+	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
+					       c_shrink);
+
+	return cache->c_entry_count;
+}
+
+/* Shrink number of entries in cache */
+static unsigned long mb2_cache_scan(struct shrinker *shrink,
+				    struct shrink_control *sc)
+{
+	int nr_to_scan = sc->nr_to_scan;
+	struct mb2_cache *cache = container_of(shrink, struct mb2_cache,
+					      c_shrink);
+	struct mb2_cache_entry *entry;
+	struct hlist_bl_head *head;
+	unsigned int shrunk = 0;
+
+	spin_lock(&cache->c_lru_list_lock);
+	while (nr_to_scan-- && !list_empty(&cache->c_lru_list)) {
+		entry = list_first_entry(&cache->c_lru_list,
+					 struct mb2_cache_entry, e_lru_list);
+		list_del_init(&entry->e_lru_list);
+		cache->c_entry_count--;
+		/*
+		 * We keep LRU list reference so that entry doesn't go away
+		 * from under us.
+		 */
+		spin_unlock(&cache->c_lru_list_lock);
+		head = entry->e_hash_list_head;
+		hlist_bl_lock(head);
+		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+			hlist_bl_del_init(&entry->e_hash_list);
+			atomic_dec(&entry->e_refcnt);
+		}
+		hlist_bl_unlock(head);
+		if (mb2_cache_entry_put(cache, entry))
+			shrunk++;
+		cond_resched();
+		spin_lock(&cache->c_lru_list_lock);
+	}
+	spin_unlock(&cache->c_lru_list_lock);
+
+	return shrunk;
+}
+
+/*
+ * mb2_cache_create - create cache
+ * @bucket_bits: log2 of the hash table size
+ *
+ * Create cache for keys with 2^bucket_bits hash entries.
+ */
+struct mb2_cache *mb2_cache_create(int bucket_bits)
+{
+	struct mb2_cache *cache;
+	int bucket_count = 1 << bucket_bits;
+	int i;
+
+	if (!try_module_get(THIS_MODULE))
+		return NULL;
+
+	cache = kzalloc(sizeof(struct mb2_cache), GFP_KERNEL);
+	if (!cache)
+		goto err_out;
+	cache->c_bucket_bits = bucket_bits;
+	INIT_LIST_HEAD(&cache->c_lru_list);
+	spin_lock_init(&cache->c_lru_list_lock);
+	cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
+				GFP_KERNEL);
+	if (!cache->c_hash) {
+		kfree(cache);
+		goto err_out;
+	}
+	for (i = 0; i < bucket_count; i++)
+		INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
+
+	cache->c_shrink.count_objects = mb2_cache_count;
+	cache->c_shrink.scan_objects = mb2_cache_scan;
+	cache->c_shrink.seeks = DEFAULT_SEEKS;
+	register_shrinker(&cache->c_shrink);
+
+	return cache;
+
+err_out:
+	module_put(THIS_MODULE);
+	return NULL;
+}
+EXPORT_SYMBOL(mb2_cache_create);
+
+/*
+ * mb2_cache_destroy - destroy cache
+ * @cache: the cache to destroy
+ *
+ * Free all entries in cache and cache itself. Caller must make sure nobody
+ * (except shrinker) can reach @cache when calling this.
+ */
+void mb2_cache_destroy(struct mb2_cache *cache)
+{
+	struct mb2_cache_entry *entry, *next;
+
+	unregister_shrinker(&cache->c_shrink);
+
+	/*
+	 * We don't bother with any locking. Cache must not be used at this
+	 * point.
+	 */
+	list_for_each_entry_safe(entry, next, &cache->c_lru_list, e_lru_list) {
+		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+			hlist_bl_del_init(&entry->e_hash_list);
+			atomic_dec(&entry->e_refcnt);
+		} else
+			WARN_ON(1);
+		list_del(&entry->e_lru_list);
+		WARN_ON(atomic_read(&entry->e_refcnt) != 1);
+		mb2_cache_entry_put(cache, entry);
+	}
+	kfree(cache->c_hash);
+	kfree(cache);
+	module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL(mb2_cache_destroy);
+
+static int __init mb2cache_init(void)
+{
+	mb2_entry_cache = kmem_cache_create("mbcache",
+				sizeof(struct mb2_cache_entry), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	BUG_ON(!mb2_entry_cache);
+	return 0;
+}
+
+static void __exit mb2cache_exit(void)
+{
+	kmem_cache_destroy(mb2_entry_cache);
+}
+
+module_init(mb2cache_init)
+module_exit(mb2cache_exit)
+
+MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mbcache2.h b/include/linux/mbcache2.h
new file mode 100644
index 000000000000..b6f160ff2533
--- /dev/null
+++ b/include/linux/mbcache2.h
@@ -0,0 +1,50 @@
+#ifndef _LINUX_MB2CACHE_H
+#define _LINUX_MB2CACHE_H
+
+#include <linux/hash.h>
+#include <linux/list_bl.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/fs.h>
+
+struct mb2_cache;
+
+struct mb2_cache_entry {
+	/* LRU list - protected by cache->c_lru_list_lock */
+	struct list_head	e_lru_list;
+	/* Hash table list - protected by bitlock in e_hash_list_head */
+	struct hlist_bl_node	e_hash_list;
+	atomic_t		e_refcnt;
+	/* Key in hash - stable during lifetime of the entry */
+	u32			e_key;
+	/* Block number of hashed block - stable during lifetime of the entry */
+	sector_t		e_block;
+	/* Head of hash list (for list bit lock) - stable */
+	struct hlist_bl_head	*e_hash_list_head;
+};
+
+struct mb2_cache *mb2_cache_create(int bucket_bits);
+void mb2_cache_destroy(struct mb2_cache *cache);
+
+int mb2_cache_entry_create(struct mb2_cache *cache, gfp_t mask, u32 key,
+			   sector_t block);
+void __mb2_cache_entry_free(struct mb2_cache_entry *entry);
+static inline int mb2_cache_entry_put(struct mb2_cache *cache,
+				      struct mb2_cache_entry *entry)
+{
+	if (!atomic_dec_and_test(&entry->e_refcnt))
+		return 0;
+	__mb2_cache_entry_free(entry);
+	return 1;
+}
+
+void mb2_cache_entry_delete_block(struct mb2_cache *cache, u32 key,
+				  sector_t block);
+struct mb2_cache_entry *mb2_cache_entry_find_first(struct mb2_cache *cache,
+						   u32 key);
+struct mb2_cache_entry *mb2_cache_entry_find_next(struct mb2_cache *cache,
+						  struct mb2_cache_entry *entry);
+void mb2_cache_entry_touch(struct mb2_cache *cache,
+			   struct mb2_cache_entry *entry);
+
+#endif	/* _LINUX_MB2CACHE_H */
-- 
cgit v1.2.3


From 3144d81a77352a3934ff0f60dccb38dbf462da39 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Mar 2017 16:54:24 -0400
Subject: cgroup, kthread: close race window where new kthreads can be migrated
 to non-root cgroups

commit 77f88796cee819b9c4562b0b6b44691b3b7755b1 upstream.

Creation of a kthread goes through a couple interlocked stages between
the kthread itself and its creator.  Once the new kthread starts
running, it initializes itself and wakes up the creator.  The creator
then can further configure the kthread and then let it start doing its
job by waking it up.

In this configuration-by-creator stage, the creator is the only one
that can wake it up but the kthread is visible to userland.  When
altering the kthread's attributes from userland is allowed, this is
fine; however, for cases where CPU affinity is critical,
kthread_bind() is used to first disable affinity changes from userland
and then set the affinity.  This also prevents the kthread from being
migrated into non-root cgroups as that can affect the CPU affinity and
many other things.

Unfortunately, the cgroup side of protection is racy.  While the
PF_NO_SETAFFINITY flag prevents further migrations, userland can win
the race before the creator sets the flag with kthread_bind() and put
the kthread in a non-root cgroup, which can lead to all sorts of
problems including incorrect CPU affinity and starvation.

This bug got triggered by userland which periodically tries to migrate
all processes in the root cpuset cgroup to a non-root one.  Per-cpu
workqueue workers got caught while being created and ended up with
incorrected CPU affinity breaking concurrency management and sometimes
stalling workqueue execution.

This patch adds task->no_cgroup_migration which disallows the task to
be migrated by userland.  kthreadd starts with the flag set making
every child kthread start in the root cgroup with migration
disallowed.  The flag is cleared after the kthread finishes
initialization by which time PF_NO_SETAFFINITY is set if the kthread
should stay in the root cgroup.

It'd be better to wait for the initialization instead of failing but I
couldn't think of a way of implementing that without adding either a
new PF flag, or sleeping and retrying from waiting side.  Even if
userland depends on changing cgroup membership of a kthread, it either
has to be synchronized with kthread_create() or periodically repeat,
so it's unlikely that this would break anything.

v2: Switch to a simpler implementation using a new task_struct bit
    field suggested by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-and-debugged-by: Chris Mason <clm@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cgroup.h | 21 +++++++++++++++++++++
 include/linux/sched.h  |  4 ++++
 kernel/cgroup.c        |  9 +++++----
 kernel/kthread.c       |  3 +++
 4 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cb91b44f5f78..ad2bcf647b9a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -528,6 +528,25 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 
+static inline void cgroup_init_kthreadd(void)
+{
+	/*
+	 * kthreadd is inherited by all kthreads, keep it in the root so
+	 * that the new kthreads are guaranteed to stay in the root until
+	 * initialization is finished.
+	 */
+	current->no_cgroup_migration = 1;
+}
+
+static inline void cgroup_kthread_ready(void)
+{
+	/*
+	 * This kthread finished initialization.  The creator should have
+	 * set PF_NO_SETAFFINITY if this kthread should stay in the root.
+	 */
+	current->no_cgroup_migration = 0;
+}
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -551,6 +570,8 @@ static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
+static inline void cgroup_init_kthreadd(void) {}
+static inline void cgroup_kthread_ready(void) {}
 
 #endif /* !CONFIG_CGROUPS */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ce0f61dcd887..352213b360d7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1475,6 +1475,10 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
 	unsigned brk_randomized:1;
 #endif
+#ifdef CONFIG_CGROUPS
+	/* disallow userland-initiated cgroup migration */
+	unsigned no_cgroup_migration:1;
+#endif
 
 	unsigned long atomic_flags; /* Flags needing atomic access. */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 127c63e02d52..4cb94b678e9f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2752,11 +2752,12 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 		tsk = tsk->group_leader;
 
 	/*
-	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
-	 * trapped in a cpuset, or RT worker may be born in a cgroup
-	 * with no rt_runtime allocated.  Just say no.
+	 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+	 * If userland migrates such a kthread to a non-root cgroup, it can
+	 * become trapped in a cpuset, or RT kthread may be born in a
+	 * cgroup with no rt_runtime allocated.  Just say no.
 	 */
-	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out_unlock_rcu;
 	}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9ff173dca1ae..850b255649a2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -18,6 +18,7 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -205,6 +206,7 @@ static int kthread(void *_create)
 	ret = -EINTR;
 
 	if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+		cgroup_kthread_ready();
 		__kthread_parkme(&self);
 		ret = threadfn(data);
 	}
@@ -510,6 +512,7 @@ int kthreadd(void *unused)
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
+	cgroup_init_kthreadd();
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From 2673d1c5122ee2492e24d9a135e230b2d0b2e630 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 10 Apr 2017 17:27:57 +0800
Subject: crypto: ahash - Fix EINPROGRESS notification callback

commit ef0579b64e93188710d48667cb5e014926af9f1b upstream.

The ahash API modifies the request's callback function in order
to clean up after itself in some corner cases (unaligned final
and missing finup).

When the request is complete ahash will restore the original
callback and everything is fine.  However, when the request gets
an EBUSY on a full queue, an EINPROGRESS callback is made while
the request is still ongoing.

In this case the ahash API will incorrectly call its own callback.

This patch fixes the problem by creating a temporary request
object on the stack which is used to relay EINPROGRESS back to
the original completion function.

This patch also adds code to preserve the original flags value.

Fixes: ab6bf4e5e5e4 ("crypto: hash - Fix the pointer voodoo in...")
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Tested-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 crypto/ahash.c                 | 79 ++++++++++++++++++++++++++----------------
 include/crypto/internal/hash.h | 10 ++++++
 2 files changed, 60 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index dac1c24e9c3e..f9caf0f74199 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -31,6 +31,7 @@ struct ahash_request_priv {
 	crypto_completion_t complete;
 	void *data;
 	u8 *result;
+	u32 flags;
 	void *ubuf[] CRYPTO_MINALIGN_ATTR;
 };
 
@@ -270,6 +271,8 @@ static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt)
 	priv->result = req->result;
 	priv->complete = req->base.complete;
 	priv->data = req->base.data;
+	priv->flags = req->base.flags;
+
 	/*
 	 * WARNING: We do not backup req->priv here! The req->priv
 	 *          is for internal use of the Crypto API and the
@@ -284,38 +287,44 @@ static int ahash_save_req(struct ahash_request *req, crypto_completion_t cplt)
 	return 0;
 }
 
-static void ahash_restore_req(struct ahash_request *req)
+static void ahash_restore_req(struct ahash_request *req, int err)
 {
 	struct ahash_request_priv *priv = req->priv;
 
+	if (!err)
+		memcpy(priv->result, req->result,
+		       crypto_ahash_digestsize(crypto_ahash_reqtfm(req)));
+
 	/* Restore the original crypto request. */
 	req->result = priv->result;
-	req->base.complete = priv->complete;
-	req->base.data = priv->data;
+
+	ahash_request_set_callback(req, priv->flags,
+				   priv->complete, priv->data);
 	req->priv = NULL;
 
 	/* Free the req->priv.priv from the ADJUSTED request. */
 	kzfree(priv);
 }
 
-static void ahash_op_unaligned_finish(struct ahash_request *req, int err)
+static void ahash_notify_einprogress(struct ahash_request *req)
 {
 	struct ahash_request_priv *priv = req->priv;
+	struct crypto_async_request oreq;
 
-	if (err == -EINPROGRESS)
-		return;
-
-	if (!err)
-		memcpy(priv->result, req->result,
-		       crypto_ahash_digestsize(crypto_ahash_reqtfm(req)));
+	oreq.data = priv->data;
 
-	ahash_restore_req(req);
+	priv->complete(&oreq, -EINPROGRESS);
 }
 
 static void ahash_op_unaligned_done(struct crypto_async_request *req, int err)
 {
 	struct ahash_request *areq = req->data;
 
+	if (err == -EINPROGRESS) {
+		ahash_notify_einprogress(areq);
+		return;
+	}
+
 	/*
 	 * Restore the original request, see ahash_op_unaligned() for what
 	 * goes where.
@@ -326,7 +335,7 @@ static void ahash_op_unaligned_done(struct crypto_async_request *req, int err)
 	 */
 
 	/* First copy req->result into req->priv.result */
-	ahash_op_unaligned_finish(areq, err);
+	ahash_restore_req(areq, err);
 
 	/* Complete the ORIGINAL request. */
 	areq->base.complete(&areq->base, err);
@@ -342,7 +351,12 @@ static int ahash_op_unaligned(struct ahash_request *req,
 		return err;
 
 	err = op(req);
-	ahash_op_unaligned_finish(req, err);
+	if (err == -EINPROGRESS ||
+	    (err == -EBUSY && (ahash_request_flags(req) &
+			       CRYPTO_TFM_REQ_MAY_BACKLOG)))
+		return err;
+
+	ahash_restore_req(req, err);
 
 	return err;
 }
@@ -377,25 +391,14 @@ int crypto_ahash_digest(struct ahash_request *req)
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_digest);
 
-static void ahash_def_finup_finish2(struct ahash_request *req, int err)
+static void ahash_def_finup_done2(struct crypto_async_request *req, int err)
 {
-	struct ahash_request_priv *priv = req->priv;
+	struct ahash_request *areq = req->data;
 
 	if (err == -EINPROGRESS)
 		return;
 
-	if (!err)
-		memcpy(priv->result, req->result,
-		       crypto_ahash_digestsize(crypto_ahash_reqtfm(req)));
-
-	ahash_restore_req(req);
-}
-
-static void ahash_def_finup_done2(struct crypto_async_request *req, int err)
-{
-	struct ahash_request *areq = req->data;
-
-	ahash_def_finup_finish2(areq, err);
+	ahash_restore_req(areq, err);
 
 	areq->base.complete(&areq->base, err);
 }
@@ -406,11 +409,15 @@ static int ahash_def_finup_finish1(struct ahash_request *req, int err)
 		goto out;
 
 	req->base.complete = ahash_def_finup_done2;
-	req->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
 	err = crypto_ahash_reqtfm(req)->final(req);
+	if (err == -EINPROGRESS ||
+	    (err == -EBUSY && (ahash_request_flags(req) &
+			       CRYPTO_TFM_REQ_MAY_BACKLOG)))
+		return err;
 
 out:
-	ahash_def_finup_finish2(req, err);
+	ahash_restore_req(req, err);
 	return err;
 }
 
@@ -418,7 +425,16 @@ static void ahash_def_finup_done1(struct crypto_async_request *req, int err)
 {
 	struct ahash_request *areq = req->data;
 
+	if (err == -EINPROGRESS) {
+		ahash_notify_einprogress(areq);
+		return;
+	}
+
+	areq->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
 	err = ahash_def_finup_finish1(areq, err);
+	if (areq->priv)
+		return;
 
 	areq->base.complete(&areq->base, err);
 }
@@ -433,6 +449,11 @@ static int ahash_def_finup(struct ahash_request *req)
 		return err;
 
 	err = tfm->update(req);
+	if (err == -EINPROGRESS ||
+	    (err == -EBUSY && (ahash_request_flags(req) &
+			       CRYPTO_TFM_REQ_MAY_BACKLOG)))
+		return err;
+
 	return ahash_def_finup_finish1(req, err);
 }
 
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 3b4af1d7c7e9..a25414ce2898 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -173,6 +173,16 @@ static inline struct ahash_instance *ahash_alloc_instance(
 	return crypto_alloc_instance2(name, alg, ahash_instance_headroom());
 }
 
+static inline void ahash_request_complete(struct ahash_request *req, int err)
+{
+	req->base.complete(&req->base, err);
+}
+
+static inline u32 ahash_request_flags(struct ahash_request *req)
+{
+	return req->base.flags;
+}
+
 static inline struct crypto_ahash *crypto_spawn_ahash(
 	struct crypto_ahash_spawn *spawn)
 {
-- 
cgit v1.2.3