[for-4.14.y PATCH] libnvdimm/namespace: Fix label tracking error
by Dan Williams
commit c4703ce11c23423d4b46e3d59aef7979814fd608 upstream.
Users have reported intermittent occurrences of DIMM initialization
failures due to duplicate allocations of address capacity detected in
the labels, or errors of the form below, both have the same root cause.
nd namespace1.4: failed to track label: 0
WARNING: CPU: 17 PID: 1381 at drivers/nvdimm/label.c:863
RIP: 0010:__pmem_label_update+0x56c/0x590 [libnvdimm]
Call Trace:
? nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm]
nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm]
uuid_store+0x17e/0x190 [libnvdimm]
kernfs_fop_write+0xf0/0x1a0
vfs_write+0xb7/0x1b0
ksys_write+0x57/0xd0
do_syscall_64+0x60/0x210
Unfortunately those reports were typically with a busy parallel
namespace creation / destruction loop making it difficult to see the
components of the bug. However, Jane provided a simple reproducer using
the work-in-progress sub-section implementation.
When ndctl is reconfiguring a namespace it may take an existing defunct
/ disabled namespace and reconfigure it with a new uuid and other
parameters. Critically namespace_update_uuid() takes existing address
resources and renames them for the new namespace to use / reconfigure as
it sees fit. The bug is that this rename only happens in the resource
tracking tree. Existing labels with the old uuid are not reaped leading
to a scenario where multiple active labels reference the same span of
address range.
Teach namespace_update_uuid() to flag any references to the old uuid for
reaping at the next label update attempt.
Cc: <stable(a)vger.kernel.org>
Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation")
Link: https://github.com/pmem/ndctl/issues/91
Reported-by: Jane Chu <jane.chu(a)oracle.com>
Reported-by: Jeff Moyer <jmoyer(a)redhat.com>
Reported-by: Erwin Tsaur <erwin.tsaur(a)oracle.com>
Cc: Johannes Thumshirn <jthumshirn(a)suse.de>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
drivers/nvdimm/label.c | 29 ++++++++++++++++-------------
drivers/nvdimm/namespace_devs.c | 15 +++++++++++++++
drivers/nvdimm/nd.h | 4 ++++
3 files changed, 35 insertions(+), 13 deletions(-)
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 184149a49b02..6a16017cc0d9 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -614,6 +614,17 @@ static const guid_t *to_abstraction_guid(enum nvdimm_claim_class claim_class,
return &guid_null;
}
+static void reap_victim(struct nd_mapping *nd_mapping,
+ struct nd_label_ent *victim)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ u32 slot = to_slot(ndd, victim->label);
+
+ dev_dbg(ndd->dev, "free: %d\n", slot);
+ nd_label_free_slot(ndd, slot);
+ victim->label = NULL;
+}
+
static int __pmem_label_update(struct nd_region *nd_region,
struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
int pos, unsigned long flags)
@@ -621,9 +632,9 @@ static int __pmem_label_update(struct nd_region *nd_region,
struct nd_namespace_common *ndns = &nspm->nsio.common;
struct nd_interleave_set *nd_set = nd_region->nd_set;
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
- struct nd_label_ent *label_ent, *victim = NULL;
struct nd_namespace_label *nd_label;
struct nd_namespace_index *nsindex;
+ struct nd_label_ent *label_ent;
struct nd_label_id label_id;
struct resource *res;
unsigned long *free;
@@ -692,18 +703,10 @@ static int __pmem_label_update(struct nd_region *nd_region,
list_for_each_entry(label_ent, &nd_mapping->labels, list) {
if (!label_ent->label)
continue;
- if (memcmp(nspm->uuid, label_ent->label->uuid,
- NSLABEL_UUID_LEN) != 0)
- continue;
- victim = label_ent;
- list_move_tail(&victim->list, &nd_mapping->labels);
- break;
- }
- if (victim) {
- dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
- slot = to_slot(ndd, victim->label);
- nd_label_free_slot(ndd, slot);
- victim->label = NULL;
+ if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)
+ || memcmp(nspm->uuid, label_ent->label->uuid,
+ NSLABEL_UUID_LEN) == 0)
+ reap_victim(nd_mapping, label_ent);
}
/* update index */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index e3f228af59d1..ace9958f2905 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1229,12 +1229,27 @@ static int namespace_update_uuid(struct nd_region *nd_region,
for (i = 0; i < nd_region->ndr_mappings; i++) {
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_ent *label_ent;
struct resource *res;
for_each_dpa_resource(ndd, res)
if (strcmp(res->name, old_label_id.id) == 0)
sprintf((void *) res->name, "%s",
new_label_id.id);
+
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+ struct nd_namespace_label *nd_label = label_ent->label;
+ struct nd_label_id label_id;
+
+ if (!nd_label)
+ continue;
+ nd_label_gen_id(&label_id, nd_label->uuid,
+ __le32_to_cpu(nd_label->flags));
+ if (strcmp(old_label_id.id, label_id.id) == 0)
+ set_bit(ND_LABEL_REAP, &label_ent->flags);
+ }
+ mutex_unlock(&nd_mapping->lock);
}
kfree(*old_uuid);
out:
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 156be00e1f76..e3f060f0b83e 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -120,8 +120,12 @@ struct nd_percpu_lane {
spinlock_t lock;
};
+enum nd_label_flags {
+ ND_LABEL_REAP,
+};
struct nd_label_ent {
struct list_head list;
+ unsigned long flags;
struct nd_namespace_label *label;
};
3 years, 2 months
[PATCH] mm/nvdimm: Use correct alignment when looking at first pfn from a region
by Aneesh Kumar K.V
We already add the start_pad to the resource->start but fails to section
align the start. This make sure with altmap we compute the right first
pfn when start_pad is zero and we are doing an align down of start address.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
---
kernel/memremap.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..23d77b60e728 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -59,9 +59,9 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap)
{
const struct resource *res = &pgmap->res;
struct vmem_altmap *altmap = &pgmap->altmap;
- unsigned long pfn;
+ unsigned long pfn = PHYS_PFN(res->start);
- pfn = res->start >> PAGE_SHIFT;
+ pfn = SECTION_ALIGN_DOWN(pfn);
if (pgmap->altmap_valid)
pfn += vmem_altmap_offset(altmap);
return pfn;
--
2.21.0
3 years, 2 months
[for-4.9.y PATCH] libnvdimm/namespace: Fix label tracking error
by Dan Williams
commit c4703ce11c23423d4b46e3d59aef7979814fd608 upstream.
Users have reported intermittent occurrences of DIMM initialization
failures due to duplicate allocations of address capacity detected in
the labels, or errors of the form below, both have the same root cause.
nd namespace1.4: failed to track label: 0
WARNING: CPU: 17 PID: 1381 at drivers/nvdimm/label.c:863
RIP: 0010:__pmem_label_update+0x56c/0x590 [libnvdimm]
Call Trace:
? nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm]
nd_pmem_namespace_label_update+0xd6/0x160 [libnvdimm]
uuid_store+0x17e/0x190 [libnvdimm]
kernfs_fop_write+0xf0/0x1a0
vfs_write+0xb7/0x1b0
ksys_write+0x57/0xd0
do_syscall_64+0x60/0x210
Unfortunately those reports were typically with a busy parallel
namespace creation / destruction loop making it difficult to see the
components of the bug. However, Jane provided a simple reproducer using
the work-in-progress sub-section implementation.
When ndctl is reconfiguring a namespace it may take an existing defunct
/ disabled namespace and reconfigure it with a new uuid and other
parameters. Critically namespace_update_uuid() takes existing address
resources and renames them for the new namespace to use / reconfigure as
it sees fit. The bug is that this rename only happens in the resource
tracking tree. Existing labels with the old uuid are not reaped leading
to a scenario where multiple active labels reference the same span of
address range.
Teach namespace_update_uuid() to flag any references to the old uuid for
reaping at the next label update attempt.
Cc: <stable(a)vger.kernel.org>
Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation")
Link: https://github.com/pmem/ndctl/issues/91
Reported-by: Jane Chu <jane.chu(a)oracle.com>
Reported-by: Jeff Moyer <jmoyer(a)redhat.com>
Reported-by: Erwin Tsaur <erwin.tsaur(a)oracle.com>
Cc: Johannes Thumshirn <jthumshirn(a)suse.de>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
drivers/nvdimm/label.c | 29 ++++++++++++++++-------------
drivers/nvdimm/namespace_devs.c | 15 +++++++++++++++
drivers/nvdimm/nd.h | 4 ++++
3 files changed, 35 insertions(+), 13 deletions(-)
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 66a089d561cf..9108004a0d9b 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -490,15 +490,26 @@ static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd,
- (unsigned long) to_namespace_index(ndd, 0);
}
+static void reap_victim(struct nd_mapping *nd_mapping,
+ struct nd_label_ent *victim)
+{
+ struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ u32 slot = to_slot(ndd, victim->label);
+
+ dev_dbg(ndd->dev, "free: %d\n", slot);
+ nd_label_free_slot(ndd, slot);
+ victim->label = NULL;
+}
+
static int __pmem_label_update(struct nd_region *nd_region,
struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm,
int pos, unsigned long flags)
{
u64 cookie = nd_region_interleave_set_cookie(nd_region);
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
- struct nd_label_ent *label_ent, *victim = NULL;
struct nd_namespace_label *nd_label;
struct nd_namespace_index *nsindex;
+ struct nd_label_ent *label_ent;
struct nd_label_id label_id;
struct resource *res;
unsigned long *free;
@@ -551,18 +562,10 @@ static int __pmem_label_update(struct nd_region *nd_region,
list_for_each_entry(label_ent, &nd_mapping->labels, list) {
if (!label_ent->label)
continue;
- if (memcmp(nspm->uuid, label_ent->label->uuid,
- NSLABEL_UUID_LEN) != 0)
- continue;
- victim = label_ent;
- list_move_tail(&victim->list, &nd_mapping->labels);
- break;
- }
- if (victim) {
- dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot);
- slot = to_slot(ndd, victim->label);
- nd_label_free_slot(ndd, slot);
- victim->label = NULL;
+ if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)
+ || memcmp(nspm->uuid, label_ent->label->uuid,
+ NSLABEL_UUID_LEN) == 0)
+ reap_victim(nd_mapping, label_ent);
}
/* update index */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index cf4a90b50f8b..e83453e1b308 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1210,12 +1210,27 @@ static int namespace_update_uuid(struct nd_region *nd_region,
for (i = 0; i < nd_region->ndr_mappings; i++) {
struct nd_mapping *nd_mapping = &nd_region->mapping[i];
struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+ struct nd_label_ent *label_ent;
struct resource *res;
for_each_dpa_resource(ndd, res)
if (strcmp(res->name, old_label_id.id) == 0)
sprintf((void *) res->name, "%s",
new_label_id.id);
+
+ mutex_lock(&nd_mapping->lock);
+ list_for_each_entry(label_ent, &nd_mapping->labels, list) {
+ struct nd_namespace_label *nd_label = label_ent->label;
+ struct nd_label_id label_id;
+
+ if (!nd_label)
+ continue;
+ nd_label_gen_id(&label_id, nd_label->uuid,
+ __le32_to_cpu(nd_label->flags));
+ if (strcmp(old_label_id.id, label_id.id) == 0)
+ set_bit(ND_LABEL_REAP, &label_ent->flags);
+ }
+ mutex_unlock(&nd_mapping->lock);
}
kfree(*old_uuid);
out:
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d869236b474f..bd29e598bac1 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -113,8 +113,12 @@ struct nd_percpu_lane {
spinlock_t lock;
};
+enum nd_label_flags {
+ ND_LABEL_REAP,
+};
struct nd_label_ent {
struct list_head list;
+ unsigned long flags;
struct nd_namespace_label *label;
};
3 years, 2 months
[GIT PULL] libnvdimm fixes for v5.2-rc2
by Dan Williams
Hi Linus, please pull from:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-fixes-5.2-rc2
...to receive a regression fix, a small (2 line code change)
performance enhancement, and some miscellaneous compilation warning
fixes. These have soaked in -next the past week with no known issues.
The device-mapper touches have Mike's ack, and the hardened user-copy
bypass was reviewed with Kees.
---
The following changes since commit a188339ca5a396acc588e5851ed7e19f66b0ebd9:
Linux 5.2-rc1 (2019-05-19 15:47:09 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-fixes-5.2-rc2
for you to fetch changes up to 52f476a323f9efc959be1c890d0cdcf12e1582e0:
libnvdimm/pmem: Bypass CONFIG_HARDENED_USERCOPY overhead (2019-05-20
20:43:32 -0700)
----------------------------------------------------------------
libnvdimm fixes v5.2-rc2
- Fix a regression that disabled device-mapper dax support
- Remove unnecessary hardened-user-copy overhead (>30%) for dax
read(2)/write(2).
- Fix some compilation warnings.
----------------------------------------------------------------
Dan Williams (2):
dax: Arrange for dax_supported check to span multiple devices
libnvdimm/pmem: Bypass CONFIG_HARDENED_USERCOPY overhead
Qian Cai (1):
libnvdimm: Fix compilation warnings with W=1
drivers/dax/super.c | 88 ++++++++++++++++++++++++++++----------------
drivers/md/dm-table.c | 17 ++++++---
drivers/md/dm.c | 20 ++++++++++
drivers/md/dm.h | 1 +
drivers/nvdimm/bus.c | 4 +-
drivers/nvdimm/label.c | 2 +
drivers/nvdimm/label.h | 2 -
drivers/nvdimm/pmem.c | 11 +++++-
drivers/s390/block/dcssblk.c | 1 +
include/linux/dax.h | 26 +++++++++++++
10 files changed, 129 insertions(+), 43 deletions(-)
3 years, 2 months
[PATCH v2 00/17] kunit: introduce KUnit, the Linux kernel unit testing framework
by Brendan Higgins
## TLDR
I rebased the last patchset on 5.1-rc7 in hopes that we can get this in
5.2.
Shuah, I think you, Greg KH, and myself talked off thread, and we agreed
we would merge through your tree when the time came? Am I remembering
correctly?
## Background
This patch set proposes KUnit, a lightweight unit testing and mocking
framework for the Linux kernel.
Unlike Autotest and kselftest, KUnit is a true unit testing framework;
it does not require installing the kernel on a test machine or in a VM
and does not require tests to be written in userspace running on a host
kernel. Additionally, KUnit is fast: From invocation to completion KUnit
can run several dozen tests in under a second. Currently, the entire
KUnit test suite for KUnit runs in under a second from the initial
invocation (build time excluded).
KUnit is heavily inspired by JUnit, Python's unittest.mock, and
Googletest/Googlemock for C++. KUnit provides facilities for defining
unit test cases, grouping related test cases into test suites, providing
common infrastructure for running tests, mocking, spying, and much more.
## What's so special about unit testing?
A unit test is supposed to test a single unit of code in isolation,
hence the name. There should be no dependencies outside the control of
the test; this means no external dependencies, which makes tests orders
of magnitudes faster. Likewise, since there are no external dependencies,
there are no hoops to jump through to run the tests. Additionally, this
makes unit tests deterministic: a failing unit test always indicates a
problem. Finally, because unit tests necessarily have finer granularity,
they are able to test all code paths easily solving the classic problem
of difficulty in exercising error handling code.
## Is KUnit trying to replace other testing frameworks for the kernel?
No. Most existing tests for the Linux kernel are end-to-end tests, which
have their place. A well tested system has lots of unit tests, a
reasonable number of integration tests, and some end-to-end tests. KUnit
is just trying to address the unit test space which is currently not
being addressed.
## More information on KUnit
There is a bunch of documentation near the end of this patch set that
describes how to use KUnit and best practices for writing unit tests.
For convenience I am hosting the compiled docs here:
https://google.github.io/kunit-docs/third_party/kernel/docs/
Additionally for convenience, I have applied these patches to a branch:
https://kunit.googlesource.com/linux/+/kunit/rfc/v5.1-rc7/v1
The repo may be cloned with:
git clone https://kunit.googlesource.com/linux
This patchset is on the kunit/rfc/v5.1-rc7/v1 branch.
## Changes Since Last Version
None. I just rebased the last patchset on v5.1-rc7.
--
2.21.0.593.g511ec345e18-goog
3 years, 2 months
[ndctl PATCH v3 00/10] daxctl: add a new reconfigure-device command
by Vishal Verma
Changes in v3:
- In daxctl_dev_get_mode(), remove the subsystem warning, detect dax-class
and simply make it return devdax
Changes in v2:
- Add examples to the documentation page (Dave Hansen)
- Clarify documentation regarding the conversion from system-ram to devdax
- Remove any references to a persistent config from the documentation -
those can be added when the feature is added.
- device.c: validate option compatibility
- daxctl-list: display numa_node for device listings
- daxctl-list: display mode for device listings
- make the options more consistent by adding a '-O' short option
for --attempt-offline
Add a new daxctl-reconfigure-device command that lets us reconfigure DAX
devices back and forth between 'system-ram' and 'device-dax' modes. It
also includes facilities to online any newly hot-plugged memory
(default), and attempt to offline memory before converting away from the
system-ram mode (not default, requires a --attempt-offline option).
Currently missing from this series is a way to persistently store which
devices have been 'marked' for use as system-ram. This depends on a
config system overhaul in ndctl, and patches for those will follow
separately and are independent of this work.
Example invocations:
1. Reconfigure dax0.0 to system-ram mode, don’t online the memory
# daxctl reconfigure-device --mode=system-ram --no-online dax0.0
[
{
"chardev":"dax0.0",
"size":16777216000,
"numa_node":2,
"mode":"system-ram"
}
]
2. Reconfigure dax0.0 to devdax mode, attempt to offline the memory
# daxctl reconfigure-device --human --mode=devdax --attempt-offline dax0.0
{
"chardev":"dax0.0",
"size":"15.63 GiB (16.78 GB)",
"numa_node":2,
"mode":"devdax"
}
3. Reconfigure all dax devices on region0 to system-ram mode
# daxctl reconfigure-device --mode=system-ram --region=0 all
[
{
"chardev":"dax0.0",
"size":16777216000,
"numa_node":2,
"mode":"system-ram"
},
{
"chardev":"dax0.1",
"size":16777216000,
"numa_node":3,
"mode":"system-ram"
}
]
These patches can also be found in the 'kmem-pending' branch on github:
https://github.com/pmem/ndctl/tree/kmem-pending
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Pavel Tatashin <pasha.tatashin(a)soleen.com>
Vishal Verma (10):
libdaxctl: add interfaces in support of device modes
libdaxctl: cache 'subsystem' in daxctl_ctx
libdaxctl: add interfaces to enable/disable devices
libdaxctl: add interfaces to get/set the online state for a node
daxctl/list: add numa_node for device listings
libdaxctl: add an interface to get the mode for a dax device
daxctl: add a new reconfigure-device command
Documentation/daxctl: add a man page for daxctl-reconfigure-device
contrib/ndctl: fix region-id completions for daxctl
contrib/ndctl: add bash-completion for daxctl-reconfigure-device
Documentation/daxctl/Makefile.am | 3 +-
.../daxctl/daxctl-reconfigure-device.txt | 118 ++++
contrib/ndctl | 34 +-
daxctl/Makefile.am | 2 +
daxctl/builtin.h | 1 +
daxctl/daxctl.c | 1 +
daxctl/device.c | 237 ++++++++
daxctl/lib/Makefile.am | 3 +-
daxctl/lib/libdaxctl-private.h | 21 +
daxctl/lib/libdaxctl.c | 552 +++++++++++++++++-
daxctl/lib/libdaxctl.sym | 14 +
daxctl/libdaxctl.h | 16 +
util/json.c | 22 +
13 files changed, 1013 insertions(+), 11 deletions(-)
create mode 100644 Documentation/daxctl/daxctl-reconfigure-device.txt
create mode 100644 daxctl/device.c
--
2.20.1
3 years, 2 months
[RFC PATCH] mm/nvdimm: Fix kernel crash on devm_mremap_pages_release
by Aneesh Kumar K.V
When we initialize the namespace, if we support altmap, we don't initialize all the
backing struct page where as while releasing the namespace we look at some of
these uninitilized struct page. This results in a kernel crash as below.
kernel BUG at include/linux/mm.h:1034!
cpu 0x2: Vector: 700 (Program Check) at [c00000024146b870]
pc: c0000000003788f8: devm_memremap_pages_release+0x258/0x3a0
lr: c0000000003788f4: devm_memremap_pages_release+0x254/0x3a0
sp: c00000024146bb00
msr: 800000000282b033
current = 0xc000000241382f00
paca = 0xc00000003fffd680 irqmask: 0x03 irq_happened: 0x01
pid = 4114, comm = ndctl
c0000000009bf8c0 devm_action_release+0x30/0x50
c0000000009c0938 release_nodes+0x268/0x2d0
c0000000009b95b4 device_release_driver_internal+0x164/0x230
c0000000009b638c unbind_store+0x13c/0x190
c0000000009b4f44 drv_attr_store+0x44/0x60
c00000000058ccc0 sysfs_kf_write+0x70/0xa0
c00000000058b52c kernfs_fop_write+0x1ac/0x290
c0000000004a415c __vfs_write+0x3c/0x70
c0000000004a85ac vfs_write+0xec/0x200
c0000000004a8920 ksys_write+0x80/0x130
c00000000000bee4 system_call+0x5c/0x70
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
---
mm/page_alloc.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59661106da16..892eabe1ec13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5740,8 +5740,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
#ifdef CONFIG_ZONE_DEVICE
/*
- * Honor reservation requested by the driver for this ZONE_DEVICE
- * memory. We limit the total number of pages to initialize to just
+ * We limit the total number of pages to initialize to just
* those that might contain the memory mapping. We will defer the
* ZONE_DEVICE page initialization until after we have released
* the hotplug lock.
@@ -5750,8 +5749,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
if (!altmap)
return;
- if (start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
}
#endif
--
2.21.0
3 years, 2 months
[RFC PATCH 1/3] mm/nvdimm: Add PFN_MIN_VERSION support
by Aneesh Kumar K.V
This allows us to make changes in a backward incompatible way. I have
kept the PFN_MIN_VERSION in this patch '0' because we are not introducing
any incompatible changes in this patch. We also may want to backport this
to older kernels.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
---
drivers/nvdimm/pfn.h | 9 ++++++++-
drivers/nvdimm/pfn_devs.c | 4 ++++
drivers/nvdimm/pmem.c | 26 ++++++++++++++++++++++----
3 files changed, 34 insertions(+), 5 deletions(-)
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index dde9853453d3..1b10ae5773b6 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -20,6 +20,12 @@
#define PFN_SIG_LEN 16
#define PFN_SIG "NVDIMM_PFN_INFO\0"
#define DAX_SIG "NVDIMM_DAX_INFO\0"
+/*
+ * increment this when we are making changes such that older
+ * kernel should fail to initialize that namespace.
+ */
+
+#define PFN_MIN_VERSION 0
struct nd_pfn_sb {
u8 signature[PFN_SIG_LEN];
@@ -36,7 +42,8 @@ struct nd_pfn_sb {
__le32 end_trunc;
/* minor-version-2 record the base alignment of the mapping */
__le32 align;
- u8 padding[4000];
+ __le16 min_verison;
+ u8 padding[3998];
__le64 checksum;
};
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 01f40672507f..3250de70a7b3 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -439,6 +439,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
return -ENXIO;
+ if (le16_to_cpu(pfn_sb->min_version > PFN_MIN_VERSION))
+ return -EOPNOTSUPP;
+
if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
return -ENODEV;
@@ -769,6 +772,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
pfn_sb->version_minor = cpu_to_le16(2);
+ pfn_sb->min_version = cpu_to_le16(PFN_MIN_VERSION);
pfn_sb->start_pad = cpu_to_le32(start_pad);
pfn_sb->end_trunc = cpu_to_le32(end_trunc);
pfn_sb->align = cpu_to_le32(nd_pfn->align);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 845c5b430cdd..406427c064d9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -490,6 +490,7 @@ static int pmem_attach_disk(struct device *dev,
static int nd_pmem_probe(struct device *dev)
{
+ int ret;
struct nd_namespace_common *ndns;
ndns = nvdimm_namespace_common_probe(dev);
@@ -505,12 +506,29 @@ static int nd_pmem_probe(struct device *dev)
if (is_nd_pfn(dev))
return pmem_attach_disk(dev, ndns);
- /* if we find a valid info-block we'll come back as that personality */
- if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
- || nd_dax_probe(dev, ndns) == 0)
+ ret = nd_btt_probe(dev, ndns);
+ if (ret == 0)
return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
- /* ...otherwise we're just a raw pmem device */
+ ret = nd_pfn_probe(dev, ndns);
+ if (ret == 0)
+ return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
+
+ ret = nd_dax_probe(dev, ndns);
+ if (ret == 0)
+ return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
+ /*
+ * We have two failure conditions here, there is no
+ * info reserver block or we found a valid info reserve block
+ * but failed to initialize the pfn superblock.
+ * Don't create a raw pmem disk for the second case.
+ */
return pmem_attach_disk(dev, ndns);
}
--
2.21.0
3 years, 2 months
[PATCH] mm/nvdimm: Use correct #defines instead of opencoding
by Aneesh Kumar K.V
The nfpn related change is needed to fix the kernel message
"number of pfns truncated from 2617344 to 163584"
The change makes sure the nfpns stored in the superblock is right value.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
---
drivers/nvdimm/pfn_devs.c | 6 +++---
drivers/nvdimm/region_devs.c | 8 ++++----
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 347cab166376..6751ff0296ef 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -777,8 +777,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
* when populating the vmemmap. This *should* be equal to
* PMD_SIZE for most architectures.
*/
- offset = ALIGN(start + reserve + 64 * npfns,
- max(nd_pfn->align, PMD_SIZE)) - start;
+ offset = ALIGN(start + reserve + sizeof(struct page) * npfns,
+ max(nd_pfn->align, PMD_SIZE)) - start;
} else if (nd_pfn->mode == PFN_MODE_RAM)
offset = ALIGN(start + reserve, nd_pfn->align) - start;
else
@@ -790,7 +790,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
return -ENXIO;
}
- npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+ npfns = (size - offset - start_pad - end_trunc) / PAGE_SIZE;
pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
pfn_sb->dataoff = cpu_to_le64(offset);
pfn_sb->npfns = cpu_to_le64(npfns);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index b4ef7d9ff22e..2d8facea5a03 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -994,10 +994,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
struct nvdimm *nvdimm = mapping->nvdimm;
- if ((mapping->start | mapping->size) % SZ_4K) {
- dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n",
- caller, dev_name(&nvdimm->dev), i);
-
+ if ((mapping->start | mapping->size) % PAGE_SIZE) {
+ dev_err(&nvdimm_bus->dev,
+ "%s: %s mapping%d is not 4K aligned\n",
+ caller, dev_name(&nvdimm->dev), i);
return NULL;
}
--
2.21.0
3 years, 2 months
[PATCH v2] libnvdimm/pmem: Bypass CONFIG_HARDENED_USERCOPY overhead
by Dan Williams
Jeff discovered that performance improves from ~375K iops to ~519K iops
on a simple psync-write fio workload when moving the location of 'struct
page' from the default PMEM location to DRAM. This result is surprising
because the expectation is that 'struct page' for dax is only needed for
third party references to dax mappings. For example, a dax-mapped buffer
passed to another system call for direct-I/O requires 'struct page' for
sending the request down the driver stack and pinning the page. There is
no usage of 'struct page' for first party access to a file via
read(2)/write(2) and friends.
However, this "no page needed" expectation is violated by
CONFIG_HARDENED_USERCOPY and the check_copy_size() performed in
copy_from_iter_full_nocache() and copy_to_iter_mcsafe(). The
check_heap_object() helper routine assumes the buffer is backed by a
slab allocator (DRAM) page and applies some checks. Those checks are
invalid, dax pages do not originate from the slab, and redundant,
dax_iomap_actor() has already validated that the I/O is within bounds.
Specifically that routine validates that the logical file offset is
within bounds of the file, then it does a sector-to-pfn translation
which validates that the physical mapping is within bounds of the block
device.
Bypass additional hardened usercopy overhead and call the 'no check'
versions of the copy_{to,from}_iter operations directly.
Fixes: 0aed55af8834 ("x86, uaccess: introduce copy_from_iter_flushcache...")
Cc: <stable(a)vger.kernel.org>
Cc: Jeff Moyer <jmoyer(a)redhat.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Matthew Wilcox <willy(a)infradead.org>
Reported-and-tested-by: Jeff Smits <jeff.smits(a)intel.com>
Acked-by: Kees Cook <keescook(a)chromium.org>
Acked-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
Changes since v1 [1]:
* Update the changelog to clarify which checks in dax_iomap_actor()
obviate the need for "hardened" checks. (Jan)
* Update the code comment in drivers/nvdimm/pmem.c to reflect the same.
* Collect some Acks from Kees and Jan.
[1]: https://lore.kernel.org/lkml/155805321833.867447.3864104616303535270.stgi...
drivers/nvdimm/pmem.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 845c5b430cdd..c894f45e5077 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -281,16 +281,21 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
}
+/*
+ * Use the 'no check' versions of copy_from_iter_flushcache() and
+ * copy_to_iter_mcsafe() to bypass HARDENED_USERCOPY overhead. Bounds
+ * checking is handled by dax_iomap_actor()
+ */
static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- return copy_from_iter_flushcache(addr, bytes, i);
+ return _copy_from_iter_flushcache(addr, bytes, i);
}
static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- return copy_to_iter_mcsafe(addr, bytes, i);
+ return _copy_to_iter_mcsafe(addr, bytes, i);
}
static const struct dax_operations pmem_dax_ops = {
3 years, 2 months