[PATCH v2] mm, memory-failure: clarify error message
by Jane Chu
Some user who install SIGBUS handler that does longjmp out
therefore keeping the process alive is confused by the error
message
"[188988.765862] Memory failure: 0x1840200: Killing
cellsrv:33395 due to hardware memory corruption"
Slightly modify the error message to improve clarity.
Signed-off-by: Jane Chu <jane.chu(a)oracle.com>
---
mm/memory-failure.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fc8b517..c4f4bcd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -216,7 +216,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
short addr_lsb = tk->size_shift;
int ret;
- pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
--
1.8.3.1
3 years, 1 month
[linux-nvdimm:libnvdimm-fixes 2/3] drivers//dax/super.c:76:6: error: redefinition of 'generic_fsdax_supported'
by kbuild test robot
tree: https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git libnvdimm-fixes
head: ac09ef3d6de21992ad8e372dc97017cc3c46e15e
commit: 17a1362dd849b1c01dedcda38a6c0f5e26582407 [2/3] dax: Arrange for dax_supported check to span multiple devices
config: i386-randconfig-x006-201920 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
git checkout 17a1362dd849b1c01dedcda38a6c0f5e26582407
# save the attached .config to linux build tree
make ARCH=i386
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp(a)intel.com>
All errors (new ones prefixed by >>):
>> drivers//dax/super.c:76:6: error: redefinition of 'generic_fsdax_supported'
bool generic_fsdax_supported(struct dax_device *dax_dev,
^~~~~~~~~~~~~~~~~~~~~~~
In file included from drivers//dax/super.c:23:0:
include/linux/dax.h:112:20: note: previous definition of 'generic_fsdax_supported' was here
static inline bool generic_fsdax_supported(struct dax_device *dax_dev,
^~~~~~~~~~~~~~~~~~~~~~~
vim +/generic_fsdax_supported +76 drivers//dax/super.c
75
> 76 bool generic_fsdax_supported(struct dax_device *dax_dev,
77 struct block_device *bdev, int blocksize, sector_t start,
78 sector_t sectors)
79 {
80 bool dax_enabled = false;
81 pgoff_t pgoff, pgoff_end;
82 char buf[BDEVNAME_SIZE];
83 void *kaddr, *end_kaddr;
84 pfn_t pfn, end_pfn;
85 sector_t last_page;
86 long len, len2;
87 int err, id;
88
89 if (blocksize != PAGE_SIZE) {
90 pr_debug("%s: error: unsupported blocksize for dax\n",
91 bdevname(bdev, buf));
92 return false;
93 }
94
95 err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff);
96 if (err) {
97 pr_debug("%s: error: unaligned partition for dax\n",
98 bdevname(bdev, buf));
99 return false;
100 }
101
102 last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512;
103 err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end);
104 if (err) {
105 pr_debug("%s: error: unaligned partition for dax\n",
106 bdevname(bdev, buf));
107 return false;
108 }
109
110 id = dax_read_lock();
111 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
112 len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn);
113 dax_read_unlock(id);
114
115 if (len < 1 || len2 < 1) {
116 pr_debug("%s: error: dax access failed (%ld)\n",
117 bdevname(bdev, buf), len < 1 ? len : len2);
118 return false;
119 }
120
121 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
122 /*
123 * An arch that has enabled the pmem api should also
124 * have its drivers support pfn_t_devmap()
125 *
126 * This is a developer warning and should not trigger in
127 * production. dax_flush() will crash since it depends
128 * on being able to do (page_address(pfn_to_page())).
129 */
130 WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
131 dax_enabled = true;
132 } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) {
133 struct dev_pagemap *pgmap, *end_pgmap;
134
135 pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
136 end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL);
137 if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX
138 && pfn_t_to_page(pfn)->pgmap == pgmap
139 && pfn_t_to_page(end_pfn)->pgmap == pgmap
140 && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr))
141 && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr)))
142 dax_enabled = true;
143 put_dev_pagemap(pgmap);
144 put_dev_pagemap(end_pgmap);
145
146 }
147
148 if (!dax_enabled) {
149 pr_debug("%s: error: dax support not enabled\n",
150 bdevname(bdev, buf));
151 return false;
152 }
153 return true;
154 }
155 EXPORT_SYMBOL_GPL(generic_fsdax_supported);
156
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
3 years, 1 month
[PATCH] mm, memory-failure: clarify error message
by Jane Chu
Some user who install SIGBUS handler that does longjmp out
therefore keeping the process alive is confused by the error
message
"[188988.765862] Memory failure: 0x1840200: Killing
cellsrv:33395 due to hardware memory corruption"
Slightly modify the error message to improve clarity.
Signed-off-by: Jane Chu <jane.chu(a)oracle.com>
---
mm/memory-failure.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fc8b517..14de5e2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -216,10 +216,9 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
short addr_lsb = tk->size_shift;
int ret;
- pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
-
if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
+ pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory "
+ "corruption\n", pfn, t->comm, t->pid);
ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
addr_lsb, current);
} else {
@@ -229,6 +228,8 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
* This could cause a loop when the user sets SIGBUS
* to SIG_IGN, but hopefully no one will do that?
*/
+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware "
+ "memory corruption\n", pfn, t->comm, t->pid);
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
addr_lsb, t); /* synchronous? */
}
--
1.8.3.1
3 years, 1 month
[PATCH] dax: Fix last_page check in __bdev_dax_supported()
by Vaibhav Jain
Presently __bdev_dax_supported() checks if first sector of last
page ( last_page ) on the block device is aligned to page
boundary. However the code to compute 'last_page' assumes that there
are 8 sectors/page assuming a 4K page-size.
This assumption breaks on architectures which use a different page
size specifically PPC64 where page-size == 64K. Hence a warning is
seen while trying to mount a xfs/ext4 file-system with dax enabled:
$ sudo mount -o dax /dev/pmem0 /mnt/pmem
XFS (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your own risk
XFS (pmem0): DAX unsupported by block device. Turning off DAX.
The patch fixes this issue by updating calculation of 'last_var' to
take into account number-of-sectors/page instead of assuming it to be
'8'.
Fixes: ad428cdb525a ("dax: Check the end of the block-device capacity with dax_direct_access()")
Signed-off-by: Chandan Rajendra <chandan(a)linux.ibm.com>
Signed-off-by: Vaibhav Jain <vaibhav(a)linux.ibm.com>
---
drivers/dax/super.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index bbd57ca0634a..6b50ed3673d3 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -116,7 +116,9 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
return false;
}
- last_page = PFN_DOWN(i_size_read(bdev->bd_inode) - 1) * 8;
+ /* Calculate the first sector of last page on the block device */
+ last_page = PFN_DOWN(i_size_read(bdev->bd_inode) - 1) *
+ (PAGE_SIZE >> SECTOR_SHIFT);
err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end);
if (err) {
pr_debug("%s: error: unaligned partition for dax\n",
--
2.21.0
3 years, 1 month
[PATCH] libnvdimm/pmem: Bypass CONFIG_HARDENED_USERCOPY overhead
by Dan Williams
Jeff discovered that performance improves from ~375K iops to ~519K iops
on a simple psync-write fio workload when moving the location of 'struct
page' from the default PMEM location to DRAM. This result is surprising
because the expectation is that 'struct page' for dax is only needed for
third party references to dax mappings. For example, a dax-mapped buffer
passed to another system call for direct-I/O requires 'struct page' for
sending the request down the driver stack and pinning the page. There is
no usage of 'struct page' for first party access to a file via
read(2)/write(2) and friends.
However, this "no page needed" expectation is violated by
CONFIG_HARDENED_USERCOPY and the check_copy_size() performed in
copy_from_iter_full_nocache() and copy_to_iter_mcsafe(). The
check_heap_object() helper routine assumes the buffer is backed by a
page-allocator DRAM page and applies some checks. Those checks are
invalid, dax pages are not from the heap, and redundant,
dax_iomap_actor() has already validated that the I/O is within bounds.
Bypass this overhead and call the 'no check' versions of the
copy_{to,from}_iter operations directly.
Fixes: 0aed55af8834 ("x86, uaccess: introduce copy_from_iter_flushcache...")
Cc: Jan Kara <jack(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Cc: Jeff Moyer <jmoyer(a)redhat.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Matthew Wilcox <willy(a)infradead.org>
Reported-and-tested-by: Jeff Smits <jeff.smits(a)intel.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
drivers/nvdimm/pmem.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 845c5b430cdd..c894f45e5077 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -281,16 +281,21 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev,
return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
}
+/*
+ * Use the 'no check' versions of copy_from_iter_flushcache() and
+ * copy_to_iter_mcsafe() to bypass HARDENED_USERCOPY overhead. Bounds
+ * checking is handled by dax_iomap_actor()
+ */
static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- return copy_from_iter_flushcache(addr, bytes, i);
+ return _copy_from_iter_flushcache(addr, bytes, i);
}
static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
- return copy_to_iter_mcsafe(addr, bytes, i);
+ return _copy_to_iter_mcsafe(addr, bytes, i);
}
static const struct dax_operations pmem_dax_ops = {
3 years, 1 month
[PATCH v2] mm/nvdimm: Pick the right alignment default when creating dax devices
by Aneesh Kumar K.V
Allow arch to provide the supported alignments and use hugepage alignment only
if we support hugepage. Right now we depend on compile time configs whereas this
patch switch this to runtime discovery.
Architectures like ppc64 can have THP enabled in code, but then can have
hugepage size disabled by the hypervisor. This allows us to create dax devices
with PAGE_SIZE alignment in this case.
Existing dax namespace with alignment larger than PAGE_SIZE will fail to
initialize in this specific case. We still allow fsdax namespace initialization.
With respect to identifying whether to enable hugepage fault for a dax device,
if THP is enabled during compile, we default to taking hugepage fault and in dax
fault handler if we find the fault size > alignment we retry with PAGE_SIZE
fault size.
This also addresses the below failure scenario on ppc64
ndctl create-namespace --mode=devdax | grep align
"align":16777216,
"align":16777216
cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments
65536 16777216
daxio.static-debug -z -o /dev/dax0.0
Bus error (core dumped)
$ dmesg | tail
lpar: Failed hash pte insert with error -4
hash-mmu: mm: Hashing failure ! EA=0x7fff17000000 access=0x8000000000000006 current=daxio
hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 pte=0xc000000501002b86
daxio[3860]: bus error (7) at 7fff17000000 nip 7fff973c007c lr 7fff973bff34 code 2 in libpmem.so.1.0.0[7fff973b0000+20000]
daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c e93f0088 f93f0120
daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128 <f9490000> e93f0088 39290008 f93f0110
The failure was due to guest kernel using wrong page size.
The namespaces created with 16M alignment will appear as below on a config with
16M page size disabled.
$ ndctl list -Ni
[
{
"dev":"namespace0.1",
"mode":"fsdax",
"map":"dev",
"size":5351931904,
"uuid":"fc6e9667-461a-4718-82b4-69b24570bddb",
"align":16777216,
"blockdev":"pmem0.1",
"supported_alignments":[
65536
]
},
{
"dev":"namespace0.0",
"mode":"fsdax", <==== devdax 16M alignment marked disabled.
"map":"mem",
"size":5368709120,
"uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484",
"state":"disabled"
}
]
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
---
Changes from V1:
* Return -EOPNOTSUPP if we find that we can't initialize the namespace
* Compare fix for DAX signature.
arch/powerpc/include/asm/libnvdimm.h | 9 ++++++++
arch/powerpc/mm/Makefile | 1 +
arch/powerpc/mm/nvdimm.c | 34 ++++++++++++++++++++++++++++
arch/x86/include/asm/libnvdimm.h | 19 ++++++++++++++++
drivers/nvdimm/nd.h | 6 -----
drivers/nvdimm/pfn_devs.c | 32 +++++++++++++++++++++++++-
drivers/nvdimm/pmem.c | 26 +++++++++++++++++----
include/linux/huge_mm.h | 7 +++++-
8 files changed, 122 insertions(+), 12 deletions(-)
create mode 100644 arch/powerpc/include/asm/libnvdimm.h
create mode 100644 arch/powerpc/mm/nvdimm.c
create mode 100644 arch/x86/include/asm/libnvdimm.h
diff --git a/arch/powerpc/include/asm/libnvdimm.h b/arch/powerpc/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..d35fd7f48603
--- /dev/null
+++ b/arch/powerpc/include/asm/libnvdimm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_LIBNVDIMM_H
+#define _ASM_POWERPC_LIBNVDIMM_H
+
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
+extern unsigned long *nd_pfn_supported_alignments(void);
+extern unsigned long nd_pfn_default_alignment(void);
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..42e4a399ba5d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
obj-$(CONFIG_PPC_PTDUMP) += ptdump/
obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_NVDIMM_PFN) += nvdimm.o
diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c
new file mode 100644
index 000000000000..a29a4510715e
--- /dev/null
+++ b/arch/powerpc/mm/nvdimm.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <linux/mm.h>
+/*
+ * We support only pte and pmd mappings for now.
+ */
+const unsigned long *nd_pfn_supported_alignments(void)
+{
+ static unsigned long supported_alignments[3];
+
+ supported_alignments[0] = PAGE_SIZE;
+
+ if (has_transparent_hugepage())
+ supported_alignments[1] = HPAGE_PMD_SIZE;
+ else
+ supported_alignments[1] = 0;
+
+ supported_alignments[2] = 0;
+ return supported_alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+unsigned long nd_pfn_default_alignment(void)
+{
+
+ if (has_transparent_hugepage())
+ return HPAGE_PMD_SIZE;
+ return PAGE_SIZE;
+}
diff --git a/arch/x86/include/asm/libnvdimm.h b/arch/x86/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..3d5361db9164
--- /dev/null
+++ b/arch/x86/include/asm/libnvdimm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_LIBNVDIMM_H
+#define _ASM_X86_LIBNVDIMM_H
+
+static inline unsigned long nd_pfn_default_alignment(void)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ return HPAGE_PMD_SIZE;
+#else
+ return PAGE_SIZE;
+#endif
+}
+
+static inline unsigned long nd_altmap_align_size(unsigned long nd_align)
+{
+ return PMD_SIZE;
+}
+
+#endif
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index a5ac3b240293..44fe923b2ee3 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -292,12 +292,6 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
struct nd_pfn *to_nd_pfn(struct device *dev);
#if IS_ENABLED(CONFIG_NVDIMM_PFN)
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
-#else
-#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
-#endif
-
int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
bool is_nd_pfn(struct device *dev);
struct device *nd_pfn_create(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 01f40672507f..cd0969ccbe30 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <asm/libnvdimm.h>
#include "nd-core.h"
#include "pfn.h"
#include "nd.h"
@@ -111,6 +112,8 @@ static ssize_t align_show(struct device *dev,
return sprintf(buf, "%ld\n", nd_pfn->align);
}
+#ifndef nd_pfn_supported_alignments
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
static const unsigned long *nd_pfn_supported_alignments(void)
{
/*
@@ -133,6 +136,7 @@ static const unsigned long *nd_pfn_supported_alignments(void)
return data;
}
+#endif
static ssize_t align_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
@@ -310,7 +314,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
return NULL;
nd_pfn->mode = PFN_MODE_NONE;
- nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
+ nd_pfn->align = nd_pfn_default_alignment();
dev = &nd_pfn->dev;
device_initialize(&nd_pfn->dev);
if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
@@ -420,6 +424,20 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
return 0;
}
+static bool nd_supported_alignment(unsigned long align)
+{
+ int i;
+ const unsigned long *supported = nd_pfn_supported_alignments();
+
+ if (align == 0)
+ return false;
+
+ for (i = 0; supported[i]; i++)
+ if (align == supported[i])
+ return true;
+ return false;
+}
+
int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
{
u64 checksum, offset;
@@ -474,6 +492,18 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
align = 1UL << ilog2(offset);
mode = le32_to_cpu(pfn_sb->mode);
+ /*
+ * Check whether the we support the alignment. For Dax if the
+ * superblock alignment is not matching, we won't initialize
+ * the device.
+ */
+ if (!nd_supported_alignment(align) &&
+ !memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) {
+ dev_err(&nd_pfn->dev, "init failed, settings mismatch\n");
+ dev_dbg(&nd_pfn->dev, "align: %lx:%lx\n", nd_pfn->align, align);
+ return -EOPNOTSUPP;
+ }
+
if (!nd_pfn->uuid) {
/*
* When probing a namepace via nd_pfn_probe() the uuid
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 845c5b430cdd..406427c064d9 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -490,6 +490,7 @@ static int pmem_attach_disk(struct device *dev,
static int nd_pmem_probe(struct device *dev)
{
+ int ret;
struct nd_namespace_common *ndns;
ndns = nvdimm_namespace_common_probe(dev);
@@ -505,12 +506,29 @@ static int nd_pmem_probe(struct device *dev)
if (is_nd_pfn(dev))
return pmem_attach_disk(dev, ndns);
- /* if we find a valid info-block we'll come back as that personality */
- if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
- || nd_dax_probe(dev, ndns) == 0)
+ ret = nd_btt_probe(dev, ndns);
+ if (ret == 0)
return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
- /* ...otherwise we're just a raw pmem device */
+ ret = nd_pfn_probe(dev, ndns);
+ if (ret == 0)
+ return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
+
+ ret = nd_dax_probe(dev, ndns);
+ if (ret == 0)
+ return -ENXIO;
+ else if (ret == -EOPNOTSUPP)
+ return ret;
+ /*
+ * We have two failure conditions here, there is no
+ * info reserver block or we found a valid info reserve block
+ * but failed to initialize the pfn superblock.
+ * Don't create a raw pmem disk for the second case.
+ */
return pmem_attach_disk(dev, ndns);
}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 381e872bfde0..d5cfea3d8b86 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -110,7 +110,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
return true;
-
+ /*
+ * For dax let's try to do hugepage fault always. If we don't support
+ * hugepages we will not have enabled namespaces with hugepage alignment.
+ * This also means we try to handle hugepage fault on device with
+ * smaller alignment. But for then we will return with VM_FAULT_FALLBACK
+ */
if (vma_is_dax(vma))
return true;
--
2.21.0
3 years, 1 month
[v5 0/3] "Hotremove" persistent memory
by Pavel Tatashin
Changelog:
v5
- Addressed comments from Dan Williams: made remove_memory() to return
an error code, and use this function from dax.
v4
- Addressed comments from Dave Hansen
v3
- Addressed comments from David Hildenbrand. Don't release
lock_device_hotplug after checking memory status, and rename
memblock_offlined_cb() to check_memblock_offlined_cb()
v2
- Dan Williams mentioned that drv->remove() return is ignored
by unbind. Unbind always succeeds. Because we cannot guarantee
that memory can be offlined from the driver, don't even
attempt to do so. Simply check that every section is offlined
beforehand and only then proceed with removing dax memory.
---
Recently, adding a persistent memory to be used like a regular RAM was
added to Linux. This work extends this functionality to also allow hot
removing persistent memory.
We (Microsoft) have an important use case for this functionality.
The requirement is for physical machines with small amount of RAM (~8G)
to be able to reboot in a very short period of time (<1s). Yet, there is
a userland state that is expensive to recreate (~2G).
The solution is to boot machines with 2G preserved for persistent
memory.
Copy the state, and hotadd the persistent memory so machine still has
all 8G available for runtime. Before reboot, offline and hotremove
device-dax 2G, copy the memory that is needed to be preserved to pmem0
device, and reboot.
The series of operations look like this:
1. After boot restore /dev/pmem0 to ramdisk to be consumed by apps.
and free ramdisk.
2. Convert raw pmem0 to devdax
ndctl create-namespace --mode devdax --map mem -e namespace0.0 -f
3. Hotadd to System RAM
echo dax0.0 > /sys/bus/dax/drivers/device_dax/unbind
echo dax0.0 > /sys/bus/dax/drivers/kmem/new_id
echo online_movable > /sys/devices/system/memoryXXX/state
4. Before reboot hotremove device-dax memory from System RAM
echo offline > /sys/devices/system/memoryXXX/state
echo dax0.0 > /sys/bus/dax/drivers/kmem/unbind
5. Create raw pmem0 device
ndctl create-namespace --mode raw -e namespace0.0 -f
6. Copy the state that was stored by apps to ramdisk to pmem device
7. Do kexec reboot or reboot through firmware if firmware does not
zero memory in pmem0 region (These machines have only regular
volatile memory). So to have pmem0 device either memmap kernel
parameter is used, or devices nodes in dtb are specified.
Pavel Tatashin (3):
device-dax: fix memory and resource leak if hotplug fails
mm/hotplug: make remove_memory() interface useable
device-dax: "Hotremove" persistent memory that is used like normal RAM
drivers/dax/dax-private.h | 2 ++
drivers/dax/kmem.c | 46 ++++++++++++++++++++++---
include/linux/memory_hotplug.h | 8 +++--
mm/memory_hotplug.c | 61 ++++++++++++++++++++++------------
4 files changed, 89 insertions(+), 28 deletions(-)
--
2.21.0
3 years, 1 month
NULL pointer dereference during memory hotremove
by Pavel Tatashin
This panic is unrelated to circular lock issue that I reported in a
separate thread, that also happens during memory hotremove.
xakep ~/x/linux$ git describe
v5.1-12317-ga6a4b66bd8f4
Config is attached, qemu script is following:
qemu-system-x86_64 \
-enable-kvm \
-cpu host \
-parallel none \
-echr 1 \
-serial none \
-chardev stdio,id=console,signal=off,mux=on \
-serial chardev:console \
-mon chardev=console \
-vga none \
-display none \
-kernel pmem/native/arch/x86/boot/bzImage \
-m 8G,slots=1,maxmem=16G \
-smp 8 \
-fsdev local,id=virtfs1,path=/,security_model=none \
-device virtio-9p-pci,fsdev=virtfs1,mount_tag=hostfs \
-append 'earlyprintk=serial,ttyS0,115200 console=ttyS0
TERM=xterm ip=dhcp memmap=2G!6G loglevel=7'
The unusual case with this script is that 2G reserved for pmem device:
memmap=2G!6G. Otherwise, it is a normal layout. Unfortunately, it does
not happen every time, but I have hit it a couple times.
# QEMU 4.0.0 monitor - type 'help' for more information
(qemu) object_add memory-backend-ram,id=mem1,size=1G
(qemu) device_add pc-dimm,id=dimm1,memdev=mem1
# echo online_movable > /sys/devices/system/memory/memory79/state
[ 40.219090] Built 1 zonelists, mobility grouping on. Total pages: 1529279
[ 40.223258] Policy zone: Normal
# (qemu) device_del dimm1
(qemu) [ 49.624600] Offlined Pages 32768
[ 49.625796] Built 1 zonelists, mobility grouping on. Total pages: 1516352
[ 49.627841] Policy zone: Normal
[ 49.630932] BUG: kernel NULL pointer dereference, address: 0000000000000698
[ 49.633704] #PF: supervisor read access in kernel mode
[ 49.635689] #PF: error_code(0x0000) - not-present page
[ 49.637620] PGD 8000000236b59067 P4D 8000000236b59067 PUD 2358fe067 PMD 0
[ 49.640163] Oops: 0000 [#1] SMP PTI
[ 49.641223] CPU: 0 PID: 7 Comm: kworker/u16:0 Not tainted 5.1.0_pt_pmem #38
[ 49.643183] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.12.0-20181126_142135-anatol 04/01/2014
[ 49.645858] Workqueue: kacpi_hotplug acpi_hotplug_work_fn
[ 49.647101] RIP: 0010:__remove_pages+0x1a/0x460
[ 49.648165] Code: e9 bb a9 fd ff 0f 0b 66 0f 1f 84 00 00 00 00 00
41 57 48 89 f8 49 89 ff 41 56 49 89 f6 41 55 41 54 55 53 48 89 d3 48
83 ec 50 <48> 2b 47 58 48 89 4c 24 48 48 3d 00 19 00 00 75 09 48 85 c9
0f 85
[ 49.651925] RSP: 0018:ffffbd1000c8fcb8 EFLAGS: 00010286
[ 49.652857] RAX: 0000000000000640 RBX: 0000000000040000 RCX: 0000000000000000
[ 49.654139] RDX: 0000000000040000 RSI: 0000000000240000 RDI: 0000000000000640
[ 49.655393] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000040000000
[ 49.656523] R10: 0000000040000000 R11: 0000000240000000 R12: 0000000040000000
[ 49.657654] R13: 0000000240000000 R14: 0000000000240000 R15: 0000000000000640
[ 49.658828] FS: 0000000000000000(0000) GS:ffff9b4bf9800000(0000)
knlGS:0000000000000000
[ 49.660178] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 49.661033] CR2: 0000000000000698 CR3: 00000002382e0006 CR4: 0000000000360ef0
[ 49.662114] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 49.663172] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 49.664243] Call Trace:
[ 49.664622] ? memblock_isolate_range+0xc4/0x139
[ 49.665290] ? firmware_map_add_hotplug+0x7e/0xde
[ 49.665908] ? memblock_remove_region+0x30/0x74
[ 49.666498] arch_remove_memory+0x6f/0xa0
[ 49.667012] __remove_memory+0xab/0x130
[ 49.667492] ? walk_memory_range+0xa1/0xe0
[ 49.668008] acpi_memory_device_remove+0x67/0xe0
[ 49.668595] acpi_bus_trim+0x50/0x90
[ 49.669051] acpi_device_hotplug+0x2fa/0x3e0
[ 49.669590] acpi_hotplug_work_fn+0x15/0x20
[ 49.670116] process_one_work+0x2a0/0x650
[ 49.670577] worker_thread+0x34/0x3d0
[ 49.670997] ? process_one_work+0x650/0x650
[ 49.671503] kthread+0x118/0x130
[ 49.671879] ? kthread_create_on_node+0x60/0x60
[ 49.672411] ret_from_fork+0x3a/0x50
[ 49.672836] Modules linked in:
[ 49.673190] CR2: 0000000000000698
[ 49.673583] ---[ end trace 6b727d3a8ce48aa1 ]---
[ 49.674120] RIP: 0010:__remove_pages+0x1a/0x460
[ 49.674624] Code: e9 bb a9 fd ff 0f 0b 66 0f 1f 84 00 00 00 00 00
41 57 48 89 f8 49 89 ff 41 56 49 89 f6 41 55 41 54 55 53 48 89 d3 48
83 ec 50 <48> 2b 47 58 48 89 4c 24 48 48 3d 00 19 00 00 75 09 48 85 c9
0f 85
[ 49.676600] RSP: 0018:ffffbd1000c8fcb8 EFLAGS: 00010286
[ 49.677159] RAX: 0000000000000640 RBX: 0000000000040000 RCX: 0000000000000000
[ 49.677960] RDX: 0000000000040000 RSI: 0000000000240000 RDI: 0000000000000640
[ 49.678813] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000040000000
[ 49.679633] R10: 0000000040000000 R11: 0000000240000000 R12: 0000000040000000
[ 49.680455] R13: 0000000240000000 R14: 0000000000240000 R15: 0000000000000640
[ 49.681243] FS: 0000000000000000(0000) GS:ffff9b4bf9800000(0000)
knlGS:0000000000000000
[ 49.682168] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 49.682813] CR2: 0000000000000698 CR3: 00000002382e0006 CR4: 0000000000360ef0
[ 49.683573] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 49.684239] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 49.684901] BUG: sleeping function called from invalid context at
include/linux/percpu-rwsem.h:34
[ 49.685690] in_atomic(): 0, irqs_disabled(): 1, pid: 7, name: kworker/u16:0
[ 49.686314] INFO: lockdep is turned off.
[ 49.686684] irq event stamp: 22546
[ 49.687003] hardirqs last enabled at (22545): [<ffffffff8c1fe5ba>]
kfree+0xba/0x230
[ 49.687692] hardirqs last disabled at (22546): [<ffffffff8c001b53>]
trace_hardirqs_off_thunk+0x1a/0x1c
[ 49.688561] softirqs last enabled at (22526): [<ffffffff8ce0033e>]
__do_softirq+0x33e/0x455
[ 49.689348] softirqs last disabled at (22519): [<ffffffff8c06ea36>]
irq_exit+0xb6/0xc0
[ 49.690088] CPU: 0 PID: 7 Comm: kworker/u16:0 Tainted: G D
5.1.0_pt_pmem #38
[ 49.690811] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.12.0-20181126_142135-anatol 04/01/2014
[ 49.691704] Workqueue: kacpi_hotplug acpi_hotplug_work_fn
[ 49.692185] Call Trace:
[ 49.692398] dump_stack+0x67/0x90
[ 49.692690] ___might_sleep.cold.87+0x9f/0xaf
[ 49.693099] exit_signals+0x2b/0x240
[ 49.693453] do_exit+0xab/0xc10
[ 49.693770] ? process_one_work+0x650/0x650
[ 49.694160] ? kthread+0x118/0x130
[ 49.694487] rewind_stack_do_exit+0x17/0x20
[ 77.418619] random: fast init done
3 years, 1 month
[PATCH v9 0/7] virtio pmem driver
by Pankaj Gupta
Hi Dan,
Proposing the patch series to be merged via nvdimm tree
as kindly agreed by you. We have ack/review on XFS, EXT4 &
VIRTIO patches.
Incorporated all the changes suggested in v8. This version
added a new patch 4 for dax for device mapper change and some
minor style changes in patch 2. Kept all the reviews. Request
to please merge the series.
---
This patch series has implementation for "virtio pmem".
"virtio pmem" is fake persistent memory(nvdimm) in guest
which allows to bypass the guest page cache. This also
implements a VIRTIO based asynchronous flush mechanism.
Sharing guest kernel driver in this patchset with the
changes suggested in v4. Tested with Qemu side device
emulation [6] for virtio-pmem. Documented the impact of
possible page cache side channel attacks with suggested
countermeasures.
Details of project idea for 'virtio pmem' flushing interface
is shared [3] & [4].
Implementation is divided into two parts:
New virtio pmem guest driver and qemu code changes for new
virtio pmem paravirtualized device.
1. Guest virtio-pmem kernel driver
---------------------------------
- Reads persistent memory range from paravirt device and
registers with 'nvdimm_bus'.
- 'nvdimm/pmem' driver uses this information to allocate
persistent memory region and setup filesystem operations
to the allocated memory.
- virtio pmem driver implements asynchronous flushing
interface to flush from guest to host.
2. Qemu virtio-pmem device
---------------------------------
- Creates virtio pmem device and exposes a memory range to
KVM guest.
- At host side this is file backed memory which acts as
persistent memory.
- Qemu side flush uses aio thread pool API's and virtio
for asynchronous guest multi request handling.
David Hildenbrand CCed also posted a modified version[7] of
qemu virtio-pmem code based on updated Qemu memory device API.
Virtio-pmem security implications and countermeasures:
-----------------------------------------------------
In previous posting of kernel driver, there was discussion [9]
on possible implications of page cache side channel attacks with
virtio pmem. After thorough analysis of details of known side
channel attacks, below are the suggestions:
- Depends entirely on how host backing image file is mapped
into guest address space.
- virtio-pmem device emulation, by default shared mapping is used
to map host backing file. It is recommended to use separate
backing file at host side for every guest. This will prevent
any possibility of executing common code from multiple guests
and any chance of inferring guest local data based based on
execution time.
- If backing file is required to be shared among multiple guests
it is recommended to don't support host page cache eviction
commands from the guest driver. This will avoid any possibility
of inferring guest local data or host data from another guest.
- Proposed device specification [8] for virtio-pmem device with
details of possible security implications and suggested
countermeasures for device emulation.
Virtio-pmem errors handling:
----------------------------------------
Checked behaviour of virtio-pmem for below types of errors
Need suggestions on expected behaviour for handling these errors?
- Hardware Errors: Uncorrectable recoverable Errors:
a] virtio-pmem:
- As per current logic if error page belongs to Qemu process,
host MCE handler isolates(hwpoison) that page and send SIGBUS.
Qemu SIGBUS handler injects exception to KVM guest.
- KVM guest then isolates the page and send SIGBUS to guest
userspace process which has mapped the page.
b] Existing implementation for ACPI pmem driver:
- Handles such errors with MCE notifier and creates a list
of bad blocks. Read/direct access DAX operation return EIO
if accessed memory page fall in bad block list.
- It also starts backgound scrubbing.
- Similar functionality can be reused in virtio-pmem with MCE
notifier but without scrubbing(no ACPI/ARS)? Need inputs to
confirm if this behaviour is ok or needs any change?
Changes from PATCH v8: [1]
- Set device mapper synchronous if all target devices support - Dan
- Move virtio_pmem.h to nvdimm directory - Dan
- Style, indentation & better error messages in patch 2 - DavidH
- Added MST's ack in patch 2.
Changes from PATCH v7: [2]
- Corrected pending request queue logic (patch 2) - Jakub Staroń
- Used unsigned long flags for passing DAXDEV_F_SYNC (patch 3) - Dan
- Fixed typo => vma 'flag' to 'vm_flag' (patch 4)
- Added rob in patch 6 & patch 2
Changes from PATCH v6:
- Corrected comment format in patch 5 & patch 6. [Dave]
- Changed variable declaration indentation in patch 6 [Darrick]
- Add Reviewed-by tag by 'Jan Kara' in patch 4 & patch 5
Changes from PATCH v5:
Changes suggested in by - [Cornelia, Yuval]
- Remove assignment chaining in virtio driver
- Better error message and remove not required free
- Check nd_region before use
Changes suggested by - [Jan Kara]
- dax_synchronous() for !CONFIG_DAX
- Correct 'daxdev_mapping_supported' comment and non-dax implementation
Changes suggested by - [Dan Williams]
- Pass meaningful flag 'DAXDEV_F_SYNC' to alloc_dax
- Gate nvdimm_flush instead of additional async parameter
- Move block chaining logic to flush callback than common nvdimm_flush
- Use NULL flush callback for generic flush for better readability [Dan, Jan]
- Use virtio device id 27 from 25(already used) - [MST]
Changes from PATCH v4:
- Factor out MAP_SYNC supported functionality to a common helper
[Dave, Darrick, Jan]
- Comment, indentation and virtqueue_kick failure handle - Yuval Shaia
Changes from PATCH v3:
- Use generic dax_synchronous() helper to check for DAXDEV_SYNC
flag - [Dan, Darrick, Jan]
- Add 'is_nvdimm_async' function
- Document page cache side channel attacks implications &
countermeasures - [Dave Chinner, Michael]
Changes from PATCH v2:
- Disable MAP_SYNC for ext4 & XFS filesystems - [Dan]
- Use name 'virtio pmem' in place of 'fake dax'
Changes from PATCH v1:
- 0-day build test for build dependency on libnvdimm
Changes suggested by - [Dan Williams]
- Split the driver into two parts virtio & pmem
- Move queuing of async block request to block layer
- Add "sync" parameter in nvdimm_flush function
- Use indirect call for nvdimm_flush
- Don’t move declarations to common global header e.g nd.h
- nvdimm_flush() return 0 or -EIO if it fails
- Teach nsio_rw_bytes() that the flush can fail
- Rename nvdimm_flush() to generic_nvdimm_flush()
- Use 'nd_region->provider_data' for long dereferencing
- Remove virtio_pmem_freeze/restore functions
- Remove BSD license text with SPDX license text
- Add might_sleep() in virtio_pmem_flush - [Luiz]
- Make spin_lock_irqsave() narrow
Pankaj Gupta (7):
libnvdimm: nd_region flush callback support
virtio-pmem: Add virtio-pmem guest driver
libnvdimm: add nd_region buffered dax_dev flag
dax: check synchronous mapping is supported
dm: dm: Enable synchronous dax
ext4: disable map_sync for virtio pmem
xfs: disable map_sync for virtio pmem
[1] https://lkml.org/lkml/2019/5/10/447
[2] https://lkml.org/lkml/2019/4/26/36
[3] https://www.spinics.net/lists/kvm/msg149761.html
[4] https://www.spinics.net/lists/kvm/msg153095.html
[5] https://lkml.org/lkml/2018/8/31/413
[6] https://marc.info/?l=linux-kernel&m=153572228719237&w=2
[7] https://marc.info/?l=qemu-devel&m=153555721901824&w=2
[8] https://lists.oasis-open.org/archives/virtio-dev/201903/msg00083.html
[9] https://lkml.org/lkml/2019/1/9/1191
drivers/acpi/nfit/core.c | 4 -
drivers/dax/bus.c | 2
drivers/dax/super.c | 19 +++++
drivers/md/dm-table.c | 14 ++++
drivers/md/dm.c | 3
drivers/nvdimm/Makefile | 1
drivers/nvdimm/claim.c | 6 +
drivers/nvdimm/nd.h | 1
drivers/nvdimm/nd_virtio.c | 126 +++++++++++++++++++++++++++++++++++++++
drivers/nvdimm/pmem.c | 18 +++--
drivers/nvdimm/region_devs.c | 33 +++++++++-
drivers/nvdimm/virtio_pmem.c | 122 +++++++++++++++++++++++++++++++++++++
drivers/nvdimm/virtio_pmem.h | 60 ++++++++++++++++++
drivers/virtio/Kconfig | 11 +++
fs/ext4/file.c | 10 +--
fs/xfs/xfs_file.c | 9 +-
include/linux/dax.h | 26 +++++++-
include/linux/libnvdimm.h | 9 ++
include/uapi/linux/virtio_ids.h | 1
include/uapi/linux/virtio_pmem.h | 10 +++
20 files changed, 460 insertions(+), 25 deletions(-)
3 years, 1 month
[PATCH] mm/nvdimm: Pick the right alignment default when creating dax devices
by Aneesh Kumar K.V
Allow arch to provide the supported alignments and use hugepage alignment only
if we support hugepage. Right now we depend on compile time configs whereas this
patch switch this to runtime discovery.
Architectures like ppc64 can have THP enabled in code, but then can have
hugepage size disabled by the hypervisor. This allows us to create dax devices
with PAGE_SIZE alignment in this case.
Existing dax namespace with alignment larger than PAGE_SIZE will fail to
initialize in this specific case. We still allow fsdax namespace initialization.
With respect to identifying whether to enable hugepage fault for a dax device,
if THP is enabled during compile, we default to taking hugepage fault and in dax
fault handler if we find the fault size > alignment we retry with PAGE_SIZE
fault size.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
---
arch/powerpc/include/asm/libnvdimm.h | 9 ++++++++
arch/powerpc/mm/Makefile | 1 +
arch/powerpc/mm/nvdimm.c | 34 ++++++++++++++++++++++++++++
arch/x86/include/asm/libnvdimm.h | 19 ++++++++++++++++
drivers/nvdimm/nd.h | 6 -----
drivers/nvdimm/pfn_devs.c | 32 +++++++++++++++++++++++++-
include/linux/huge_mm.h | 7 +++++-
7 files changed, 100 insertions(+), 8 deletions(-)
create mode 100644 arch/powerpc/include/asm/libnvdimm.h
create mode 100644 arch/powerpc/mm/nvdimm.c
create mode 100644 arch/x86/include/asm/libnvdimm.h
diff --git a/arch/powerpc/include/asm/libnvdimm.h b/arch/powerpc/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..d35fd7f48603
--- /dev/null
+++ b/arch/powerpc/include/asm/libnvdimm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_LIBNVDIMM_H
+#define _ASM_POWERPC_LIBNVDIMM_H
+
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
+extern unsigned long *nd_pfn_supported_alignments(void);
+extern unsigned long nd_pfn_default_alignment(void);
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..42e4a399ba5d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
obj-$(CONFIG_PPC_PTDUMP) += ptdump/
obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_NVDIMM_PFN) += nvdimm.o
diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c
new file mode 100644
index 000000000000..a29a4510715e
--- /dev/null
+++ b/arch/powerpc/mm/nvdimm.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <linux/mm.h>
+/*
+ * We support only pte and pmd mappings for now.
+ */
+const unsigned long *nd_pfn_supported_alignments(void)
+{
+ static unsigned long supported_alignments[3];
+
+ supported_alignments[0] = PAGE_SIZE;
+
+ if (has_transparent_hugepage())
+ supported_alignments[1] = HPAGE_PMD_SIZE;
+ else
+ supported_alignments[1] = 0;
+
+ supported_alignments[2] = 0;
+ return supported_alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+unsigned long nd_pfn_default_alignment(void)
+{
+
+ if (has_transparent_hugepage())
+ return HPAGE_PMD_SIZE;
+ return PAGE_SIZE;
+}
diff --git a/arch/x86/include/asm/libnvdimm.h b/arch/x86/include/asm/libnvdimm.h
new file mode 100644
index 000000000000..3d5361db9164
--- /dev/null
+++ b/arch/x86/include/asm/libnvdimm.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_LIBNVDIMM_H
+#define _ASM_X86_LIBNVDIMM_H
+
+static inline unsigned long nd_pfn_default_alignment(void)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ return HPAGE_PMD_SIZE;
+#else
+ return PAGE_SIZE;
+#endif
+}
+
+static inline unsigned long nd_altmap_align_size(unsigned long nd_align)
+{
+ return PMD_SIZE;
+}
+
+#endif
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index a5ac3b240293..44fe923b2ee3 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -292,12 +292,6 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
struct nd_pfn *to_nd_pfn(struct device *dev);
#if IS_ENABLED(CONFIG_NVDIMM_PFN)
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
-#else
-#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
-#endif
-
int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
bool is_nd_pfn(struct device *dev);
struct device *nd_pfn_create(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 01f40672507f..347cab166376 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <asm/libnvdimm.h>
#include "nd-core.h"
#include "pfn.h"
#include "nd.h"
@@ -111,6 +112,8 @@ static ssize_t align_show(struct device *dev,
return sprintf(buf, "%ld\n", nd_pfn->align);
}
+#ifndef nd_pfn_supported_alignments
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
static const unsigned long *nd_pfn_supported_alignments(void)
{
/*
@@ -133,6 +136,7 @@ static const unsigned long *nd_pfn_supported_alignments(void)
return data;
}
+#endif
static ssize_t align_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
@@ -310,7 +314,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
return NULL;
nd_pfn->mode = PFN_MODE_NONE;
- nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
+ nd_pfn->align = nd_pfn_default_alignment();
dev = &nd_pfn->dev;
device_initialize(&nd_pfn->dev);
if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
@@ -420,6 +424,20 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn)
return 0;
}
+static bool nd_supported_alignment(unsigned long align)
+{
+ int i;
+ const unsigned long *supported = nd_pfn_supported_alignments();
+
+ if (align == 0)
+ return false;
+
+ for (i = 0; supported[i]; i++)
+ if (align == supported[i])
+ return true;
+ return false;
+}
+
int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
{
u64 checksum, offset;
@@ -474,6 +492,18 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
align = 1UL << ilog2(offset);
mode = le32_to_cpu(pfn_sb->mode);
+ /*
+ * Check whether the we support the alignment. For Dax if the
+ * superblock alignment is not matching, we won't initialize
+ * the device.
+ */
+ if (!nd_supported_alignment(align) &&
+ memcmp(pfn_sb->signature, DAX_SIG, PFN_SIG_LEN)) {
+ dev_err(&nd_pfn->dev, "init failed, settings mismatch\n");
+ dev_dbg(&nd_pfn->dev, "align: %lx:%lx\n", nd_pfn->align, align);
+ return -EINVAL;
+ }
+
if (!nd_pfn->uuid) {
/*
* When probing a namepace via nd_pfn_probe() the uuid
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 381e872bfde0..d5cfea3d8b86 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -110,7 +110,12 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
return true;
-
+ /*
+ * For dax let's try to do hugepage fault always. If we don't support
+ * hugepages we will not have enabled namespaces with hugepage alignment.
+ * This also means we try to handle hugepage fault on device with
+ * smaller alignment. But for then we will return with VM_FAULT_FALLBACK
+ */
if (vma_is_dax(vma))
return true;
--
2.21.0
3 years, 1 month