[RFC PATCH 1/3] fs: dax.c: move fs hole signifier from DAX_ZERO_PAGE
to XA_ZERO_ENTRY
by Amy Parker
DAX uses the DAX_ZERO_PAGE bit to represent holes in files. It could also use
a single entry, such as XArray's XA_ZERO_ENTRY. This distinguishes zero pages
and allows us to shift DAX_EMPTY down (see patch 2/3).
Signed-off-by: Amy Parker <enbyamy(a)gmail.com>
---
fs/dax.c | 15 ++++++++++-----
1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..fa8ca1a71bbd 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -77,9 +77,14 @@ fs_initcall(init_dax_wait_table);
#define DAX_SHIFT (4)
#define DAX_LOCKED (1UL << 0)
#define DAX_PMD (1UL << 1)
-#define DAX_ZERO_PAGE (1UL << 2)
#define DAX_EMPTY (1UL << 3)
+/*
+ * A zero entry, XA_ZERO_ENTRY, is used to represent a zero page. This
+ * definition helps with checking if an entry is a PMD size.
+ */
+#define XA_ZERO_PMD_ENTRY DAX_PMD | (unsigned long)XA_ZERO_ENTRY
+
static unsigned long dax_to_pfn(void *entry)
{
return xa_to_value(entry) >> DAX_SHIFT;
@@ -114,7 +119,7 @@ static bool dax_is_pte_entry(void *entry)
static int dax_is_zero_entry(void *entry)
{
- return xa_to_value(entry) & DAX_ZERO_PAGE;
+ return xa_to_value(entry) & (unsigned long)XA_ZERO_ENTRY;
}
static int dax_is_empty_entry(void *entry)
@@ -738,7 +743,7 @@ static void *dax_insert_entry(struct xa_state *xas,
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
+ if (dax_is_zero_entry(entry) && !(flags & (unsigned long)XA_ZERO_ENTRY)) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -1047,7 +1052,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas,
vm_fault_t ret;
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_ZERO_PAGE, false);
+ XA_ZERO_ENTRY, false);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
trace_dax_load_hole(inode, vmf, ret);
@@ -1434,7 +1439,7 @@ static vm_fault_t dax_pmd_load_hole(struct
xa_state *xas, struct vm_fault *vmf,
pfn = page_to_pfn_t(zero_page);
*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE, false);
+ XA_ZERO_PMD_ENTRY, false);
if (arch_needs_pgtable_deposit()) {
pgtable = pte_alloc_one(vma->vm_mm);
--
2.29.2
1 year, 3 months
[PATCH v2] nvdimm: Avoid race between probe and reading device attributes
by Richard Palethorpe
It is possible to cause a division error and use-after-free by querying the
nmem device before the driver data is fully initialised in nvdimm_probe. E.g
by doing
(while true; do
cat /sys/bus/nd/devices/nmem*/available_slots 2>&1 > /dev/null
done) &
while true; do
for i in $(seq 0 4); do
echo nmem$i > /sys/bus/nd/drivers/nvdimm/bind
done
for i in $(seq 0 4); do
echo nmem$i > /sys/bus/nd/drivers/nvdimm/unbind
done
done
On 5.7-rc3 this causes:
[ 12.711578] divide error: 0000 [#1] SMP KASAN PTI
[ 12.712321] CPU: 0 PID: 231 Comm: cat Not tainted 5.7.0-rc3 #48
[ 12.713188] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
[ 12.714857] RIP: 0010:nd_label_nfree+0x134/0x1a0 [libnvdimm]
[ 12.715772] Code: ba 00 00 00 00 00 fc ff df 48 89 f9 48 c1 e9 03 0f b6 14 11 84 d2 74 05 80 fa 03 7e 52 8b 73 08 31 d2 89 c1 48 83 c4 08 5b 5d <f7> f6 31 d2 41 5c 83 c0 07 c1 e8 03 48 8d 84 00 8e 02 00 00 25 00
[ 12.718311] RSP: 0018:ffffc9000046fd08 EFLAGS: 00010282
[ 12.719030] RAX: 0000000000000000 RBX: ffffffffc0073aa0 RCX: 0000000000000000
[ 12.720005] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff888060931808
[ 12.720970] RBP: ffff88806609d018 R08: 0000000000000001 R09: ffffed100cc0a2b1
[ 12.721889] R10: ffff888066051587 R11: ffffed100cc0a2b0 R12: ffff888060931800
[ 12.722744] R13: ffff888064362000 R14: ffff88806609d018 R15: ffffffff8b1a2520
[ 12.723602] FS: 00007fd16f3d5580(0000) GS:ffff88806b400000(0000) knlGS:0000000000000000
[ 12.724600] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 12.725308] CR2: 00007fd16f1ec000 CR3: 0000000064322006 CR4: 0000000000160ef0
[ 12.726268] Call Trace:
[ 12.726633] available_slots_show+0x4e/0x120 [libnvdimm]
[ 12.727380] dev_attr_show+0x42/0x80
[ 12.727891] ? memset+0x20/0x40
[ 12.728341] sysfs_kf_seq_show+0x218/0x410
[ 12.728923] seq_read+0x389/0xe10
[ 12.729415] vfs_read+0x101/0x2d0
[ 12.729891] ksys_read+0xf9/0x1d0
[ 12.730361] ? kernel_write+0x120/0x120
[ 12.730915] do_syscall_64+0x95/0x4a0
[ 12.731435] entry_SYSCALL_64_after_hwframe+0x49/0xb3
[ 12.732163] RIP: 0033:0x7fd16f2fe4be
[ 12.732685] Code: c0 e9 c6 fe ff ff 50 48 8d 3d 2e 12 0a 00 e8 69 e9 01 00 66 0f 1f 84 00 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28
[ 12.735207] RSP: 002b:00007ffd3177b838 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 12.736261] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fd16f2fe4be
[ 12.737233] RDX: 0000000000020000 RSI: 00007fd16f1ed000 RDI: 0000000000000003
[ 12.738203] RBP: 00007fd16f1ed000 R08: 00007fd16f1ec010 R09: 0000000000000000
[ 12.739172] R10: 00007fd16f3f4f70 R11: 0000000000000246 R12: 00007ffd3177ce23
[ 12.740144] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
[ 12.741139] Modules linked in: nfit libnvdimm
[ 12.741783] ---[ end trace 99532e4b82410044 ]---
[ 12.742452] RIP: 0010:nd_label_nfree+0x134/0x1a0 [libnvdimm]
[ 12.743167] Code: ba 00 00 00 00 00 fc ff df 48 89 f9 48 c1 e9 03 0f b6 14 11 84 d2 74 05 80 fa 03 7e 52 8b 73 08 31 d2 89 c1 48 83 c4 08 5b 5d <f7> f6 31 d2 41 5c 83 c0 07 c1 e8 03 48 8d 84 00 8e 02 00 00 25 00
[ 12.745709] RSP: 0018:ffffc9000046fd08 EFLAGS: 00010282
[ 12.746340] RAX: 0000000000000000 RBX: ffffffffc0073aa0 RCX: 0000000000000000
[ 12.747209] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff888060931808
[ 12.748081] RBP: ffff88806609d018 R08: 0000000000000001 R09: ffffed100cc0a2b1
[ 12.748977] R10: ffff888066051587 R11: ffffed100cc0a2b0 R12: ffff888060931800
[ 12.749849] R13: ffff888064362000 R14: ffff88806609d018 R15: ffffffff8b1a2520
[ 12.750729] FS: 00007fd16f3d5580(0000) GS:ffff88806b400000(0000) knlGS:0000000000000000
[ 12.751708] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 12.752441] CR2: 00007fd16f1ec000 CR3: 0000000064322006 CR4: 0000000000160ef0
[ 12.821357] ==================================================================
[ 12.822284] BUG: KASAN: use-after-free in __mutex_lock+0x111c/0x11a0
[ 12.823084] Read of size 4 at addr ffff888065c26238 by task reproducer/218
[ 12.823968]
[ 12.824183] CPU: 2 PID: 218 Comm: reproducer Tainted: G D 5.7.0-rc3 #48
[ 12.825167] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
[ 12.826595] Call Trace:
[ 12.826926] dump_stack+0x97/0xe0
[ 12.827362] print_address_description.constprop.0+0x1b/0x210
[ 12.828111] ? __mutex_lock+0x111c/0x11a0
[ 12.828645] __kasan_report.cold+0x37/0x92
[ 12.829179] ? __mutex_lock+0x111c/0x11a0
[ 12.829706] kasan_report+0x38/0x50
[ 12.830158] __mutex_lock+0x111c/0x11a0
[ 12.830666] ? ftrace_graph_stop+0x10/0x10
[ 12.831193] ? is_nvdimm_bus+0x40/0x40 [libnvdimm]
[ 12.831820] ? mutex_trylock+0x2b0/0x2b0
[ 12.832333] ? nvdimm_probe+0x259/0x420 [libnvdimm]
[ 12.832975] ? mutex_trylock+0x2b0/0x2b0
[ 12.833500] ? nvdimm_probe+0x259/0x420 [libnvdimm]
[ 12.834122] ? prepare_ftrace_return+0xa1/0xf0
[ 12.834724] ? ftrace_graph_caller+0x6b/0xa0
[ 12.835269] ? acpi_label_write+0x390/0x390 [nfit]
[ 12.835909] ? nvdimm_probe+0x259/0x420 [libnvdimm]
[ 12.836558] ? nvdimm_probe+0x259/0x420 [libnvdimm]
[ 12.837179] nvdimm_probe+0x259/0x420 [libnvdimm]
[ 12.837802] nvdimm_bus_probe+0x110/0x6b0 [libnvdimm]
[ 12.838470] really_probe+0x212/0x9a0
[ 12.838954] driver_probe_device+0x1cd/0x300
[ 12.839511] ? driver_probe_device+0x5/0x300
[ 12.840063] device_driver_attach+0xe7/0x120
[ 12.840623] bind_store+0x18d/0x230
[ 12.841075] kernfs_fop_write+0x200/0x420
[ 12.841606] vfs_write+0x154/0x450
[ 12.842047] ksys_write+0xf9/0x1d0
[ 12.842497] ? __ia32_sys_read+0xb0/0xb0
[ 12.843010] do_syscall_64+0x95/0x4a0
[ 12.843495] entry_SYSCALL_64_after_hwframe+0x49/0xb3
[ 12.844140] RIP: 0033:0x7f5b235d3563
[ 12.844607] Code: 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 55 c3 0f 1f 40 00 48 83 ec 28 48 89 54 24 18
[ 12.846877] RSP: 002b:00007fff1c3bc578 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 12.847822] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f5b235d3563
[ 12.848717] RDX: 0000000000000006 RSI: 000055f9576710d0 RDI: 0000000000000001
[ 12.849594] RBP: 000055f9576710d0 R08: 000000000000000a R09: 0000000000000000
[ 12.850470] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000006
[ 12.851333] R13: 00007f5b236a3500 R14: 0000000000000006 R15: 00007f5b236a3700
[ 12.852247]
[ 12.852466] Allocated by task 225:
[ 12.852893] save_stack+0x1b/0x40
[ 12.853310] __kasan_kmalloc.constprop.0+0xc2/0xd0
[ 12.853918] kmem_cache_alloc_node+0xef/0x270
[ 12.854475] copy_process+0x485/0x6130
[ 12.854945] _do_fork+0xf1/0xb40
[ 12.855353] __do_sys_clone+0xc3/0x100
[ 12.855843] do_syscall_64+0x95/0x4a0
[ 12.856302] entry_SYSCALL_64_after_hwframe+0x49/0xb3
[ 12.856939]
[ 12.857140] Freed by task 0:
[ 12.857522] save_stack+0x1b/0x40
[ 12.857940] __kasan_slab_free+0x12c/0x170
[ 12.858464] kmem_cache_free+0xb0/0x330
[ 12.858945] rcu_core+0x55f/0x19f0
[ 12.859385] __do_softirq+0x228/0x944
[ 12.859869]
[ 12.860075] The buggy address belongs to the object at ffff888065c26200
[ 12.860075] which belongs to the cache task_struct of size 6016
[ 12.861638] The buggy address is located 56 bytes inside of
[ 12.861638] 6016-byte region [ffff888065c26200, ffff888065c27980)
[ 12.863084] The buggy address belongs to the page:
[ 12.863702] page:ffffea0001970800 refcount:1 mapcount:0 mapping:0000000021ee3712 index:0x0 head:ffffea0001970800 order:3 compound_mapcount:0 compound_pincount:0
[ 12.865478] flags: 0x80000000010200(slab|head)
[ 12.866039] raw: 0080000000010200 0000000000000000 0000000100000001 ffff888066c0f980
[ 12.867010] raw: 0000000000000000 0000000080050005 00000001ffffffff 0000000000000000
[ 12.867986] page dumped because: kasan: bad access detected
[ 12.868696]
[ 12.868900] Memory state around the buggy address:
[ 12.869514] ffff888065c26100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 12.870414] ffff888065c26180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 12.871318] >ffff888065c26200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 12.872238] ^
[ 12.872870] ffff888065c26280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 12.873754] ffff888065c26300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 12.874640]
==================================================================
This can be prevented by setting the driver data after initialisation is
complete.
Fixes: 4d88a97aa9e8 ("libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver infrastructure")
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: linux-nvdimm(a)lists.01.org
Cc: linux-kernel(a)vger.kernel.org
Cc: Coly Li <colyli(a)suse.com>
Signed-off-by: Richard Palethorpe <rpalethorpe(a)suse.com>
---
V2:
+ Reviewed by Coly and removed unecessary lock
drivers/nvdimm/dimm.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvdimm/dimm.c b/drivers/nvdimm/dimm.c
index 7d4ddc4d9322..3d3988e1d9a0 100644
--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -43,7 +43,6 @@ static int nvdimm_probe(struct device *dev)
if (!ndd)
return -ENOMEM;
- dev_set_drvdata(dev, ndd);
ndd->dpa.name = dev_name(dev);
ndd->ns_current = -1;
ndd->ns_next = -1;
@@ -106,6 +105,8 @@ static int nvdimm_probe(struct device *dev)
if (rc)
goto err;
+ dev_set_drvdata(dev, ndd);
+
return 0;
err:
--
2.26.2
1 year, 3 months
[PATCH 0/3] cdev: Generic shutdown handling
by Dan Williams
After reviewing driver submissions with new cdev + ioctl usages one
common stumbling block is coordinating the shutdown of the ioctl path,
or other file operations, at driver ->remove() time. While cdev_del()
guarantees that no new file descriptors will be established, operations
on existing file descriptors can proceed indefinitely.
Given the observation that the kernel spends the resources for a percpu_ref
per request_queue shared with all block_devices on a gendisk, do the
same for all the cdev instances that share the same
cdev_add()-to-cdev_del() lifetime.
With this in place cdev_del() not only guarantees 'no new opens', but it
also guarantees 'no new operations invocations' and 'all threads running
in an operation handler have exited that handler'.
As a proof point of the way driver implementations open-code around this
gap in the api the libnvdimm ioctl path is reworked with a result of:
4 files changed, 101 insertions(+), 153 deletions(-)
---
Dan Williams (3):
cdev: Finish the cdev api with queued mode support
libnvdimm/ida: Switch to non-deprecated ida helpers
libnvdimm/ioctl: Switch to cdev_register_queued()
drivers/nvdimm/btt_devs.c | 6 +
drivers/nvdimm/bus.c | 177 +++++++++------------------------------
drivers/nvdimm/core.c | 14 ++-
drivers/nvdimm/dax_devs.c | 4 -
drivers/nvdimm/dimm_devs.c | 53 +++++++++---
drivers/nvdimm/namespace_devs.c | 14 +--
drivers/nvdimm/nd-core.h | 14 ++-
drivers/nvdimm/pfn_devs.c | 4 -
fs/char_dev.c | 108 ++++++++++++++++++++++--
include/linux/cdev.h | 21 ++++-
10 files changed, 238 insertions(+), 177 deletions(-)
1 year, 3 months
[PATCH 1/1] ndctl/namespace: Fix disable-namespace accounting relative to seed devices
by Redhairer Li
Seed namespaces are included in "ndctl disable-namespace all". However
since the user never "creates" them it is surprising to see
"disable-namespace" report 1 more namespace relative to the number that
have been created. Catch attempts to disable a zero-sized namespace:
Before:
{
"dev":"namespace1.0",
"size":"492.00 MiB (515.90 MB)",
"blockdev":"pmem1"
}
{
"dev":"namespace1.1",
"size":"492.00 MiB (515.90 MB)",
"blockdev":"pmem1.1"
}
{
"dev":"namespace1.2",
"size":"492.00 MiB (515.90 MB)",
"blockdev":"pmem1.2"
}
disabled 4 namespaces
After:
{
"dev":"namespace1.0",
"size":"492.00 MiB (515.90 MB)",
"blockdev":"pmem1"
}
{
"dev":"namespace1.3",
"size":"492.00 MiB (515.90 MB)",
"blockdev":"pmem1.3"
}
{
"dev":"namespace1.1",
"size":"492.00 MiB (515.90 MB)",
"blockdev":"pmem1.1"
}
disabled 3 namespaces
Signed-off-by: Redhairer Li <redhairer.li(a)intel.com>
---
ndctl/lib/libndctl.c | 11 ++++++++---
ndctl/region.c | 4 +++-
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
index ee737cb..49f362b 100644
--- a/ndctl/lib/libndctl.c
+++ b/ndctl/lib/libndctl.c
@@ -4231,6 +4231,7 @@ NDCTL_EXPORT int ndctl_namespace_disable_safe(struct ndctl_namespace *ndns)
const char *bdev = NULL;
char path[50];
int fd;
+ unsigned long long size = ndctl_namespace_get_size(ndns);
if (pfn && ndctl_pfn_is_enabled(pfn))
bdev = ndctl_pfn_get_block_device(pfn);
@@ -4260,9 +4261,13 @@ NDCTL_EXPORT int ndctl_namespace_disable_safe(struct ndctl_namespace *ndns)
devname, bdev, strerror(errno));
return -errno;
}
- } else
- ndctl_namespace_disable_invalidate(ndns);
-
+ } else {
+ if (size == 0)
+ /* Don't try to disable idle namespace (no capacity allocated) */
+ return -ENXIO;
+ else
+ ndctl_namespace_disable_invalidate(ndns);
+ }
return 0;
}
diff --git a/ndctl/region.c b/ndctl/region.c
index 7945007..0014bb9 100644
--- a/ndctl/region.c
+++ b/ndctl/region.c
@@ -72,6 +72,7 @@ static int region_action(struct ndctl_region *region, enum device_action mode)
{
struct ndctl_namespace *ndns;
int rc = 0;
+ unsigned long long size;
switch (mode) {
case ACTION_ENABLE:
@@ -80,7 +81,8 @@ static int region_action(struct ndctl_region *region, enum device_action mode)
case ACTION_DISABLE:
ndctl_namespace_foreach(region, ndns) {
rc = ndctl_namespace_disable_safe(ndns);
- if (rc)
+ size = ndctl_namespace_get_size(ndns);
+ if (rc && size != 0)
return rc;
}
rc = ndctl_region_disable_invalidate(region);
--
2.20.1.windows.1
1 year, 3 months
PATCH] fs/dax: fix compile problem on parisc and mips
by James Bottomley
These platforms define PMD_ORDER in asm/pgtable.h
This means that as soon as dax.c included asm/pgtable.h in commit
11cf9d863dcb ("fs/dax: Deposit pagetable even when installing zero
page") we clash with PMD_ORDER introduced by cfc93c6c6c96 ("dax:
Convert dax_insert_pfn_mkwrite to XArray") and we get this problem:
/home/jejb/git/linux-build/fs/dax.c:53: warning: "PMD_ORDER" redefined
53 | #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
|
In file included from /home/jejb/git/linux-build/include/linux/pgtable.h:6,
from /home/jejb/git/linux-build/include/linux/mm.h:33,
from /home/jejb/git/linux-build/include/linux/bvec.h:14,
from /home/jejb/git/linux-build/include/linux/blk_types.h:10,
from /home/jejb/git/linux-build/include/linux/genhd.h:19,
from /home/jejb/git/linux-build/include/linux/blkdev.h:8,
from /home/jejb/git/linux-build/fs/dax.c:10:
/home/jejb/git/linux-build/arch/parisc/include/asm/pgtable.h:124: note: this is the location of the previous definition
124 | #define PMD_ORDER 1 /* Number of pages per pmd */
|
make[2]: *** Deleting file 'fs/dax.o'
Fix by renaming dax's PMD_ORDER to DAX_PMD_ORDER
Signed-off-by: James Bottomley <James.Bottomley(a)HansenPartnership.com>
---
fs/dax.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..4d3b0db5c321 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -50,7 +50,7 @@ static inline unsigned int pe_order(enum page_entry_size pe_size)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
/* The order of a PMD entry */
-#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#define DAX_PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
@@ -98,7 +98,7 @@ static bool dax_is_locked(void *entry)
static unsigned int dax_entry_order(void *entry)
{
if (xa_to_value(entry) & DAX_PMD)
- return PMD_ORDER;
+ return DAX_PMD_ORDER;
return 0;
}
@@ -1471,7 +1471,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
{
struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
- XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
+ XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, DAX_PMD_ORDER);
unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
@@ -1530,7 +1530,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* entry is already in the array, for instance), it will return
* VM_FAULT_FALLBACK.
*/
- entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
+ entry = grab_mapping_entry(&xas, mapping, DAX_PMD_ORDER);
if (xa_is_internal(entry)) {
result = xa_to_internal(entry);
goto fallback;
@@ -1696,7 +1696,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
if (order == 0)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
#ifdef CONFIG_FS_DAX_PMD
- else if (order == PMD_ORDER)
+ else if (order == DAX_PMD_ORDER)
ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
#endif
else
--
2.29.2
1 year, 3 months
[RFC v5 0/7] PMEM device emulation without nfit depenency
by Santosh Sivaraj
The current test module cannot be used for testing platforms (make check)
that do not have support for NFIT. In order to get the ndctl tests working,
we need a module which can emulate NVDIMM devices without relying on
ACPI/NFIT.
The emulated PMEM device is made part of the PAPR family.
Corresponding changes for ndctl is also required, to add attributes needed
for the test, which will be sent as a reply to this patch.
The following is the test result, run on a x86 guest:
PASS: libndctl
PASS: dsm-fail
PASS: dpa-alloc
PASS: parent-uuid
PASS: multi-pmem
PASS: create.sh
FAIL: clear.sh
FAIL: pmem-errors.sh
FAIL: daxdev-errors.sh
PASS: multi-dax.sh
PASS: btt-check.sh
FAIL: label-compat.sh
PASS: blk-exhaust.sh
PASS: sector-mode.sh
FAIL: inject-error.sh
SKIP: btt-errors.sh
PASS: hugetlb
PASS: btt-pad-compat.sh
SKIP: firmware-update.sh
FAIL: ack-shutdown-count-set
PASS: rescan-partitions.sh
FAIL: inject-smart.sh
FAIL: monitor.sh
PASS: max_available_extent_ns.sh
FAIL: pfn-meta-errors.sh
PASS: track-uuid.sh
============================================================================
Testsuite summary for ndctl 70.10.g7ecd11c
============================================================================
# TOTAL: 26
# PASS: 15
# SKIP: 2
# XFAIL: 0
# FAIL: 9
# XPASS: 0
# ERROR: 0
The following is the test result from a PowerPC 64 guest.
PASS: libndctl
PASS: dsm-fail
PASS: dpa-alloc
PASS: parent-uuid
PASS: multi-pmem
PASS: create.sh
FAIL: clear.sh
FAIL: pmem-errors.sh
FAIL: daxdev-errors.sh
PASS: multi-dax.sh
PASS: btt-check.sh
FAIL: label-compat.sh
PASS: blk-exhaust.sh
PASS: sector-mode.sh
FAIL: inject-error.sh
SKIP: btt-errors.sh
SKIP: hugetlb
PASS: btt-pad-compat.sh
SKIP: firmware-update.sh
FAIL: ack-shutdown-count-set
PASS: rescan-partitions.sh
FAIL: inject-smart.sh
FAIL: monitor.sh
PASS: max_available_extent_ns.sh
FAIL: pfn-meta-errors.sh
PASS: track-uuid.sh
============================================================================
Testsuite summary for ndctl 70.git94a00679
============================================================================
# TOTAL: 26
# PASS: 14
# SKIP: 3
# XFAIL: 0
# FAIL: 9
# XPASS: 0
# ERROR: 0
Error injection tests and SMART are not yet implemented.
Changes from V4:
- Split the driver patches into smaller chunks for ease of review
- [ndctl] Adding dimm attributes for nfit and papr, make it into one function [Dan]
- [ndctl] If nfit modules are missing, check for acpi support before failing, slightly
different approach than what Dan commented.
Santosh Sivaraj (7):
testing/nvdimm: Add test module for non-nfit platforms
ndtest: Add compatability string to treat it as PAPR family
ndtest: Add dimms to the two buses
ndtest: Add dimm attributes
ndtest: Add regions and mappings to the test buses
ndtest: Add nvdimm control functions
ndtest: Add papr health related flags
tools/testing/nvdimm/config_check.c | 3 +-
tools/testing/nvdimm/test/Kbuild | 6 +-
tools/testing/nvdimm/test/ndtest.c | 1138 +++++++++++++++++++++++++++
tools/testing/nvdimm/test/ndtest.h | 109 +++
4 files changed, 1254 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/nvdimm/test/ndtest.c
create mode 100644 tools/testing/nvdimm/test/ndtest.h
--
2.26.2
1 year, 3 months
[PATCH 1/3] device-dax: Prevent registering drivers without probe or remove callback
by Uwe Kleine-König
The bus probe and remove functions (dax_bus_probe and dax_bus_remove
respectively) call these functions without checking them to be non-NULL.
Add a check to prevent a NULL pointer exception.
Signed-off-by: Uwe Kleine-König <uwe(a)kleine-koenig.org>
---
drivers/dax/bus.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 737b207c9e30..618d462975ba 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1392,6 +1392,14 @@ int __dax_driver_register(struct dax_device_driver *dax_drv,
struct device_driver *drv = &dax_drv->drv;
int rc = 0;
+ /*
+ * dax_bus_probe() calls dax_drv->probe() unconditionally and
+ * dax_bus_remove() calls dax_drv->remove() unconditionally. So better
+ * be safe than sorry and ensure these are provided.
+ */
+ if (!dax_drv->probe || !dax_drv->remove)
+ return -EINVAL;
+
INIT_LIST_HEAD(&dax_drv->ids);
drv->owner = module;
drv->name = mod_name;
base-commit: 5c8fe583cce542aa0b84adc939ce85293de36e5e
--
2.29.2
1 year, 3 months
[PATCH v15 00/11] mm: introduce memfd_secret system call to create "secret" memory areas
by Mike Rapoport
From: Mike Rapoport <rppt(a)linux.ibm.com>
Hi,
@Andrew, this is based on v5.11-rc4-mmots-2021-01-19-13-54 with secretmem
patches dropped from there, I can rebase whatever way you prefer.
This is an implementation of "secret" mappings backed by a file descriptor.
The file descriptor backing secret memory mappings is created using a
dedicated memfd_secret system call The desired protection mode for the
memory is configured using flags parameter of the system call. The mmap()
of the file descriptor created with memfd_secret() will create a "secret"
memory mapping. The pages in that mapping will be marked as not present in
the direct map and will be present only in the page table of the owning mm.
Although normally Linux userspace mappings are protected from other users,
such secret mappings are useful for environments where a hostile tenant is
trying to trick the kernel into giving them access to other tenants
mappings.
Additionally, in the future the secret mappings may be used as a mean to
protect guest memory in a virtual machine host.
For demonstration of secret memory usage we've created a userspace library
https://git.kernel.org/pub/scm/linux/kernel/git/jejb/secret-memory-preloa...
that does two things: the first is act as a preloader for openssl to
redirect all the OPENSSL_malloc calls to secret memory meaning any secret
keys get automatically protected this way and the other thing it does is
expose the API to the user who needs it. We anticipate that a lot of the
use cases would be like the openssl one: many toolkits that deal with
secret keys already have special handling for the memory to try to give
them greater protection, so this would simply be pluggable into the
toolkits without any need for user application modification.
Hiding secret memory mappings behind an anonymous file allows (ab)use of
the page cache for tracking pages allocated for the "secret" mappings as
well as using address_space_operations for e.g. page migration callbacks.
The anonymous file may be also used implicitly, like hugetlb files, to
implement mmap(MAP_SECRET) and use the secret memory areas with "native" mm
ABIs in the future.
To limit fragmentation of the direct map to splitting only PUD-size pages,
I've added an amortizing cache of PMD-size pages to each file descriptor
that is used as an allocation pool for the secret memory areas.
As the memory allocated by secretmem becomes unmovable, we use CMA to back
large page caches so that page allocator won't be surprised by failing attempt
to migrate these pages.
v15:
* Add riscv/Kconfig update to disable set_memory operations for nommu
builds (patch 3)
* Update the code around add_to_page_cache() per Matthew's comments
(patches 6,7)
* Add fixups for build/checkpatch errors discovered by CI systems
v14: https://lore.kernel.org/lkml/20201203062949.5484-1-rppt@kernel.org
* Finally s/mod_node_page_state/mod_lruvec_page_state/
v13: https://lore.kernel.org/lkml/20201201074559.27742-1-rppt@kernel.org
* Added Reviewed-by, thanks Catalin and David
* s/mod_node_page_state/mod_lruvec_page_state/ as Shakeel suggested
v12: https://lore.kernel.org/lkml/20201125092208.12544-1-rppt@kernel.org
* Add detection of whether set_direct_map has actual effect on arm64 and bail
out of CMA allocation for secretmem and the memfd_secret() syscall if pages
would not be removed from the direct map
v11: https://lore.kernel.org/lkml/20201124092556.12009-1-rppt@kernel.org
* Drop support for uncached mappings
Older history:
v10: https://lore.kernel.org/lkml/20201123095432.5860-1-rppt@kernel.org
v9: https://lore.kernel.org/lkml/20201117162932.13649-1-rppt@kernel.org
v8: https://lore.kernel.org/lkml/20201110151444.20662-1-rppt@kernel.org
v7: https://lore.kernel.org/lkml/20201026083752.13267-1-rppt@kernel.org
v6: https://lore.kernel.org/lkml/20200924132904.1391-1-rppt@kernel.org
v5: https://lore.kernel.org/lkml/20200916073539.3552-1-rppt@kernel.org
v4: https://lore.kernel.org/lkml/20200818141554.13945-1-rppt@kernel.org
v3: https://lore.kernel.org/lkml/20200804095035.18778-1-rppt@kernel.org
v2: https://lore.kernel.org/lkml/20200727162935.31714-1-rppt@kernel.org
v1: https://lore.kernel.org/lkml/20200720092435.17469-1-rppt@kernel.org
Mike Rapoport (11):
mm: add definition of PMD_PAGE_ORDER
mmap: make mlock_future_check() global
riscv/Kconfig: make direct map manipulation options depend on MMU
set_memory: allow set_direct_map_*_noflush() for multiple pages
set_memory: allow querying whether set_direct_map_*() is actually enabled
mm: introduce memfd_secret system call to create "secret" memory areas
secretmem: use PMD-size pages to amortize direct map fragmentation
secretmem: add memcg accounting
PM: hibernate: disable when there are active secretmem users
arch, mm: wire up memfd_secret system call where relevant
secretmem: test: add basic selftest for memfd_secret(2)
arch/arm64/include/asm/Kbuild | 1 -
arch/arm64/include/asm/cacheflush.h | 6 -
arch/arm64/include/asm/set_memory.h | 17 +
arch/arm64/include/uapi/asm/unistd.h | 1 +
arch/arm64/kernel/machine_kexec.c | 1 +
arch/arm64/mm/mmu.c | 6 +-
arch/arm64/mm/pageattr.c | 23 +-
arch/riscv/Kconfig | 4 +-
arch/riscv/include/asm/set_memory.h | 4 +-
arch/riscv/include/asm/unistd.h | 1 +
arch/riscv/mm/pageattr.c | 8 +-
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/x86/include/asm/set_memory.h | 4 +-
arch/x86/mm/pat/set_memory.c | 8 +-
fs/dax.c | 11 +-
include/linux/pgtable.h | 3 +
include/linux/secretmem.h | 30 ++
include/linux/set_memory.h | 16 +-
include/linux/syscalls.h | 1 +
include/uapi/asm-generic/unistd.h | 6 +-
include/uapi/linux/magic.h | 1 +
kernel/power/hibernate.c | 5 +-
kernel/power/snapshot.c | 4 +-
kernel/sys_ni.c | 2 +
mm/Kconfig | 5 +
mm/Makefile | 1 +
mm/filemap.c | 3 +-
mm/gup.c | 10 +
mm/internal.h | 3 +
mm/mmap.c | 5 +-
mm/secretmem.c | 444 ++++++++++++++++++++++
mm/vmalloc.c | 5 +-
scripts/checksyscalls.sh | 4 +
tools/testing/selftests/vm/.gitignore | 1 +
tools/testing/selftests/vm/Makefile | 3 +-
tools/testing/selftests/vm/memfd_secret.c | 296 +++++++++++++++
tools/testing/selftests/vm/run_vmtests | 17 +
38 files changed, 910 insertions(+), 52 deletions(-)
create mode 100644 arch/arm64/include/asm/set_memory.h
create mode 100644 include/linux/secretmem.h
create mode 100644 mm/secretmem.c
create mode 100644 tools/testing/selftests/vm/memfd_secret.c
--
2.28.0
1 year, 3 months
[PATCH v2 00/10] fsdax: introduce fs query to support reflink
by Shiyang Ruan
This patchset is aimed to support shared pages tracking for fsdax.
Change from V1:
- Add the old memory-failure handler back for rolling back
- Add callback in MD's ->rmap() to support multiple mapping of dm device
- Add judgement for CONFIG_SYSFS
- Add pfn_valid() judgement in hwpoison_filter()
- Rebased to v5.11-rc5
Change from RFC v3:
- Do not lock dax entry in memory failure handler
- Add a helper function for corrupted_range
- Add restrictions in xfs code
- Fix code style
- remove the useless association and lock in fsdax
Change from RFC v2:
- Adjust the order of patches
- Divide the infrastructure and the drivers that use it
- Rebased to v5.10
Change from RFC v1:
- Introduce ->block_lost() for block device
- Support mapped device
- Add 'not available' warning for realtime device in XFS
- Rebased to v5.10-rc1
This patchset moves owner tracking from dax_assocaite_entry() to pmem
device driver, by introducing an interface ->memory_failure() of struct
pagemap. This interface is called by memory_failure() in mm, and
implemented by pmem device. Then pmem device calls its ->corrupted_range()
to find the filesystem which the corrupted data located in, and call
filesystem handler to track files or metadata assocaited with this page.
Finally we are able to try to fix the corrupted data in filesystem and do
other necessary processing, such as killing processes who are using the
files affected.
The call trace is like this:
memory_failure()
pgmap->ops->memory_failure() => pmem_pgmap_memory_failure()
gendisk->fops->corrupted_range() => - pmem_corrupted_range()
- md_blk_corrupted_range()
sb->s_ops->currupted_range() => xfs_fs_corrupted_range()
xfs_rmap_query_range()
xfs_currupt_helper()
* corrupted on metadata
try to recover data, call xfs_force_shutdown()
* corrupted on file data
try to recover data, call mf_dax_mapping_kill_procs()
The fsdax & reflink support for XFS is not contained in this patchset.
(Rebased on v5.11-rc5)
Shiyang Ruan (10):
pagemap: Introduce ->memory_failure()
blk: Introduce ->corrupted_range() for block device
fs: Introduce ->corrupted_range() for superblock
mm, fsdax: Refactor memory-failure handler for dax mapping
mm, pmem: Implement ->memory_failure() in pmem driver
pmem: Implement ->corrupted_range() for pmem driver
dm: Introduce ->rmap() to find bdev offset
md: Implement ->corrupted_range()
xfs: Implement ->corrupted_range() for XFS
fs/dax: Remove useless functions
block/genhd.c | 6 ++
drivers/md/dm-linear.c | 20 ++++
drivers/md/dm.c | 61 +++++++++++
drivers/nvdimm/pmem.c | 44 ++++++++
fs/block_dev.c | 42 +++++++-
fs/dax.c | 63 ++++-------
fs/xfs/xfs_fsops.c | 5 +
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 109 +++++++++++++++++++
include/linux/blkdev.h | 2 +
include/linux/dax.h | 1 +
include/linux/device-mapper.h | 5 +
include/linux/fs.h | 2 +
include/linux/genhd.h | 3 +
include/linux/memremap.h | 8 ++
include/linux/mm.h | 9 ++
mm/memory-failure.c | 190 +++++++++++++++++++++++-----------
17 files changed, 466 insertions(+), 105 deletions(-)
--
2.30.0
1 year, 3 months
[PATCH 4.14 15/50] arch/arc: add copy_user_page() to <asm/page.h> to fix build error on ARC
by Greg Kroah-Hartman
From: Randy Dunlap <rdunlap(a)infradead.org>
[ Upstream commit 8a48c0a3360bf2bf4f40c980d0ec216e770e58ee ]
fs/dax.c uses copy_user_page() but ARC does not provide that interface,
resulting in a build error.
Provide copy_user_page() in <asm/page.h>.
../fs/dax.c: In function 'copy_cow_page_dax':
../fs/dax.c:702:2: error: implicit declaration of function 'copy_user_page'; did you mean 'copy_to_user_page'? [-Werror=implicit-function-declaration]
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Randy Dunlap <rdunlap(a)infradead.org>
Cc: Vineet Gupta <vgupta(a)synopsys.com>
Cc: linux-snps-arc(a)lists.infradead.org
Cc: Dan Williams <dan.j.williams(a)intel.com>
#Acked-by: Vineet Gupta <vgupta(a)synopsys.com> # v1
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Jan Kara <jack(a)suse.cz>
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-nvdimm(a)lists.01.org
#Reviewed-by: Ira Weiny <ira.weiny(a)intel.com> # v2
Signed-off-by: Vineet Gupta <vgupta(a)synopsys.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/arc/include/asm/page.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h
index 09ddddf71cc50..a70fef79c4055 100644
--- a/arch/arc/include/asm/page.h
+++ b/arch/arc/include/asm/page.h
@@ -13,6 +13,7 @@
#ifndef __ASSEMBLY__
#define clear_page(paddr) memset((paddr), 0, PAGE_SIZE)
+#define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
#define copy_page(to, from) memcpy((to), (from), PAGE_SIZE)
struct vm_area_struct;
--
2.27.0
1 year, 3 months