Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async
hcalls at the drc level
by Greg Kurz
Hi Shiva,
On Mon, 30 Nov 2020 09:16:39 -0600
Shivaprasad G Bhat <sbhat(a)linux.ibm.com> wrote:
> The patch adds support for async hcalls at the DRC level for the
> spapr devices. To be used by spapr-scm devices in the patch/es to follow.
>
> Signed-off-by: Shivaprasad G Bhat <sbhat(a)linux.ibm.com>
> ---
The overall idea looks good but I think you should consider using
a thread pool to implement it. See below.
> hw/ppc/spapr_drc.c | 149 ++++++++++++++++++++++++++++++++++++++++++++
> include/hw/ppc/spapr_drc.h | 25 +++++++
> 2 files changed, 174 insertions(+)
>
> diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
> index 77718cde1f..4ecd04f686 100644
> --- a/hw/ppc/spapr_drc.c
> +++ b/hw/ppc/spapr_drc.c
> @@ -15,6 +15,7 @@
> #include "qapi/qmp/qnull.h"
> #include "cpu.h"
> #include "qemu/cutils.h"
> +#include "qemu/guest-random.h"
> #include "hw/ppc/spapr_drc.h"
> #include "qom/object.h"
> #include "migration/vmstate.h"
> @@ -421,6 +422,148 @@ void spapr_drc_detach(SpaprDrc *drc)
> spapr_drc_release(drc);
> }
>
> +
> +/*
> + * @drc : device DRC targetting which the async hcalls to be made.
> + *
> + * All subsequent requests to run/query the status should use the
> + * unique token returned here.
> + */
> +uint64_t spapr_drc_get_new_async_hcall_token(SpaprDrc *drc)
> +{
> + Error *err = NULL;
> + uint64_t token;
> + SpaprDrcDeviceAsyncHCallState *tmp, *next, *state;
> +
> + state = g_malloc0(sizeof(*state));
> + state->pending = true;
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> +retry:
> + if (qemu_guest_getrandom(&token, sizeof(token), &err) < 0) {
> + error_report_err(err);
> + g_free(state);
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> + return 0;
> + }
> +
> + if (!token) /* Token should be non-zero */
> + goto retry;
> +
> + if (!QLIST_EMPTY(&drc->async_hcall_states)) {
> + QLIST_FOREACH_SAFE(tmp, &drc->async_hcall_states, node, next) {
> + if (tmp->continue_token == token) {
> + /* If the token already in use, get a new one */
> + goto retry;
> + }
> + }
> + }
> +
> + state->continue_token = token;
> + QLIST_INSERT_HEAD(&drc->async_hcall_states, state, node);
> +
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +
> + return state->continue_token;
> +}
> +
> +static void *spapr_drc_async_hcall_runner(void *opaque)
> +{
> + int response = -1;
> + SpaprDrcDeviceAsyncHCallState *state = opaque;
> +
> + /*
> + * state is freed only after this thread finishes(after pthread_join()),
> + * don't worry about it becoming NULL.
> + */
> +
> + response = state->func(state->data);
> +
> + state->hcall_ret = response;
> + state->pending = 0;
> +
> + return NULL;
> +}
> +
> +/*
> + * @drc : device DRC targetting which the async hcalls to be made.
> + * token : The continue token to be used for tracking as recived from
> + * spapr_drc_get_new_async_hcall_token
> + * @func() : the worker function which needs to be executed asynchronously
> + * @data : data to be passed to the asynchronous function. Worker is supposed
> + * to free/cleanup the data that is passed here
It'd be cleaner to pass a completion callback and have free/cleanup handled there.
> + */
> +void spapr_drc_run_async_hcall(SpaprDrc *drc, uint64_t token,
> + SpaprDrcAsyncHcallWorkerFunc *func, void *data)
> +{
> + SpaprDrcDeviceAsyncHCallState *state;
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> + QLIST_FOREACH(state, &drc->async_hcall_states, node) {
> + if (state->continue_token == token) {
> + state->func = func;
> + state->data = data;
> + qemu_thread_create(&state->thread, "sPAPR Async HCALL",
> + spapr_drc_async_hcall_runner, state,
> + QEMU_THREAD_JOINABLE);
qemu_thread_create() exits on failure, it shouldn't be called on
a guest triggerable path, eg. a buggy guest could call it up to
the point that pthread_create() returns EAGAIN.
Please use a thread pool (see thread_pool_submit_aio()). This takes care
of all the thread housekeeping for you in a safe way, and it provides a
completion callback API. The implementation could then be just about
having two lists: one for pending requests (fed here) and one for
completed requests (fed by the completion callback).
> + break;
> + }
> + }
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +}
> +
> +/*
> + * spapr_drc_finish_async_hcalls
> + * Waits for all pending async requests to complete
> + * thier execution and free the states
> + */
> +static void spapr_drc_finish_async_hcalls(SpaprDrc *drc)
> +{
> + SpaprDrcDeviceAsyncHCallState *state, *next;
> +
> + if (QLIST_EMPTY(&drc->async_hcall_states)) {
> + return;
> + }
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> + QLIST_FOREACH_SAFE(state, &drc->async_hcall_states, node, next) {
> + qemu_thread_join(&state->thread);
With a thread-pool, you'd just need to aio_poll() until the pending list
is empty and then clear the completed list.
> + QLIST_REMOVE(state, node);
> + g_free(state);
> + }
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +}
> +
> +/*
> + * spapr_drc_get_async_hcall_status
> + * Fetches the status of the hcall worker and returns H_BUSY
> + * if the worker is still running.
> + */
> +int spapr_drc_get_async_hcall_status(SpaprDrc *drc, uint64_t token)
> +{
> + int ret = H_PARAMETER;
> + SpaprDrcDeviceAsyncHCallState *state, *node;
> +
> + qemu_mutex_lock(&drc->async_hcall_states_lock);
> + QLIST_FOREACH_SAFE(state, &drc->async_hcall_states, node, node) {
> + if (state->continue_token == token) {
> + if (state->pending) {
> + ret = H_BUSY;
> + break;
> + } else {
> + ret = state->hcall_ret;
> + qemu_thread_join(&state->thread);
Like for qemu_thread_create(), the guest shouldn't be responsible for
thread housekeeping. Getting the hcall status should just be about
finding the token in the pending or completed lists.
> + QLIST_REMOVE(state, node);
> + g_free(state);
> + break;
> + }
> + }
> + }
> + qemu_mutex_unlock(&drc->async_hcall_states_lock);
> +
> + return ret;
> +}
> +
> void spapr_drc_reset(SpaprDrc *drc)
> {
> SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> @@ -448,6 +591,7 @@ void spapr_drc_reset(SpaprDrc *drc)
> drc->ccs_offset = -1;
> drc->ccs_depth = -1;
> }
> + spapr_drc_finish_async_hcalls(drc);
> }
>
> static bool spapr_drc_unplug_requested_needed(void *opaque)
> @@ -558,6 +702,7 @@ SpaprDrc *spapr_dr_connector_new(Object *owner, const char *type,
> drc->owner = owner;
> prop_name = g_strdup_printf("dr-connector[%"PRIu32"]",
> spapr_drc_index(drc));
> +
Unrelated change.
> object_property_add_child(owner, prop_name, OBJECT(drc));
> object_unref(OBJECT(drc));
> qdev_realize(DEVICE(drc), NULL, NULL);
> @@ -577,6 +722,10 @@ static void spapr_dr_connector_instance_init(Object *obj)
> object_property_add(obj, "fdt", "struct", prop_get_fdt,
> NULL, NULL, NULL);
> drc->state = drck->empty_state;
> +
> + qemu_mutex_init(&drc->async_hcall_states_lock);
> + QLIST_INIT(&drc->async_hcall_states);
> +
Empty line not needed.
> }
>
> static void spapr_dr_connector_class_init(ObjectClass *k, void *data)
> diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h
> index 165b281496..77f6e4386c 100644
> --- a/include/hw/ppc/spapr_drc.h
> +++ b/include/hw/ppc/spapr_drc.h
> @@ -18,6 +18,7 @@
> #include "sysemu/runstate.h"
> #include "hw/qdev-core.h"
> #include "qapi/error.h"
> +#include "block/thread-pool.h"
>
> #define TYPE_SPAPR_DR_CONNECTOR "spapr-dr-connector"
> #define SPAPR_DR_CONNECTOR_GET_CLASS(obj) \
> @@ -168,6 +169,21 @@ typedef enum {
> SPAPR_DRC_STATE_PHYSICAL_CONFIGURED = 8,
> } SpaprDrcState;
>
> +typedef struct SpaprDrc SpaprDrc;
> +
> +typedef int SpaprDrcAsyncHcallWorkerFunc(void *opaque);
> +typedef struct SpaprDrcDeviceAsyncHCallState {
> + uint64_t continue_token;
> + bool pending;
> +
> + int hcall_ret;
> + SpaprDrcAsyncHcallWorkerFunc *func;
> + void *data;
> +
> + QemuThread thread;
> +
> + QLIST_ENTRY(SpaprDrcDeviceAsyncHCallState) node;
> +} SpaprDrcDeviceAsyncHCallState;
> typedef struct SpaprDrc {
> /*< private >*/
> DeviceState parent;
> @@ -182,6 +198,10 @@ typedef struct SpaprDrc {
> int ccs_offset;
> int ccs_depth;
>
> + /* async hcall states */
> + QemuMutex async_hcall_states_lock;
> + QLIST_HEAD(, SpaprDrcDeviceAsyncHCallState) async_hcall_states;
> +
> /* device pointer, via link property */
> DeviceState *dev;
> bool unplug_requested;
> @@ -241,6 +261,11 @@ void spapr_drc_detach(SpaprDrc *drc);
> /* Returns true if a hot plug/unplug request is pending */
> bool spapr_drc_transient(SpaprDrc *drc);
>
> +uint64_t spapr_drc_get_new_async_hcall_token(SpaprDrc *drc);
> +void spapr_drc_run_async_hcall(SpaprDrc *drc, uint64_t token,
> + SpaprDrcAsyncHcallWorkerFunc, void *data);
> +int spapr_drc_get_async_hcall_status(SpaprDrc *drc, uint64_t token);
> +
> static inline bool spapr_drc_unplug_requested(SpaprDrc *drc)
> {
> return drc->unplug_requested;
>
>
>
1 year, 3 months
[PATCH v2 0/4] Remove nrexceptional tracking
by Matthew Wilcox (Oracle)
We actually use nrexceptional for very little these days. It's a minor
pain to keep in sync with nrpages, but the pain becomes much bigger
with the THP patches because we don't know how many indices a shadow
entry occupies. It's easier to just remove it than keep it accurate.
Also, we save 8 bytes per inode which is nothing to sneeze at; on my
laptop, it would improve shmem_inode_cache from 22 to 23 objects per
16kB, and inode_cache from 26 to 27 objects. Combined, that saves
a megabyte of memory from a combined usage of 25MB for both caches.
Unfortunately, ext4 doesn't cross a magic boundary, so it doesn't save
any memory for ext4.
Matthew Wilcox (Oracle) (4):
mm: Introduce and use mapping_empty
mm: Stop accounting shadow entries
dax: Account DAX entries as nrpages
mm: Remove nrexceptional from inode
fs/block_dev.c | 2 +-
fs/dax.c | 8 ++++----
fs/gfs2/glock.c | 3 +--
fs/inode.c | 2 +-
include/linux/fs.h | 2 --
include/linux/pagemap.h | 5 +++++
mm/filemap.c | 16 ----------------
mm/swap_state.c | 4 ----
mm/truncate.c | 19 +++----------------
mm/workingset.c | 1 -
10 files changed, 15 insertions(+), 47 deletions(-)
--
2.28.0
1 year, 3 months
[RFC 0/2] virtio-pmem: Asynchronous flush
by Pankaj Gupta
Jeff reported preflush order issue with the existing implementation
of virtio pmem preflush. Dan suggested[1] to implement asynchronous flush
for virtio pmem using work queue as done in md/RAID. This patch series
intends to solve the preflush ordering issue and also makes the flush
asynchronous from the submitting thread POV.
Submitting this patch series for feeback and is in WIP. I have
done basic testing and currently doing more testing.
Pankaj Gupta (2):
pmem: make nvdimm_flush asynchronous
virtio_pmem: Async virtio-pmem flush
drivers/nvdimm/nd_virtio.c | 66 ++++++++++++++++++++++++++----------
drivers/nvdimm/pmem.c | 15 ++++----
drivers/nvdimm/region_devs.c | 3 +-
drivers/nvdimm/virtio_pmem.c | 9 +++++
drivers/nvdimm/virtio_pmem.h | 12 +++++++
5 files changed, 78 insertions(+), 27 deletions(-)
[1] https://marc.info/?l=linux-kernel&m=157446316409937&w=2
--
2.20.1
1 year, 3 months
[PATCH RFC 0/9] mm, sparse-vmemmap: Introduce compound pagemaps
by Joao Martins
Hey,
This small series, attempts at minimizing 'struct page' overhead by
pursuing a similar approach as Muchun Song series "Free some vmemmap
pages of hugetlb page"[0] but applied to devmap/ZONE_DEVICE.
[0] https://lore.kernel.org/linux-mm/20201130151838.11208-1-songmuchun@byteda...
The link above describes it quite nicely, but the idea is to reuse tail
page vmemmap areas, particular the area which only describes tail pages.
So a vmemmap page describes 64 struct pages, and the first page for a given
ZONE_DEVICE vmemmap would contain the head page and 63 tail pages. The second
vmemmap page would contain only tail pages, and that's what gets reused across
the rest of the subsection/section. The bigger the page size, the bigger the
savings (2M hpage -> save 6 vmemmap pages; 1G hpage -> save 4094 vmemmap pages).
In terms of savings, per 1Tb of memory, the struct page cost would go down
with compound pagemap:
* with 2M pages we lose 4G instead of 16G (0.39% instead of 1.5% of total memory)
* with 1G pages we lose 8MB instead of 16G (0.0007% instead of 1.5% of total memory)
Along the way I've extended it past 'struct page' overhead *trying* to address a
few performance issues we knew about for pmem, specifically on the
{pin,get}_user_pages* function family with device-dax vmas which are really
slow even of the fast variants. THP is great on -fast variants but all except
hugetlbfs perform rather poorly on non-fast gup.
So to summarize what the series does:
Patches 1-5: Much like Muchun series, we reuse tail page areas across a given
page size (namely @align was referred by remaining memremap/dax code) and
enabling of memremap to initialize the ZONE_DEVICE pages as compound pages or a
given @align order. The main difference though, is that contrary to the hugetlbfs
series, there's no vmemmap for the area, because we are onlining it. IOW no
freeing of pages of already initialized vmemmap like the case for hugetlbfs,
which simplifies the logic (besides not being arch-specific). After these,
there's quite visible region bootstrap of pmem memmap given that we would
initialize fewer struct pages depending on the page size.
NVDIMM namespace bootstrap improves from ~750ms to ~190ms/<=1ms on emulated NVDIMMs
with 2M and 1G respectivally. The net gain in improvement is similarly observed
in proportion when running on actual NVDIMMs.
Patch 6 - 8: Optimize grabbing/release a page refcount changes given that we
are working with compound pages i.e. we do 1 increment/decrement to the head
page for a given set of N subpages compared as opposed to N individual writes.
{get,pin}_user_pages_fast() for zone_device with compound pagemap consequently
improves considerably, and unpin_user_pages() improves as well when passed a
set of consecutive pages:
before after
(get_user_pages_fast 1G;2M page size) ~75k us -> ~3.2k ; ~5.2k us
(pin_user_pages_fast 1G;2M page size) ~125k us -> ~3.4k ; ~5.5k us
The RDMA patch (patch 8/9) is to demonstrate the improvement for an existing
user. For unpin_user_pages() we have an additional test to demonstrate the
improvement. The test performs MR reg/unreg continuously and measuring its
rate for a given period. So essentially ib_mem_get and ib_mem_release being
stress tested which at the end of day means: pin_user_pages_longterm() and
unpin_user_pages() for a scatterlist:
Before:
159 rounds in 5.027 sec: 31617.923 usec / round (device-dax)
466 rounds in 5.009 sec: 10748.456 usec / round (hugetlbfs)
After:
305 rounds in 5.010 sec: 16426.047 usec / round (device-dax)
1073 rounds in 5.004 sec: 4663.622 usec / round (hugetlbfs)
Patch 9: Improves {pin,get}_user_pages() and its longterm counterpart. It
is very experimental, and I imported most of follow_hugetlb_page(), except
that we do the same trick as gup-fast. In doing the patch I feel this batching
should live in follow_page_mask() and having that being changed to return a set
of pages/something-else when walking over PMD/PUDs for THP / devmap pages. This
patch then brings the previous test of mr reg/unreg (above) on parity
between device-dax and hugetlbfs.
Some of the patches are a little fresh/WIP (specially patch 3 and 9) and we are
still running tests. Hence the RFC, asking for comments and general direction
of the work before continuing.
Patches apply on top of linux-next tag next-20201208 (commit a9e26cb5f261).
Comments and suggestions very much appreciated!
Thanks,
Joao
Joao Martins (9):
memremap: add ZONE_DEVICE support for compound pages
sparse-vmemmap: Consolidate arguments in vmemmap section populate
sparse-vmemmap: Reuse vmemmap areas for a given page size
mm/page_alloc: Reuse tail struct pages for compound pagemaps
device-dax: Compound pagemap support
mm/gup: Grab head page refcount once for group of subpages
mm/gup: Decrement head page once for group of subpages
RDMA/umem: batch page unpin in __ib_mem_release()
mm: Add follow_devmap_page() for devdax vmas
drivers/dax/device.c | 54 ++++++---
drivers/infiniband/core/umem.c | 25 +++-
include/linux/huge_mm.h | 4 +
include/linux/memory_hotplug.h | 16 ++-
include/linux/memremap.h | 2 +
include/linux/mm.h | 6 +-
mm/gup.c | 130 ++++++++++++++++-----
mm/huge_memory.c | 202 +++++++++++++++++++++++++++++++++
mm/memory_hotplug.c | 13 ++-
mm/memremap.c | 13 ++-
mm/page_alloc.c | 28 ++++-
mm/sparse-vmemmap.c | 97 +++++++++++++---
mm/sparse.c | 16 +--
13 files changed, 531 insertions(+), 75 deletions(-)
--
2.17.1
1 year, 3 months
[0/7] PMEM device emulation without nfit depenency
by Santosh Sivaraj
The current test module cannot be used for testing platforms (make check)
that do not have support for NFIT. In order to get the ndctl tests working,
we need a module which can emulate NVDIMM devices without relying on
ACPI/NFIT.
The emulated PMEM device is made part of the PAPR family.
Corresponding changes for ndctl is also required, to add attributes needed
for the test, which will be sent as a reply to this patch.
None of tests passed on PAPR before, now there are 16 test that pass. Error
injection tests and SMART are not yet implemented.
Santosh Sivaraj (7):
testing/nvdimm: Add test module for non-nfit platforms
ndtest: Add compatability string to treat it as PAPR family
ndtest: Add dimms to the two buses
ndtest: Add dimm attributes
ndtest: Add regions and mappings to the test buses
ndtest: Add nvdimm control functions
ndtest: Add papr health related flags
tools/testing/nvdimm/config_check.c | 3 +-
tools/testing/nvdimm/test/Kbuild | 6 +-
tools/testing/nvdimm/test/ndtest.c | 1138 +++++++++++++++++++++++++++
tools/testing/nvdimm/test/ndtest.h | 109 +++
4 files changed, 1254 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/nvdimm/test/ndtest.c
create mode 100644 tools/testing/nvdimm/test/ndtest.h
--
2.26.2
1 year, 4 months
[PATCH] dax: fix default return code of range_parse()
by Shiyang Ruan
The return value of range_parse() indicates the size when it is
positive. The error code should be negative.
Signed-off-by: Shiyang Ruan <ruansy.fnst(a)cn.fujitsu.com>
---
drivers/dax/bus.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 737b207c9e30..3003558c1a8b 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -1038,7 +1038,7 @@ static ssize_t range_parse(const char *opt, size_t len, struct range *range)
{
unsigned long long addr = 0;
char *start, *end, *str;
- ssize_t rc = EINVAL;
+ ssize_t rc = -EINVAL;
str = kstrdup(opt, GFP_KERNEL);
if (!str)
--
2.30.0
1 year, 4 months
[PATCH ndctl] dimm: re-fix potential fd leakage in dimm_action()
by Michal Suchanek
There are cases not covered by the original fix and cases added by the
latter patch.
Also there is one case of usage added without returning from the
function.
Fixes: ff434d87ccbd ("dimm: fix potential fd leakage in dimm_action()")
Fixes: 41a7e24af5db ("ndctl/dimm: Auto-arm firmware activation")
Signed-off-by: Michal Suchanek <msuchanek(a)suse.de>
---
ndctl/dimm.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/ndctl/dimm.c b/ndctl/dimm.c
index 09ce49e1d2ca..1c177b5494ec 100644
--- a/ndctl/dimm.c
+++ b/ndctl/dimm.c
@@ -1333,12 +1333,15 @@ static int dimm_action(int argc, const char **argv, struct ndctl_ctx *ctx,
if (param.arm_set && param.disarm_set) {
fprintf(stderr, "set either --arm, or --disarm, not both\n");
usage_with_options(u, options);
+ rc = -EINVAL;
+ goto out_close_fout;
}
if (param.disarm_set && !param.disarm) {
fprintf(stderr, "--no-disarm syntax not supported\n");
usage_with_options(u, options);
- return -EINVAL;
+ rc = -EINVAL;
+ goto out_close_fout;
}
if (!param.infile) {
@@ -1351,13 +1354,15 @@ static int dimm_action(int argc, const char **argv, struct ndctl_ctx *ctx,
if (!param.arm_set && !param.disarm_set) {
fprintf(stderr, "require --arm, or --disarm\n");
usage_with_options(u, options);
- return -EINVAL;
+ rc = -EINVAL;
+ goto out_close_fout;
}
if (param.arm_set && !param.arm) {
fprintf(stderr, "--no-arm syntax not supported\n");
usage_with_options(u, options);
- return -EINVAL;
+ rc = -EINVAL;
+ goto out_close_fout;
}
}
actx.f_in = stdin;
@@ -1425,7 +1430,8 @@ static int dimm_action(int argc, const char **argv, struct ndctl_ctx *ctx,
if (count > 1) {
error("write-labels only supports writing a single dimm\n");
usage_with_options(u, options);
- return -EINVAL;
+ rc = -EINVAL;
+ goto out_close_fin_fout;
} else if (single)
rc = action(single, &actx);
}
--
2.26.2
1 year, 4 months
[PATCH 00/14] CXL 2.0 Support
by Ben Widawsky
# Changes since RFC v3 [1]
* Added error message when payload size is too small. (Ben)
* Fix includes in UAPI for Clang (LKP)
* Reorder CXL in MAINTAINERS (Joe Perches)
* Kconfig whitespace and spelling fixes (Randy)
* Remove excess frees controlled by devm, introduced in v3 (Jonathan, Dan)
* Use 'PCI Express' instead of 'PCI-E' in Kconfig (Jonathan)
* Fail when mailbox commands return value is an error (Jonathan)
* Add comment to mailbox protocol to explain ordering of operations
(Jonathan, Ben)
* Fail mailbox xfer when doorbell is busy. (Jonathan)
* Remove extraneous SHIFT defines. (Jonathan)
* Change kdocs for mbox_cmd size_out to output only. (Jonathan)
* Fix transient bug (ENOTTY) in CXL_MEM_QUERY_COMMANDS (Jonathan)
* Add some comments and code beautification to mbox commands (Jonathan)
* Add some comments and code beautification to user commands (Jonathan)
* Fix bogus check of memcpy return value (Ben)
* Add concept of blocking certain RAW opcodes (Dan)
* Add debugfs knob to allow all RAW opcodes (Vishal)
* Move docs to driver-api/ (Dan)
* Use bounce buffer again like in v2 (Jonathan)
* Use kvzalloc instead of memdup (Ben)
* Wordsmith some changelogs and documentation (Dan)
* Use a percpu_ref counter to protect devm allocated data in the ioctl path
(Dan)
* Rework cdev registration and lookup to use inode->i_cdev (Dan)
* Drop mutex_lock_interruptible() from ioctl path (Dan)
* Convert add_taint() to WARN_TAINT_ONCE()
* Drop ACPI coordination for pure mailbox driver milestone (Dan)
* Permit GET_LOG with CEL_UUID (Ben)
* Cover letter overhaul (Ben)
* Use info.id instead of CXL_COMMAND_INDEX (Dan)
* Add several new commands to the mailbox interface (Ben)
---
In addition to the mailing list, please feel free to use #cxl on oftc IRC for
discussion.
---
# Summary
Introduce support for “type-3” memory devices defined in the Compute Express
Link (CXL) 2.0 specification [2]. Specifically, these are the memory devices
defined by section 8.2.8.5 of the CXL 2.0 spec. A reference implementation
emulating these devices has been submitted to the QEMU mailing list [3] and is
available on gitlab [4], but will move to a shared tree on kernel.org after
initial acceptance. “Type-3” is a CXL device that acts as a memory expander for
RAM or Persistent Memory. The device might be interleaved with other CXL devices
in a given physical address range.
In addition to the core functionality of discovering the spec defined registers
and resources, introduce a CXL device model that will be the foundation for
translating CXL capabilities into existing Linux infrastructure for Persistent
Memory and other memory devices. For now, this only includes support for the
management command mailbox the surfacing of type-3 devices. These control
devices fill the role of “DIMMs” / nmemX memory-devices in LIBNVDIMM terms.
## Userspace Interaction
Interaction with the driver and type-3 devices via the CXL drivers is introduced
in this patch series and considered stable ABI. They include
* sysfs - Documentation/ABI/testing/sysfs-bus-cxl
* IOCTL - Documentation/driver-api/cxl/memory-devices.rst
* debugfs - Documentation/ABI/testing/debugfs-debug
Work is in process to add support for CXL interactions to the ndctl project [5]
### Development plans
One of the unique challenges that CXL imposes on the Linux driver model is that
it requires the operating system to perform physical address space management
interleaved across devices and bridges. Whereas LIBNVDIMM handles a list of
established static persistent memory address ranges (for example from the ACPI
NFIT), CXL introduces hotplug and the concept of allocating address space to
instantiate persistent memory ranges. This is similar to PCI in the sense that
the platform establishes the MMIO range for PCI BARs to be allocated, but it is
significantly complicated by the fact that a given device can optionally be
interleaved with other devices and can participate in several interleave-sets at
once. LIBNVDIMM handled something like this with the aliasing between PMEM and
BLOCK-WINDOW mode, but CXL adds flexibility to alias DEVICE MEMORY through up to
10 decoders per device.
All of the above needs to be enabled with respect to PCI hotplug events on
Type-3 memory device which needs hooks to determine if a given device is
contributing to a "System RAM" address range that is unable to be unplugged. In
other words CXL ties PCI hotplug to Memory Hotplug and PCI hotplug needs to be
able to negotiate with memory hotplug. In the medium term the implications of
CXL hotplug vs ACPI SRAT/SLIT/HMAT need to be reconciled. One capability that
seems to be needed is either the dynamic allocation of new memory nodes, or
default initializing extra pgdat instances beyond what is enumerated in ACPI
SRAT to accommodate hot-added CXL memory.
Patches welcome, questions welcome as the development effort on the post v5.12
capabilities proceeds.
## Running in QEMU
The incantation to get CXL support in QEMU [4] is considered unstable at this
time. Future readers of this cover letter should verify if any changes are
needed. For the novice QEMU user, the following can be copy/pasted into a
working QEMU commandline. It is enough to make the simplest topology possible.
The topology would consist of a single memory window, single type3 device,
single root port, and single host bridge.
+-------------+
| CXL PXB |
| |
| +-------+ |<----------+
| |CXL RP | | |
+--+-------+--+ v
| +----------+
| | "window" |
| +----------+
v ^
+-------------+ |
| CXL Type 3 | |
| Device |<----------+
+-------------+
// Memory backend
-object memory-backend-file,id=cxl-mem1,share,mem-path=cxl-type3,size=512M
// Host Bridge
-device pxb-cxl id=cxl.0,bus=pcie.0,bus_nr=52,uid=0 len-window-base=1,window-base[0]=0x4c0000000 memdev[0]=cxl-mem1
// Single root port
-device cxl rp,id=rp0,bus=cxl.0,addr=0.0,chassis=0,slot=0,memdev=cxl-mem1
// Single type3 device
-device cxl-type3,bus=rp0,memdev=cxl-mem1,id=cxl-pmem0,size=256M -device cxl-type3,bus=rp1,memdev=cxl-mem1,id=cxl-pmem1,size=256M
---
[1]: https://lore.kernel.org/linux-cxl/20201209002418.1976362-1-ben.widawsky@i...
[2]: https://www.computeexpresslink.org/](https://www.computeexpresslink.org/
[3]: https://lore.kernel.org/qemu-devel/20210105165323.783725-1-ben.widawsky@i...
[4]: https://gitlab.com/bwidawsk/qemu/-/tree/cxl-2.0v*
[5]: https://github.com/pmem/ndctl/tree/cxl-2.0v*
Ben Widawsky (12):
cxl/mem: Map memory device registers
cxl/mem: Find device capabilities
cxl/mem: Implement polled mode mailbox
cxl/mem: Add basic IOCTL interface
cxl/mem: Add send command
taint: add taint for direct hardware access
cxl/mem: Add a "RAW" send command
cxl/mem: Create concept of enabled commands
cxl/mem: Use CEL for enabling commands
cxl/mem: Add set of informational commands
cxl/mem: Add limited Get Log command (0401h)
MAINTAINERS: Add maintainers of the CXL driver
Dan Williams (2):
cxl/mem: Introduce a driver for CXL-2.0-Type-3 endpoints
cxl/mem: Register CXL memX devices
.clang-format | 1 +
Documentation/ABI/testing/debugfs-cxl | 10 +
Documentation/ABI/testing/sysfs-bus-cxl | 26 +
Documentation/admin-guide/sysctl/kernel.rst | 1 +
Documentation/admin-guide/tainted-kernels.rst | 6 +-
Documentation/driver-api/cxl/index.rst | 12 +
.../driver-api/cxl/memory-devices.rst | 46 +
Documentation/driver-api/index.rst | 1 +
.../userspace-api/ioctl/ioctl-number.rst | 1 +
MAINTAINERS | 11 +
drivers/Kconfig | 1 +
drivers/Makefile | 1 +
drivers/base/core.c | 14 +
drivers/cxl/Kconfig | 49 +
drivers/cxl/Makefile | 7 +
drivers/cxl/bus.c | 29 +
drivers/cxl/cxl.h | 140 ++
drivers/cxl/mem.c | 1603 +++++++++++++++++
drivers/cxl/pci.h | 34 +
include/linux/device.h | 1 +
include/linux/kernel.h | 3 +-
include/uapi/linux/cxl_mem.h | 180 ++
kernel/panic.c | 1 +
23 files changed, 2176 insertions(+), 2 deletions(-)
create mode 100644 Documentation/ABI/testing/debugfs-cxl
create mode 100644 Documentation/ABI/testing/sysfs-bus-cxl
create mode 100644 Documentation/driver-api/cxl/index.rst
create mode 100644 Documentation/driver-api/cxl/memory-devices.rst
create mode 100644 drivers/cxl/Kconfig
create mode 100644 drivers/cxl/Makefile
create mode 100644 drivers/cxl/bus.c
create mode 100644 drivers/cxl/cxl.h
create mode 100644 drivers/cxl/mem.c
create mode 100644 drivers/cxl/pci.h
create mode 100644 include/uapi/linux/cxl_mem.h
--
2.30.0
1 year, 4 months
[PATCH v16 00/11] mm: introduce memfd_secret system call to create "secret" memory areas
by Mike Rapoport
From: Mike Rapoport <rppt(a)linux.ibm.com>
Hi,
@Andrew, this is based on v5.11-rc4-mmots-2021-01-19-13-54 with secretmem
patches dropped from there, I can rebase whatever way you prefer.
This is an implementation of "secret" mappings backed by a file descriptor.
The file descriptor backing secret memory mappings is created using a
dedicated memfd_secret system call The desired protection mode for the
memory is configured using flags parameter of the system call. The mmap()
of the file descriptor created with memfd_secret() will create a "secret"
memory mapping. The pages in that mapping will be marked as not present in
the direct map and will be present only in the page table of the owning mm.
Although normally Linux userspace mappings are protected from other users,
such secret mappings are useful for environments where a hostile tenant is
trying to trick the kernel into giving them access to other tenants
mappings.
Additionally, in the future the secret mappings may be used as a mean to
protect guest memory in a virtual machine host.
For demonstration of secret memory usage we've created a userspace library
https://git.kernel.org/pub/scm/linux/kernel/git/jejb/secret-memory-preloa...
that does two things: the first is act as a preloader for openssl to
redirect all the OPENSSL_malloc calls to secret memory meaning any secret
keys get automatically protected this way and the other thing it does is
expose the API to the user who needs it. We anticipate that a lot of the
use cases would be like the openssl one: many toolkits that deal with
secret keys already have special handling for the memory to try to give
them greater protection, so this would simply be pluggable into the
toolkits without any need for user application modification.
Hiding secret memory mappings behind an anonymous file allows (ab)use of
the page cache for tracking pages allocated for the "secret" mappings as
well as using address_space_operations for e.g. page migration callbacks.
The anonymous file may be also used implicitly, like hugetlb files, to
implement mmap(MAP_SECRET) and use the secret memory areas with "native" mm
ABIs in the future.
To limit fragmentation of the direct map to splitting only PUD-size pages,
I've added an amortizing cache of PMD-size pages to each file descriptor
that is used as an allocation pool for the secret memory areas.
As the memory allocated by secretmem becomes unmovable, we use CMA to back
large page caches so that page allocator won't be surprised by failing attempt
to migrate these pages.
v16:
* Fix memory leak intorduced in v15
* Clean the data left from previous page user before handing the page to
the userspace
v15: https://lore.kernel.org/lkml/20210120180612.1058-1-rppt@kernel.org
* Add riscv/Kconfig update to disable set_memory operations for nommu
builds (patch 3)
* Update the code around add_to_page_cache() per Matthew's comments
(patches 6,7)
* Add fixups for build/checkpatch errors discovered by CI systems
v14: https://lore.kernel.org/lkml/20201203062949.5484-1-rppt@kernel.org
* Finally s/mod_node_page_state/mod_lruvec_page_state/
v13: https://lore.kernel.org/lkml/20201201074559.27742-1-rppt@kernel.org
* Added Reviewed-by, thanks Catalin and David
* s/mod_node_page_state/mod_lruvec_page_state/ as Shakeel suggested
v12: https://lore.kernel.org/lkml/20201125092208.12544-1-rppt@kernel.org
* Add detection of whether set_direct_map has actual effect on arm64 and bail
out of CMA allocation for secretmem and the memfd_secret() syscall if pages
would not be removed from the direct map
Older history:
v11: https://lore.kernel.org/lkml/20201124092556.12009-1-rppt@kernel.org
v10: https://lore.kernel.org/lkml/20201123095432.5860-1-rppt@kernel.org
v9: https://lore.kernel.org/lkml/20201117162932.13649-1-rppt@kernel.org
v8: https://lore.kernel.org/lkml/20201110151444.20662-1-rppt@kernel.org
v7: https://lore.kernel.org/lkml/20201026083752.13267-1-rppt@kernel.org
v6: https://lore.kernel.org/lkml/20200924132904.1391-1-rppt@kernel.org
v5: https://lore.kernel.org/lkml/20200916073539.3552-1-rppt@kernel.org
v4: https://lore.kernel.org/lkml/20200818141554.13945-1-rppt@kernel.org
v3: https://lore.kernel.org/lkml/20200804095035.18778-1-rppt@kernel.org
v2: https://lore.kernel.org/lkml/20200727162935.31714-1-rppt@kernel.org
v1: https://lore.kernel.org/lkml/20200720092435.17469-1-rppt@kernel.org
Mike Rapoport (11):
mm: add definition of PMD_PAGE_ORDER
mmap: make mlock_future_check() global
riscv/Kconfig: make direct map manipulation options depend on MMU
set_memory: allow set_direct_map_*_noflush() for multiple pages
set_memory: allow querying whether set_direct_map_*() is actually enabled
mm: introduce memfd_secret system call to create "secret" memory areas
secretmem: use PMD-size pages to amortize direct map fragmentation
secretmem: add memcg accounting
PM: hibernate: disable when there are active secretmem users
arch, mm: wire up memfd_secret system call where relevant
secretmem: test: add basic selftest for memfd_secret(2)
arch/arm64/include/asm/Kbuild | 1 -
arch/arm64/include/asm/cacheflush.h | 6 -
arch/arm64/include/asm/set_memory.h | 17 +
arch/arm64/include/uapi/asm/unistd.h | 1 +
arch/arm64/kernel/machine_kexec.c | 1 +
arch/arm64/mm/mmu.c | 6 +-
arch/arm64/mm/pageattr.c | 23 +-
arch/riscv/Kconfig | 4 +-
arch/riscv/include/asm/set_memory.h | 4 +-
arch/riscv/include/asm/unistd.h | 1 +
arch/riscv/mm/pageattr.c | 8 +-
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/x86/include/asm/set_memory.h | 4 +-
arch/x86/mm/pat/set_memory.c | 8 +-
fs/dax.c | 11 +-
include/linux/pgtable.h | 3 +
include/linux/secretmem.h | 30 ++
include/linux/set_memory.h | 16 +-
include/linux/syscalls.h | 1 +
include/uapi/asm-generic/unistd.h | 6 +-
include/uapi/linux/magic.h | 1 +
kernel/power/hibernate.c | 5 +-
kernel/power/snapshot.c | 4 +-
kernel/sys_ni.c | 2 +
mm/Kconfig | 5 +
mm/Makefile | 1 +
mm/filemap.c | 3 +-
mm/gup.c | 10 +
mm/internal.h | 3 +
mm/mmap.c | 5 +-
mm/secretmem.c | 451 ++++++++++++++++++++++
mm/vmalloc.c | 5 +-
scripts/checksyscalls.sh | 4 +
tools/testing/selftests/vm/.gitignore | 1 +
tools/testing/selftests/vm/Makefile | 3 +-
tools/testing/selftests/vm/memfd_secret.c | 296 ++++++++++++++
tools/testing/selftests/vm/run_vmtests | 17 +
38 files changed, 917 insertions(+), 52 deletions(-)
create mode 100644 arch/arm64/include/asm/set_memory.h
create mode 100644 include/linux/secretmem.h
create mode 100644 mm/secretmem.c
create mode 100644 tools/testing/selftests/vm/memfd_secret.c
--
2.28.0
1 year, 4 months
[PATCH RESEND v2 00/10] fsdax: introduce fs query to support reflink
by Shiyang Ruan
This patchset is aimed to support shared pages tracking for fsdax.
Resend V2:
- Cc dm-devel instead of linux-raid
Change from V1:
- Add the old memory-failure handler back for rolling back
- Add callback in MD's ->rmap() to support multiple mapping of dm device
- Add judgement for CONFIG_SYSFS
- Add pfn_valid() judgement in hwpoison_filter()
- Rebased to v5.11-rc5
Change from RFC v3:
- Do not lock dax entry in memory failure handler
- Add a helper function for corrupted_range
- Add restrictions in xfs code
- Fix code style
- remove the useless association and lock in fsdax
Change from RFC v2:
- Adjust the order of patches
- Divide the infrastructure and the drivers that use it
- Rebased to v5.10
Change from RFC v1:
- Introduce ->block_lost() for block device
- Support mapped device
- Add 'not available' warning for realtime device in XFS
- Rebased to v5.10-rc1
This patchset moves owner tracking from dax_assocaite_entry() to pmem
device driver, by introducing an interface ->memory_failure() of struct
pagemap. This interface is called by memory_failure() in mm, and
implemented by pmem device. Then pmem device calls its ->corrupted_range()
to find the filesystem which the corrupted data located in, and call
filesystem handler to track files or metadata assocaited with this page.
Finally we are able to try to fix the corrupted data in filesystem and do
other necessary processing, such as killing processes who are using the
files affected.
The call trace is like this:
memory_failure()
pgmap->ops->memory_failure() => pmem_pgmap_memory_failure()
gendisk->fops->corrupted_range() => - pmem_corrupted_range()
- md_blk_corrupted_range()
sb->s_ops->currupted_range() => xfs_fs_corrupted_range()
xfs_rmap_query_range()
xfs_currupt_helper()
* corrupted on metadata
try to recover data, call xfs_force_shutdown()
* corrupted on file data
try to recover data, call mf_dax_mapping_kill_procs()
The fsdax & reflink support for XFS is not contained in this patchset.
(Rebased on v5.11-rc5)
Shiyang Ruan (10):
pagemap: Introduce ->memory_failure()
blk: Introduce ->corrupted_range() for block device
fs: Introduce ->corrupted_range() for superblock
mm, fsdax: Refactor memory-failure handler for dax mapping
mm, pmem: Implement ->memory_failure() in pmem driver
pmem: Implement ->corrupted_range() for pmem driver
dm: Introduce ->rmap() to find bdev offset
md: Implement ->corrupted_range()
xfs: Implement ->corrupted_range() for XFS
fs/dax: Remove useless functions
block/genhd.c | 6 ++
drivers/md/dm-linear.c | 20 ++++
drivers/md/dm.c | 61 +++++++++++
drivers/nvdimm/pmem.c | 44 ++++++++
fs/block_dev.c | 42 +++++++-
fs/dax.c | 63 ++++-------
fs/xfs/xfs_fsops.c | 5 +
fs/xfs/xfs_mount.h | 1 +
fs/xfs/xfs_super.c | 109 +++++++++++++++++++
include/linux/blkdev.h | 2 +
include/linux/dax.h | 1 +
include/linux/device-mapper.h | 5 +
include/linux/fs.h | 2 +
include/linux/genhd.h | 3 +
include/linux/memremap.h | 8 ++
include/linux/mm.h | 9 ++
mm/memory-failure.c | 190 +++++++++++++++++++++++-----------
17 files changed, 466 insertions(+), 105 deletions(-)
--
2.30.0
1 year, 4 months