[ndctl PATCH] ndctl: hide null uuids
by Dan Williams
Clean up the namespace listing to hide the 'raw_uuid' field when it is
zero.
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
util/json.c | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/util/json.c b/util/json.c
index 1772177a8abd..c606e1cdf770 100644
--- a/util/json.c
+++ b/util/json.c
@@ -652,10 +652,12 @@ static struct json_object *util_dax_badblocks_to_json(struct ndctl_dax *dax,
static struct json_object *util_raw_uuid(struct ndctl_namespace *ndns)
{
- uuid_t raw_uuid;
char buf[40];
+ uuid_t raw_uuid;
ndctl_namespace_get_uuid(ndns, raw_uuid);
+ if (uuid_is_null(raw_uuid))
+ return NULL;
uuid_unparse(raw_uuid, buf);
return json_object_new_string(buf);
}
@@ -734,9 +736,8 @@ struct json_object *util_namespace_to_json(struct ndctl_namespace *ndns,
json_object_object_add(jndns, "uuid", jobj);
jobj = util_raw_uuid(ndns);
- if (!jobj)
- goto err;
- json_object_object_add(jndns, "raw_uuid", jobj);
+ if (jobj)
+ json_object_object_add(jndns, "raw_uuid", jobj);
bdev = ndctl_btt_get_block_device(btt);
} else if (pfn) {
ndctl_pfn_get_uuid(pfn, uuid);
@@ -746,9 +747,8 @@ struct json_object *util_namespace_to_json(struct ndctl_namespace *ndns,
goto err;
json_object_object_add(jndns, "uuid", jobj);
jobj = util_raw_uuid(ndns);
- if (!jobj)
- goto err;
- json_object_object_add(jndns, "raw_uuid", jobj);
+ if (jobj)
+ json_object_object_add(jndns, "raw_uuid", jobj);
bdev = ndctl_pfn_get_block_device(pfn);
} else if (dax) {
struct daxctl_region *dax_region;
@@ -761,9 +761,8 @@ struct json_object *util_namespace_to_json(struct ndctl_namespace *ndns,
goto err;
json_object_object_add(jndns, "uuid", jobj);
jobj = util_raw_uuid(ndns);
- if (!jobj)
- goto err;
- json_object_object_add(jndns, "raw_uuid", jobj);
+ if (jobj)
+ json_object_object_add(jndns, "raw_uuid", jobj);
if ((flags & UTIL_JSON_DAX) && dax_region) {
jobj = util_daxctl_region_to_json(dax_region, NULL,
flags);
4 years, 2 months
[ndctl PATCH] ndctl, test: fix sector-mode.sh to work with label support
by Vishal Verma
The nfit_test.1 grew support for the label DSMs. Fix sector-mode.sh to
emulate labelless mode by zeroing the labels at the start, and supplying
--no-autolabel to create-namespace commands.
Cc: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
---
test/sector-mode.sh | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/test/sector-mode.sh b/test/sector-mode.sh
index ee364eb..54b16a5 100755
--- a/test/sector-mode.sh
+++ b/test/sector-mode.sh
@@ -32,11 +32,7 @@ $NDCTL zero-labels $BUS all
$NDCTL enable-region $BUS all
$NDCTL disable-region $BUS1 all
-if $NDCTL zero-labels $BUS1 all; then
- echo "DIMMs on $BUS1 support labels, skip..."
- $NDCTL enable-region $BUS1 all
- false
-fi
+$NDCTL zero-labels $BUS1 all
$NDCTL enable-region $BUS1 all
rc=1
@@ -44,9 +40,9 @@ query=". | sort_by(.size) | reverse | .[0].dev"
NAMESPACE=$($NDCTL list $BUS1 -N | jq -r "$query")
REGION=$($NDCTL list -R --namespace=$NAMESPACE | jq -r ".dev")
echo 0 > /sys/bus/nd/devices/$REGION/read_only
-$NDCTL create-namespace -e $NAMESPACE -m sector -f -l 4K
-$NDCTL create-namespace -e $NAMESPACE -m dax -f -a 4K
-$NDCTL create-namespace -e $NAMESPACE -m sector -f -l 4K
+$NDCTL create-namespace --no-autolabel -e $NAMESPACE -m sector -f -l 4K
+$NDCTL create-namespace --no-autolabel -e $NAMESPACE -m dax -f -a 4K
+$NDCTL create-namespace --no-autolabel -e $NAMESPACE -m sector -f -l 4K
$NDCTL disable-region $BUS all
$NDCTL disable-region $BUS1 all
--
2.17.0
4 years, 2 months
[PATCH v3 0/9] Series short description
by Dan Williams
Changes since v2 [1]:
* Fix source address increment in mcsafe_handle_tail() (Mika)
* Extend the unit test to inject simulated write faults and validate
that data is properly transferred.
* Rename MCSAFE_DEBUG to MCSAFE_TEST
[1]: https://lists.01.org/pipermail/linux-nvdimm/2018-May/015583.html
---
Currently memcpy_mcsafe() is only deployed in the pmem driver when
reading through a /dev/pmemX block device. However, a filesystem in dax
mode mounted on a /dev/pmemX block device will bypass the block layer
and the driver for reads. The filesystem-dax (fsdax) read case uses
dax_direct_access() and copy_to_iter() to bypass the block layer.
The result of the bypass is that the kernel treats machine checks during
read as system fatal (reboot) when they could simply be flagged as an
I/O error, similar to performing reads through the pmem driver. Prevent
this fatal condition by deploying memcpy_mcsafe() in the fsdax read
path.
The main differences between this copy_to_user_mcsafe() and
copy_user_generic_unrolled() are:
* Typical tail/residue handling after a fault retries the copy
byte-by-byte until the fault happens again. Re-triggering machine
checks is potentially fatal so the implementation uses source alignment
and poison alignment assumptions to avoid re-triggering machine
checks.
* SMAP coordination is handled external to the assembly with
__uaccess_begin() and __uaccess_end().
* ITER_KVEC and ITER_BVEC can now end prematurely with an error.
The new MCSAFE_TEST facility is proposed as a way to unit test the
exception handling without requiring an ACPI EINJ capable platform.
---
Dan Williams (9):
x86, memcpy_mcsafe: remove loop unrolling
x86, memcpy_mcsafe: add labels for write fault handling
x86, memcpy_mcsafe: return bytes remaining
x86, memcpy_mcsafe: add write-protection-fault handling
x86, memcpy_mcsafe: define copy_to_iter_mcsafe()
dax: introduce a ->copy_to_iter dax operation
dax: report bytes remaining in dax_iomap_actor()
pmem: switch to copy_to_iter_mcsafe()
x86, nfit_test: unit test for memcpy_mcsafe()
arch/x86/Kconfig | 1
arch/x86/Kconfig.debug | 3 +
arch/x86/include/asm/mcsafe_test.h | 75 ++++++++++++++++++++++++
arch/x86/include/asm/string_64.h | 10 ++-
arch/x86/include/asm/uaccess_64.h | 14 +++++
arch/x86/lib/memcpy_64.S | 112 +++++++++++++++++-------------------
arch/x86/lib/usercopy_64.c | 21 +++++++
drivers/dax/super.c | 10 +++
drivers/md/dm-linear.c | 16 +++++
drivers/md/dm-log-writes.c | 15 +++++
drivers/md/dm-stripe.c | 21 +++++++
drivers/md/dm.c | 25 ++++++++
drivers/nvdimm/claim.c | 3 +
drivers/nvdimm/pmem.c | 13 +++-
drivers/s390/block/dcssblk.c | 7 ++
fs/dax.c | 21 ++++---
include/linux/dax.h | 5 ++
include/linux/device-mapper.h | 5 +-
include/linux/string.h | 4 +
include/linux/uio.h | 15 +++++
lib/iov_iter.c | 61 ++++++++++++++++++++
tools/testing/nvdimm/test/nfit.c | 104 +++++++++++++++++++++++++++++++++
22 files changed, 482 insertions(+), 79 deletions(-)
create mode 100644 arch/x86/include/asm/mcsafe_test.h
4 years, 2 months
Re: Draft NVDIMM proposal
by Dan Williams
[ adding linux-nvdimm ]
Great write up! Some comments below...
On Wed, May 9, 2018 at 10:35 AM, George Dunlap <george.dunlap(a)citrix.com> wrote:
> Dan,
>
> I understand that you're the NVDIMM maintainer for Linux. I've been
> working with your colleagues to try to sort out an architecture to allow
> NVRAM to be passed to guests under the Xen hypervisor.
>
> If you have time, I'd appreciate it if you could skim through at least
> the first section of the document below ("NVIDMM Overview"), concerning
> NVDIMM devices and Linux, to see if I've made any mistakes.
>
> If you're up for it, additional early feedback on the proposed Xen
> architecture, from a Linux perspective, would be awesome as well.
>
> Thanks,
> -George
>
> On 05/09/2018 06:29 PM, George Dunlap wrote:
>> Below is an initial draft of an NVDIMM proposal. I'll submit a patch to
>> include it in the tree at some point, but I thought for initial
>> discussion it would be easier if it were copied in-line.
>>
>> I've done a fair amount of investigation, but it's quite likely I've
>> made mistakes. Please send me corrections where necessary.
>>
>> -George
>>
>> ---
>> % NVDIMMs and Xen
>> % George Dunlap
>> % Revision 0.1
>>
>> # NVDIMM overview
>>
>> It's very difficult, from the various specs, to actually get a
>> complete enough picture if what's going on to make a good design.
>> This section is meant as an overview of the current hardware,
>> firmware, and Linux interfaces sufficient to inform a discussion of
>> the issues in designing a Xen interface for NVDIMMs.
>>
>> ## DIMMs, Namespaces, and access methods
>>
>> An NVDIMM is a DIMM (_dual in-line memory module_) -- a physical form
>> factor) that contains _non-volatile RAM_ (NVRAM). Individual bytes of
>> memory on a DIMM are specified by a _DIMM physical address_ or DPA.
>> Each DIMM is attached to an NVDIMM controller.
>>
>> Memory on the DIMMs is divided up into _namespaces_. The word
>> "namespace" is rather misleading though; a namespace in this context
>> is not actually a space of names (contrast, for example "C++
>> namespaces"); rather, it's more like a SCSI LUN, or a volume, or a
>> partition on a drive: a set of data which is meant to be viewed and
>> accessed as a unit. (The name was apparently carried over from NVMe
>> devices, which were precursors of the NVDIMM spec.)
Unlike NVMe an NVDIMM itself has no concept of namespaces. Some DIMMs
provide a "label area" which is an out-of-band non-volatile memory
area where the OS can store whatever it likes. The UEFI 2.7
specification defines a data format for the definition of namespaces
on top of persistent memory ranges advertised to the OS via the ACPI
NFIT structure.
There is no obligation for an NVDIMM to provide a label area, and as
far as I know all NVDIMMs on the market today do not provide a label
area. That said, QEMU has the ability to associate a virtual label
area with for its virtual NVDIMM representation.
>> The NVDIMM controller allows two ways to access the DIMM. One is
>> mapped 1-1 in _system physical address space_ (SPA), much like normal
>> RAM. This method of access is called _PMEM_. The other method is
>> similar to that of a PCI device: you have a control and status
>> register which control an 8k aperture window into the DIMM. This
>> method access is called _PBLK_.
>>
>> In the case of PMEM, as in the case of DRAM, addresses from the SPA
>> are interleaved across a set of DIMMs (an _interleave set_) for
>> performance reasons. A specific PMEM namespace will be a single
>> contiguous DPA range across all DIMMs in its interleave set. For
>> example, you might have a namespace for DPAs `0-0x50000000` on DIMMs 0
>> and 1; and another namespace for DPAs `0x80000000-0xa0000000` on DIMMs
>> 0, 1, 2, and 3.
>>
>> In the case of PBLK, a namespace always resides on a single DIMM.
>> However, that namespace can be made up of multiple discontiguous
>> chunks of space on that DIMM. For instance, in our example above, we
>> might have a namespace om DIMM 0 consisting of DPAs
>> `0x50000000-0x60000000`, `0x80000000-0x90000000`, and
>> `0xa0000000-0xf0000000`.
>>
>> The interleaving of PMEM has implications for the speed and
>> reliability of the namespace: Much like RAID 0, it maximizes speed,
>> but it means that if any one DIMM fails, the data from the entire
>> namespace is corrupted. PBLK makes it slightly less straightforward
>> to access, but it allows OS software to apply RAID-like logic to
>> balance redundancy and speed.
>>
>> Furthermore, PMEM requires one byte of SPA for every byte of NVDIMM;
>> for large systems without 5-level paging, this is actually becoming a
>> limitation. Using PBLK allows existing 4-level paged systems to
>> access an arbitrary amount of NVDIMM.
>>
>> ## Namespaces, labels, and the label area
>>
>> A namespace is a mapping from the SPA and MMIO space into the DIMM.
>>
>> The firmware and/or operating system can talk to the NVDIMM controller
>> to set up mappings from SPA and MMIO space into the DIMM. Because the
>> memory and PCI devices are separate, it would be possible for buggy
>> firmware or NVDIMM controller drivers to misconfigure things such that
>> the same DPA is exposed in multiple places; if so, the results are
>> undefined.
>>
>> Namespaces are constructed out of "labels". Each DIMM has a Label
>> Storage Area, which is persistent but logically separate from the
>> device-addressable areas on the DIMM. A label on a DIMM describes a
>> single contiguous region of DPA on that DIMM. A PMEM namespace is
>> made up of one label from each of the DIMMs which make its interleave
>> set; a PBLK namespace is made up of one label for each chunk of range.
>>
>> In our examples above, the first PMEM namespace would be made of two
>> labels (one on DIMM 0 and one on DIMM 1, each describind DPA
>> `0-0x50000000`), and the second namespace would be made of four labels
>> (one on DIMM 0, one on DIMM 1, and so on). Similarly, in the PBLK
>> example, the namespace would consist of three labels; one describing
>> `0x50000000-0x60000000`, one describing `0x80000000-0x90000000`, and
>> so on.
>>
>> The namespace definition includes not only information about the DPAs
>> which make up the namespace and how they fit together; it also
>> includes a UUID for the namespace (to allow it to be identified
>> uniquely), a 64-character "name" field for a human-friendly
>> description, and Type and Address Abstraction GUIDs to inform the
>> operating system how the data inside the namespace should be
>> interpreted. Additionally, it can have an `ROLABEL` flag, which
>> indicates to the OS that "device drivers and manageability software
>> should refuse to make changes to the namespace labels", because
>> "attempting to make configuration changes that affect the namespace
>> labels will fail (i.e. because the VM guest is not in a position to
>> make the change correctly)".
>>
>> See the [UEFI Specification][uefi-spec], section 13.19, "NVDIMM Label
>> Protocol", for more information.
>>
>> [uefi-spec]:
>> http://www.uefi.org/sites/default/files/resources/UEFI%20Spec%202_7_A%20S...
>>
>> ## NVDIMMs and ACPI
>>
>> The [ACPI Specification][acpi-spec] breaks down information in two ways.
>>
>> The first is about physical devices (see section 9.20, "NVDIMM
>> Devices"). The NVDIMM controller is called the _NVDIMM Root Device_.
>> There will generally be only a single NVDIMM root device on a system.
>>
>> Individual NVDIMMs are referred to by the spec as _NVDIMM Devices_.
>> Each separate DIMM will have its own device listed as being under the
>> Root Device. Each DIMM will have an _NVDIMM Device Handle_ which
>> describes the physical DIMM (its location within the memory channel,
>> the channel number within the memory controller, the memory controller
>> ID within the socket, and so on).
>>
>> The second is about the data on those devices, and how the operating
>> system can access it. This information is exposed in the NFIT table
>> (see section 5.2.25).
>>
>> Because namespace labels allow NVDIMMs to be partitioned in fairly
>> arbitrary ways, exposing information about how the operating system
>> can access it is a bit complicated. It consists of several tables,
>> whose information must be correlated to make sense out of it.
>>
>> These tables include:
>> 1. A table of DPA ranges on individual NVDIMM devices
>> 2. A table of SPA ranges where PMEM regions are mapped, along with
>> interleave sets
>> 3. Tables for control and data addresses for PBLK regions
>>
>> NVRAM on a given NVDIMM device will be broken down into one or more
>> _regions_. These regions are enumerated in the NVDIMM Region Mapping
>> Structure. Each entry in this table contains the NVDIMM Device Handle
>> for the device the region is in, as well as the DPA range for the
>> region (called "NVDIMM Physical Address Base" and "NVDIMM Region Size"
>> in the spec). Regions which are part of a PMEM namespace will have
>> references into SPA tables and interleave set tables; regions which
>> are part of PBLK namespaces will have references into control region
>> and block data window region structures.
>>
>> [acpi-spec]:
>> http://www.uefi.org/sites/default/files/resources/ACPI%206_2_A_Sept29.pdf
>>
>> ## Namespaces and the OS
>>
>> At boot time, the firmware will read the label regions from the NVDIMM
>> device and set up the memory controllers appropriately. It will then
>> construct a table describing the resulting regions in a table called
>> an NFIT table, and expose that table via ACPI.
Labels are not involved in the creation of the NFIT. The NFIT only
defines PMEM ranges and interleave sets, the rest is left to the OS.
>> To use a namespace, an operating system needs at a minimum two pieces
>> of information: The UUID and/or Name of the namespace, and the SPA
>> range where that namespace is mapped; and ideally also the Type and
>> Abstraction Type to know how to interpret the data inside.
Not necessarily, no. Linux supports "label-less" mode where it exposes
the raw capacity of a region in 1:1 mapped namespace without a label.
This is how Linux supports "legacy" NVDIMMs that do not support
labels.
>> Unfortunately, the information needed to understand namespaces is
>> somewhat disjoint. The namespace labels themselves contain the UUID,
>> Name, Type, and Abstraction Type, but don't contain any information
>> about SPA or block control / status registers and windows. The NFIT
>> table contains a list of SPA Range Structures, which list the
>> NVDIMM-related SPA ranges and their Type GUID; as well as a table
>> containing individual DPA ranges, which specifies which SPAs they
>> correspond to. But the NFIT does not contain the UUID or other
>> identifying information from the Namespace labels. In order to
>> actually discover that namespace with UUID _X_ is mapped at SPA _Y-Z_,
>> an operating system must:
>>
>> 1. Read the label areas of all NVDIMMs and discover the DPA range and
>> Interleave Set for namespace _X_
>> 2. Read the Region Mapping Structures from the NFIT table, and find
>> out which structures match the DPA ranges for namespace _X_
>> 3. Find the System Physical Address Range Structure Index associated
>> with the Region Mapping
>> 4. Look up the SPA Range Structure in the NFIT table using the SPA
>> Range Structure Index
>> 5. Read the SPA range _Y-Z_
I'm not sure I'm grokking your distinction between 2, 3, 4? In any
event we do the DIMM to SPA association first before reading labels.
The OS calculates a so called "Interleave Set Cookie" from the NFIT
information to compare against a similar value stored in the labels.
This lets the OS determine that the Interleave Set composition has not
changed from when the labels were initially written. An Interleave Set
Cookie mismatch indicates the labels are stale, corrupted, or that the
physical composition of the Interleave Set has changed.
>> An OS driver can modify the namespaces by modifying the Label Storage
>> Areas of the corresponding DIMMs. The NFIT table describes how the OS
>> can access the Label Storage Areas. Label Storage Areas may be
>> "isolated", in which case the area would be accessed via
>> device-specific AML methods (DSM), or they may be exposed directly
>> using a well-known location. AML methods to access the label areas
>> are "dumb": they are essentially a memcpy() which copies into or out
>> of a given {DIMM, Label Area Offest} address. No checking for
>> validity of reads and writes is done, and simply modifying the labels
>> does not change the mapping immediately -- this must be done either by
>> the OS driver reprogramming the NVDIMM memory controller, or by
>> rebooting and allowing the firmware to it.
There are checksums in the Namespace definition to account label
validity. Starting with ACPI 6.2 DSMs for labels are deprecated in
favor of the new / named methods for label access _LSI, _LSR, and
_LSW.
>> Modifying labels is tricky, due to an issue that will be somewhat of a
>> recurring theme when discussing NVDIMMs: The necessity of assuming
>> that, at any given point in time, power may be suddenly cut, and the
>> system needing to be able to recover sensible data in such a
>> circumstance. The [UEFI Specification][uefi-spec] chapter on the
>> NVDIMM label protocol specifies how the label area is to be modified
>> such that a consistent "view" is always available; and how firmware
>> and the operating system should respond consistently to labels which
>> appear corrupt.
Not that tricky :-).
>> ## NVDIMMs and filesystems
>>
>> Along the same line, most filesystems are written with the assumption
>> that a given write to a block device will either finish completely, or
>> be entirely reverted. Since access to NVDIMMs (even in PBLK mode) are
>> essentially `memcpy`s, writes may well be interrupted halfway through,
>> resulting in _sector tearing_. In order to help with this, the UEFI
>> spec defines method of reading and writing NVRAM which is capable of
>> emulating sector-atomic write semantics via a _block translation
>> layer_ (BTT) ([UEFI spec][uefi-spec], chapter 6, "Block Translation
>> Table (BTT) Layout"). Namespaces accessed via this discipline will
>> have a _BTT info block_ at the beginning of the namespace (similar to
>> a superblock on a traditional hard disk). Additionally, the
>> AddressAbstraction GUID in the namespace label(s) should be set to
>> `EFI_BTT_ABSTRACTION_GUID`.
>>
>> ## Linux
>>
>> Linux has a _direct access_ (DAX) filesystem mount mode for block
>> devices which are "memory-like" ^[kernel-dax]. If both the filesystem
>> and the underlying device support DAX, and the `dax` mount option is
>> enabled, then when a file on that filesystem is `mmap`ed, the page
>> cache is bypassed and the underlying storage is mapped directly into
>> the user process. (?)
>>
>> [kernel-dax]: https://www.kernel.org/doc/Documentation/filesystems/dax.txt
>>
>> Linux has a tool called `ndctl` to manage NVDIMM namespaces. From the
>> documentation it looks fairly well abstracted: you don't typically
>> specify individual DPAs when creating PBLK or PMEM regions: you
>> specify the type you want and the size and it works out the layout
>> details (?).
>>
>> The `ndctl` tool allows you to make PMEM namespaces in one of four
>> modes: `raw`, `sector`, `fsdax` (or `memory`), and `devdax` (or,
>> confusingly, `dax`).
Yes, apologies on the confusion. Going forward from ndctl-v60 we have
deprecated 'memory' and 'dax'.
>> The `raw`, `sector`, and `fsdax` modes all result in a block device in
>> the pattern of `/dev/pmemN[.M]`, in which a filesystem can be stored.
>> `devdax` results in a character device in the pattern of
>> `/dev/daxN[.M]`.
>>
>> It's not clear from the documentation exactly what `raw` mode is or
>> when it would be safe to use it.
We'll add some documentation to the man page, but 'raw' mode is
effectively just a ramdisk. No, dax support.
>>
>> `sector` mode implements `BTT`; it is thus safe against sector
>> tearing, but does not support mapping files in DAX mode. The
>> namespace can be either PMEM or PBLK (?). As described above, the
>> first block of the namespace will be a BTT info block.
The info block is not exposed in the user accessible data space as
this comment seems to imply. It's similar to a partition table it's on
media metadata that specifies an encapsulation.
>> `fsdax` and `devdax` mode are both designed to make it possible for
>> user processes to have direct mapping of NVRAM. As such, both are
>> only suitable for PMEM namespaces (?). Both also need to have kernel
>> page structures allocated for each page of NVRAM; this amounts to 64
>> bytes for every 4k of NVRAM. Memory for these page structures can
>> either be allocated out of normal "system" memory, or inside the PMEM
>> namespace itself.
>>
>> In both cases, an "info block", very similar to the BTT info block, is
>> written to the beginning of the namespace when created. This info
>> block specifies whether the page structures come from system memory or
>> from the namespace itself. If from the namespace itself, it contains
>> information about what parts of the namespace have been set aside for
>> Linux to use for this purpose.
>>
>> Linux has also defined "Type GUIDs" for these two types of namespace
>> to be stored in the namespace label, although these are not yet in the
>> ACPI spec.
They never will be. One of the motivations for GUIDs is that an OS can
define private ones without needing to go back and standardize them.
Only GUIDs that are needed to inter-OS / pre-OS compatibility would
need to be defined in ACPI, and there is no expectation that other
OSes understand Linux's format for reserving page structure space.
>> Documentation seems to indicate that both `pmem` and `dax` devices can
>> be further subdivided (by mentioning `/dev/pmemN.M` and
>> `/dev/daxN.M`), but don't mention specifically how. `pmem` devices,
>> being block devices, can presumuably be partitioned like a block
>> device can. `dax` devices may have something similar, or may have
>> their own subdivision mechanism. The rest of this document will
>> assume that this is the case.
You can create multiple namespaces in a given region. Sub-sequent
namespaces after the first get the .1, .2, .3 etc suffix.
>>
>> # Xen considerations
>>
>> ## RAM and MMIO in Xen
>>
>> Xen generally has two types of things that can go into a pagetable or
>> p2m. The first is RAM or "system memory". RAM has a page struct,
>> which allows it to be accounted for on a page-by-page basis: Assigned
>> to a specific domain, reference counted, and so on.
>>
>> The second is MMIO. MMIO areas do not have page structures, and thus
>> cannot be accounted on a page-by-page basis. Xen knows about PCI
>> devices and the associated MMIO ranges, and makes sure that PV
>> pagetables or HVM p2m tables only contain MMIO mappings for devices
>> which have been assigned to a guest.
>>
>> ## Page structures
>>
>> To begin with, Xen, like Linux, needs page structs for NVDIMM
>> memory. Without page structs, we don't have reference counts; which
>> means there's no safe way, for instance, for a guest to ask a PV
>> device to write into NVRAM owned by a guest; and no real way to be
>> confident that the same memory hadn't been mapped multiple times.
>>
>> Page structures in Xen are 32 bytes for non-BIGMEM systems (<5 TiB),
>> and 40 bytes for BIGMEM systems.
>>
>> ### Page structure allocation
>>
>> There are three potential places we could store page structs:
>>
>> 1. **System memory** Allocated from the host RAM
>>
>> 2. **Inside the namespace** Like Linux, there could be memory set
>> aside inside the namespace set aside specifically for mapping that
>> namespace. This could be 2a) As a user-visible separate partition,
>> or 2b) allocated by `ndctl` from the namespace "superblock". As
>> the page frame areas of the namespace can be discontiguous (?), it
>> would be possible to enable or disable this extra space on an
>> existing namespace, to allow users with existing vNVDIMM images to
>> switch to or from Xen.
I think a Xen mode namespace makes sense. If I understand correctly it
would also need to house the struct page array at the same time in
case dom0 needs to do a get_user_pages() operation when assigning pmem
to a guest?
The other consideration is how to sub-divide that namespace for
handing it out to guests. We are currently working through problems
with virtualization and device-assignment when the guest is given
memory for a dax mapped file on a filesystem in dax mode. Given that
the filesytem can do physical layout rearrangement at will it means
that it is not suitable to give to guest. For now we require a devdax
mode namespace for mapping pmem to a guest so that we do not collide
with filesystem block map mutations.
I assume this "xen-mode" namespace would be something like devdax + mfn array.
>>
>> 3. **A different namespace** NVRAM could be set aside for use by
>> arbitrary namespaces. This could be a 3a) specially-selected
>> partition from a normal namespace, or it could be 3b) a namespace
>> specifically designed to be used for Xen (perhaps with its own Type
>> GUID).
>>
>> 2b has the advantage that we should be able to unilaterally allocate a
>> Type GUID and start using it for that purpose. It also has the
>> advantage that it should be somewhat easier for someone with existing
>> vNVDIMM images to switch into (or away from) using Xen. It has the
>> disadvantage of being less transparent to the user.
>>
>> 3b has the advantage of being invisible to the user once being set up.
>> It has the slight disadvantage of having more gatekeepers to get
>> through; and if those gatekeepers aren't happy with enabling or
>> disabling extra frametable space for Xen after creation (or if I've
>> misunderstood and such functionality isn't straightforward to
>> implement) then it will be difficult for people with existing images
>> to switch to Xen.
>>
>> ### Dealing with changing frame tables
>>
>> Another potential issue to consider is the monolithic nature of the
>> current frame table. At the moment, to find a page struct given an
>> mfn, you use the mfn as an index into a single large array.
>>
>> I think we can assume that NVDIMM SPA ranges will be separate from
>> normal system RAM. There's no reason the frame table couldn't be
>> "sparse": i.e., only the sections of it that actually contain valid
>> pages need to have ram backing them.
>>
>> However, if we pursue a solution like Linux, where each namespace
>> contains memory set aside to use for its own pagetables, we may have a
>> situation where boundary between two namespaces falls in the middle of
>> a frame table page; in that case, from where should such a frame table
>> page be allocated?
It's already the case that the minimum alignment for multiple
namespaces is 128MB given the current "section size" assumptions of
the core mm. Can we make a similar alignment restriction for Xen to
eliminate this problem.?
>> A simple answer would be to use system RAM to "cover the gap": There
>> would only ever need to be a single page per boundary.
>>
>> ## Page tracking for domain 0
>>
>> When domain 0 adds or removes entries from its pagetables, it does not
>> explicitly store the memory type (i.e., whether RAM or MMIO); Xen
>> infers this from its knowledge of where RAM is and is not. Below we
>> will explore design choices that involve domain 0 telling Xen about
>> NVDIMM namespaces, SPAs, and what it can use for page structures. In
>> such a scenario, NVRAM pages essentially transition from being MMIO
>> (before Xen knows about them) to being RAM (after Xen knows about
>> them), which in turn has implications for any mappings which domain 0
>> has in its pagetables.
>>
>> ## PVH and QEMU
>>
>> A number of solutions have suggested using QEMU to provide emulated
>> NVDIMM support to guests. This is a workable solution for HVM guests,
>> but for PVH guests we would like to avoid introducing a device model
>> if at all possible.
>>
>> ## FS DAX and DMA in Linux
>>
>> There is [an issue][linux-fs-dax-dma-issue] with DAX and filesystems,
>> in that filesystems (even those claiming to support DAX) may want to
>> rearrange the block<->file mapping "under the feet" of running
>> processes with mapped files. Unfortunately, this is more tricky with
>> DAX than with a page cache, and as of [early 2018][linux-fs-dax-dma-2]
>> was essentially incompatible with virtualization. ("I think we need to
>> enforce this in the host kernel. I.e. do not allow file backed DAX
>> pages to be mapped in EPT entries unless / until we have a solution to
>> the DMA synchronization problem.")
>>
>> More needs to be discussed and investigated here; but for the time
>> being, mapping a file in a DAX filesystem into a guest's p2m is
>> probably not going to be possible.
Ah, you have the fsdax issue captured here, great.
>>
>> [linux-fs-dax-dma-issue]:
>> https://lists.01.org/pipermail/linux-nvdimm/2017-December/013704.html
>> [linux-fs-dax-dma-2]:
>> https://lists.nongnu.org/archive/html/qemu-devel/2018-01/msg07347.html
>>
>> # Target functionality
>>
>> The above sets the stage, but to actually determine on an architecture
>> we have to decide what kind of final functionality we're looking for.
>> The functionality falls into two broad areas: Functionality from the
>> host administrator's point of view (accessed from domain 0), and
>> functionality from the guest administrator's point of view.
>>
>> ## Domain 0 functionality
>>
>> For the purposes of this section, I shall be distinguishing between
>> "native Linux" functionality and "domain 0" functionality. By "native
>> Linux" functionality I mean functionality which is available when
>> Linux is running on bare metal -- `ndctl`, `/dev/pmem`, `/dev/dax`,
>> and so on. By "dom0 functionality" I mean functionality which is
>> available in domain 0 when Linux is running under Xen.
>>
>> 1. **Disjoint functionality** Have dom0 and native Linux
>> functionality completely separate: namespaces created when booted
>> on native Linux would not be accessible when booted under domain 0,
>> and vice versa. Some Xen-specific tool similar to `ndctl` would
>> need to be developed for accessing functionality.
I'm open to teaching ndctl about Xen needs if that helps.
>>
>> 2. **Shared data but no dom0 functionality** Another option would be
>> to have Xen and Linux have shared access to the same namespaces,
>> but dom0 essentially have no direct access to the NVDIMM. Xen
>> would read the NFIT, parse namespaces, and expose those namespaces
>> to dom0 like any other guest; but dom0 would not be able to create
>> or modify namespaces. To manage namespaces, an administrator would
>> need to boot into native Linux, modify the namespaces, and then
>> reboot into Xen again.
Ugh.
>>
>> 3. **Dom0 fully functional, Manual Xen frame table** Another level of
>> functionality would be to make it possible for dom0 to have full
>> parity with native Linux in terms of using `ndctl` to manage
>> namespaces, but to require the host administrator to manually set
>> aside NVRAM for Xen to use for frame tables.
>>
>> 4. **Dom0 fully functional, automatic Xen frame table** This is like
>> the above, but with the Xen frame table space automatically
>> managed, similar to Linux's: You'd simply specify that you wanted
>> the Xen frametable somehow when you create the namespace, and from
>> then on forget about it.
>>
>> Number 1 should be avoided if at all possible, in my opinion.
>>
>> Given that the NFIT table doesn't currently have namespace UUIDs or
>> other key pieces of information to fully understand the namespaces, it
>> seems like #2 would likely not be able to be made functional enough.
>>
>> Number 3 should be achievable under our control. Obviously #4 would
>> be ideal, but might depend on getting cooperation from the Linux
>> NVDIMM maintainers to be able to set aside Xen frame table memory in
>> addition to Linux frame table memory.
"xen-mode" namespace?
>>
>> ## Guest functionality
>>
>> 1. **No remapping** The guest can take the PMEM device as-is. It's
>> mapped by the toolstack at a specific place in _guest physical
>> address_ (GPA) space and cannot be moved. There is no controller
>> emulation (which would allow remapping) and minimal label area
>> functionality.
>>
>> 2. **Full controller access for PMEM**. The guest has full
>> controller access for PMEM: it can carve up namespaces, change
>> mappings in GPA space, and so on.
In it's own virtual label area, right?
>> 3. **Full controller access for both PMEM and PBLK**. A guest has
>> full controller access, and can carve up its NVRAM into arbitrary
>> PMEM or PBLK regions, as it wants.
I'd forget about giving PBLK to guests, just use standard virtio or
equivalent to route requests to the dom0 driver. Unless the PBLK
control registers are mapped on 4K boundaries there's no safe way to
give individual guests their own direct PBLK access.
>>
>> Numbers 2 and 3 would of course be nice-to-have, but would almost
>> certainly involve having a QEMU qprocess to emulate them. Since we'd
>> like to have PVH use NVDIMMs, we should at least make #1 an option.
>>
>> # Proposed design / roadmap
>>
>> Initially, dom0 accesses the NVRAM as normal, using static ACPI tables
>> and the DSM methods; mappings are treated by Xen during this phase as
>> MMIO.
>>
>> Once dom0 is ready to pass parts of a namespace through to a guest, it
>> makes a hypercall to tell Xen about the namespace. It includes any
>> regions of the namespace which Xen may use for 'scratch'; it also
>> includes a flag to indicate whether this 'scratch' space may be used
>> for frame tables from other namespaces.
>>
>> Frame tables are then created for this SPA range. They will be
>> allocated from, in this order: 1) designated 'scratch' range from
>> within this namespace 2) designated 'scratch' range from other
>> namespaces which has been marked as sharable 3) system RAM.
>>
>> Xen will either verify that dom0 has no existing mappings, or promote
>> the mappings to full pages (taking appropriate reference counts for
>> mappings). Dom0 must ensure that this namespace is not unmapped,
>> modified, or relocated until it asks Xen to unmap it.
>>
>> For Xen frame tables, to begin with, set aside a partition inside a
>> namespace to be used by Xen. Pass this in to Xen when activating the
>> namespace; this could be either 2a or 3a from "Page structure
>> allocation". After that, we could decide which of the two more
>> streamlined approaches (2b or 3b) to pursue.
>>
>> At this point, dom0 can pass parts of the mapped namespace into
>> guests. Unfortunately, passing files on a fsdax filesystem is
>> probably not safe; but we can pass in full dev-dax or fsdax
>> partitions.
>>
>> From a guest perspective, I propose we provide static NFIT only, no
>> access to labels to begin with. This can be generated in hvmloader
>> and/or the toolstack acpi code.
I'm ignorant of Xen internals, but can you not reuse the existing QEMU
emulation for labels and NFIT?
Thanks for this thorough write up, it's always nice to see the tradeoffs.
4 years, 2 months
[PATCH v4 00/14] Copy Offload in NVMe Fabrics with P2P PCI Memory
by Logan Gunthorpe
Hi Everyone,
Here's v4 of our series to introduce P2P based copy offload to NVMe
fabrics. This version has been rebased onto v4.17-rc2. A git repo
is here:
https://github.com/sbates130272/linux-p2pmem pci-p2p-v4
Thanks,
Logan
Changes in v4:
* Change the original upstream_bridges_match() function to
upstream_bridge_distance() which calculates the distance between two
devices as long as they are behind the same root port. This should
address Bjorn's concerns that the code was to focused on
being behind a single switch.
* The disable ACS function now disables ACS for all bridge ports instead
of switch ports (ie. those that had two upstream_bridge ports).
* Change the pci_p2pmem_alloc_sgl() and pci_p2pmem_free_sgl()
API to be more like sgl_alloc() in that the alloc function returns
the allocated scatterlist and nents is not required bythe free
function.
* Moved the new documentation into the driver-api tree as requested
by Jonathan
* Add SGL alloc and free helpers in the nvmet code so that the
individual drivers can share the code that allocates P2P memory.
As requested by Christoph.
* Cleanup the nvmet_p2pmem_store() function as Christoph
thought my first attempt was ugly.
* Numerous commit message and comment fix-ups
Changes in v3:
* Many more fixes and minor cleanups that were spotted by Bjorn
* Additional explanation of the ACS change in both the commit message
and Kconfig doc. Also, the code that disables the ACS bits is surrounded
explicitly by an #ifdef
* Removed the flag we added to rdma_rw_ctx() in favour of using
is_pci_p2pdma_page(), as suggested by Sagi.
* Adjust pci_p2pmem_find() so that it prefers P2P providers that
are closest to (or the same as) the clients using them. In cases
of ties, the provider is randomly chosen.
* Modify the NVMe Target code so that the PCI device name of the provider
may be explicitly specified, bypassing the logic in pci_p2pmem_find().
(Note: it's still enforced that the provider must be behind the
same switch as the clients).
* As requested by Bjorn, added documentation for driver writers.
Changes in v2:
* Renamed everything to 'p2pdma' per the suggestion from Bjorn as well
as a bunch of cleanup and spelling fixes he pointed out in the last
series.
* To address Alex's ACS concerns, we change to a simpler method of
just disabling ACS behind switches for any kernel that has
CONFIG_PCI_P2PDMA.
* We also reject using devices that employ 'dma_virt_ops' which should
fairly simply handle Jason's concerns that this work might break with
the HFI, QIB and rxe drivers that use the virtual ops to implement
their own special DMA operations.
--
This is a continuation of our work to enable using Peer-to-Peer PCI
memory in the kernel with initial support for the NVMe fabrics target
subsystem. Many thanks go to Christoph Hellwig who provided valuable
feedback to get these patches to where they are today.
The concept here is to use memory that's exposed on a PCI BAR as
data buffers in the NVMe target code such that data can be transferred
from an RDMA NIC to the special memory and then directly to an NVMe
device avoiding system memory entirely. The upside of this is better
QoS for applications running on the CPU utilizing memory and lower
PCI bandwidth required to the CPU (such that systems could be designed
with fewer lanes connected to the CPU).
Due to these trade-offs we've designed the system to only enable using
the PCI memory in cases where the NIC, NVMe devices and memory are all
behind the same PCI switch hierarchy. This will mean many setups that
could likely work well will not be supported so that we can be more
confident it will work and not place any responsibility on the user to
understand their topology. (We chose to go this route based on feedback
we received at the last LSF). Future work may enable these transfers
using a white list of known good root complexes. However, at this time,
there is no reliable way to ensure that Peer-to-Peer transactions are
permitted between PCI Root Ports.
In order to enable this functionality, we introduce a few new PCI
functions such that a driver can register P2P memory with the system.
Struct pages are created for this memory using devm_memremap_pages()
and the PCI bus offset is stored in the corresponding pagemap structure.
When the PCI P2PDMA config option is selected the ACS bits in every
bridge port in the system are turned off to allow traffic to
pass freely behind the root port. At this time, the bit must be disabled
at boot so the IOMMU subsystem can correctly create the groups, though
this could be addressed in the future. There is no way to dynamically
disable the bit and alter the groups.
Another set of functions allow a client driver to create a list of
client devices that will be used in a given P2P transactions and then
use that list to find any P2P memory that is supported by all the
client devices.
In the block layer, we also introduce a P2P request flag to indicate a
given request targets P2P memory as well as a flag for a request queue
to indicate a given queue supports targeting P2P memory. P2P requests
will only be accepted by queues that support it. Also, P2P requests
are marked to not be merged seeing a non-homogenous request would
complicate the DMA mapping requirements.
In the PCI NVMe driver, we modify the existing CMB support to utilize
the new PCI P2P memory infrastructure and also add support for P2P
memory in its request queue. When a P2P request is received it uses the
pci_p2pmem_map_sg() function which applies the necessary transformation
to get the corrent pci_bus_addr_t for the DMA transactions.
In the RDMA core, we also adjust rdma_rw_ctx_init() and
rdma_rw_ctx_destroy() to take a flags argument which indicates whether
to use the PCI P2P mapping functions or not. To avoid odd RDMA devices
that don't use the proper DMA infrastructure this code rejects using
any device that employs the virt_dma_ops implementation.
Finally, in the NVMe fabrics target port we introduce a new
configuration boolean: 'allow_p2pmem'. When set, the port will attempt
to find P2P memory supported by the RDMA NIC and all namespaces. If
supported memory is found, it will be used in all IO transfers. And if
a port is using P2P memory, adding new namespaces that are not supported
by that memory will fail.
These patches have been tested on a number of Intel based systems and
for a variety of RDMA NICs (Mellanox, Broadcomm, Chelsio) and NVMe
SSDs (Intel, Seagate, Samsung) and p2pdma devices (Eideticom,
Microsemi, Chelsio and Everspin) using switches from both Microsemi
and Broadcomm.
Logan Gunthorpe (14):
PCI/P2PDMA: Support peer-to-peer memory
PCI/P2PDMA: Add sysfs group to display p2pmem stats
PCI/P2PDMA: Add PCI p2pmem dma mappings to adjust the bus offset
PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches
docs-rst: Add a new directory for PCI documentation
PCI/P2PDMA: Add P2P DMA driver writer's documentation
block: Introduce PCI P2P flags for request and request queue
IB/core: Ensure we map P2P memory correctly in
rdma_rw_ctx_[init|destroy]()
nvme-pci: Use PCI p2pmem subsystem to manage the CMB
nvme-pci: Add support for P2P memory in requests
nvme-pci: Add a quirk for a pseudo CMB
nvmet: Introduce helper functions to allocate and free request SGLs
nvmet-rdma: Use new SGL alloc/free helper for requests
nvmet: Optionally use PCI P2P memory
Documentation/ABI/testing/sysfs-bus-pci | 25 +
Documentation/PCI/index.rst | 14 +
Documentation/driver-api/index.rst | 2 +-
Documentation/driver-api/pci/index.rst | 20 +
Documentation/driver-api/pci/p2pdma.rst | 166 ++++++
Documentation/driver-api/{ => pci}/pci.rst | 0
Documentation/index.rst | 3 +-
block/blk-core.c | 3 +
drivers/infiniband/core/rw.c | 13 +-
drivers/nvme/host/core.c | 4 +
drivers/nvme/host/nvme.h | 8 +
drivers/nvme/host/pci.c | 118 +++--
drivers/nvme/target/configfs.c | 67 +++
drivers/nvme/target/core.c | 143 ++++-
drivers/nvme/target/io-cmd.c | 3 +
drivers/nvme/target/nvmet.h | 15 +
drivers/nvme/target/rdma.c | 22 +-
drivers/pci/Kconfig | 26 +
drivers/pci/Makefile | 1 +
drivers/pci/p2pdma.c | 814 +++++++++++++++++++++++++++++
drivers/pci/pci.c | 6 +
include/linux/blk_types.h | 18 +-
include/linux/blkdev.h | 3 +
include/linux/memremap.h | 19 +
include/linux/pci-p2pdma.h | 118 +++++
include/linux/pci.h | 4 +
26 files changed, 1579 insertions(+), 56 deletions(-)
create mode 100644 Documentation/PCI/index.rst
create mode 100644 Documentation/driver-api/pci/index.rst
create mode 100644 Documentation/driver-api/pci/p2pdma.rst
rename Documentation/driver-api/{ => pci}/pci.rst (100%)
create mode 100644 drivers/pci/p2pdma.c
create mode 100644 include/linux/pci-p2pdma.h
--
2.11.0
4 years, 2 months
[qemu PATCH v2 0/4] support NFIT platform capabilities
by Ross Zwisler
The first 2 patches in this series clean up some things I noticed while
coding.
Patch 3 adds support for the new Platform Capabilities Structure, which
was added to the NFIT in ACPI 6.2 Errata A. We add a machine command
line option "nvdimm-cap":
-machine pc,accel=kvm,nvdimm,nvdimm-cap=2
which allows the user to pass in a value for this structure. When such
a value is passed in we will generate the new NFIT subtable.
Patch 4 adds code to the "make check" self test infrastructure so that
we generate the new Platform Capabilities Structure, and adds it to the
expected NFIT output so that we test for it.
Thanks to Igor Mammedov for his feedback on v1 of this set.
Ross Zwisler (4):
nvdimm: fix typo in label-size definition
tests/.gitignore: add entry for generated file
nvdimm, acpi: support NFIT platform capabilities
ACPI testing: test NFIT platform capabilities
docs/nvdimm.txt | 18 ++++++++++++++
hw/acpi/nvdimm.c | 44 ++++++++++++++++++++++++++++++----
hw/i386/pc.c | 31 ++++++++++++++++++++++++
hw/mem/nvdimm.c | 2 +-
include/hw/i386/pc.h | 1 +
include/hw/mem/nvdimm.h | 7 +++++-
tests/.gitignore | 1 +
tests/acpi-test-data/pc/NFIT.dimmpxm | Bin 224 -> 240 bytes
tests/acpi-test-data/q35/NFIT.dimmpxm | Bin 224 -> 240 bytes
tests/bios-tables-test.c | 2 +-
10 files changed, 99 insertions(+), 7 deletions(-)
--
2.14.3
4 years, 2 months
Re:(3) linux-nvdimm@lists.01.org E-mail database for $10 per country / E-mail базы по $10 за страну...
by Heike
Hello linux-nvdimm(a)lists.01.org
E-mail database for $10 per country.[bxwcp]
For marketing, advertising, newsletters.[qyiqb]
+ Bonuses, discounts and much more.[eetxytf]
+ First base, then money.[pdjstv]
Hurry up. Details on: andrey100077(a)gmail.com or by ICQ: 666784430 [unorn]
Добрый день linux-nvdimm(a)lists.01.org
E-mail базы по $10 за страну.[oajek]
Для маркетинговых, рекламных, рассылок.[zukjhbb]
+ Бонусы, скидки и многое другое.[dpoapbp]
+ Сначала базы, потом деньги.[crjnv]
Спешите. Детали по: andrey100077(a)gmail.com или по ICQ: 666784430 [yfqijigj]
4 years, 2 months
Re: [dm-devel] [PATCH] dm-writecache
by Christoph Hellwig
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6/drivers/md/dm-writecache.c 2018-03-08 14:23:31.059999000 +0100
> @@ -0,0 +1,2417 @@
> +#include <linux/device-mapper.h>
missing copyright statement, or for those new-fashioned SPDX statement.
> +#define WRITEBACK_FUA true
no business having this around.
> +#ifndef bio_set_dev
> +#define bio_set_dev(bio, dev) ((bio)->bi_bdev = (dev))
> +#endif
> +#ifndef timer_setup
> +#define timer_setup(t, c, f) setup_timer(t, c, (unsigned long)(t))
> +#endif
no business in mainline.
> +/*
> + * On X86, non-temporal stores are more efficient than cache flushing.
> + * On ARM64, cache flushing is more efficient.
> + */
> +#if defined(CONFIG_X86_64)
> +#define NT_STORE(dest, src) \
> +do { \
> + typeof(src) val = (src); \
> + memcpy_flushcache(&(dest), &val, sizeof(src)); \
> +} while (0)
> +#define COMMIT_FLUSHED() wmb()
> +#else
> +#define NT_STORE(dest, src) WRITE_ONCE(dest, src)
> +#define FLUSH_RANGE dax_flush
> +#define COMMIT_FLUSHED() do { } while (0)
> +#endif
Please use proper APIs for this, this has no business in a driver.
And that's it for now. This is clearly not submission ready, and I
should got back to my backlog of other things.
4 years, 2 months
[ndctl PATCH] ndctl: refactor validation of the ars_status command
by Vishal Verma
The APIs that iterate over the information contained in an ars_atatus
command require a prior, successfully completed ars_status command
struct. We were neglecting to verify that the firmware status too
indicates a success. Refactor this checking to mimic validate_ars_cap()
which checks both the command status as well as the formware status.
Reported-by: Tomasz Rochumski <tomasz.rochumski(a)intel.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma(a)intel.com>
---
ndctl/lib/ars.c | 38 ++++++++++++++++++++++++++------------
1 file changed, 26 insertions(+), 12 deletions(-)
diff --git a/ndctl/lib/ars.c b/ndctl/lib/ars.c
index 1ff6cf7..ef30617 100644
--- a/ndctl/lib/ars.c
+++ b/ndctl/lib/ars.c
@@ -216,15 +216,31 @@ NDCTL_EXPORT int ndctl_cmd_ars_in_progress(struct ndctl_cmd *cmd)
return 0;
}
+static bool __validate_ars_stat(struct ndctl_cmd *ars_stat)
+{
+ if (ars_stat->type != ND_CMD_ARS_STATUS || ars_stat->status != 0)
+ return false;
+ if ((*ars_stat->firmware_status & ARS_STATUS_MASK) != 0)
+ return false;
+ return true;
+}
+
+#define validate_ars_stat(ctx, ars_stat) \
+({ \
+ bool __valid = __validate_ars_stat(ars_stat); \
+ if (!__valid) \
+ dbg(ctx, "expected sucessfully completed ars_stat command\n"); \
+ __valid; \
+})
+
NDCTL_EXPORT unsigned int ndctl_cmd_ars_num_records(struct ndctl_cmd *ars_stat)
{
struct ndctl_ctx *ctx = ndctl_bus_get_ctx(cmd_to_bus(ars_stat));
- if (ars_stat->type == ND_CMD_ARS_STATUS && ars_stat->status == 0)
- return ars_stat->ars_status->num_records;
+ if (!validate_ars_stat(ctx, ars_stat))
+ return 0;
- dbg(ctx, "invalid ars_status\n");
- return 0;
+ return ars_stat->ars_status->num_records;
}
NDCTL_EXPORT unsigned long long ndctl_cmd_ars_get_record_addr(
@@ -237,11 +253,10 @@ NDCTL_EXPORT unsigned long long ndctl_cmd_ars_get_record_addr(
return 0;
}
- if (ars_stat->type == ND_CMD_ARS_STATUS && ars_stat->status == 0)
- return ars_stat->ars_status->records[rec_index].err_address;
+ if (!validate_ars_stat(ctx, ars_stat))
+ return 0;
- dbg(ctx, "invalid ars_status\n");
- return 0;
+ return ars_stat->ars_status->records[rec_index].err_address;
}
NDCTL_EXPORT unsigned long long ndctl_cmd_ars_get_record_len(
@@ -254,11 +269,10 @@ NDCTL_EXPORT unsigned long long ndctl_cmd_ars_get_record_len(
return 0;
}
- if (ars_stat->type == ND_CMD_ARS_STATUS && ars_stat->status == 0)
- return ars_stat->ars_status->records[rec_index].length;
+ if (!validate_ars_stat(ctx, ars_stat))
+ return 0;
- dbg(ctx, "invalid ars_status\n");
- return 0;
+ return ars_stat->ars_status->records[rec_index].length;
}
NDCTL_EXPORT struct ndctl_cmd *ndctl_bus_cmd_new_clear_error(
--
2.14.3
4 years, 2 months