[PATCH v9 17/50] KVM: arm64: nv: Handle shadow stage 2 page faults

Tue May 2 02:35:40 PDT 2023

Hi Marc,

On 05-04-2023 09:09 pm, Marc Zyngier wrote:
> If we are faulting on a shadow stage 2 translation, we first walk the
> guest hypervisor's stage 2 page table to see if it has a mapping. If
> not, we inject a stage 2 page fault to the virtual EL2. Otherwise, we
> create a mapping in the shadow stage 2 page table.
> 
> Note that we have to deal with two IPAs when we got a shadow stage 2
> page fault. One is the address we faulted on, and is in the L2 guest
> phys space. The other is from the guest stage-2 page table walk, and is
> in the L1 guest phys space.  To differentiate them, we rename variables
> so that fault_ipa is used for the former and ipa is used for the latter.
> 
> Co-developed-by: Christoffer Dall <christoffer.dall at linaro.org>
> Co-developed-by: Jintack Lim <jintack.lim at linaro.org>
> Signed-off-by: Christoffer Dall <christoffer.dall at linaro.org>
> Signed-off-by: Jintack Lim <jintack.lim at linaro.org>
> [maz: rewrote this multiple times...]
> Signed-off-by: Marc Zyngier <maz at kernel.org>
> ---
>   arch/arm64/include/asm/kvm_emulate.h |  6 ++
>   arch/arm64/include/asm/kvm_nested.h  | 19 ++++++
>   arch/arm64/kvm/mmu.c                 | 89 ++++++++++++++++++++++++----
>   arch/arm64/kvm/nested.c              | 48 +++++++++++++++
>   4 files changed, 152 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index fe4b4b893fb8..1aa059ebb569 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -646,4 +646,10 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature)
>   	return test_bit(feature, vcpu->arch.features);
>   }
>   
> +static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
> +{
> +	return (vcpu->arch.hw_mmu != &vcpu->kvm->arch.mmu &&
> +		vcpu->arch.hw_mmu->nested_stage2_enabled);
> +}
> +
>   #endif /* __ARM64_KVM_EMULATE_H__ */
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> index 19796d4b0798..33a25ac0e258 100644
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -76,9 +76,28 @@ struct kvm_s2_trans {
>   	u64 upper_attr;
>   };
>   
> +static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
> +{
> +	return trans->output;
> +}
> +
> +static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans)
> +{
> +	return trans->block_size;
> +}
> +
> +static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans)
> +{
> +	return trans->esr;
> +}
> +
>   extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
>   			      struct kvm_s2_trans *result);
>   
> +extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
> +				    struct kvm_s2_trans *trans);
> +extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
> +int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
>   extern bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg,
>   			    u64 control_bit);
>   extern bool forward_traps(struct kvm_vcpu *vcpu, u64 control_bit);
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index b2612763abc1..e08001a45a89 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1251,14 +1251,16 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
>   }
>   
>   static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> -			  struct kvm_memory_slot *memslot, unsigned long hva,
> -			  unsigned long fault_status)
> +			  struct kvm_s2_trans *nested,
> +			  struct kvm_memory_slot *memslot,
> +			  unsigned long hva, unsigned long fault_status)
>   {
>   	int ret = 0;
>   	bool write_fault, writable, force_pte = false;
>   	bool exec_fault, mte_allowed;
>   	bool device = false;
>   	unsigned long mmu_seq;
> +	phys_addr_t ipa = fault_ipa;
>   	struct kvm *kvm = vcpu->kvm;
>   	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
>   	struct vm_area_struct *vma;
> @@ -1343,10 +1345,38 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>   	}
>   
>   	vma_pagesize = 1UL << vma_shift;
> +
> +	if (nested) {
> +		unsigned long max_map_size;
> +
> +		max_map_size = force_pte ? PUD_SIZE : PAGE_SIZE;
> +
> +		ipa = kvm_s2_trans_output(nested);
> +
> +		/*
> +		 * If we're about to create a shadow stage 2 entry, then we
> +		 * can only create a block mapping if the guest stage 2 page
> +		 * table uses at least as big a mapping.
> +		 */
> +		max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
> +
> +		/*
> +		 * Be careful that if the mapping size falls between
> +		 * two host sizes, take the smallest of the two.
> +		 */
> +		if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
> +			max_map_size = PMD_SIZE;
> +		else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
> +			max_map_size = PAGE_SIZE;
> +

Thanks for folding the fix[1] in to this patch.
please feel free to add,

Reviewed-by: Ganapatrao Kulkarni <gankulkarni at os.amperecomputing.com>

[1] 
https://lore.kernel.org/linux-arm-kernel/20220824060304.21128-1-gankulkarni@os.amperecomputing.com/T/#m2d0d950604009f0ab3f8217b3b1daf6f34385c7e

> +		force_pte = (max_map_size == PAGE_SIZE);
> +		vma_pagesize = min(vma_pagesize, (long)max_map_size);
> +	}
> +
>   	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
>   		fault_ipa &= ~(vma_pagesize - 1);
>   
> -	gfn = fault_ipa >> PAGE_SHIFT;
> +	gfn = ipa >> PAGE_SHIFT;
>   	mte_allowed = kvm_vma_mte_allowed(vma);
>   
>   	/* Don't use the VMA after the unlock -- it may have vanished */
> @@ -1497,8 +1527,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
>    */
>   int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>   {
> +	struct kvm_s2_trans nested_trans, *nested = NULL;
>   	unsigned long fault_status;
> -	phys_addr_t fault_ipa;
> +	phys_addr_t fault_ipa; /* The address we faulted on */
> +	phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
>   	struct kvm_memory_slot *memslot;
>   	unsigned long hva;
>   	bool is_iabt, write_fault, writable;
> @@ -1507,7 +1539,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>   
>   	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
>   
> -	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
> +	ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
>   	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
>   
>   	if (fault_status == ESR_ELx_FSC_FAULT) {
> @@ -1548,6 +1580,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>   	if (fault_status != ESR_ELx_FSC_FAULT &&
>   	    fault_status != ESR_ELx_FSC_PERM &&
>   	    fault_status != ESR_ELx_FSC_ACCESS) {
> +		/*
> +		 * We must never see an address size fault on shadow stage 2
> +		 * page table walk, because we would have injected an addr
> +		 * size fault when we walked the nested s2 page and not
> +		 * create the shadow entry.
> +		 */
>   		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
>   			kvm_vcpu_trap_get_class(vcpu),
>   			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
> @@ -1557,7 +1595,37 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>   
>   	idx = srcu_read_lock(&vcpu->kvm->srcu);
>   
> -	gfn = fault_ipa >> PAGE_SHIFT;
> +	/*
> +	 * We may have faulted on a shadow stage 2 page table if we are
> +	 * running a nested guest.  In this case, we have to resolve the L2
> +	 * IPA to the L1 IPA first, before knowing what kind of memory should
> +	 * back the L1 IPA.
> +	 *
> +	 * If the shadow stage 2 page table walk faults, then we simply inject
> +	 * this to the guest and carry on.
> +	 */
> +	if (kvm_is_shadow_s2_fault(vcpu)) {
> +		u32 esr;
> +
> +		ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
> +		if (ret) {
> +			esr = kvm_s2_trans_esr(&nested_trans);
> +			kvm_inject_s2_fault(vcpu, esr);
> +			goto out_unlock;
> +		}
> +
> +		ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
> +		if (ret) {
> +			esr = kvm_s2_trans_esr(&nested_trans);
> +			kvm_inject_s2_fault(vcpu, esr);
> +			goto out_unlock;
> +		}
> +
> +		ipa = kvm_s2_trans_output(&nested_trans);
> +		nested = &nested_trans;
> +	}
> +
> +	gfn = ipa >> PAGE_SHIFT;
>   	memslot = gfn_to_memslot(vcpu->kvm, gfn);
>   	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
>   	write_fault = kvm_is_write_fault(vcpu);
> @@ -1601,13 +1669,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>   		 * faulting VA. This is always 12 bits, irrespective
>   		 * of the page size.
>   		 */
> -		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
> -		ret = io_mem_abort(vcpu, fault_ipa);
> +		ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
> +		ret = io_mem_abort(vcpu, ipa);
>   		goto out_unlock;
>   	}
>   
>   	/* Userspace should not be able to register out-of-bounds IPAs */
> -	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
> +	VM_BUG_ON(ipa >= kvm_phys_size(vcpu->kvm));
>   
>   	if (fault_status == ESR_ELx_FSC_ACCESS) {
>   		handle_access_fault(vcpu, fault_ipa);
> @@ -1615,7 +1683,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>   		goto out_unlock;
>   	}
>   
> -	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
> +	ret = user_mem_abort(vcpu, fault_ipa, nested,
> +			     memslot, hva, fault_status);
>   	if (ret == 0)
>   		ret = 1;
>   out:
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 1e5eb8140012..1cf2ad18a5cd 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -112,6 +112,15 @@ static u32 compute_fsc(int level, u32 fsc)
>   	return fsc | (level & 0x3);
>   }
>   
> +static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
> +{
> +	u32 esr;
> +
> +	esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
> +	esr |= compute_fsc(level, fsc);
> +	return esr;
> +}
> +
>   static int check_base_s2_limits(struct s2_walk_info *wi,
>   				int level, int input_size, int stride)
>   {
> @@ -478,6 +487,45 @@ void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
>   	}
>   }
>   
> +/*
> + * Returns non-zero if permission fault is handled by injecting it to the next
> + * level hypervisor.
> + */
> +int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
> +{
> +	unsigned long fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
> +	bool forward_fault = false;
> +
> +	trans->esr = 0;
> +
> +	if (fault_status != ESR_ELx_FSC_PERM)
> +		return 0;
> +
> +	if (kvm_vcpu_trap_is_iabt(vcpu)) {
> +		forward_fault = (trans->upper_attr & BIT(54));
> +	} else {
> +		bool write_fault = kvm_is_write_fault(vcpu);
> +
> +		forward_fault = ((write_fault && !trans->writable) ||
> +				 (!write_fault && !trans->readable));
> +	}
> +
> +	if (forward_fault) {
> +		trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
> +{
> +	vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
> +	vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);
> +
> +	return kvm_inject_nested_sync(vcpu, esr_el2);
> +}
> +
>   void kvm_arch_flush_shadow_all(struct kvm *kvm)
>   {
>   	int i;

Thanks,
Ganapat