qemu-patch-raspberry4/include/exec/cpu_ldst_template.h
Emilio G. Cota 403f290c06 cputlb: read CPUTLBEntry.addr_write atomically
Updates can come from other threads, so readers that do not
take tlb_lock must use atomic_read to avoid undefined
behaviour (UB).

This completes the conversion to tlb_lock. This conversion results
on average in no performance loss, as the following experiments
(run on an Intel i7-6700K CPU @ 4.00GHz) show.

1. aarch64 bootup+shutdown test:

- Before:
 Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs):

       7487.087786      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.12% )
    31,574,905,303      cycles                    #    4.217 GHz                      ( +-  0.12% )
    57,097,908,812      instructions              #    1.81  insns per cycle          ( +-  0.08% )
    10,255,415,367      branches                  # 1369.747 M/sec                    ( +-  0.08% )
       173,278,962      branch-misses             #    1.69% of all branches          ( +-  0.18% )

       7.504481349 seconds time elapsed                                          ( +-  0.14% )

- After:
 Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs):

       7462.441328      task-clock (msec)         #    0.998 CPUs utilized            ( +-  0.07% )
    31,478,476,520      cycles                    #    4.218 GHz                      ( +-  0.07% )
    57,017,330,084      instructions              #    1.81  insns per cycle          ( +-  0.05% )
    10,251,929,667      branches                  # 1373.804 M/sec                    ( +-  0.05% )
       173,023,787      branch-misses             #    1.69% of all branches          ( +-  0.11% )

       7.474970463 seconds time elapsed                                          ( +-  0.07% )

2. SPEC06int:
                                              SPEC06int (test set)
                                           [Y axis: Speedup over master]
  1.15 +-+----+------+------+------+------+------+-------+------+------+------+------+------+------+----+-+
       |                                                                                                  |
   1.1 +-+.................................+++.............................+  tlb-lock-v2 (m+++x)       +-+
       |                                +++ |                   +++        tlb-lock-v3 (spinl|ck)         |
       |                    +++          |  |     +++    +++     |                           |            |
  1.05 +-+....+++...........####.........|####.+++.|......|.....###....+++...........+++....###.........+-+
       |      ###         ++#| #         |# |# ***### +++### +++#+#     |     +++     |     #|#    ###    |
     1 +-+++***+#++++####+++#++#++++++++++#++#+*+*++#++++#+#+****+#++++###++++###++++###++++#+#++++#+#+++-+
       |    *+* #    #++# ***  #   #### ***  # * *++# ****+# *| * # ****|#   |# #    #|#    #+#    # #    |
  0.95 +-+..*.*.#....#..#.*|*..#...#..#.*|*..#.*.*..#.*|.*.#.*++*.#.*++*+#.****.#....#+#....#.#..++#.#..+-+
       |    * * #    #  # *|*  #   #  # *|*  # * *  # *++* # *  * # *  * # * |* #  ++# #    # #  *** #    |
       |    * * #  ++#  # *+*  #   #  # *|*  # * *  # *  * # *  * # *  * # *++* # **** #  ++# #  * * #    |
   0.9 +-+..*.*.#...|#..#.*.*..#.++#..#.*|*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*.|*.#...|#.#..*.*.#..+-+
       |    * * #  ***  # * *  #  |#  # *+*  # * *  # *  * # *  * # *  * # *  * # *++* #   |# #  * * #    |
  0.85 +-+..*.*.#..*|*..#.*.*..#.***..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.****.#..*.*.#..+-+
       |    * * #  *+*  # * *  # *|*  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # * |* #  * * #    |
       |    * * #  * *  # * *  # *+*  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # * |* #  * * #    |
   0.8 +-+..*.*.#..*.*..#.*.*..#.*.*..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.*++*.#..*.*.#..+-+
       |    * * #  * *  # * *  # * *  # * *  # * *  # *  * # *  * # *  * # *  * # *  * # *  * #  * * #    |
  0.75 +-+--***##--***###-***###-***###-***###-***###-****##-****##-****##-****##-****##-****##--***##--+-+
 400.perlben401.bzip2403.gcc429.m445.gob456.hmme45462.libqua464.h26471.omnet473483.xalancbmkgeomean

  png: https://imgur.com/a/BHzpPTW

Notes:
- tlb-lock-v2 corresponds to an implementation with a mutex.
- tlb-lock-v3 corresponds to the current implementation, i.e.
  a spinlock and a single lock acquisition in tlb_set_page_with_attrs.

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <20181016153840.25877-1-cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2018-10-18 19:46:53 -07:00

211 lines
5.7 KiB
C

/*
* Software MMU support
*
* Generate inline load/store functions for one MMU mode and data
* size.
*
* Generate a store function as well as signed and unsigned loads.
*
* Not used directly but included from cpu_ldst.h.
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#if !defined(SOFTMMU_CODE_ACCESS)
#include "trace-root.h"
#endif
#include "trace/mem.h"
#if DATA_SIZE == 8
#define SUFFIX q
#define USUFFIX q
#define DATA_TYPE uint64_t
#define SHIFT 3
#elif DATA_SIZE == 4
#define SUFFIX l
#define USUFFIX l
#define DATA_TYPE uint32_t
#define SHIFT 2
#elif DATA_SIZE == 2
#define SUFFIX w
#define USUFFIX uw
#define DATA_TYPE uint16_t
#define DATA_STYPE int16_t
#define SHIFT 1
#elif DATA_SIZE == 1
#define SUFFIX b
#define USUFFIX ub
#define DATA_TYPE uint8_t
#define DATA_STYPE int8_t
#define SHIFT 0
#else
#error unsupported data size
#endif
#if DATA_SIZE == 8
#define RES_TYPE uint64_t
#else
#define RES_TYPE uint32_t
#endif
#ifdef SOFTMMU_CODE_ACCESS
#define ADDR_READ addr_code
#define MMUSUFFIX _cmmu
#define URETSUFFIX SUFFIX
#define SRETSUFFIX SUFFIX
#else
#define ADDR_READ addr_read
#define MMUSUFFIX _mmu
#define URETSUFFIX USUFFIX
#define SRETSUFFIX glue(s, SUFFIX)
#endif
/* generic load/store macros */
static inline RES_TYPE
glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
target_ulong ptr,
uintptr_t retaddr)
{
CPUTLBEntry *entry;
RES_TYPE res;
target_ulong addr;
int mmu_idx;
TCGMemOpIdx oi;
#if !defined(SOFTMMU_CODE_ACCESS)
trace_guest_mem_before_exec(
ENV_GET_CPU(env), ptr,
trace_mem_build_info(SHIFT, false, MO_TE, false));
#endif
addr = ptr;
mmu_idx = CPU_MMU_INDEX;
entry = tlb_entry(env, mmu_idx, addr);
if (unlikely(entry->ADDR_READ !=
(addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
oi = make_memop_idx(SHIFT, mmu_idx);
res = glue(glue(helper_ret_ld, URETSUFFIX), MMUSUFFIX)(env, addr,
oi, retaddr);
} else {
uintptr_t hostaddr = addr + entry->addend;
res = glue(glue(ld, USUFFIX), _p)((uint8_t *)hostaddr);
}
return res;
}
static inline RES_TYPE
glue(glue(cpu_ld, USUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
{
return glue(glue(glue(cpu_ld, USUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
}
#if DATA_SIZE <= 2
static inline int
glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
target_ulong ptr,
uintptr_t retaddr)
{
CPUTLBEntry *entry;
int res;
target_ulong addr;
int mmu_idx;
TCGMemOpIdx oi;
#if !defined(SOFTMMU_CODE_ACCESS)
trace_guest_mem_before_exec(
ENV_GET_CPU(env), ptr,
trace_mem_build_info(SHIFT, true, MO_TE, false));
#endif
addr = ptr;
mmu_idx = CPU_MMU_INDEX;
entry = tlb_entry(env, mmu_idx, addr);
if (unlikely(entry->ADDR_READ !=
(addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
oi = make_memop_idx(SHIFT, mmu_idx);
res = (DATA_STYPE)glue(glue(helper_ret_ld, SRETSUFFIX),
MMUSUFFIX)(env, addr, oi, retaddr);
} else {
uintptr_t hostaddr = addr + entry->addend;
res = glue(glue(lds, SUFFIX), _p)((uint8_t *)hostaddr);
}
return res;
}
static inline int
glue(glue(cpu_lds, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr)
{
return glue(glue(glue(cpu_lds, SUFFIX), MEMSUFFIX), _ra)(env, ptr, 0);
}
#endif
#ifndef SOFTMMU_CODE_ACCESS
/* generic store macro */
static inline void
glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(CPUArchState *env,
target_ulong ptr,
RES_TYPE v, uintptr_t retaddr)
{
CPUTLBEntry *entry;
target_ulong addr;
int mmu_idx;
TCGMemOpIdx oi;
#if !defined(SOFTMMU_CODE_ACCESS)
trace_guest_mem_before_exec(
ENV_GET_CPU(env), ptr,
trace_mem_build_info(SHIFT, false, MO_TE, true));
#endif
addr = ptr;
mmu_idx = CPU_MMU_INDEX;
entry = tlb_entry(env, mmu_idx, addr);
if (unlikely(tlb_addr_write(entry) !=
(addr & (TARGET_PAGE_MASK | (DATA_SIZE - 1))))) {
oi = make_memop_idx(SHIFT, mmu_idx);
glue(glue(helper_ret_st, SUFFIX), MMUSUFFIX)(env, addr, v, oi,
retaddr);
} else {
uintptr_t hostaddr = addr + entry->addend;
glue(glue(st, SUFFIX), _p)((uint8_t *)hostaddr, v);
}
}
static inline void
glue(glue(cpu_st, SUFFIX), MEMSUFFIX)(CPUArchState *env, target_ulong ptr,
RES_TYPE v)
{
glue(glue(glue(cpu_st, SUFFIX), MEMSUFFIX), _ra)(env, ptr, v, 0);
}
#endif /* !SOFTMMU_CODE_ACCESS */
#undef RES_TYPE
#undef DATA_TYPE
#undef DATA_STYPE
#undef SUFFIX
#undef USUFFIX
#undef DATA_SIZE
#undef MMUSUFFIX
#undef ADDR_READ
#undef URETSUFFIX
#undef SRETSUFFIX
#undef SHIFT