diff mbox

[RFC] tcg: add ability to dump /tmp/perf-<pid>.map files

Message ID 1395938253-9225-1-git-send-email-alex.bennee@linaro.org
State Superseded
Headers show

Commit Message

Alex Bennée March 27, 2014, 4:37 p.m. UTC
From: Alex Bennée <alex.bennee@linaro.org>

This allows the perf tool to map samples to each individual translation
block. This could be expanded for user space but currently it gives
enough information to find any hotblocks by other means.
---
 qemu-options.hx | 10 ++++++++++
 tcg/tcg.c       | 21 +++++++++++++++++++++
 vl.c            |  6 ++++++
 3 files changed, 37 insertions(+)

Comments

Richard Henderson March 27, 2014, 7:51 p.m. UTC | #1
On 03/27/2014 09:37 AM, alex.bennee@linaro.org wrote:
> From: Alex Bennée <alex.bennee@linaro.org>
> 
> This allows the perf tool to map samples to each individual translation
> block. This could be expanded for user space but currently it gives
> enough information to find any hotblocks by other means.

Plausible, I suppose.

>  TCGOpDef tcg_op_defs[] = {
>  #define DEF(s, oargs, iargs, cargs, flags) { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
> @@ -2575,6 +2579,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint64_t target_pc,
>   the_end:
>      /* Generate TB finalization at the end of block */
>      tcg_out_tb_finalize(s);
> +
> +    tcg_write_perfmap(gen_code_buf, s->code_ptr - gen_code_buf, target_pc);
>      return -1;
>  }
>  
> @@ -2666,6 +2672,21 @@ void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
>  }
>  #endif
>  
> +static FILE *tcg_perfmap = NULL;
> +void qemu_tcg_enable_perfmap(void) {
> +    gchar * map_file = g_strdup_printf("/tmp/perf-%d.map", getpid());
> +    tcg_perfmap = g_fopen(map_file, "w");
> +    g_free(map_file);
> +}
> +
> +static void tcg_write_perfmap(uint8_t *start, uint64_t size, uint64_t target_pc)
> +{
> +    if (tcg_perfmap) {
> +        g_fprintf(tcg_perfmap, "%lx %lx subject-0x%lx\n",
> +                  (uint64_t) start, size, target_pc);
> +    }
> +}

Why oh why do you feel the need to use g_fprintf?  That's just gratuitous glib
obfuscation, that.

For this small of a single-use function, I think it would be clearer to just do
the printf directly in tcg_gen_code_common.  Certainly no need to use uint64_t
all over the place; ptrdiff_t and target_ulong are exactly the right thing.


r~
Alex Bennée March 28, 2014, 11:12 a.m. UTC | #2
Richard Henderson <rth@twiddle.net> writes:

> On 03/27/2014 09:37 AM, alex.bennee@linaro.org wrote:
>> From: Alex Bennée <alex.bennee@linaro.org>
>> 
>> This allows the perf tool to map samples to each individual translation
>> block. This could be expanded for user space but currently it gives
>> enough information to find any hotblocks by other means.
>
> Plausible, I suppose.

It works fine on my toy test case (kernel + single computation
user-space /init task) but I've not really used it in anger for any
performance tweaking hence it's only an RFC patch. Hopefully it will
save re-inventing the wheel should anyone actually want to do serious
tcg optimisation.

I've had a brief poke around the TCG profiling and seen it track
generation cost. Do we have any hotblock tracking in the built-in profiler?

<snip>
>> +static void tcg_write_perfmap(uint8_t *start, uint64_t size, uint64_t target_pc)
>> +{
>> +    if (tcg_perfmap) {
>> +        g_fprintf(tcg_perfmap, "%lx %lx subject-0x%lx\n",
>> +                  (uint64_t) start, size, target_pc);
>> +    }
>> +}
>
> Why oh why do you feel the need to use g_fprintf?  That's just gratuitous glib
> obfuscation, that.

Sorry my bad, force of habit given that we have glib as a dependency.
But your right in this case it's just a dumb wrapper unlike when your
doing more string mangling where glib save a lot of time.

> For this small of a single-use function, I think it would be clearer to just do
> the printf directly in tcg_gen_code_common.  Certainly no need to use uint64_t
> all over the place; ptrdiff_t and target_ulong are exactly the right
> thing.

Do we have a format macro for target_ulong? The uint64_t was just for
simplicity. I found when I was messing with the trace-event stuff some
of the target types where barred from the common code although I guess
in this case the tcg is very aware of the execution target.
Kirill Batuzov March 28, 2014, 12:29 p.m. UTC | #3
On Thu, 27 Mar 2014, alex.bennee@linaro.org wrote:

> From: Alex Bennée <alex.bennee@linaro.org>
> 
> This allows the perf tool to map samples to each individual translation
> block. This could be expanded for user space but currently it gives
> enough information to find any hotblocks by other means.

I'm in favor of this patch. Being able to profile guest code with perf
will be very helpful for developing and tweaking optimizations in tcg.

<snip>

> @@ -2575,6 +2579,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint64_t target_pc,
>   the_end:
>      /* Generate TB finalization at the end of block */
>      tcg_out_tb_finalize(s);
> +
> +    tcg_write_perfmap(gen_code_buf, s->code_ptr - gen_code_buf, target_pc);
>      return -1;
>  }

I think a part of the patch is missing here. tcg_gen_code_common does not
have target_pc argument in current master.

<snip>

> diff --git a/vl.c b/vl.c
> index c036367..f1c3c3d 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -123,6 +123,9 @@ int main(int argc, char **argv)
>  #define MAX_VIRTIO_CONSOLES 1
>  #define MAX_SCLP_CONSOLES 1
>  
> +/* seems better than pulling in all the tcg headers? */
> +extern void qemu_tcg_enable_perfmap(void);
> +
>  static const char *data_dir[16];
>  static int data_dir_idx;
>  const char *bios_name = NULL;
> @@ -3345,6 +3348,9 @@ int main(int argc, char **argv, char **envp)
>              case QEMU_OPTION_DFILTER:
>                  qemu_set_dfilter_ranges(optarg);
>                  break;
> +            case QEMU_OPTION_PERFMAP:
> +                qemu_tcg_enable_perfmap();
> +                break;
>              case QEMU_OPTION_s:
>                  add_device_config(DEV_GDB, "tcp::" DEFAULT_GDBSTUB_PORT);
>                  break;
> 

Is there any particular reason it is not enabled for linux-user mode? I
think it should work just fine in both modes. linux-user handles command
line options by itself in a different manner (and it has it's own set of
options, different from softmmu targets). The corresponding code is in
linux-user/main.c.
Richard Henderson March 28, 2014, 1:29 p.m. UTC | #4
On 03/28/2014 04:12 AM, Alex Bennée wrote:
> Do we have a format macro for target_ulong?

TCG_PRIlx or TCG_PRIld.


r~
Richard Henderson March 28, 2014, 1:30 p.m. UTC | #5
On 03/28/2014 04:12 AM, Alex Bennée wrote:
> I've had a brief poke around the TCG profiling and seen it track
> generation cost. Do we have any hotblock tracking in the built-in profiler?

No.


r~
Alex Bennée March 28, 2014, 4:34 p.m. UTC | #6
Kirill Batuzov <batuzovk@ispras.ru> writes:

> On Thu, 27 Mar 2014, alex.bennee@linaro.org wrote:
>
>> From: Alex Bennée <alex.bennee@linaro.org>
>> 
>> This allows the perf tool to map samples to each individual translation
>> block. This could be expanded for user space but currently it gives
>> enough information to find any hotblocks by other means.
>
> I'm in favor of this patch. Being able to profile guest code with perf
> will be very helpful for developing and tweaking optimizations in tcg.

OK I'll roll in Richard's comments and re-submit.

>
> <snip>
>
>> @@ -2575,6 +2579,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint64_t target_pc,
>>   the_end:
>>      /* Generate TB finalization at the end of block */
>>      tcg_out_tb_finalize(s);
>> +
>> +    tcg_write_perfmap(gen_code_buf, s->code_ptr - gen_code_buf, target_pc);
>>      return -1;
>>  }
>
> I think a part of the patch is missing here. tcg_gen_code_common does not
> have target_pc argument in current master.

Ahh yes - it's currently sitting on-top of my qemu-log series which
pulled target_pc into the tcg_gen_code_common loop for the -dfilter
patch. I'll be sending the updated series in a moment.

> <snip>
>
>> diff --git a/vl.c b/vl.c
>> index c036367..f1c3c3d 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -123,6 +123,9 @@ int main(int argc, char **argv)
>>  #define MAX_VIRTIO_CONSOLES 1
>>  #define MAX_SCLP_CONSOLES 1
>>  
>> +/* seems better than pulling in all the tcg headers? */
>> +extern void qemu_tcg_enable_perfmap(void);
>> +

I'm surprised no one has pulled me up on this yet, perhaps it is just
less ugly!

>>  static const char *data_dir[16];
>>  static int data_dir_idx;
>>  const char *bios_name = NULL;
>> @@ -3345,6 +3348,9 @@ int main(int argc, char **argv, char **envp)
>>              case QEMU_OPTION_DFILTER:
>>                  qemu_set_dfilter_ranges(optarg);
>>                  break;
>> +            case QEMU_OPTION_PERFMAP:
>> +                qemu_tcg_enable_perfmap();
>> +                break;
>>              case QEMU_OPTION_s:
>>                  add_device_config(DEV_GDB, "tcp::" DEFAULT_GDBSTUB_PORT);
>>                  break;
>> 
>
> Is there any particular reason it is not enabled for linux-user mode? I
> think it should work just fine in both modes. linux-user handles command
> line options by itself in a different manner (and it has it's own set of
> options, different from softmmu targets). The corresponding code is in
> linux-user/main.c.

No real reason. I was only experimenting with system emulation so that's
all I'd tested it with. In fact for linux-user we should be able to make
a better stab at giving the blocks decent names thanks to symbol-lookup.

I'll update the patch to do it.
Kirill Batuzov March 28, 2014, 8:04 p.m. UTC | #7
On 28.03.2014 20:34, Alex Bennée wrote:

>>> @@ -2575,6 +2579,8 @@ static inline int 
>>> tcg_gen_code_common(TCGContext *s, uint64_t target_pc,
>>>    the_end:
>>>       /* Generate TB finalization at the end of block */
>>>       tcg_out_tb_finalize(s);
>>> +
>>> +    tcg_write_perfmap(gen_code_buf, s->code_ptr - gen_code_buf, 
>>> target_pc);
>>>       return -1;
>>>   }
>> 
>> I think a part of the patch is missing here. tcg_gen_code_common does 
>> not
>> have target_pc argument in current master.
> 
> Ahh yes - it's currently sitting on-top of my qemu-log series which
> pulled target_pc into the tcg_gen_code_common loop for the -dfilter
> patch. I'll be sending the updated series in a moment.
> 
>> <snip>
>> 
>>> diff --git a/vl.c b/vl.c
>>> index c036367..f1c3c3d 100644
>>> --- a/vl.c
>>> +++ b/vl.c
>>> @@ -123,6 +123,9 @@ int main(int argc, char **argv)
>>>   #define MAX_VIRTIO_CONSOLES 1
>>>   #define MAX_SCLP_CONSOLES 1
>>> 
>>> +/* seems better than pulling in all the tcg headers? */
>>> +extern void qemu_tcg_enable_perfmap(void);
>>> +
> 
> I'm surprised no one has pulled me up on this yet, perhaps it is just
> less ugly!
> 
I think I have a better solution for this. You can move 
qemu_tcg_enable_refmap and tcg_write_refmap away from tcg to 
translate-all.c (you'll need to rename them obviously). write_refmap 
should be called from cpu_gen_code. It has number of advantages:

  * For writing refmap you do not need any backend details. The only 
thing you need is to know that some guest address has been translated to 
some host range. That is exactly what cpu_gen_code does.
  * In cpu_gen_code you have all needed information. No need to pull 
target_pc specifically.
  * Declaration could be moved somewhere into exec folder. To exec-all.h 
probably, but I'm not sure here. Anyway, you will not need tcg staff in 
unrelated areas any more.
  * You'll avoid writing map entries for tcg_gen_code_search_pc calls. 
This is good because tcg_gen_code_search_pc does not create new 
translation blocks.
diff mbox

Patch

diff --git a/qemu-options.hx b/qemu-options.hx
index c5577be..09fb1d0 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2683,6 +2683,16 @@  Will dump output for any code in the 0x1000 sized block starting at 0x8000 and
 the 0x200 sized block starting at 0xffffffc000080000.
 ETEXI
 
+DEF("perfmap", 0, QEMU_OPTION_PERFMAP, \
+    "-perfmap  generate a /tmp/perf-${pid}.map file for perf\n",
+    QEMU_ARCH_ALL)
+STEXI
+@item -perfmap
+@findex -perfmap
+This will cause QEMU to generate a map file for Linux perf tools that will allow
+basic profiling information to be broken down into basic blocks.
+ETEXI
+
 DEF("L", HAS_ARG, QEMU_OPTION_L, \
     "-L path         set the directory for the BIOS, VGA BIOS and keymaps\n",
     QEMU_ARCH_ALL)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 57d2b82..a24f581 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -27,6 +27,8 @@ 
 #define USE_TCG_OPTIMIZATIONS
 
 #include "config.h"
+#include <glib.h>
+#include <glib/gstdio.h>
 
 /* Define to jump the ELF file used to communicate with GDB.  */
 #undef DEBUG_JIT
@@ -106,6 +108,8 @@  static int tcg_target_const_match(tcg_target_long val,
 static void tcg_out_tb_init(TCGContext *s);
 static void tcg_out_tb_finalize(TCGContext *s);
 
+static void tcg_write_perfmap(uint8_t *start, uint64_t size, uint64_t target_pc);
+void qemu_tcg_enable_perfmap(void);
 
 TCGOpDef tcg_op_defs[] = {
 #define DEF(s, oargs, iargs, cargs, flags) { #s, oargs, iargs, cargs, iargs + oargs + cargs, flags },
@@ -2575,6 +2579,8 @@  static inline int tcg_gen_code_common(TCGContext *s, uint64_t target_pc,
  the_end:
     /* Generate TB finalization at the end of block */
     tcg_out_tb_finalize(s);
+
+    tcg_write_perfmap(gen_code_buf, s->code_ptr - gen_code_buf, target_pc);
     return -1;
 }
 
@@ -2666,6 +2672,21 @@  void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf)
 }
 #endif
 
+static FILE *tcg_perfmap = NULL;
+void qemu_tcg_enable_perfmap(void) {
+    gchar * map_file = g_strdup_printf("/tmp/perf-%d.map", getpid());
+    tcg_perfmap = g_fopen(map_file, "w");
+    g_free(map_file);
+}
+
+static void tcg_write_perfmap(uint8_t *start, uint64_t size, uint64_t target_pc)
+{
+    if (tcg_perfmap) {
+        g_fprintf(tcg_perfmap, "%lx %lx subject-0x%lx\n",
+                  (uint64_t) start, size, target_pc);
+    }
+}
+
 #ifdef ELF_HOST_MACHINE
 /* In order to use this feature, the backend needs to do three things:
 
diff --git a/vl.c b/vl.c
index c036367..f1c3c3d 100644
--- a/vl.c
+++ b/vl.c
@@ -123,6 +123,9 @@  int main(int argc, char **argv)
 #define MAX_VIRTIO_CONSOLES 1
 #define MAX_SCLP_CONSOLES 1
 
+/* seems better than pulling in all the tcg headers? */
+extern void qemu_tcg_enable_perfmap(void);
+
 static const char *data_dir[16];
 static int data_dir_idx;
 const char *bios_name = NULL;
@@ -3345,6 +3348,9 @@  int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_DFILTER:
                 qemu_set_dfilter_ranges(optarg);
                 break;
+            case QEMU_OPTION_PERFMAP:
+                qemu_tcg_enable_perfmap();
+                break;
             case QEMU_OPTION_s:
                 add_device_config(DEV_GDB, "tcp::" DEFAULT_GDBSTUB_PORT);
                 break;