Skip to content

Commit

Permalink
Use llvm-symbolizer's JSON output for symbolizing (#879)
Browse files Browse the repository at this point in the history
In some edge cases (e.g. injected JIT symbols), function names can have
new lines. This breaks the llvm-symbolizer output parsing, and makes
pprof hang.

Conveniently, as of LLVM 13, llvm-symbolizer has a JSON output mode,
which is robust against all kinds of weirdness like new lines. We can
use this instead of the line-based parsing, and as a bonus we get much
simpler handling of multiple frames in a stack, as the JSON output
already returns these as an array.

This also requires splitting the CODE and DATA processing into separate
functions, since their JSON output is incompatible. For now, we keep the
DATA output as before, a slightly hacky but functional concatenation of
start + size, but this could be improved.

Co-authored-by: Alexey Alexandrov <aalexand@users.noreply.github.com>
  • Loading branch information
LeszekSwirski and aalexand authored Jul 27, 2024
1 parent 304e4f0 commit 813a5fb
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 71 deletions.
118 changes: 57 additions & 61 deletions internal/binutils/addr2liner_llvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package binutils

import (
"bufio"
"encoding/json"
"fmt"
"io"
"os/exec"
Expand All @@ -37,6 +38,7 @@ type llvmSymbolizer struct {
filename string
rw lineReaderWriter
base uint64
isData bool
}

type llvmSymbolizerJob struct {
Expand Down Expand Up @@ -76,7 +78,7 @@ func newLLVMSymbolizer(cmd, file string, base uint64, isData bool) (*llvmSymboli
}

j := &llvmSymbolizerJob{
cmd: exec.Command(cmd, "--inlining", "-demangle=false"),
cmd: exec.Command(cmd, "--inlining", "-demangle=false", "--output-style=JSON"),
symType: "CODE",
}
if isData {
Expand All @@ -102,63 +104,67 @@ func newLLVMSymbolizer(cmd, file string, base uint64, isData bool) (*llvmSymboli
filename: file,
rw: j,
base: base,
isData: isData,
}

return a, nil
}

// readFrame parses the llvm-symbolizer output for a single address. It
// returns a populated plugin.Frame and whether it has reached the end of the
// data.
func (d *llvmSymbolizer) readFrame() (plugin.Frame, bool) {
funcname, err := d.rw.readLine()
// readDataFrames parses the llvm-symbolizer DATA output for a single address. It
// returns a populated plugin.Frame array with a single entry.
func (d *llvmSymbolizer) readDataFrames() ([]plugin.Frame, error) {
line, err := d.rw.readLine()
if err != nil {
return plugin.Frame{}, true
return nil, err
}

switch funcname {
case "":
return plugin.Frame{}, true
case "??":
funcname = ""
var frame struct {
Address string `json:"Address"`
ModuleName string `json:"ModuleName"`
Data struct {
Start string `json:"Start"`
Size string `json:"Size"`
Name string `json:"Name"`
} `json:"Data"`
}
if err := json.Unmarshal([]byte(line), &frame); err != nil {
return nil, err
}
// Match non-JSON output behaviour of stuffing the start/size into the filename of a single frame,
// with the size being a decimal value.
size, err := strconv.ParseInt(frame.Data.Size, 0, 0)
if err != nil {
return nil, err
}
var stack []plugin.Frame
stack = append(stack, plugin.Frame{Func: frame.Data.Name, File: fmt.Sprintf("%s %d", frame.Data.Start, size)})
return stack, nil
}

fileline, err := d.rw.readLine()
// readCodeFrames parses the llvm-symbolizer CODE output for a single address. It
// returns a populated plugin.Frame array.
func (d *llvmSymbolizer) readCodeFrames() ([]plugin.Frame, error) {
line, err := d.rw.readLine()
if err != nil {
return plugin.Frame{Func: funcname}, true
}

linenumber := 0
columnnumber := 0
// The llvm-symbolizer outputs the <file_name>:<line_number>:<column_number>.
// When it cannot identify the source code location, it outputs "??:0:0".
// Older versions output just the filename and line number, so we check for
// both conditions here.
if fileline == "??:0" || fileline == "??:0:0" {
fileline = ""
} else {
switch split := strings.Split(fileline, ":"); len(split) {
case 3:
// filename:line:column
if col, err := strconv.Atoi(split[2]); err == nil {
columnnumber = col
}
fallthrough
case 2:
// filename:line
if line, err := strconv.Atoi(split[1]); err == nil {
linenumber = line
}
fallthrough
case 1:
// filename
fileline = split[0]
default:
// Unrecognized, ignore
}
}

return plugin.Frame{Func: funcname, File: fileline, Line: linenumber, Column: columnnumber}, false
return nil, err
}
var frame struct {
Address string `json:"Address"`
ModuleName string `json:"ModuleName"`
Symbol []struct {
Line int `json:"Line"`
Column int `json:"Column"`
FunctionName string `json:"FunctionName"`
FileName string `json:"FileName"`
} `json:"Symbol"`
}
if err := json.Unmarshal([]byte(line), &frame); err != nil {
return nil, err
}
var stack []plugin.Frame
for _, s := range frame.Symbol {
stack = append(stack, plugin.Frame{Func: s.FunctionName, File: s.FileName, Line: s.Line, Column: s.Column})
}
return stack, nil
}

// addrInfo returns the stack frame information for a specific program
Expand All @@ -170,18 +176,8 @@ func (d *llvmSymbolizer) addrInfo(addr uint64) ([]plugin.Frame, error) {
if err := d.rw.write(fmt.Sprintf("%s 0x%x", d.filename, addr-d.base)); err != nil {
return nil, err
}

var stack []plugin.Frame
for {
frame, end := d.readFrame()
if end {
break
}

if frame != (plugin.Frame{}) {
stack = append(stack, frame)
}
if d.isData {
return d.readDataFrames()
}

return stack, nil
return d.readCodeFrames()
}
16 changes: 6 additions & 10 deletions internal/binutils/testdata/fake-llvm-symbolizer
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,18 @@ IFS=" "
while read line; do
# line has form:
# filename 0xaddr
# Emit dummy output that matches llvm-symbolizer output format.
# Emit dummy output that matches llvm-symbolizer JSON output format.
set -- ${line}
kind=$1
fname=$2
addr=$3
case ${kind} in
CODE)
echo "Inlined_${addr}"
echo "${fname}.h"
echo "Func_${addr}"
echo "${fname}.c:2:1"
echo;;
echo "{\"Address\":\"${addr}\",\"ModuleName\":\"${fname}\",\"Symbol\":[{\"Column\":0,\"FileName\":\"${fname}.h\",\"FunctionName\":\"Inlined_${addr}\",\"Line\":0},{\"Column\":1,\"FileName\":\"${fname}.c\",\"FunctionName\":\"Func_${addr}\",\"Line\":2}]}"
;;
DATA)
echo "${fname}_${addr}"
echo "${addr} 8"
echo;;
*) echo ${kind} ${fname} ${addr};;
echo "{\"Address\":\"${addr}\",\"ModuleName\":\"${fname}\",\"Data\":{\"Name\":\"${fname}_${addr}\",\"Size\":\"0x8\",\"Start\":\"${addr}\"}}"
;;
*) exit 1;;
esac
done

0 comments on commit 813a5fb

Please sign in to comment.