...

Source file src/github.com/letsencrypt/boulder/test/integration/otel_test.go

Documentation: github.com/letsencrypt/boulder/test/integration

     1  //go:build integration
     2  
     3  package integration
     4  
     5  import (
     6  	"context"
     7  	"crypto/ecdsa"
     8  	"crypto/elliptic"
     9  	"crypto/rand"
    10  	"encoding/json"
    11  	"fmt"
    12  	"io"
    13  	"net/http"
    14  	"os"
    15  	"strings"
    16  	"testing"
    17  	"time"
    18  
    19  	"github.com/eggsampler/acme/v3"
    20  	"go.opentelemetry.io/otel"
    21  	"go.opentelemetry.io/otel/propagation"
    22  	sdktrace "go.opentelemetry.io/otel/sdk/trace"
    23  	"go.opentelemetry.io/otel/trace"
    24  
    25  	"github.com/letsencrypt/boulder/cmd"
    26  	blog "github.com/letsencrypt/boulder/log"
    27  	"github.com/letsencrypt/boulder/test"
    28  )
    29  
    30  // TraceResponse is the list of traces returned from Jaeger's trace search API
    31  // We always search for a single trace by ID, so this should be length 1.
    32  // This is a specialization of Jaeger's structuredResponse type which
    33  // uses []interface{} upstream.
    34  type TraceResponse struct {
    35  	Data []Trace
    36  }
    37  
    38  // Trace represents a single trace in Jaeger's API
    39  // See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Trace
    40  type Trace struct {
    41  	TraceID   string
    42  	Spans     []Span
    43  	Processes map[string]struct {
    44  		ServiceName string
    45  	}
    46  	Warnings []string
    47  }
    48  
    49  // Span represents a single span in Jaeger's API
    50  // See https://pkg.go.dev/github.com/jaegertracing/jaeger/model/json#Span
    51  type Span struct {
    52  	SpanID        string
    53  	OperationName string
    54  	Warnings      []string
    55  	ProcessID     string
    56  	References    []struct {
    57  		RefType string
    58  		TraceID string
    59  		SpanID  string
    60  	}
    61  }
    62  
    63  func getTraceFromJaeger(t *testing.T, traceID trace.TraceID) Trace {
    64  	t.Helper()
    65  	traceURL := "http://bjaeger:16686/api/traces/" + traceID.String()
    66  	resp, err := http.Get(traceURL)
    67  	test.AssertNotError(t, err, "failed to trace from jaeger: "+traceID.String())
    68  	if resp.StatusCode == http.StatusNotFound {
    69  		t.Fatalf("jaeger returned 404 for trace %s", traceID)
    70  	}
    71  	test.AssertEquals(t, resp.StatusCode, http.StatusOK)
    72  
    73  	body, err := io.ReadAll(resp.Body)
    74  	test.AssertNotError(t, err, "failed to read trace body")
    75  
    76  	var parsed TraceResponse
    77  	err = json.Unmarshal(body, &parsed)
    78  	test.AssertNotError(t, err, "failed to decode traces body")
    79  
    80  	if len(parsed.Data) != 1 {
    81  		t.Fatalf("expected to get exactly one trace from jaeger for %s: %v", traceID, parsed)
    82  	}
    83  
    84  	return parsed.Data[0]
    85  }
    86  
    87  type expectedSpans struct {
    88  	Operation string
    89  	Service   string
    90  	Children  []expectedSpans
    91  }
    92  
    93  // isParent returns true if the given span has a parent of ParentID
    94  // The empty string means no ParentID
    95  func isParent(parentID string, span Span) bool {
    96  	if len(span.References) == 0 {
    97  		return parentID == ""
    98  	}
    99  	for _, ref := range span.References {
   100  		// In OpenTelemetry, CHILD_OF is the only reference, but Jaeger supports other systems.
   101  		if ref.RefType == "CHILD_OF" {
   102  			return ref.SpanID == parentID
   103  		}
   104  	}
   105  	return false
   106  }
   107  
   108  func missingChildren(trace Trace, spanID string, children []expectedSpans) bool {
   109  	for _, child := range children {
   110  		if !findSpans(trace, spanID, child) {
   111  			// Missing Child
   112  			return true
   113  		}
   114  	}
   115  	return false
   116  }
   117  
   118  // findSpans checks if the expectedSpan and its expected children are found in trace
   119  func findSpans(trace Trace, parentSpan string, expectedSpan expectedSpans) bool {
   120  	for _, span := range trace.Spans {
   121  		if !isParent(parentSpan, span) {
   122  			continue
   123  		}
   124  		if trace.Processes[span.ProcessID].ServiceName != expectedSpan.Service {
   125  			continue
   126  		}
   127  		if span.OperationName != expectedSpan.Operation {
   128  			continue
   129  		}
   130  		if missingChildren(trace, span.SpanID, expectedSpan.Children) {
   131  			continue
   132  		}
   133  
   134  		// This span has the correct parent, service, operation, and children
   135  		return true
   136  	}
   137  	fmt.Printf("did not find span %s::%s with parent '%s'\n", expectedSpan.Service, expectedSpan.Operation, parentSpan)
   138  	return false
   139  }
   140  
   141  // ContextInjectingRoundTripper holds a context that is added to every request
   142  // sent through this RoundTripper, propagating the OpenTelemetry trace through
   143  // the requests made with it.
   144  //
   145  // This is useful for tracing HTTP clients which don't pass through a context,
   146  // notably including the eggsampler ACME client used in this test.
   147  //
   148  // This test uses a trace started in the test to connect all the outgoing
   149  // requests into a trace that is retrieved from Jaeger's API to make assertions
   150  // about the spans from Boulder.
   151  type ContextInjectingRoundTripper struct {
   152  	ctx context.Context
   153  }
   154  
   155  // RoundTrip implements http.RoundTripper, injecting c.ctx and the OpenTelemetry
   156  // propagation headers into the request. This ensures all requests are traced.
   157  func (c *ContextInjectingRoundTripper) RoundTrip(request *http.Request) (*http.Response, error) {
   158  	// RoundTrip is not permitted to modify the request, so we clone with this context
   159  	r := request.Clone(c.ctx)
   160  	// Inject the otel propagation headers
   161  	otel.GetTextMapPropagator().Inject(c.ctx, propagation.HeaderCarrier(r.Header))
   162  	return http.DefaultTransport.RoundTrip(r)
   163  }
   164  
   165  // rpcSpan is a helper for constructing an RPC span where we have both a client and server rpc operation
   166  func rpcSpan(op, client, server string, children ...expectedSpans) expectedSpans {
   167  	return expectedSpans{
   168  		Operation: op,
   169  		Service:   client,
   170  		Children: []expectedSpans{
   171  			{
   172  				Operation: op,
   173  				Service:   server,
   174  				Children:  children,
   175  			},
   176  		},
   177  	}
   178  }
   179  
   180  func httpSpan(endpoint string, children ...expectedSpans) expectedSpans {
   181  	return expectedSpans{
   182  		Operation: endpoint,
   183  		Service:   "boulder-wfe2",
   184  		Children: append(children,
   185  			rpcSpan("nonce.NonceService/Nonce", "boulder-wfe2", "nonce-service"),
   186  			rpcSpan("nonce.NonceService/Redeem", "boulder-wfe2", "nonce-service"),
   187  		),
   188  	}
   189  }
   190  
   191  // TestTraces tests that all the expected spans are present and properly connected
   192  func TestTraces(t *testing.T) {
   193  	t.Parallel()
   194  	if !strings.Contains(os.Getenv("BOULDER_CONFIG_DIR"), "test/config-next") {
   195  		t.Skip("OpenTelemetry is only configured in config-next")
   196  	}
   197  
   198  	traceID := traceIssuingTestCert(t)
   199  
   200  	wfe := "boulder-wfe2"
   201  	sa := "boulder-sa"
   202  	ra := "boulder-ra"
   203  	ca := "boulder-ca"
   204  
   205  	expectedSpans := expectedSpans{
   206  		Operation: "TraceTest",
   207  		Service:   "integration.test",
   208  		Children: []expectedSpans{
   209  			{Operation: "/directory", Service: wfe},
   210  			{Operation: "/acme/new-nonce", Service: wfe, Children: []expectedSpans{
   211  				rpcSpan("nonce.NonceService/Nonce", wfe, "nonce-service")}},
   212  			httpSpan("/acme/new-acct",
   213  				rpcSpan("sa.StorageAuthorityReadOnly/KeyBlocked", wfe, sa),
   214  				rpcSpan("sa.StorageAuthorityReadOnly/GetRegistrationByKey", wfe, sa),
   215  				rpcSpan("ra.RegistrationAuthority/NewRegistration", wfe, ra,
   216  					rpcSpan("sa.StorageAuthority/KeyBlocked", ra, sa),
   217  					rpcSpan("sa.StorageAuthority/CountRegistrationsByIP", ra, sa),
   218  					rpcSpan("sa.StorageAuthority/NewRegistration", ra, sa))),
   219  			httpSpan("/acme/new-order",
   220  				rpcSpan("sa.StorageAuthorityReadOnly/GetRegistration", wfe, sa),
   221  				rpcSpan("ra.RegistrationAuthority/NewOrder", wfe, ra,
   222  					rpcSpan("sa.StorageAuthority/GetOrderForNames", ra, sa),
   223  					// 8 ra -> sa rate limit spans omitted here
   224  					rpcSpan("sa.StorageAuthority/NewOrderAndAuthzs", ra, sa))),
   225  			httpSpan("/acme/authz-v3/",
   226  				rpcSpan("sa.StorageAuthorityReadOnly/GetAuthorization2", wfe, sa)),
   227  			httpSpan("/acme/chall-v3/",
   228  				rpcSpan("sa.StorageAuthorityReadOnly/GetAuthorization2", wfe, sa),
   229  				rpcSpan("ra.RegistrationAuthority/PerformValidation", wfe, ra,
   230  					rpcSpan("sa.StorageAuthority/GetRegistration", ra, sa))),
   231  			httpSpan("/acme/finalize/",
   232  				rpcSpan("sa.StorageAuthorityReadOnly/GetOrder", wfe, sa),
   233  				rpcSpan("ra.RegistrationAuthority/FinalizeOrder", wfe, ra,
   234  					rpcSpan("sa.StorageAuthority/KeyBlocked", ra, sa),
   235  					rpcSpan("sa.StorageAuthority/GetRegistration", ra, sa),
   236  					rpcSpan("sa.StorageAuthority/GetValidOrderAuthorizations2", ra, sa),
   237  					rpcSpan("sa.StorageAuthority/SetOrderProcessing", ra, sa),
   238  					rpcSpan("ca.CertificateAuthority/IssuePrecertificate", ra, ca),
   239  					rpcSpan("Publisher/SubmitToSingleCTWithResult", ra, "boulder-publisher"),
   240  					rpcSpan("ca.CertificateAuthority/IssueCertificateForPrecertificate", ra, ca),
   241  					rpcSpan("sa.StorageAuthority/FinalizeOrder", ra, sa))),
   242  			httpSpan("/acme/order/", rpcSpan("sa.StorageAuthorityReadOnly/GetOrder", wfe, sa)),
   243  			httpSpan("/acme/cert/", rpcSpan("sa.StorageAuthorityReadOnly/GetCertificate", wfe, sa)),
   244  		},
   245  	}
   246  
   247  	// Retry checking for spans. Span submission is batched asynchronously, so we
   248  	// may have to wait for the DefaultScheduleDelay (5 seconds) for results to
   249  	// be available. Rather than always waiting, we retry a few times.
   250  	// Empirically, this test passes on the second or third try.
   251  	var trace Trace
   252  	found := false
   253  	const retries = 10
   254  	for try := 0; try < retries; try++ {
   255  		trace := getTraceFromJaeger(t, traceID)
   256  		if findSpans(trace, "", expectedSpans) {
   257  			found = true
   258  			break
   259  		}
   260  		time.Sleep(sdktrace.DefaultScheduleDelay / 5 * time.Millisecond)
   261  	}
   262  	test.Assert(t, found, fmt.Sprintf("Failed to find expected spans in Jaeger for trace %s", traceID))
   263  
   264  	test.AssertEquals(t, len(trace.Warnings), 0)
   265  	for _, span := range trace.Spans {
   266  		for _, warning := range span.Warnings {
   267  			if strings.Contains(warning, "clock skew adjustment disabled; not applying calculated delta") {
   268  				continue
   269  			}
   270  			t.Errorf("Span %s (%s) warning: %v", span.SpanID, span.OperationName, warning)
   271  		}
   272  	}
   273  }
   274  
   275  func traceIssuingTestCert(t *testing.T) trace.TraceID {
   276  	domains := []string{random_domain()}
   277  
   278  	// Configure this integration test to trace to jaeger:4317 like Boulder will
   279  	shutdown := cmd.NewOpenTelemetry(cmd.OpenTelemetryConfig{
   280  		Endpoint:    "bjaeger:4317",
   281  		SampleRatio: 1,
   282  	}, blog.Get())
   283  	defer shutdown(context.Background())
   284  
   285  	tracer := otel.GetTracerProvider().Tracer("TraceTest")
   286  	ctx, span := tracer.Start(context.Background(), "TraceTest")
   287  	defer span.End()
   288  
   289  	// Provide an HTTP client with otel spans.
   290  	// The acme client doesn't pass contexts through, so we inject one.
   291  	option := acme.WithHTTPClient(&http.Client{
   292  		Timeout:   60 * time.Second,
   293  		Transport: &ContextInjectingRoundTripper{ctx},
   294  	})
   295  
   296  	c, err := acme.NewClient("http://boulder.service.consul:4001/directory", option)
   297  	test.AssertNotError(t, err, "acme.NewClient failed")
   298  
   299  	privKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
   300  	test.AssertNotError(t, err, "Generating ECDSA key failed")
   301  
   302  	account, err := c.NewAccount(privKey, false, true)
   303  	test.AssertNotError(t, err, "newAccount failed")
   304  
   305  	_, err = authAndIssue(&client{account, c}, nil, domains, true)
   306  	test.AssertNotError(t, err, "authAndIssue failed")
   307  
   308  	return span.SpanContext().TraceID()
   309  }
   310  

View as plain text