...

Source file src/github.com/letsencrypt/boulder/va/va.go

Documentation: github.com/letsencrypt/boulder/va

     1  package va
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"crypto/tls"
     7  	"encoding/json"
     8  	"errors"
     9  	"fmt"
    10  	"math/rand"
    11  	"net"
    12  	"net/url"
    13  	"os"
    14  	"regexp"
    15  	"strings"
    16  	"syscall"
    17  	"time"
    18  
    19  	"github.com/jmhodges/clock"
    20  	"github.com/letsencrypt/boulder/bdns"
    21  	"github.com/letsencrypt/boulder/canceled"
    22  	"github.com/letsencrypt/boulder/core"
    23  	berrors "github.com/letsencrypt/boulder/errors"
    24  	"github.com/letsencrypt/boulder/features"
    25  	bgrpc "github.com/letsencrypt/boulder/grpc"
    26  	"github.com/letsencrypt/boulder/identifier"
    27  	blog "github.com/letsencrypt/boulder/log"
    28  	"github.com/letsencrypt/boulder/metrics"
    29  	"github.com/letsencrypt/boulder/probs"
    30  	vapb "github.com/letsencrypt/boulder/va/proto"
    31  	"github.com/prometheus/client_golang/prometheus"
    32  )
    33  
    34  var (
    35  	// badTLSHeader contains the string 'HTTP /' which is returned when
    36  	// we try to talk TLS to a server that only talks HTTP
    37  	badTLSHeader = []byte{0x48, 0x54, 0x54, 0x50, 0x2f}
    38  	// h2SettingsFrameErrRegex is a regex against a net/http error indicating
    39  	// a malformed HTTP response that matches the initial SETTINGS frame of an
    40  	// HTTP/2 connection. This happens when a server configures HTTP/2 on port
    41  	// :80, failing HTTP-01 challenges.
    42  	//
    43  	// The regex first matches the error string prefix and then matches the raw
    44  	// bytes of an arbitrarily sized HTTP/2 SETTINGS frame:
    45  	//   0x00 0x00 0x?? 0x04 0x00 0x00 0x00 0x00
    46  	//
    47  	// The third byte is variable and indicates the frame size. Typically
    48  	// this will be 0x12.
    49  	// The 0x04 in the fourth byte indicates that the frame is SETTINGS type.
    50  	//
    51  	// See:
    52  	//   * https://tools.ietf.org/html/rfc7540#section-4.1
    53  	//   * https://tools.ietf.org/html/rfc7540#section-6.5
    54  	//
    55  	// NOTE(@cpu): Using a regex is a hack but unfortunately for this case
    56  	// http.Client.Do() will return a url.Error err that wraps
    57  	// a errors.ErrorString instance. There isn't much else to do with one of
    58  	// those except match the encoded byte string with a regex. :-X
    59  	//
    60  	// NOTE(@cpu): The first component of this regex is optional to avoid an
    61  	// integration test flake. In some (fairly rare) conditions the malformed
    62  	// response error will be returned simply as a http.badStringError without
    63  	// the broken transport prefix. Most of the time the error is returned with
    64  	// a transport connection error prefix.
    65  	h2SettingsFrameErrRegex = regexp.MustCompile(`(?:net\/http\: HTTP\/1\.x transport connection broken: )?malformed HTTP response \"\\x00\\x00\\x[a-f0-9]{2}\\x04\\x00\\x00\\x00\\x00\\x00.*"`)
    66  )
    67  
    68  // RemoteVA wraps the vapb.VAClient interface and adds a field containing the
    69  // address of the remote gRPC server since the underlying gRPC client doesn't
    70  // provide a way to extract this metadata which is useful for debugging gRPC
    71  // connection issues.
    72  type RemoteVA struct {
    73  	vapb.VAClient
    74  	Address string
    75  }
    76  
    77  type vaMetrics struct {
    78  	validationTime                      *prometheus.HistogramVec
    79  	localValidationTime                 *prometheus.HistogramVec
    80  	remoteValidationTime                *prometheus.HistogramVec
    81  	remoteValidationFailures            prometheus.Counter
    82  	prospectiveRemoteValidationFailures prometheus.Counter
    83  	tlsALPNOIDCounter                   *prometheus.CounterVec
    84  	http01Fallbacks                     prometheus.Counter
    85  	http01Redirects                     prometheus.Counter
    86  	caaCounter                          *prometheus.CounterVec
    87  	ipv4FallbackCounter                 prometheus.Counter
    88  }
    89  
    90  func initMetrics(stats prometheus.Registerer) *vaMetrics {
    91  	validationTime := prometheus.NewHistogramVec(
    92  		prometheus.HistogramOpts{
    93  			Name:    "validation_time",
    94  			Help:    "Total time taken to validate a challenge and aggregate results",
    95  			Buckets: metrics.InternetFacingBuckets,
    96  		},
    97  		[]string{"type", "result", "problem_type"})
    98  	stats.MustRegister(validationTime)
    99  	localValidationTime := prometheus.NewHistogramVec(
   100  		prometheus.HistogramOpts{
   101  			Name:    "local_validation_time",
   102  			Help:    "Time taken to locally validate a challenge",
   103  			Buckets: metrics.InternetFacingBuckets,
   104  		},
   105  		[]string{"type", "result"})
   106  	stats.MustRegister(localValidationTime)
   107  	remoteValidationTime := prometheus.NewHistogramVec(
   108  		prometheus.HistogramOpts{
   109  			Name:    "remote_validation_time",
   110  			Help:    "Time taken to remotely validate a challenge",
   111  			Buckets: metrics.InternetFacingBuckets,
   112  		},
   113  		[]string{"type", "result"})
   114  	stats.MustRegister(remoteValidationTime)
   115  	remoteValidationFailures := prometheus.NewCounter(
   116  		prometheus.CounterOpts{
   117  			Name: "remote_validation_failures",
   118  			Help: "Number of validations failed due to remote VAs returning failure when consensus is enforced",
   119  		})
   120  	stats.MustRegister(remoteValidationFailures)
   121  	prospectiveRemoteValidationFailures := prometheus.NewCounter(
   122  		prometheus.CounterOpts{
   123  			Name: "prospective_remote_validation_failures",
   124  			Help: "Number of validations that would have failed due to remote VAs returning failure if consesus were enforced",
   125  		})
   126  	stats.MustRegister(prospectiveRemoteValidationFailures)
   127  	tlsALPNOIDCounter := prometheus.NewCounterVec(
   128  		prometheus.CounterOpts{
   129  			Name: "tls_alpn_oid_usage",
   130  			Help: "Number of TLS ALPN validations using either of the two OIDs",
   131  		},
   132  		[]string{"oid"},
   133  	)
   134  	stats.MustRegister(tlsALPNOIDCounter)
   135  	http01Fallbacks := prometheus.NewCounter(
   136  		prometheus.CounterOpts{
   137  			Name: "http01_fallbacks",
   138  			Help: "Number of IPv6 to IPv4 HTTP-01 fallback requests made",
   139  		})
   140  	stats.MustRegister(http01Fallbacks)
   141  	http01Redirects := prometheus.NewCounter(
   142  		prometheus.CounterOpts{
   143  			Name: "http01_redirects",
   144  			Help: "Number of HTTP-01 redirects followed",
   145  		})
   146  	stats.MustRegister(http01Redirects)
   147  	caaCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
   148  		Name: "caa_sets_processed",
   149  		Help: "A counter of CAA sets processed labelled by result",
   150  	}, []string{"result"})
   151  	stats.MustRegister(caaCounter)
   152  	ipv4FallbackCounter := prometheus.NewCounter(prometheus.CounterOpts{
   153  		Name: "tls_alpn_ipv4_fallback",
   154  		Help: "A counter of IPv4 fallbacks during TLS ALPN validation",
   155  	})
   156  	stats.MustRegister(ipv4FallbackCounter)
   157  
   158  	return &vaMetrics{
   159  		validationTime:                      validationTime,
   160  		remoteValidationTime:                remoteValidationTime,
   161  		localValidationTime:                 localValidationTime,
   162  		remoteValidationFailures:            remoteValidationFailures,
   163  		prospectiveRemoteValidationFailures: prospectiveRemoteValidationFailures,
   164  		tlsALPNOIDCounter:                   tlsALPNOIDCounter,
   165  		http01Fallbacks:                     http01Fallbacks,
   166  		http01Redirects:                     http01Redirects,
   167  		caaCounter:                          caaCounter,
   168  		ipv4FallbackCounter:                 ipv4FallbackCounter,
   169  	}
   170  }
   171  
   172  // PortConfig specifies what ports the VA should call to on the remote
   173  // host when performing its checks.
   174  type portConfig struct {
   175  	HTTPPort  int
   176  	HTTPSPort int
   177  	TLSPort   int
   178  }
   179  
   180  // newDefaultPortConfig is a constructor which returns a portConfig with default
   181  // settings.
   182  //
   183  // CABF BRs section 1.6.1: Authorized Ports: One of the following ports: 80
   184  // (http), 443 (https), 25 (smtp), 22 (ssh).
   185  //
   186  // RFC 8555 section 8.3: Dereference the URL using an HTTP GET request. This
   187  // request MUST be sent to TCP port 80 on the HTTP server.
   188  //
   189  // RFC 8737 section 3: The ACME server initiates a TLS connection to the chosen
   190  // IP address. This connection MUST use TCP port 443.
   191  func newDefaultPortConfig() *portConfig {
   192  	return &portConfig{
   193  		HTTPPort:  80,
   194  		HTTPSPort: 443,
   195  		TLSPort:   443,
   196  	}
   197  }
   198  
   199  // ValidationAuthorityImpl represents a VA
   200  type ValidationAuthorityImpl struct {
   201  	vapb.UnimplementedVAServer
   202  	vapb.UnimplementedCAAServer
   203  	log                blog.Logger
   204  	dnsClient          bdns.Client
   205  	issuerDomain       string
   206  	httpPort           int
   207  	httpsPort          int
   208  	tlsPort            int
   209  	userAgent          string
   210  	clk                clock.Clock
   211  	remoteVAs          []RemoteVA
   212  	maxRemoteFailures  int
   213  	accountURIPrefixes []string
   214  	singleDialTimeout  time.Duration
   215  
   216  	metrics *vaMetrics
   217  }
   218  
   219  // NewValidationAuthorityImpl constructs a new VA
   220  func NewValidationAuthorityImpl(
   221  	resolver bdns.Client,
   222  	remoteVAs []RemoteVA,
   223  	maxRemoteFailures int,
   224  	userAgent string,
   225  	issuerDomain string,
   226  	stats prometheus.Registerer,
   227  	clk clock.Clock,
   228  	logger blog.Logger,
   229  	accountURIPrefixes []string,
   230  ) (*ValidationAuthorityImpl, error) {
   231  
   232  	if len(accountURIPrefixes) == 0 {
   233  		return nil, errors.New("no account URI prefixes configured")
   234  	}
   235  
   236  	pc := newDefaultPortConfig()
   237  
   238  	va := &ValidationAuthorityImpl{
   239  		log:                logger,
   240  		dnsClient:          resolver,
   241  		issuerDomain:       issuerDomain,
   242  		httpPort:           pc.HTTPPort,
   243  		httpsPort:          pc.HTTPSPort,
   244  		tlsPort:            pc.TLSPort,
   245  		userAgent:          userAgent,
   246  		clk:                clk,
   247  		metrics:            initMetrics(stats),
   248  		remoteVAs:          remoteVAs,
   249  		maxRemoteFailures:  maxRemoteFailures,
   250  		accountURIPrefixes: accountURIPrefixes,
   251  		// singleDialTimeout specifies how long an individual `DialContext` operation may take
   252  		// before timing out. This timeout ignores the base RPC timeout and is strictly
   253  		// used for the DialContext operations that take place during an
   254  		// HTTP-01 challenge validation.
   255  		singleDialTimeout: 10 * time.Second,
   256  	}
   257  
   258  	return va, nil
   259  }
   260  
   261  // Used for audit logging
   262  type verificationRequestEvent struct {
   263  	ID                string         `json:",omitempty"`
   264  	Requester         int64          `json:",omitempty"`
   265  	Hostname          string         `json:",omitempty"`
   266  	Challenge         core.Challenge `json:",omitempty"`
   267  	ValidationLatency float64
   268  	Error             string `json:",omitempty"`
   269  }
   270  
   271  // ipError is an error type used to pass though the IP address of the remote
   272  // host when an error occurs during HTTP-01 and TLS-ALPN domain validation.
   273  type ipError struct {
   274  	ip  net.IP
   275  	err error
   276  }
   277  
   278  // Unwrap returns the underlying error.
   279  func (i ipError) Unwrap() error {
   280  	return i.err
   281  }
   282  
   283  // Error returns a string representation of the error.
   284  func (i ipError) Error() string {
   285  	return fmt.Sprintf("%s: %s", i.ip, i.err)
   286  }
   287  
   288  // detailedError returns a ProblemDetails corresponding to an error
   289  // that occurred during HTTP-01 or TLS-ALPN domain validation. Specifically it
   290  // tries to unwrap known Go error types and present something a little more
   291  // meaningful. It additionally handles `berrors.ConnectionFailure` errors by
   292  // passing through the detailed message.
   293  func detailedError(err error) *probs.ProblemDetails {
   294  	var ipErr ipError
   295  	if errors.As(err, &ipErr) {
   296  		detailedErr := detailedError(ipErr.err)
   297  		if ipErr.ip == nil {
   298  			// This should never happen.
   299  			return detailedErr
   300  		}
   301  		// Prefix the error message with the IP address of the remote host.
   302  		detailedErr.Detail = fmt.Sprintf("%s: %s", ipErr.ip, detailedErr.Detail)
   303  		return detailedErr
   304  	}
   305  	// net/http wraps net.OpError in a url.Error. Unwrap them.
   306  	var urlErr *url.Error
   307  	if errors.As(err, &urlErr) {
   308  		prob := detailedError(urlErr.Err)
   309  		prob.Detail = fmt.Sprintf("Fetching %s: %s", urlErr.URL, prob.Detail)
   310  		return prob
   311  	}
   312  
   313  	var tlsErr tls.RecordHeaderError
   314  	if errors.As(err, &tlsErr) && bytes.Equal(tlsErr.RecordHeader[:], badTLSHeader) {
   315  		return probs.Malformed("Server only speaks HTTP, not TLS")
   316  	}
   317  
   318  	var netOpErr *net.OpError
   319  	if errors.As(err, &netOpErr) {
   320  		if fmt.Sprintf("%T", netOpErr.Err) == "tls.alert" {
   321  			// All the tls.alert error strings are reasonable to hand back to a
   322  			// user. Confirmed against Go 1.8.
   323  			return probs.TLS(netOpErr.Error())
   324  		} else if netOpErr.Timeout() && netOpErr.Op == "dial" {
   325  			return probs.Connection("Timeout during connect (likely firewall problem)")
   326  		} else if netOpErr.Timeout() {
   327  			return probs.Connection(fmt.Sprintf("Timeout during %s (your server may be slow or overloaded)", netOpErr.Op))
   328  		}
   329  	}
   330  	var syscallErr *os.SyscallError
   331  	if errors.As(err, &syscallErr) {
   332  		switch syscallErr.Err {
   333  		case syscall.ECONNREFUSED:
   334  			return probs.Connection("Connection refused")
   335  		case syscall.ENETUNREACH:
   336  			return probs.Connection("Network unreachable")
   337  		case syscall.ECONNRESET:
   338  			return probs.Connection("Connection reset by peer")
   339  		}
   340  	}
   341  	var netErr net.Error
   342  	if errors.As(err, &netErr) && netErr.Timeout() {
   343  		return probs.Connection("Timeout after connect (your server may be slow or overloaded)")
   344  	}
   345  	if errors.Is(err, berrors.ConnectionFailure) {
   346  		return probs.Connection(err.Error())
   347  	}
   348  	if errors.Is(err, berrors.Unauthorized) {
   349  		return probs.Unauthorized(err.Error())
   350  	}
   351  	if errors.Is(err, berrors.DNS) {
   352  		return probs.DNS(err.Error())
   353  	}
   354  
   355  	if h2SettingsFrameErrRegex.MatchString(err.Error()) {
   356  		return probs.Connection("Server is speaking HTTP/2 over HTTP")
   357  	}
   358  	return probs.Connection("Error getting validation data")
   359  }
   360  
   361  // validate performs a challenge validation and, in parallel,
   362  // checks CAA and GSB for the identifier. If any of those steps fails, it
   363  // returns a ProblemDetails plus the validation records created during the
   364  // validation attempt.
   365  func (va *ValidationAuthorityImpl) validate(
   366  	ctx context.Context,
   367  	identifier identifier.ACMEIdentifier,
   368  	regid int64,
   369  	challenge core.Challenge,
   370  ) ([]core.ValidationRecord, *probs.ProblemDetails) {
   371  
   372  	// If the identifier is a wildcard domain we need to validate the base
   373  	// domain by removing the "*." wildcard prefix. We create a separate
   374  	// `baseIdentifier` here before starting the `va.checkCAA` goroutine with the
   375  	// `identifier` to avoid a data race.
   376  	baseIdentifier := identifier
   377  	if strings.HasPrefix(identifier.Value, "*.") {
   378  		baseIdentifier.Value = strings.TrimPrefix(identifier.Value, "*.")
   379  	}
   380  
   381  	// Create this channel outside of the feature-conditional block so that it is
   382  	// also in scope for the matching block below.
   383  	ch := make(chan *probs.ProblemDetails, 1)
   384  	if !features.Enabled(features.CAAAfterValidation) {
   385  		// va.checkCAA accepts wildcard identifiers and handles them appropriately so
   386  		// we can dispatch `checkCAA` with the provided `identifier` instead of
   387  		// `baseIdentifier`
   388  		go func() {
   389  			params := &caaParams{
   390  				accountURIID:     regid,
   391  				validationMethod: challenge.Type,
   392  			}
   393  			ch <- va.checkCAA(ctx, identifier, params)
   394  		}()
   395  	}
   396  
   397  	// TODO(#1292): send into another goroutine
   398  	validationRecords, prob := va.validateChallenge(ctx, baseIdentifier, challenge)
   399  	if prob != nil {
   400  		// The ProblemDetails will be serialized through gRPC, which requires UTF-8.
   401  		// It will also later be serialized in JSON, which defaults to UTF-8. Make
   402  		// sure it is UTF-8 clean now.
   403  		prob = filterProblemDetails(prob)
   404  
   405  		return validationRecords, prob
   406  	}
   407  
   408  	if !features.Enabled(features.CAAAfterValidation) {
   409  		for i := 0; i < cap(ch); i++ {
   410  			if extraProblem := <-ch; extraProblem != nil {
   411  				return validationRecords, extraProblem
   412  			}
   413  		}
   414  	} else {
   415  		params := &caaParams{
   416  			accountURIID:     regid,
   417  			validationMethod: challenge.Type,
   418  		}
   419  		prob := va.checkCAA(ctx, identifier, params)
   420  		if prob != nil {
   421  			return validationRecords, prob
   422  		}
   423  	}
   424  
   425  	return validationRecords, nil
   426  }
   427  
   428  func (va *ValidationAuthorityImpl) validateChallenge(ctx context.Context, identifier identifier.ACMEIdentifier, challenge core.Challenge) ([]core.ValidationRecord, *probs.ProblemDetails) {
   429  	err := challenge.CheckConsistencyForValidation()
   430  	if err != nil {
   431  		return nil, probs.Malformed("Challenge failed consistency check: %s", err)
   432  	}
   433  	switch challenge.Type {
   434  	case core.ChallengeTypeHTTP01:
   435  		return va.validateHTTP01(ctx, identifier, challenge)
   436  	case core.ChallengeTypeDNS01:
   437  		return va.validateDNS01(ctx, identifier, challenge)
   438  	case core.ChallengeTypeTLSALPN01:
   439  		return va.validateTLSALPN01(ctx, identifier, challenge)
   440  	}
   441  	return nil, probs.Malformed("invalid challenge type %s", challenge.Type)
   442  }
   443  
   444  // performRemoteValidation calls `PerformValidation` for each of the configured
   445  // remoteVAs in a random order. The provided `results` chan should have an equal
   446  // size to the number of remote VAs. The validations will be performed in
   447  // separate go-routines. If the result `error` from a remote
   448  // `PerformValidation` RPC is nil or a nil `ProblemDetails` instance it is
   449  // written directly to the `results` chan. If the err is a cancelled error it is
   450  // treated as a nil error. Otherwise the error/problem is written to the results
   451  // channel as-is.
   452  func (va *ValidationAuthorityImpl) performRemoteValidation(
   453  	ctx context.Context,
   454  	req *vapb.PerformValidationRequest,
   455  	results chan *remoteValidationResult) {
   456  	for _, i := range rand.Perm(len(va.remoteVAs)) {
   457  		remoteVA := va.remoteVAs[i]
   458  		go func(rva RemoteVA) {
   459  			result := &remoteValidationResult{
   460  				VAHostname: rva.Address,
   461  			}
   462  			res, err := rva.PerformValidation(ctx, req)
   463  			if err != nil && canceled.Is(err) {
   464  				// If the non-nil err was a canceled error, ignore it. That's fine: it
   465  				// just means we cancelled the remote VA request before it was
   466  				// finished because we didn't care about its result. Don't log to avoid
   467  				// spamming the logs.
   468  				result.Problem = probs.ServerInternal("Remote PerformValidation RPC canceled")
   469  			} else if err != nil {
   470  				// This is a real error, not just a problem with the validation.
   471  				va.log.Errf("Remote VA %q.PerformValidation failed: %s", rva.Address, err)
   472  				result.Problem = probs.ServerInternal("Remote PerformValidation RPC failed")
   473  			} else if res.Problems != nil {
   474  				prob, err := bgrpc.PBToProblemDetails(res.Problems)
   475  				if err != nil {
   476  					va.log.Infof("Remote VA %q.PerformValidation returned malformed problem: %s", rva.Address, err)
   477  					result.Problem = probs.ServerInternal(
   478  						fmt.Sprintf("Remote PerformValidation RPC returned malformed result: %s", err))
   479  				} else {
   480  					va.log.Infof("Remote VA %q.PerformValidation returned problem: %s", rva.Address, prob)
   481  					result.Problem = prob
   482  				}
   483  			}
   484  			results <- result
   485  		}(remoteVA)
   486  	}
   487  }
   488  
   489  // processRemoteResults evaluates a primary VA result, and a channel of remote
   490  // VA problems to produce a single overall validation result based on configured
   491  // feature flags. The overall result is calculated based on the VA's configured
   492  // `maxRemoteFailures` value.
   493  //
   494  // If the `MultiVAFullResults` feature is enabled then `processRemoteResults`
   495  // will expect to read a result from the `remoteErrors` channel for each VA and
   496  // will not produce an overall result until all remote VAs have responded. In
   497  // this case `logRemoteFailureDifferentials` will also be called to describe the
   498  // differential between the primary and all of the remote VAs.
   499  //
   500  // If the `MultiVAFullResults` feature flag is not enabled then
   501  // `processRemoteResults` will potentially return before all remote VAs have had
   502  // a chance to respond. This happens if the success or failure threshold is met.
   503  // This doesn't allow for logging the differential between the primary and
   504  // remote VAs but is more performant.
   505  func (va *ValidationAuthorityImpl) processRemoteResults(
   506  	domain string,
   507  	acctID int64,
   508  	challengeType string,
   509  	primaryResult *probs.ProblemDetails,
   510  	remoteResultsChan chan *remoteValidationResult,
   511  	numRemoteVAs int) *probs.ProblemDetails {
   512  
   513  	state := "failure"
   514  	start := va.clk.Now()
   515  
   516  	defer func() {
   517  		va.metrics.remoteValidationTime.With(prometheus.Labels{
   518  			"type":   challengeType,
   519  			"result": state,
   520  		}).Observe(va.clk.Since(start).Seconds())
   521  	}()
   522  
   523  	required := numRemoteVAs - va.maxRemoteFailures
   524  	good := 0
   525  	bad := 0
   526  
   527  	var remoteResults []*remoteValidationResult
   528  	var firstProb *probs.ProblemDetails
   529  	// Due to channel behavior this could block indefinitely and we rely on gRPC
   530  	// honoring the context deadline used in client calls to prevent that from
   531  	// happening.
   532  	for result := range remoteResultsChan {
   533  		// Add the result to the slice
   534  		remoteResults = append(remoteResults, result)
   535  		if result.Problem == nil {
   536  			good++
   537  		} else {
   538  			bad++
   539  		}
   540  
   541  		// Store the first non-nil problem to return later (if `MultiVAFullResults`
   542  		// is enabled).
   543  		if firstProb == nil && result.Problem != nil {
   544  			firstProb = result.Problem
   545  		}
   546  
   547  		// If MultiVAFullResults isn't enabled then return early whenever the
   548  		// success or failure threshold is met.
   549  		if !features.Enabled(features.MultiVAFullResults) {
   550  			if good >= required {
   551  				state = "success"
   552  				return nil
   553  			} else if bad > va.maxRemoteFailures {
   554  				modifiedProblem := *result.Problem
   555  				modifiedProblem.Detail = "During secondary validation: " + firstProb.Detail
   556  				return &modifiedProblem
   557  			}
   558  		}
   559  
   560  		// If we haven't returned early because of MultiVAFullResults being enabled
   561  		// we need to break the loop once all of the VAs have returned a result.
   562  		if len(remoteResults) == numRemoteVAs {
   563  			break
   564  		}
   565  	}
   566  
   567  	// If we are using `features.MultiVAFullResults` then we haven't returned
   568  	// early and can now log the differential between what the primary VA saw and
   569  	// what all of the remote VAs saw.
   570  	va.logRemoteValidationDifferentials(
   571  		domain,
   572  		acctID,
   573  		challengeType,
   574  		primaryResult,
   575  		remoteResults)
   576  
   577  	// Based on the threshold of good/bad return nil or a problem.
   578  	if good >= required {
   579  		state = "success"
   580  		return nil
   581  	} else if bad > va.maxRemoteFailures {
   582  		modifiedProblem := *firstProb
   583  		modifiedProblem.Detail = "During secondary validation: " + firstProb.Detail
   584  		return &modifiedProblem
   585  	}
   586  
   587  	// This condition should not occur - it indicates the good/bad counts didn't
   588  	// meet either the required threshold or the maxRemoteFailures threshold.
   589  	return probs.ServerInternal("Too few remote PerformValidation RPC results")
   590  }
   591  
   592  // logRemoteValidationDifferentials is called by `processRemoteResults` when the
   593  // `MultiVAFullResults` feature flag is enabled. It produces a JSON log line
   594  // that contains the primary VA result and the results each remote VA returned.
   595  func (va *ValidationAuthorityImpl) logRemoteValidationDifferentials(
   596  	domain string,
   597  	acctID int64,
   598  	challengeType string,
   599  	primaryResult *probs.ProblemDetails,
   600  	remoteResults []*remoteValidationResult) {
   601  
   602  	var successes []*remoteValidationResult
   603  	var failures []*remoteValidationResult
   604  
   605  	allEqual := true
   606  	for _, result := range remoteResults {
   607  		if result.Problem != primaryResult {
   608  			allEqual = false
   609  		}
   610  		if result.Problem == nil {
   611  			successes = append(successes, result)
   612  		} else {
   613  			failures = append(failures, result)
   614  		}
   615  	}
   616  	if allEqual {
   617  		// There's no point logging a differential line if the primary VA and remote
   618  		// VAs all agree.
   619  		return
   620  	}
   621  
   622  	// If the primary result was OK and there were more failures than the allowed
   623  	// threshold increment a stat that indicates this overall validation will have
   624  	// failed if features.EnforceMultiVA is enabled.
   625  	if primaryResult == nil && len(failures) > va.maxRemoteFailures {
   626  		va.metrics.prospectiveRemoteValidationFailures.Inc()
   627  	}
   628  
   629  	logOb := struct {
   630  		Domain          string
   631  		AccountID       int64
   632  		ChallengeType   string
   633  		PrimaryResult   *probs.ProblemDetails
   634  		RemoteSuccesses int
   635  		RemoteFailures  []*remoteValidationResult
   636  	}{
   637  		Domain:          domain,
   638  		AccountID:       acctID,
   639  		ChallengeType:   challengeType,
   640  		PrimaryResult:   primaryResult,
   641  		RemoteSuccesses: len(successes),
   642  		RemoteFailures:  failures,
   643  	}
   644  
   645  	logJSON, err := json.Marshal(logOb)
   646  	if err != nil {
   647  		// log a warning - a marshaling failure isn't expected given the data and
   648  		// isn't critical enough to break validation for by returning an error to
   649  		// the caller.
   650  		va.log.Warningf("Could not marshal log object in "+
   651  			"logRemoteValidationDifferentials: %s", err)
   652  		return
   653  	}
   654  
   655  	va.log.Infof("remoteVADifferentials JSON=%s", string(logJSON))
   656  }
   657  
   658  // remoteValidationResult is a struct that combines a problem details instance
   659  // (that may be nil) with the remote VA hostname that produced it.
   660  type remoteValidationResult struct {
   661  	VAHostname string
   662  	Problem    *probs.ProblemDetails
   663  }
   664  
   665  // PerformValidation validates the challenge for the domain in the request.
   666  // The returned result will always contain a list of validation records, even
   667  // when it also contains a problem.
   668  func (va *ValidationAuthorityImpl) PerformValidation(ctx context.Context, req *vapb.PerformValidationRequest) (*vapb.ValidationResult, error) {
   669  	if core.IsAnyNilOrZero(req, req.Domain, req.Challenge, req.Authz) {
   670  		return nil, berrors.InternalServerError("Incomplete validation request")
   671  	}
   672  	logEvent := verificationRequestEvent{
   673  		ID:        req.Authz.Id,
   674  		Requester: req.Authz.RegID,
   675  		Hostname:  req.Domain,
   676  	}
   677  	vStart := va.clk.Now()
   678  
   679  	var remoteResults chan *remoteValidationResult
   680  	if remoteVACount := len(va.remoteVAs); remoteVACount > 0 {
   681  		remoteResults = make(chan *remoteValidationResult, remoteVACount)
   682  		go va.performRemoteValidation(ctx, req, remoteResults)
   683  	}
   684  
   685  	challenge, err := bgrpc.PBToChallenge(req.Challenge)
   686  	if err != nil {
   687  		return nil, probs.ServerInternal("Challenge failed to deserialize")
   688  	}
   689  
   690  	records, prob := va.validate(ctx, identifier.DNSIdentifier(req.Domain), req.Authz.RegID, challenge)
   691  	challenge.ValidationRecord = records
   692  	localValidationLatency := time.Since(vStart)
   693  
   694  	// Check for malformed ValidationRecords
   695  	if !challenge.RecordsSane() && prob == nil {
   696  		prob = probs.ServerInternal("Records for validation failed sanity check")
   697  	}
   698  
   699  	var problemType string
   700  	if prob != nil {
   701  		problemType = string(prob.Type)
   702  		challenge.Status = core.StatusInvalid
   703  		challenge.Error = prob
   704  		logEvent.Error = prob.Error()
   705  	} else if remoteResults != nil {
   706  		if !features.Enabled(features.EnforceMultiVA) && features.Enabled(features.MultiVAFullResults) {
   707  			// If we're not going to enforce multi VA but we are logging the
   708  			// differentials then collect and log the remote results in a separate go
   709  			// routine to avoid blocking the primary VA.
   710  			go func() {
   711  				_ = va.processRemoteResults(
   712  					req.Domain,
   713  					req.Authz.RegID,
   714  					string(challenge.Type),
   715  					prob,
   716  					remoteResults,
   717  					len(va.remoteVAs))
   718  			}()
   719  			// Since prob was nil and we're not enforcing the results from
   720  			// `processRemoteResults` set the challenge status to valid so the
   721  			// validationTime metrics increment has the correct result label.
   722  			challenge.Status = core.StatusValid
   723  		} else if features.Enabled(features.EnforceMultiVA) {
   724  			remoteProb := va.processRemoteResults(
   725  				req.Domain,
   726  				req.Authz.RegID,
   727  				string(challenge.Type),
   728  				prob,
   729  				remoteResults,
   730  				len(va.remoteVAs))
   731  
   732  			// If the remote result was a non-nil problem then fail the validation
   733  			if remoteProb != nil {
   734  				prob = remoteProb
   735  				challenge.Status = core.StatusInvalid
   736  				challenge.Error = remoteProb
   737  				logEvent.Error = remoteProb.Error()
   738  				va.log.Infof("Validation failed due to remote failures: identifier=%v err=%s",
   739  					req.Domain, remoteProb)
   740  				va.metrics.remoteValidationFailures.Inc()
   741  			} else {
   742  				challenge.Status = core.StatusValid
   743  			}
   744  		}
   745  	} else {
   746  		challenge.Status = core.StatusValid
   747  	}
   748  
   749  	logEvent.Challenge = challenge
   750  
   751  	validationLatency := time.Since(vStart)
   752  	logEvent.ValidationLatency = validationLatency.Round(time.Millisecond).Seconds()
   753  
   754  	va.metrics.localValidationTime.With(prometheus.Labels{
   755  		"type":   string(challenge.Type),
   756  		"result": string(challenge.Status),
   757  	}).Observe(localValidationLatency.Seconds())
   758  	va.metrics.validationTime.With(prometheus.Labels{
   759  		"type":         string(challenge.Type),
   760  		"result":       string(challenge.Status),
   761  		"problem_type": problemType,
   762  	}).Observe(validationLatency.Seconds())
   763  
   764  	va.log.AuditObject("Validation result", logEvent)
   765  
   766  	return bgrpc.ValidationResultToPB(records, prob)
   767  }
   768  

View as plain text