diff --git a/director/cache_ads.go b/director/cache_ads.go index 9b19cc686..2dea53d93 100644 --- a/director/cache_ads.go +++ b/director/cache_ads.go @@ -33,6 +33,7 @@ import ( log "github.com/sirupsen/logrus" "golang.org/x/sync/errgroup" + "github.com/pelicanplatform/pelican/metrics" "github.com/pelicanplatform/pelican/param" "github.com/pelicanplatform/pelican/server_structs" "github.com/pelicanplatform/pelican/utils" @@ -81,7 +82,11 @@ func (f filterType) String() string { // 4. Set up utilities for collecting origin/health server file transfer test status // 5. Return the updated ServerAd. The ServerAd passed in will not be modified func recordAd(ctx context.Context, sAd server_structs.ServerAd, namespaceAds *[]server_structs.NamespaceAdV2) (updatedAd server_structs.ServerAd) { - if err := updateLatLong(ctx, &sAd); err != nil { + if err := updateLatLong(&sAd); err != nil { + if geoIPError, ok := err.(geoIPError); ok { + labels := geoIPError.labels + metrics.PelicanDirectorGeoIPErrors.With(labels).Inc() + } log.Debugln("Failed to lookup GeoIP coordinates for host", sAd.URL.Host) } @@ -239,7 +244,7 @@ func recordAd(ctx context.Context, sAd server_structs.ServerAd, namespaceAds *[] return sAd } -func updateLatLong(ctx context.Context, ad *server_structs.ServerAd) error { +func updateLatLong(ad *server_structs.ServerAd) error { if ad == nil { return errors.New("Cannot provide a nil ad to UpdateLatLong") } @@ -257,7 +262,7 @@ func updateLatLong(ctx context.Context, ad *server_structs.ServerAd) error { } // NOTE: If GeoIP resolution of this address fails, lat/long are set to 0.0 (the null lat/long) // This causes the server to be sorted to the end of the list whenever the Director requires distance-aware sorting. - lat, long, err := getLatLong(ctx, addr) + lat, long, err := getLatLong(addr) if err != nil { return err } diff --git a/director/sort.go b/director/sort.go index 6a11e3361..cc7054cc1 100644 --- a/director/sort.go +++ b/director/sort.go @@ -21,6 +21,7 @@ package director import ( "cmp" "context" + "fmt" "math/rand" "net" "net/netip" @@ -59,8 +60,17 @@ type ( IP string `mapstructure:"IP"` Coordinate Coordinate `mapstructure:"Coordinate"` } + + geoIPError struct { + labels prometheus.Labels + errorMsg string + } ) +func (e geoIPError) Error() string { + return e.errorMsg +} + var ( invalidOverrideLogOnce = map[string]bool{} geoIPOverrides []GeoIPOverride @@ -148,7 +158,16 @@ func checkOverrides(addr net.IP) (coordinate *Coordinate) { return nil } -func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64, err error) { +func setProjectLabel(ctx context.Context, labels *prometheus.Labels) { + project, ok := ctx.Value(ProjectContextKey{}).(string) + if !ok || project == "" { + (*labels)["proj"] = "unknown" + } else { + (*labels)["proj"] = project + } +} + +func getLatLong(addr netip.Addr) (lat float64, long float64, err error) { ip := net.IP(addr.AsSlice()) override := checkOverrides(ip) if override != nil { @@ -159,7 +178,7 @@ func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64 labels := prometheus.Labels{ "network": "", "source": "", - "proj": "", + "proj": "", // this will be set in the setProjectLabel function } network, ok := utils.ApplyIPMask(addr.String()) @@ -170,26 +189,16 @@ func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64 labels["network"] = network } - project, ok := ctx.Value(ProjectContextKey{}).(string) - if !ok || project == "" { - log.Warningf("Failed to get project from context") - labels["proj"] = "unknown" - labels["source"] = "server" - } else { - labels["proj"] = project - } - reader := maxMindReader.Load() if reader == nil { - err = errors.New("No GeoIP database is available") labels["source"] = "server" - metrics.PelicanDirectorGeoIPErrors.With(labels).Inc() + err = geoIPError{labels: labels, errorMsg: "No GeoIP database is available"} return } record, err := reader.City(ip) if err != nil { labels["source"] = "server" - metrics.PelicanDirectorGeoIPErrors.With(labels).Inc() + err = geoIPError{labels: labels, errorMsg: err.Error()} return } lat = record.Location.Latitude @@ -199,9 +208,10 @@ func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64 // There's likely a problem with the GeoIP database or the IP address. Usually this just means the IP address // comes from a private range. if lat == 0 && long == 0 { - log.Warningf("GeoIP Resolution of the address %s resulted in the null lat/long. This will result in random server sorting.", ip.String()) + errMsg := fmt.Sprintf("GeoIP Resolution of the address %s resulted in the null lat/long. This will result in random server sorting.", ip.String()) + log.Warning(errMsg) labels["source"] = "client" - metrics.PelicanDirectorGeoIPErrors.With(labels).Inc() + err = geoIPError{labels: labels, errorMsg: errMsg} } // MaxMind provides an accuracy radius in kilometers. When it actually has no clue how to resolve a valid, public @@ -209,12 +219,13 @@ func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64 // should be very suspicious of the data, and mark it as appearing at the null lat/long (and provide a warning in // the Director), which also triggers random weighting in our sort algorithms. if record.Location.AccuracyRadius >= 900 { - log.Warningf("GeoIP resolution of the address %s resulted in a suspiciously large accuracy radius of %d km. "+ + errMsg := fmt.Sprintf("GeoIP resolution of the address %s resulted in a suspiciously large accuracy radius of %d km. "+ "This will be treated as GeoIP resolution failure and result in random server sorting. Setting lat/long to null.", ip.String(), record.Location.AccuracyRadius) + log.Warning(errMsg) lat = 0 long = 0 labels["source"] = "client" - metrics.PelicanDirectorGeoIPErrors.With(labels).Inc() + err = geoIPError{labels: labels, errorMsg: errMsg} } return @@ -229,8 +240,7 @@ func assignRandBoundedCoord(minLat, maxLat, minLong, maxLong float64) (lat, long // Given a client address, attempt to get the lat/long of the client. If the address is invalid or // the lat/long is not resolvable, assign a random location in the contiguous US. -func getClientLatLong(ctx context.Context, addr netip.Addr) (coord Coordinate) { - var err error +func getClientLatLong(addr netip.Addr) (coord Coordinate, err error) { if !addr.IsValid() { log.Warningf("Unable to sort servers based on client-server distance. Invalid client IP address: %s", addr.String()) coord.Lat, coord.Long = assignRandBoundedCoord(usLatMin, usLatMax, usLongMin, usLongMax) @@ -244,7 +254,7 @@ func getClientLatLong(ctx context.Context, addr netip.Addr) (coord Coordinate) { return } - coord.Lat, coord.Long, err = getLatLong(ctx, addr) + coord.Lat, coord.Long, err = getLatLong(addr) if err != nil || (coord.Lat == 0 && coord.Long == 0) { if err != nil { log.Warningf("Error while getting the client IP address: %v", err) @@ -286,7 +296,17 @@ func sortServerAds(ctx context.Context, clientAddr netip.Addr, ads []server_stru weights := make(SwapMaps, len(ads)) sortMethod := param.Director_CacheSortMethod.GetString() // This will handle the case where the client address is invalid or the lat/long is not resolvable. - clientCoord := getClientLatLong(ctx, clientAddr) + clientCoord, err := getClientLatLong(clientAddr) + if err != nil { + // If it is a geoIP error, then we get the labels and increment the error counter + // Otherwise we log the error and continue + if geoIPError, ok := err.(geoIPError); ok { + labels := geoIPError.labels + setProjectLabel(ctx, &labels) + metrics.PelicanDirectorGeoIPErrors.With(labels).Inc() + } + log.Warningf("Error while getting the client IP address: %v", err) + } // For each ad, we apply the configured sort method to determine a priority weight. for idx, ad := range ads { diff --git a/director/sort_test.go b/director/sort_test.go index ae1b64fbc..753ba752d 100644 --- a/director/sort_test.go +++ b/director/sort_test.go @@ -470,9 +470,7 @@ func TestGetClientLatLong(t *testing.T) { clientIp := netip.Addr{} assert.False(t, clientIpCache.Has(clientIp)) - ctx := context.Background() - ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test") - coord1 := getClientLatLong(ctx, clientIp) + coord1, _ := getClientLatLong(clientIp) assert.True(t, coord1.Lat <= usLatMax && coord1.Lat >= usLatMin) assert.True(t, coord1.Long <= usLongMax && coord1.Long >= usLongMin) @@ -480,9 +478,7 @@ func TestGetClientLatLong(t *testing.T) { assert.NotContains(t, logOutput.String(), "Retrieving pre-assigned lat/long") // Get it again to make sure it's coming from the cache - ctx = context.Background() - ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test") - coord2 := getClientLatLong(ctx, clientIp) + coord2, _ := getClientLatLong(clientIp) assert.Equal(t, coord1.Lat, coord2.Lat) assert.Equal(t, coord1.Long, coord2.Long) assert.Contains(t, logOutput.String(), "Retrieving pre-assigned lat/long for unresolved client IP") @@ -496,9 +492,7 @@ func TestGetClientLatLong(t *testing.T) { clientIp := netip.MustParseAddr("192.168.0.1") assert.False(t, clientIpCache.Has(clientIp)) - ctx := context.Background() - ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test") - coord1 := getClientLatLong(ctx, clientIp) + coord1, _ := getClientLatLong(clientIp) assert.True(t, coord1.Lat <= usLatMax && coord1.Lat >= usLatMin) assert.True(t, coord1.Long <= usLongMax && coord1.Long >= usLongMin) @@ -506,9 +500,7 @@ func TestGetClientLatLong(t *testing.T) { assert.NotContains(t, logOutput.String(), "Retrieving pre-assigned lat/long") // Get it again to make sure it's coming from the cache - ctx = context.Background() - ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test") - coord2 := getClientLatLong(ctx, clientIp) + coord2, _ := getClientLatLong(clientIp) assert.Equal(t, coord1.Lat, coord2.Lat) assert.Equal(t, coord1.Long, coord2.Long) assert.Contains(t, logOutput.String(), "Retrieving pre-assigned lat/long for client IP")