Skip to content
11 changes: 8 additions & 3 deletions director/cache_ads.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
log "github.com/sirupsen/logrus"
"golang.org/x/sync/errgroup"

"github.com/pelicanplatform/pelican/metrics"
"github.com/pelicanplatform/pelican/param"
"github.com/pelicanplatform/pelican/server_structs"
"github.com/pelicanplatform/pelican/utils"
Expand Down Expand Up @@ -81,7 +82,11 @@ func (f filterType) String() string {
// 4. Set up utilities for collecting origin/health server file transfer test status
// 5. Return the updated ServerAd. The ServerAd passed in will not be modified
func recordAd(ctx context.Context, sAd server_structs.ServerAd, namespaceAds *[]server_structs.NamespaceAdV2) (updatedAd server_structs.ServerAd) {
if err := updateLatLong(ctx, &sAd); err != nil {
if err := updateLatLong(&sAd); err != nil {
if geoIPError, ok := err.(geoIPError); ok {
labels := geoIPError.labels
metrics.PelicanDirectorGeoIPErrors.With(labels).Inc()
}
log.Debugln("Failed to lookup GeoIP coordinates for host", sAd.URL.Host)
}

Expand Down Expand Up @@ -239,7 +244,7 @@ func recordAd(ctx context.Context, sAd server_structs.ServerAd, namespaceAds *[]
return sAd
}

func updateLatLong(ctx context.Context, ad *server_structs.ServerAd) error {
func updateLatLong(ad *server_structs.ServerAd) error {
if ad == nil {
return errors.New("Cannot provide a nil ad to UpdateLatLong")
}
Expand All @@ -257,7 +262,7 @@ func updateLatLong(ctx context.Context, ad *server_structs.ServerAd) error {
}
// NOTE: If GeoIP resolution of this address fails, lat/long are set to 0.0 (the null lat/long)
// This causes the server to be sorted to the end of the list whenever the Director requires distance-aware sorting.
lat, long, err := getLatLong(ctx, addr)
lat, long, err := getLatLong(addr)
if err != nil {
return err
}
Expand Down
64 changes: 42 additions & 22 deletions director/sort.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package director
import (
"cmp"
"context"
"fmt"
"math/rand"
"net"
"net/netip"
Expand Down Expand Up @@ -59,8 +60,17 @@ type (
IP string `mapstructure:"IP"`
Coordinate Coordinate `mapstructure:"Coordinate"`
}

geoIPError struct {
labels prometheus.Labels
errorMsg string
}
)

func (e geoIPError) Error() string {
return e.errorMsg
}

var (
invalidOverrideLogOnce = map[string]bool{}
geoIPOverrides []GeoIPOverride
Expand Down Expand Up @@ -148,7 +158,16 @@ func checkOverrides(addr net.IP) (coordinate *Coordinate) {
return nil
}

func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64, err error) {
func setProjectLabel(ctx context.Context, labels *prometheus.Labels) {
project, ok := ctx.Value(ProjectContextKey{}).(string)
if !ok || project == "" {
(*labels)["proj"] = "unknown"
} else {
(*labels)["proj"] = project
}
}

func getLatLong(addr netip.Addr) (lat float64, long float64, err error) {
ip := net.IP(addr.AsSlice())
override := checkOverrides(ip)
if override != nil {
Expand All @@ -159,7 +178,7 @@ func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64
labels := prometheus.Labels{
"network": "",
"source": "",
"proj": "",
"proj": "", // this will be set in the setProjectLabel function
}

network, ok := utils.ApplyIPMask(addr.String())
Expand All @@ -170,26 +189,16 @@ func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64
labels["network"] = network
}

project, ok := ctx.Value(ProjectContextKey{}).(string)
if !ok || project == "" {
log.Warningf("Failed to get project from context")
labels["proj"] = "unknown"
labels["source"] = "server"
} else {
labels["proj"] = project
}

reader := maxMindReader.Load()
if reader == nil {
err = errors.New("No GeoIP database is available")
labels["source"] = "server"
metrics.PelicanDirectorGeoIPErrors.With(labels).Inc()
err = geoIPError{labels: labels, errorMsg: "No GeoIP database is available"}
return
}
record, err := reader.City(ip)
if err != nil {
labels["source"] = "server"
metrics.PelicanDirectorGeoIPErrors.With(labels).Inc()
err = geoIPError{labels: labels, errorMsg: err.Error()}
return
}
lat = record.Location.Latitude
Expand All @@ -199,22 +208,24 @@ func getLatLong(ctx context.Context, addr netip.Addr) (lat float64, long float64
// There's likely a problem with the GeoIP database or the IP address. Usually this just means the IP address
// comes from a private range.
if lat == 0 && long == 0 {
log.Warningf("GeoIP Resolution of the address %s resulted in the null lat/long. This will result in random server sorting.", ip.String())
errMsg := fmt.Sprintf("GeoIP Resolution of the address %s resulted in the null lat/long. This will result in random server sorting.", ip.String())
log.Warning(errMsg)
labels["source"] = "client"
metrics.PelicanDirectorGeoIPErrors.With(labels).Inc()
err = geoIPError{labels: labels, errorMsg: errMsg}
}

// MaxMind provides an accuracy radius in kilometers. When it actually has no clue how to resolve a valid, public
// IP, it sets the radius to 1000. If we get a radius of 900 or more (probably even much less than this...), we
// should be very suspicious of the data, and mark it as appearing at the null lat/long (and provide a warning in
// the Director), which also triggers random weighting in our sort algorithms.
if record.Location.AccuracyRadius >= 900 {
log.Warningf("GeoIP resolution of the address %s resulted in a suspiciously large accuracy radius of %d km. "+
errMsg := fmt.Sprintf("GeoIP resolution of the address %s resulted in a suspiciously large accuracy radius of %d km. "+
"This will be treated as GeoIP resolution failure and result in random server sorting. Setting lat/long to null.", ip.String(), record.Location.AccuracyRadius)
log.Warning(errMsg)
lat = 0
long = 0
labels["source"] = "client"
metrics.PelicanDirectorGeoIPErrors.With(labels).Inc()
err = geoIPError{labels: labels, errorMsg: errMsg}
}

return
Expand All @@ -229,8 +240,7 @@ func assignRandBoundedCoord(minLat, maxLat, minLong, maxLong float64) (lat, long

// Given a client address, attempt to get the lat/long of the client. If the address is invalid or
// the lat/long is not resolvable, assign a random location in the contiguous US.
func getClientLatLong(ctx context.Context, addr netip.Addr) (coord Coordinate) {
var err error
func getClientLatLong(addr netip.Addr) (coord Coordinate, err error) {
if !addr.IsValid() {
log.Warningf("Unable to sort servers based on client-server distance. Invalid client IP address: %s", addr.String())
coord.Lat, coord.Long = assignRandBoundedCoord(usLatMin, usLatMax, usLongMin, usLongMax)
Expand All @@ -244,7 +254,7 @@ func getClientLatLong(ctx context.Context, addr netip.Addr) (coord Coordinate) {
return
}

coord.Lat, coord.Long, err = getLatLong(ctx, addr)
coord.Lat, coord.Long, err = getLatLong(addr)
if err != nil || (coord.Lat == 0 && coord.Long == 0) {
if err != nil {
log.Warningf("Error while getting the client IP address: %v", err)
Expand Down Expand Up @@ -286,7 +296,17 @@ func sortServerAds(ctx context.Context, clientAddr netip.Addr, ads []server_stru
weights := make(SwapMaps, len(ads))
sortMethod := param.Director_CacheSortMethod.GetString()
// This will handle the case where the client address is invalid or the lat/long is not resolvable.
clientCoord := getClientLatLong(ctx, clientAddr)
clientCoord, err := getClientLatLong(clientAddr)
if err != nil {
// If it is a geoIP error, then we get the labels and increment the error counter
// Otherwise we log the error and continue
if geoIPError, ok := err.(geoIPError); ok {
labels := geoIPError.labels
setProjectLabel(ctx, &labels)
metrics.PelicanDirectorGeoIPErrors.With(labels).Inc()
}
log.Warningf("Error while getting the client IP address: %v", err)
}

// For each ad, we apply the configured sort method to determine a priority weight.
for idx, ad := range ads {
Expand Down
16 changes: 4 additions & 12 deletions director/sort_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,19 +470,15 @@ func TestGetClientLatLong(t *testing.T) {

clientIp := netip.Addr{}
assert.False(t, clientIpCache.Has(clientIp))
ctx := context.Background()
ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test")
coord1 := getClientLatLong(ctx, clientIp)
coord1, _ := getClientLatLong(clientIp)

assert.True(t, coord1.Lat <= usLatMax && coord1.Lat >= usLatMin)
assert.True(t, coord1.Long <= usLongMax && coord1.Long >= usLongMin)
assert.Contains(t, logOutput.String(), "Unable to sort servers based on client-server distance. Invalid client IP address")
assert.NotContains(t, logOutput.String(), "Retrieving pre-assigned lat/long")

// Get it again to make sure it's coming from the cache
ctx = context.Background()
ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test")
coord2 := getClientLatLong(ctx, clientIp)
coord2, _ := getClientLatLong(clientIp)
assert.Equal(t, coord1.Lat, coord2.Lat)
assert.Equal(t, coord1.Long, coord2.Long)
assert.Contains(t, logOutput.String(), "Retrieving pre-assigned lat/long for unresolved client IP")
Expand All @@ -496,19 +492,15 @@ func TestGetClientLatLong(t *testing.T) {

clientIp := netip.MustParseAddr("192.168.0.1")
assert.False(t, clientIpCache.Has(clientIp))
ctx := context.Background()
ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test")
coord1 := getClientLatLong(ctx, clientIp)
coord1, _ := getClientLatLong(clientIp)

assert.True(t, coord1.Lat <= usLatMax && coord1.Lat >= usLatMin)
assert.True(t, coord1.Long <= usLongMax && coord1.Long >= usLongMin)
assert.Contains(t, logOutput.String(), "Client IP 192.168.0.1 has been re-assigned a random location in the contiguous US to lat/long")
assert.NotContains(t, logOutput.String(), "Retrieving pre-assigned lat/long")

// Get it again to make sure it's coming from the cache
ctx = context.Background()
ctx = context.WithValue(ctx, ProjectContextKey{}, "pelican-client/1.0.0 project/test")
coord2 := getClientLatLong(ctx, clientIp)
coord2, _ := getClientLatLong(clientIp)
assert.Equal(t, coord1.Lat, coord2.Lat)
assert.Equal(t, coord1.Long, coord2.Long)
assert.Contains(t, logOutput.String(), "Retrieving pre-assigned lat/long for client IP")
Expand Down
Loading