Skip to content

Commit

Permalink
Merge pull request NVIDIA#330 from tariq1890/nvidia-dev-maj-num-lookup
Browse files Browse the repository at this point in the history
add fallback logic when retrieving major number of the nvidia control device
  • Loading branch information
elezar authored Feb 12, 2024
2 parents ab7693a + e64b723 commit a2a1a78
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 53 deletions.
62 changes: 62 additions & 0 deletions internal/info/proc/devices/builder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package devices

type builder struct {
asMap devices
filter func(string) bool
}

// New creates a new devices struct with the specified options.
func New(opts ...Option) Devices {
b := &builder{}
for _, opt := range opts {
opt(b)
}

if b.filter == nil {
b.filter = func(string) bool { return false }
}

devices := make(devices)
for k, v := range b.asMap {
if b.filter(string(k)) {
continue
}
devices[k] = v
}
return devices
}

type Option func(*builder)

// WithDeviceToMajor specifies an explicit device name to major number map.
func WithDeviceToMajor(deviceToMajor map[string]int) Option {
return func(b *builder) {
b.asMap = make(devices)
for name, major := range deviceToMajor {
b.asMap[Name(name)] = Major(major)
}
}
}

// WithFilter specifies a filter to exclude devices.
func WithFilter(filter func(string) bool) Option {
return func(b *builder) {
b.filter = filter
}
}
59 changes: 38 additions & 21 deletions internal/info/proc/devices/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ const (
NVIDIAModesetMinor = 254

NVIDIAFrontend = Name("nvidia-frontend")
NVIDIAGPU = NVIDIAFrontend
NVIDIAGPU = Name("nvidia")
NVIDIACaps = Name("nvidia-caps")
NVIDIAUVM = Name("nvidia-uvm")

Expand All @@ -53,22 +53,43 @@ type Major int
type Devices interface {
Exists(Name) bool
Get(Name) (Major, bool)
Count() int
}

type devices map[Name]Major

var _ Devices = devices(nil)

// Count returns the number of devices defined.
func (d devices) Count() int {
return len(d)
}

// Exists checks if a Device with a given name exists or not
func (d devices) Exists(name Name) bool {
_, exists := d[name]
return exists
}

// Get a Device from Devices
// Get a Device from Devices. It also has fallback logic to ensure device name changes in /proc/devices are handled
// For e.g:- For GPU drivers 550.40.x or greater, the gpu device has been renamed from "nvidia-frontend" to "nvidia".
func (d devices) Get(name Name) (Major, bool) {
device, exists := d[name]
return device, exists
for _, n := range name.getWithFallback() {
device, exists := d[n]
if exists {
return device, true
}
}
return 0, false
}

// getWithFallback returns a prioritised list of device names for a specific name.
// This allows multiple names to be associated with a single name to support various driver versions.
func (n Name) getWithFallback() []Name {
if n == NVIDIAGPU || n == NVIDIAFrontend {
return []Name{NVIDIAGPU, NVIDIAFrontend}
}
return []Name{n}
}

// GetNVIDIADevices returns the set of NVIDIA Devices on the machine
Expand All @@ -94,27 +115,23 @@ func nvidiaDevices(devicesPath string) (Devices, error) {

var errNoNvidiaDevices = errors.New("no NVIDIA devices found")

func nvidiaDeviceFrom(reader io.Reader) (devices, error) {
func nvidiaDeviceFrom(reader io.Reader) (Devices, error) {
allDevices := devicesFrom(reader)
nvidiaDevices := make(devices)

var hasNvidiaDevices bool
for n, d := range allDevices {
if !strings.HasPrefix(string(n), nvidiaDevicePrefix) {
continue
}
nvidiaDevices[n] = d
hasNvidiaDevices = true
}

if !hasNvidiaDevices {
nvidiaDevices := New(
WithDeviceToMajor(allDevices),
WithFilter(func(n string) bool {
return !strings.HasPrefix(n, nvidiaDevicePrefix)
}),
)
if nvidiaDevices.Count() == 0 {
return nil, errNoNvidiaDevices
}
return nvidiaDevices, nil
}

func devicesFrom(reader io.Reader) devices {
allDevices := make(devices)
func devicesFrom(reader io.Reader) map[string]int {
allDevices := make(map[string]int)
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
device, major, err := processProcDeviceLine(scanner.Text())
Expand All @@ -126,11 +143,11 @@ func devicesFrom(reader io.Reader) devices {
return allDevices
}

func processProcDeviceLine(line string) (Name, Major, error) {
func processProcDeviceLine(line string) (string, int, error) {
trimmed := strings.TrimSpace(line)

var name Name
var major Major
var name string
var major int

n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
if n == 2 {
Expand Down
40 changes: 40 additions & 0 deletions internal/info/proc/devices/devices_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 44 additions & 21 deletions internal/info/proc/devices/devices_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,46 @@ import (
)

func TestNvidiaDevices(t *testing.T) {
devices := map[Name]Major{
"nvidia-frontend": 195,
"nvidia-nvlink": 234,
"nvidia-caps": 235,
"nvidia-uvm": 510,
"nvidia-nvswitch": 511,
perDriverDeviceMaps := map[string]map[string]int{
"pre550": {
"nvidia-frontend": 195,
"nvidia-nvlink": 234,
"nvidia-caps": 235,
"nvidia-uvm": 510,
"nvidia-nvswitch": 511,
},
"post550": {
"nvidia": 195,
"nvidia-nvlink": 234,
"nvidia-caps": 235,
"nvidia-uvm": 510,
"nvidia-nvswitch": 511,
},
}

nvidiaDevices := testDevices(devices)
for name, major := range devices {
device, exists := nvidiaDevices.Get(name)
require.True(t, exists, "Unexpected missing device")
require.Equal(t, device, major, "Unexpected device major")
for k, devices := range perDriverDeviceMaps {
nvidiaDevices := New(WithDeviceToMajor(devices))
t.Run(k, func(t *testing.T) {
// Each of the expected devices needs to exist.
for name, major := range devices {
device, exists := nvidiaDevices.Get(Name(name))
require.True(t, exists)
require.Equal(t, device, Major(major))
}
// An unexpected device cannot exist
_, exists := nvidiaDevices.Get("bogus")
require.False(t, exists)

// Regardles of the driver version, the nvidia and nvidia-frontend
// names are supported and have the same value.
nvidia, exists := nvidiaDevices.Get(NVIDIAGPU)
require.True(t, exists)
nvidiaFrontend, exists := nvidiaDevices.Get(NVIDIAFrontend)
require.True(t, exists)
require.Equal(t, nvidia, nvidiaFrontend)
})

}
_, exists := nvidiaDevices.Get("bogus")
require.False(t, exists, "Unexpected 'bogus' device found")
}

func TestProcessDeviceFile(t *testing.T) {
Expand All @@ -52,6 +76,7 @@ func TestProcessDeviceFile(t *testing.T) {
{lines: []string{}, expectedError: errNoNvidiaDevices},
{lines: []string{"Not a valid line:"}, expectedError: errNoNvidiaDevices},
{lines: []string{"195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
{lines: []string{"195 nvidia"}, expected: devices{"nvidia": 195}},
{lines: []string{"195 nvidia-frontend", "235 nvidia-caps"}, expected: devices{"nvidia-frontend": 195, "nvidia-caps": 235}},
{lines: []string{" 195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
{lines: []string{"Not a valid line:", "", "195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
Expand All @@ -63,16 +88,19 @@ func TestProcessDeviceFile(t *testing.T) {
d, err := nvidiaDeviceFrom(contents)
require.ErrorIs(t, err, tc.expectedError)

require.EqualValues(t, tc.expected, d)
if tc.expectedError == nil {
require.EqualValues(t, tc.expected, d.(devices))
}

})
}
}

func TestProcessDeviceFileLine(t *testing.T) {
testCases := []struct {
line string
name Name
major Major
name string
major int
err bool
}{
{"", "", 0, true},
Expand All @@ -97,8 +125,3 @@ func TestProcessDeviceFileLine(t *testing.T) {
})
}
}

// testDevices creates a set of test NVIDIA devices
func testDevices(d map[Name]Major) Devices {
return devices(d)
}
Loading

0 comments on commit a2a1a78

Please sign in to comment.