1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package sysfs
18
19 import (
20 "errors"
21 "fmt"
22 "os"
23 "path/filepath"
24 "strconv"
25 "strings"
26
27 "github.com/prometheus/procfs/internal/util"
28 )
29
30 const infinibandClassPath = "class/infiniband"
31
32
33
34
35
36 type InfiniBandCounters struct {
37 LegacyPortMulticastRcvPackets *uint64
38 LegacyPortMulticastXmitPackets *uint64
39 LegacyPortRcvData64 *uint64
40 LegacyPortRcvPackets64 *uint64
41 LegacyPortUnicastRcvPackets *uint64
42 LegacyPortUnicastXmitPackets *uint64
43 LegacyPortXmitData64 *uint64
44 LegacyPortXmitPackets64 *uint64
45
46 ExcessiveBufferOverrunErrors *uint64
47 LinkDowned *uint64
48 LinkErrorRecovery *uint64
49 LocalLinkIntegrityErrors *uint64
50 MulticastRcvPackets *uint64
51 MulticastXmitPackets *uint64
52 PortRcvConstraintErrors *uint64
53 PortRcvData *uint64
54 PortRcvDiscards *uint64
55 PortRcvErrors *uint64
56 PortRcvPackets *uint64
57 PortRcvRemotePhysicalErrors *uint64
58 PortRcvSwitchRelayErrors *uint64
59 PortXmitConstraintErrors *uint64
60 PortXmitData *uint64
61 PortXmitDiscards *uint64
62 PortXmitPackets *uint64
63 PortXmitWait *uint64
64 SymbolError *uint64
65 UnicastRcvPackets *uint64
66 UnicastXmitPackets *uint64
67 VL15Dropped *uint64
68 }
69
70
71
72
73 type InfiniBandHwCounters struct {
74 DuplicateRequest *uint64
75 ImpliedNakSeqErr *uint64
76 Lifespan *uint64
77 LocalAckTimeoutErr *uint64
78 NpCnpSent *uint64
79 NpEcnMarkedRocePackets *uint64
80 OutOfBuffer *uint64
81 OutOfSequence *uint64
82 PacketSeqErr *uint64
83 ReqCqeError *uint64
84 ReqCqeFlushError *uint64
85 ReqRemoteAccessErrors *uint64
86 ReqRemoteInvalidRequest *uint64
87 RespCqeError *uint64
88 RespCqeFlushError *uint64
89 RespLocalLengthError *uint64
90 RespRemoteAccessErrors *uint64
91 RnrNakRetryErr *uint64
92 RoceAdpRetrans *uint64
93 RoceAdpRetransTo *uint64
94 RoceSlowRestart *uint64
95 RoceSlowRestartCnps *uint64
96 RoceSlowRestartTrans *uint64
97 RpCnpHandled *uint64
98 RpCnpIgnored *uint64
99 RxAtomicRequests *uint64
100 RxDctConnect *uint64
101 RxIcrcEncapsulated *uint64
102 RxReadRequests *uint64
103 RxWriteRequests *uint64
104 }
105
106
107
108
109 type InfiniBandPort struct {
110 Name string
111 Port uint
112 State string
113 StateID uint
114 PhysState string
115 PhysStateID uint
116 Rate uint64
117 Counters InfiniBandCounters
118 HwCounters InfiniBandHwCounters
119 }
120
121
122
123 type InfiniBandDevice struct {
124 Name string
125 BoardID string
126 FirmwareVersion string
127 HCAType string
128 Ports map[uint]InfiniBandPort
129 }
130
131
132
133
134
135 type InfiniBandClass map[string]InfiniBandDevice
136
137
138
139 func (fs FS) InfiniBandClass() (InfiniBandClass, error) {
140 path := fs.sys.Path(infinibandClassPath)
141
142 dirs, err := os.ReadDir(path)
143 if err != nil {
144 return nil, err
145 }
146
147 ibc := make(InfiniBandClass, len(dirs))
148 for _, d := range dirs {
149 device, err := fs.parseInfiniBandDevice(d.Name())
150 if err != nil {
151 return nil, err
152 }
153
154 ibc[device.Name] = *device
155 }
156
157 return ibc, nil
158 }
159
160
161
162 func (fs FS) parseInfiniBandDevice(name string) (*InfiniBandDevice, error) {
163 path := fs.sys.Path(infinibandClassPath, name)
164 device := InfiniBandDevice{Name: name}
165
166
167 value, err := util.SysReadFile(filepath.Join(path, "fw_ver"))
168 if err != nil {
169 return nil, fmt.Errorf("failed to read HCA firmware version: %w", err)
170 }
171 device.FirmwareVersion = value
172
173
174 for _, f := range [...]string{"board_id", "hca_type"} {
175 name := filepath.Join(path, f)
176 value, err := util.SysReadFile(name)
177 if err != nil {
178 if os.IsNotExist(err) {
179 continue
180 }
181 return nil, fmt.Errorf("failed to read file %q: %w", name, err)
182 }
183
184 switch f {
185 case "board_id":
186 device.BoardID = value
187 case "hca_type":
188 device.HCAType = value
189 }
190 }
191
192 portsPath := filepath.Join(path, "ports")
193 ports, err := os.ReadDir(portsPath)
194 if err != nil {
195 return nil, fmt.Errorf("failed to list InfiniBand ports at %q: %w", portsPath, err)
196 }
197
198 device.Ports = make(map[uint]InfiniBandPort, len(ports))
199 for _, d := range ports {
200 port, err := fs.parseInfiniBandPort(name, d.Name())
201 if err != nil {
202 return nil, err
203 }
204
205 device.Ports[port.Port] = *port
206 }
207
208 return &device, nil
209 }
210
211
212 func parseState(s string) (uint, string, error) {
213 parts := strings.Split(s, ":")
214 if len(parts) != 2 {
215 return 0, "", fmt.Errorf("failed to split %s into 'ID: NAME'", s)
216 }
217 name := strings.TrimSpace(parts[1])
218 value, err := strconv.ParseUint(strings.TrimSpace(parts[0]), 10, 32)
219 if err != nil {
220 return 0, name, fmt.Errorf("failed to convert %s into uint", strings.TrimSpace(parts[0]))
221 }
222 id := uint(value)
223 return id, name, nil
224 }
225
226
227 func parseRate(s string) (uint64, error) {
228 parts := strings.SplitAfterN(s, " ", 2)
229 if len(parts) != 2 {
230 return 0, fmt.Errorf("failed to split %q", s)
231 }
232 value, err := strconv.ParseFloat(strings.TrimSpace(parts[0]), 32)
233 if err != nil {
234 return 0, fmt.Errorf("failed to convert %s into uint", strings.TrimSpace(parts[0]))
235 }
236
237 rate := uint64(value * 125000000)
238 return rate, nil
239 }
240
241
242
243 func (fs FS) parseInfiniBandPort(name string, port string) (*InfiniBandPort, error) {
244 portNumber, err := strconv.ParseUint(port, 10, 32)
245 if err != nil {
246 return nil, fmt.Errorf("failed to convert %s into uint", port)
247 }
248 ibp := InfiniBandPort{Name: name, Port: uint(portNumber)}
249
250 portPath := fs.sys.Path(infinibandClassPath, name, "ports", port)
251 content, err := os.ReadFile(filepath.Join(portPath, "state"))
252 if err != nil {
253 return nil, err
254 }
255 id, name, err := parseState(string(content))
256 if err != nil {
257 return nil, fmt.Errorf("could not parse state file in %q: %w", portPath, err)
258 }
259 ibp.State = name
260 ibp.StateID = id
261
262 content, err = os.ReadFile(filepath.Join(portPath, "phys_state"))
263 if err != nil {
264 return nil, err
265 }
266 id, name, err = parseState(string(content))
267 if err != nil {
268 return nil, fmt.Errorf("could not parse phys_state file in %q: %w", portPath, err)
269 }
270 ibp.PhysState = name
271 ibp.PhysStateID = id
272
273 content, err = os.ReadFile(filepath.Join(portPath, "rate"))
274 if err != nil {
275 return nil, err
276 }
277 ibp.Rate, err = parseRate(string(content))
278 if err != nil {
279 return nil, fmt.Errorf("could not parse rate file in %q: %w", portPath, err)
280 }
281
282
283 if !strings.HasPrefix(ibp.Name, "irdma") {
284 counters, err := parseInfiniBandCounters(portPath)
285 if err != nil {
286 return nil, err
287 }
288 ibp.Counters = *counters
289 }
290
291 if strings.HasPrefix(ibp.Name, "irdma") || strings.HasPrefix(ibp.Name, "mlx5_") {
292 hwCounters, err := parseInfiniBandHwCounters(portPath)
293 if err != nil {
294 return nil, err
295 }
296 ibp.HwCounters = *hwCounters
297 }
298
299 return &ibp, nil
300 }
301
302
303
304
305 func parseInfiniBandCounters(portPath string) (*InfiniBandCounters, error) {
306 var counters InfiniBandCounters
307
308 path := filepath.Join(portPath, "counters")
309 files, err := os.ReadDir(path)
310 if err != nil {
311 return nil, err
312 }
313
314 for _, f := range files {
315 if !f.Type().IsRegular() {
316 continue
317 }
318
319 name := filepath.Join(path, f.Name())
320 value, err := util.SysReadFile(name)
321 if err != nil {
322 if os.IsNotExist(err) || os.IsPermission(err) || err.Error() == "operation not supported" || errors.Is(err, os.ErrInvalid) {
323 continue
324 }
325 return nil, fmt.Errorf("failed to read file %q: %w", name, err)
326 }
327
328
329
330
331
332
333
334 vp := util.NewValueParser(value)
335
336 switch f.Name() {
337 case "excessive_buffer_overrun_errors":
338 counters.ExcessiveBufferOverrunErrors = vp.PUInt64()
339 case "link_downed":
340 counters.LinkDowned = vp.PUInt64()
341 case "link_error_recovery":
342 counters.LinkErrorRecovery = vp.PUInt64()
343 case "local_link_integrity_errors":
344 counters.LocalLinkIntegrityErrors = vp.PUInt64()
345 case "multicast_rcv_packets":
346 counters.MulticastRcvPackets = vp.PUInt64()
347 case "multicast_xmit_packets":
348 counters.MulticastXmitPackets = vp.PUInt64()
349 case "port_rcv_constraint_errors":
350 counters.PortRcvConstraintErrors = vp.PUInt64()
351 case "port_rcv_data":
352 counters.PortRcvData = vp.PUInt64()
353 if counters.PortRcvData != nil {
354 *counters.PortRcvData *= 4
355 }
356 case "port_rcv_discards":
357 counters.PortRcvDiscards = vp.PUInt64()
358 case "port_rcv_errors":
359 counters.PortRcvErrors = vp.PUInt64()
360 case "port_rcv_packets":
361 counters.PortRcvPackets = vp.PUInt64()
362 case "port_rcv_remote_physical_errors":
363 counters.PortRcvRemotePhysicalErrors = vp.PUInt64()
364 case "port_rcv_switch_relay_errors":
365 counters.PortRcvSwitchRelayErrors = vp.PUInt64()
366 case "port_xmit_constraint_errors":
367 counters.PortXmitConstraintErrors = vp.PUInt64()
368 case "port_xmit_data":
369 counters.PortXmitData = vp.PUInt64()
370 if counters.PortXmitData != nil {
371 *counters.PortXmitData *= 4
372 }
373 case "port_xmit_discards":
374 counters.PortXmitDiscards = vp.PUInt64()
375 case "port_xmit_packets":
376 counters.PortXmitPackets = vp.PUInt64()
377 case "port_xmit_wait":
378 counters.PortXmitWait = vp.PUInt64()
379 case "symbol_error":
380 counters.SymbolError = vp.PUInt64()
381 case "unicast_rcv_packets":
382 counters.UnicastRcvPackets = vp.PUInt64()
383 case "unicast_xmit_packets":
384 counters.UnicastXmitPackets = vp.PUInt64()
385 case "VL15_dropped":
386 counters.VL15Dropped = vp.PUInt64()
387 }
388
389 if err := vp.Err(); err != nil {
390
391
392
393
394
395 if strings.Contains(value, "N/A (no PMA)") {
396 continue
397 }
398 return nil, err
399 }
400 }
401
402
403 path = filepath.Join(portPath, "counters_ext")
404 files, err = os.ReadDir(path)
405 if err != nil && !os.IsNotExist(err) {
406 return nil, err
407 }
408
409 for _, f := range files {
410 if !f.Type().IsRegular() {
411 continue
412 }
413
414 name := filepath.Join(path, f.Name())
415 value, err := util.SysReadFile(name)
416 if err != nil {
417 if os.IsNotExist(err) || os.IsPermission(err) || err.Error() == "operation not supported" || errors.Is(err, os.ErrInvalid) {
418 continue
419 }
420 return nil, fmt.Errorf("failed to read file %q: %w", name, err)
421 }
422
423 vp := util.NewValueParser(value)
424
425 switch f.Name() {
426 case "port_multicast_rcv_packets":
427 counters.LegacyPortMulticastRcvPackets = vp.PUInt64()
428 case "port_multicast_xmit_packets":
429 counters.LegacyPortMulticastXmitPackets = vp.PUInt64()
430 case "port_rcv_data_64":
431 counters.LegacyPortRcvData64 = vp.PUInt64()
432 if counters.LegacyPortRcvData64 != nil {
433 *counters.LegacyPortRcvData64 *= 4
434 }
435 case "port_rcv_packets_64":
436 counters.LegacyPortRcvPackets64 = vp.PUInt64()
437 case "port_unicast_rcv_packets":
438 counters.LegacyPortUnicastRcvPackets = vp.PUInt64()
439 case "port_unicast_xmit_packets":
440 counters.LegacyPortUnicastXmitPackets = vp.PUInt64()
441 case "port_xmit_data_64":
442 counters.LegacyPortXmitData64 = vp.PUInt64()
443 if counters.LegacyPortXmitData64 != nil {
444 *counters.LegacyPortXmitData64 *= 4
445 }
446 case "port_xmit_packets_64":
447 counters.LegacyPortXmitPackets64 = vp.PUInt64()
448 }
449
450 if err := vp.Err(); err != nil {
451
452
453
454
455
456 if strings.Contains(value, "N/A (no PMA)") {
457 continue
458 }
459 return nil, err
460 }
461 }
462
463 return &counters, nil
464 }
465
466
467
468 func parseInfiniBandHwCounters(portPath string) (*InfiniBandHwCounters, error) {
469 var hwCounters InfiniBandHwCounters
470
471 path := filepath.Join(portPath, "hw_counters")
472 files, err := os.ReadDir(path)
473 if err != nil {
474 return nil, err
475 }
476
477 for _, f := range files {
478 if !f.Type().IsRegular() {
479 continue
480 }
481
482 name := filepath.Join(path, f.Name())
483 value, err := util.SysReadFile(name)
484 if err != nil {
485 if os.IsNotExist(err) || os.IsPermission(err) || err.Error() == "operation not supported" || errors.Is(err, os.ErrInvalid) {
486 continue
487 }
488 return nil, fmt.Errorf("failed to read file %q: %w", name, err)
489 }
490
491 vp := util.NewValueParser(value)
492
493 switch f.Name() {
494 case "duplicate_request":
495 hwCounters.DuplicateRequest = vp.PUInt64()
496 case "implied_nak_seq_err":
497 hwCounters.ImpliedNakSeqErr = vp.PUInt64()
498 case "lifespan":
499 hwCounters.Lifespan = vp.PUInt64()
500 case "local_ack_timeout_err":
501 hwCounters.LocalAckTimeoutErr = vp.PUInt64()
502 case "np_cnp_sent":
503 hwCounters.NpCnpSent = vp.PUInt64()
504 case "np_ecn_marked_roce_packets":
505 hwCounters.NpEcnMarkedRocePackets = vp.PUInt64()
506 case "out_of_buffer":
507 hwCounters.OutOfBuffer = vp.PUInt64()
508 case "out_of_sequence":
509 hwCounters.OutOfSequence = vp.PUInt64()
510 case "packet_seq_err":
511 hwCounters.PacketSeqErr = vp.PUInt64()
512 case "req_cqe_error":
513 hwCounters.ReqCqeError = vp.PUInt64()
514 case "req_cqe_flush_error":
515 hwCounters.ReqCqeFlushError = vp.PUInt64()
516 case "req_remote_access_errors":
517 hwCounters.ReqRemoteAccessErrors = vp.PUInt64()
518 case "req_remote_invalid_request":
519 hwCounters.ReqRemoteInvalidRequest = vp.PUInt64()
520 case "resp_cqe_error":
521 hwCounters.RespCqeError = vp.PUInt64()
522 case "resp_cqe_flush_error":
523 hwCounters.RespCqeFlushError = vp.PUInt64()
524 case "resp_local_length_error":
525 hwCounters.RespLocalLengthError = vp.PUInt64()
526 case "resp_remote_access_errors":
527 hwCounters.RespRemoteAccessErrors = vp.PUInt64()
528 case "rnr_nak_retry_err":
529 hwCounters.RnrNakRetryErr = vp.PUInt64()
530 case "roce_adp_retrans":
531 hwCounters.RoceAdpRetrans = vp.PUInt64()
532 case "roce_adp_retrans_to":
533 hwCounters.RoceAdpRetransTo = vp.PUInt64()
534 case "roce_slow_restart":
535 hwCounters.RoceSlowRestart = vp.PUInt64()
536 case "roce_slow_restart_cnps":
537 hwCounters.RoceSlowRestartCnps = vp.PUInt64()
538 case "roce_slow_restart_trans":
539 hwCounters.RoceSlowRestartTrans = vp.PUInt64()
540 case "rp_cnp_handled":
541 hwCounters.RpCnpHandled = vp.PUInt64()
542 case "rp_cnp_ignored":
543 hwCounters.RpCnpIgnored = vp.PUInt64()
544 case "rx_atomic_requests":
545 hwCounters.RxAtomicRequests = vp.PUInt64()
546 case "rx_dct_connect":
547 hwCounters.RxDctConnect = vp.PUInt64()
548 case "rx_icrc_encapsulated":
549 hwCounters.RxIcrcEncapsulated = vp.PUInt64()
550 case "rx_read_requests":
551 hwCounters.RxReadRequests = vp.PUInt64()
552 case "rx_write_requests":
553 hwCounters.RxWriteRequests = vp.PUInt64()
554 }
555
556 if err := vp.Err(); err != nil {
557
558
559
560
561
562 if strings.Contains(value, "N/A (no PMA)") {
563 continue
564 }
565 return nil, err
566 }
567 }
568 return &hwCounters, nil
569 }
570
View as plain text