From de3df446812cf1f0f21031e38156441b7e8dcfcf Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Tue, 21 Apr 2026 21:11:36 +0000 Subject: [PATCH 1/3] nodescraper/plugins/inband/rdma/rdma_collector.py --- nodescraper/plugins/inband/rdma/rdmadata.py | 379 ++++++++++++++++++-- test/unit/plugin/test_rdma_analyzer.py | 187 ++++++---- 2 files changed, 467 insertions(+), 99 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/rdmadata.py b/nodescraper/plugins/inband/rdma/rdmadata.py index cb26b5b1..965cb5b5 100644 --- a/nodescraper/plugins/inband/rdma/rdmadata.py +++ b/nodescraper/plugins/inband/rdma/rdmadata.py @@ -23,44 +23,365 @@ # SOFTWARE. # ############################################################################### -from typing import Optional +from typing import ClassVar, Optional, Union -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, Field, model_validator from typing_extensions import Self from nodescraper.models import DataModel -class RdmaDevice(BaseModel): - """RDMA device from 'rdma dev' (text output).""" +class PollaraRdmaStatistics(BaseModel): + """ifname ionic""" - device: str - node_type: Optional[str] = None - transport: Optional[str] = None - node_guid: Optional[str] = None - sys_image_guid: Optional[str] = None - state: Optional[str] = None - attributes: dict[str, str] = Field(default_factory=dict) + tx_rdma_ucast_bytes: Optional[int] = None + tx_rdma_ucast_pkts: Optional[int] = None + tx_rdma_mcast_bytes: Optional[int] = None + tx_rdma_mcast_pkts: Optional[int] = None + tx_rdma_cnp_pkts: Optional[int] = None + rx_rdma_ucast_bytes: Optional[int] = None + rx_rdma_ucast_pkts: Optional[int] = None + rx_rdma_mcast_bytes: Optional[int] = None + rx_rdma_mcast_pkts: Optional[int] = None + rx_rdma_cnp_pkts: Optional[int] = None + rx_rdma_ecn_pkts: Optional[int] = None + req_rx_pkt_seq_err: Optional[int] = None + req_rx_rnr_retry_err: Optional[int] = None + req_rx_rmt_acc_err: Optional[int] = None + req_rx_rmt_req_err: Optional[int] = None + req_rx_oper_err: Optional[int] = None + req_rx_impl_nak_seq_err: Optional[int] = None + req_rx_cqe_err: Optional[int] = None + req_rx_cqe_flush: Optional[int] = None + req_rx_dup_response: Optional[int] = None + req_rx_inval_pkts: Optional[int] = None + req_tx_loc_acc_err: Optional[int] = None + req_tx_loc_oper_err: Optional[int] = None + req_tx_mem_mgmt_err: Optional[int] = None + req_tx_retry_excd_err: Optional[int] = None + req_tx_loc_sgl_inv_err: Optional[int] = None + resp_rx_dup_request: Optional[int] = None + resp_rx_outof_buf: Optional[int] = None + resp_rx_outouf_seq: Optional[int] = None + resp_rx_cqe_err: Optional[int] = None + resp_rx_cqe_flush: Optional[int] = None + resp_rx_loc_len_err: Optional[int] = None + resp_rx_inval_request: Optional[int] = None + resp_rx_loc_oper_err: Optional[int] = None + resp_rx_outof_atomic: Optional[int] = None + resp_tx_pkt_seq_err: Optional[int] = None + resp_tx_rmt_inval_req_err: Optional[int] = None + resp_tx_rmt_acc_err: Optional[int] = None + resp_tx_rmt_oper_err: Optional[int] = None + resp_tx_rnr_retry_err: Optional[int] = None + resp_tx_loc_sgl_inv_err: Optional[int] = None + resp_rx_s0_table_err: Optional[int] = None + tx_rdma_ccl_cts_bytes: Optional[int] = None + tx_rdma_ccl_cts_pkts: Optional[int] = None + rx_rdma_ccl_cts_bytes: Optional[int] = None + rx_rdma_ccl_cts_pkts: Optional[int] = None + resp_rx_ccl_cts_outouf_seq: Optional[int] = None + tx_rdma_ack_timeout: Optional[int] = None + tx_rdma_ccl_cts_ack_timeout: Optional[int] = None + tx_rdma_retx_bytes: Optional[int] = None + tx_rdma_retx_pkts: Optional[int] = None + tx_rdma_ccl_cts_retx_bytes: Optional[int] = None + tx_rdma_ccl_cts_retx_pkts: Optional[int] = None + rx_rdma_mtu_discard_pkts: Optional[int] = None + error_fields: ClassVar[list[str]] = [ + "req_rx_pkt_seq_err", + "req_rx_rnr_retry_err", + "req_rx_rmt_acc_err", + "req_rx_rmt_req_err", + "req_rx_oper_err", + "req_rx_impl_nak_seq_err", + "req_rx_cqe_err", + "req_rx_cqe_flush", + "req_rx_dup_response", + "req_rx_inval_pkts", + "req_tx_loc_acc_err", + "req_tx_loc_oper_err", + "req_tx_mem_mgmt_err", + "req_tx_retry_excd_err", + "req_tx_loc_sgl_inv_err", + "resp_rx_dup_request", + "resp_rx_outof_buf", + "resp_rx_outouf_seq", + "resp_rx_cqe_err", + "resp_rx_cqe_flush", + "resp_rx_loc_len_err", + "resp_rx_inval_request", + "resp_rx_loc_oper_err", + "resp_rx_outof_atomic", + "resp_tx_pkt_seq_err", + "resp_tx_rmt_inval_req_err", + "resp_tx_rmt_acc_err", + "resp_tx_rmt_oper_err", + "resp_tx_rnr_retry_err", + "resp_tx_loc_sgl_inv_err", + "resp_rx_s0_table_err", + "resp_rx_ccl_cts_outouf_seq", + "tx_rdma_ack_timeout", + "tx_rdma_ccl_cts_ack_timeout", + "tx_rdma_retx_bytes", + "tx_rdma_retx_pkts", + "tx_rdma_ccl_cts_retx_bytes", + "tx_rdma_ccl_cts_retx_pkts", + "rx_rdma_mtu_discard_pkts", + ] -class RdmaStatistics(BaseModel): - """RDMA statistic entry from 'rdma statistic -j'.""" + critial_error_fields: ClassVar[list[str]] = [] + + +class Thor2RdmaStatistics(BaseModel): + """ifname bnxt""" + + active_pds: Optional[int] = None + active_ahs: Optional[int] = None + active_qps: Optional[int] = None + active_rc_qps: Optional[int] = None + active_ud_qps: Optional[int] = None + active_srqs: Optional[int] = None + active_cqs: Optional[int] = None + active_mrs: Optional[int] = None + active_mws: Optional[int] = None + watermark_pds: Optional[int] = None + watermark_ahs: Optional[int] = None + watermark_qps: Optional[int] = None + watermark_rc_qps: Optional[int] = None + watermark_ud_qps: Optional[int] = None + watermark_srqs: Optional[int] = None + watermark_cqs: Optional[int] = None + watermark_mrs: Optional[int] = None + watermark_mws: Optional[int] = None + rx_pkts: Optional[int] = None + rx_bytes: Optional[int] = None + tx_pkts: Optional[int] = None + tx_bytes: Optional[int] = None + recoverable_errors: Optional[int] = None + tx_roce_errors: Optional[int] = None + tx_roce_discards: Optional[int] = None + rx_roce_errors: Optional[int] = None + rx_roce_discards: Optional[int] = None + local_ack_timeout_err: Optional[int] = None + packet_seq_err: Optional[int] = None + max_retry_exceeded: Optional[int] = None + rnr_nak_retry_err: Optional[int] = None + implied_nak_seq_err: Optional[int] = None + unrecoverable_err: Optional[int] = None + bad_resp_err: Optional[int] = None + local_qp_op_err: Optional[int] = None + local_protection_err: Optional[int] = None + mem_mgmt_op_err: Optional[int] = None + req_remote_invalid_request: Optional[int] = None + req_remote_access_errors: Optional[int] = None + remote_op_err: Optional[int] = None + duplicate_request: Optional[int] = None + res_exceed_max: Optional[int] = None + resp_local_length_error: Optional[int] = None + res_exceeds_wqe: Optional[int] = None + res_opcode_err: Optional[int] = None + res_rx_invalid_rkey: Optional[int] = None + res_rx_domain_err: Optional[int] = None + res_rx_no_perm: Optional[int] = None + res_rx_range_err: Optional[int] = None + res_tx_invalid_rkey: Optional[int] = None + res_tx_domain_err: Optional[int] = None + res_tx_no_perm: Optional[int] = None + res_tx_range_err: Optional[int] = None + res_irrq_oflow: Optional[int] = None + res_unsup_opcode: Optional[int] = None + res_unaligned_atomic: Optional[int] = None + res_rem_inv_err: Optional[int] = None + res_mem_err: Optional[int] = None + res_srq_err: Optional[int] = None + res_cmp_err: Optional[int] = None + res_invalid_dup_rkey: Optional[int] = None + res_wqe_format_err: Optional[int] = None + res_cq_load_err: Optional[int] = None + res_srq_load_err: Optional[int] = None + res_tx_pci_err: Optional[int] = None + res_rx_pci_err: Optional[int] = None + tx_atomic_req: Optional[int] = None + tx_read_req: Optional[int] = None + tx_read_resp: Optional[int] = None + tx_write_req: Optional[int] = None + tx_send_req: Optional[int] = None + rx_atomic_requests: Optional[int] = None + rx_read_requests: Optional[int] = None + rx_read_resp: Optional[int] = None + rx_write_requests: Optional[int] = None + rx_send_req: Optional[int] = None + rx_good_pkts: Optional[int] = None + rx_good_bytes: Optional[int] = None + out_of_buffer: Optional[int] = None + np_cnp_sent: Optional[int] = None + rp_cnp_handled: Optional[int] = None + np_ecn_marked_roce_packets: Optional[int] = None + out_of_sequence: Optional[int] = None + pacing_reschedule: Optional[int] = None + pacing_complete: Optional[int] = None + pacing_alerts: Optional[int] = None + db_fifo_register: Optional[int] = None + req_cqe_error: Optional[int] = None + req_cqe_flush_error: Optional[int] = None + resp_cqe_error: Optional[int] = None + resp_cqe_flush_error: Optional[int] = None + resp_remote_access_errors: Optional[int] = None + roce_adp_retrans: Optional[int] = None + roce_adp_retrans_to: Optional[int] = None + roce_slow_restart: Optional[int] = None + roce_slow_restart_cnps: Optional[int] = None + roce_slow_restart_trans: Optional[int] = None + rp_cnp_ignored: Optional[int] = None + rx_icrc_encapsulated: Optional[int] = None + + error_fields: ClassVar[list[str]] = [ + "recoverable_errors", + "tx_roce_errors", + "tx_roce_discards", + "rx_roce_errors", + "rx_roce_discards", + "local_ack_timeout_err", + "packet_seq_err", + "max_retry_exceeded", + "rnr_nak_retry_err", + "implied_nak_seq_err", + "bad_resp_err", + "local_qp_op_err", + "local_protection_err", + "mem_mgmt_op_err", + "req_remote_invalid_request", + "req_remote_access_errors", + "remote_op_err", + "duplicate_request", + "res_exceed_max", + "resp_local_length_error", + "res_exceeds_wqe", + "res_opcode_err", + "res_rx_invalid_rkey", + "res_rx_domain_err", + "res_rx_no_perm", + "res_rx_range_err", + "res_tx_invalid_rkey", + "res_tx_domain_err", + "res_tx_no_perm", + "res_tx_range_err", + "res_irrq_oflow", + "res_unsup_opcode", + "res_unaligned_atomic", + "res_rem_inv_err", + "res_srq_err", + "res_cmp_err", + "res_invalid_dup_rkey", + "res_wqe_format_err", + "res_cq_load_err", + "res_srq_load_err", + "out_of_buffer", + "out_of_sequence", + "req_cqe_error", + "req_cqe_flush_error", + "resp_cqe_error", + "resp_cqe_flush_error", + "resp_remote_access_errors", + "roce_adp_retrans", + "roce_adp_retrans_to", + "rp_cnp_ignored", + "rx_icrc_encapsulated", + ] + + critial_error_fields: ClassVar[list[str]] = [ + "unrecoverable_err", + "res_tx_pci_err", + "res_rx_pci_err", + "res_mem_err", + ] + + +class Cx7RdmaStatistics(BaseModel): + """ifname mlx""" + + rx_write_requests: Optional[int] = None + rx_read_requests: Optional[int] = None + rx_atomic_requests: Optional[int] = None + rx_dct_connect: Optional[int] = None + out_of_buffer: Optional[int] = None + out_of_sequence: Optional[int] = None + duplicate_request: Optional[int] = None + rnr_nak_retry_err: Optional[int] = None + packet_seq_err: Optional[int] = None + implied_nak_seq_err: Optional[int] = None + local_ack_timeout_err: Optional[int] = None + resp_local_length_error: Optional[int] = None + resp_cqe_error: Optional[int] = None + req_cqe_error: Optional[int] = None + req_remote_invalid_request: Optional[int] = None + req_remote_access_errors: Optional[int] = None + resp_remote_access_errors: Optional[int] = None + resp_cqe_flush_error: Optional[int] = None + req_cqe_flush_error: Optional[int] = None + roce_adp_retrans: Optional[int] = None + roce_adp_retrans_to: Optional[int] = None + roce_slow_restart: Optional[int] = None + roce_slow_restart_cnps: Optional[int] = None + roce_slow_restart_trans: Optional[int] = None + rp_cnp_ignored: Optional[int] = None + rp_cnp_handled: Optional[int] = None + np_ecn_marked_roce_packets: Optional[int] = None + np_cnp_sent: Optional[int] = None + rx_icrc_encapsulated: Optional[int] = None + + error_fields: ClassVar[list[str]] = [ + "out_of_buffer", + "out_of_sequence", + "duplicate_request", + "rnr_nak_retry_err", + "packet_seq_err", + "implied_nak_seq_err", + "local_ack_timeout_err", + "resp_local_length_error", + "resp_cqe_error", + "req_cqe_error", + "req_remote_invalid_request", + "req_remote_access_errors", + "resp_remote_access_errors", + "resp_cqe_flush_error", + "req_cqe_flush_error", + "roce_adp_retrans", + "roce_adp_retrans_to", + "rp_cnp_ignored", + "rx_icrc_encapsulated", + ] - model_config = ConfigDict(extra="allow") + critial_error_fields: ClassVar[list[str]] = [] + +RdmaVendorStatistics = Union[PollaraRdmaStatistics, Thor2RdmaStatistics, Cx7RdmaStatistics] + +# Map ifname prefixes to vendor-specific statistic models +VENDOR_PREFIX_MAP: dict[str, type[RdmaVendorStatistics]] = { + "ionic": PollaraRdmaStatistics, + "bnxt": Thor2RdmaStatistics, + "mlx": Cx7RdmaStatistics, +} + + +class RdmaStatistics(BaseModel): + # Interface information ifname: Optional[str] = None port: Optional[int] = None + vendor_statistics: Optional[RdmaVendorStatistics] = None @model_validator(mode="after") - def validate_at_least_one_field(self) -> Self: + def validate_atleast_one_field(self) -> Self: if not self.model_fields_set: raise ValueError("At least one field must be set in RdmaStatistics") return self class RdmaLink(BaseModel): - """RDMA link entry from 'rdma link -j' (JSON).""" - + # Interface and port information ifindex: Optional[int] = None ifname: Optional[str] = None port: Optional[int] = None @@ -70,12 +391,24 @@ class RdmaLink(BaseModel): netdev_index: Optional[int] = None @model_validator(mode="after") - def validate_at_least_one_field(self) -> Self: + def validate_atleast_one_field(self) -> Self: if not self.model_fields_set: raise ValueError("At least one field must be set in RdmaLink") return self +class RdmaDevice(BaseModel): + """RDMA device from 'rdma dev' (text output).""" + + device: str + node_type: Optional[str] = None + transport: Optional[str] = None + node_guid: Optional[str] = None + sys_image_guid: Optional[str] = None + state: Optional[str] = None + attributes: dict[str, str] = Field(default_factory=dict) + + class RdmaLinkText(BaseModel): """RDMA link from 'rdma link' (text output).""" @@ -92,10 +425,12 @@ class RdmaDataModel(DataModel): Data model for RDMA (Remote Direct Memory Access) statistics and link information. Attributes: - statistic_list: List of RDMA statistics from 'rdma statistic -j'. - link_list: List of RDMA links from 'rdma link -j' (JSON). - dev_list: List of RDMA devices from 'rdma dev' (text). - link_list_text: List of RDMA links from 'rdma link' (text). + statistic_list: RDMA statistics from 'rdma statistic -j'. Each entry has + ifname, port, and vendor_statistics (ionic/bnxt/mlx counters) when the + interface prefix matches a known vendor. + link_list: RDMA links from 'rdma link -j' (JSON). + dev_list: RDMA devices from 'rdma dev' (text). + link_list_text: RDMA links from 'rdma link' (text). """ link_list: list[RdmaLink] = Field(default_factory=list) diff --git a/test/unit/plugin/test_rdma_analyzer.py b/test/unit/plugin/test_rdma_analyzer.py index 2f477b11..ada67c04 100644 --- a/test/unit/plugin/test_rdma_analyzer.py +++ b/test/unit/plugin/test_rdma_analyzer.py @@ -25,15 +25,21 @@ ############################################################################### import json from pathlib import Path +from typing import Optional import pytest from nodescraper.enums import EventPriority, ExecutionStatus from nodescraper.plugins.inband.rdma.rdma_analyzer import RdmaAnalyzer from nodescraper.plugins.inband.rdma.rdmadata import ( + VENDOR_PREFIX_MAP, + Cx7RdmaStatistics, + PollaraRdmaStatistics, RdmaDataModel, RdmaLink, RdmaStatistics, + RdmaVendorStatistics, + Thor2RdmaStatistics, ) @@ -48,91 +54,108 @@ def plugin_fixtures_path(): @pytest.fixture -def clean_rdma_model(plugin_fixtures_path): - """RDMA data with no errors (all counters zero).""" +def example_stat_dicts(plugin_fixtures_path): path = plugin_fixtures_path / "rdma_statistic_example_data.json" - data = json.loads(path.read_text()) - stats = [RdmaStatistics(**s) for s in data] - return RdmaDataModel(statistic_list=stats) + return json.loads(path.read_text()) + + +def _build_stats(data: list[dict]) -> list[RdmaStatistics]: + """Build RdmaStatistics list from raw dicts using vendor prefix map.""" + stats = [] + for entry in data: + ifname = entry.get("ifname", "") + vendor_stats: Optional[RdmaVendorStatistics] = None + for prefix, vendor_cls in VENDOR_PREFIX_MAP.items(): + if ifname.startswith(prefix): + vendor_stats = vendor_cls(**entry) + break + stats.append( + RdmaStatistics( + ifname=entry.get("ifname"), + port=entry.get("port"), + vendor_statistics=vendor_stats, + ) + ) + return stats @pytest.fixture -def clean_stats(plugin_fixtures_path): - """List of clean RdmaStatistics (no errors) for building models with links.""" - path = plugin_fixtures_path / "rdma_statistic_example_data.json" - data = json.loads(path.read_text()) - return [RdmaStatistics(**s) for s in data] +def clean_rdma_model(example_stat_dicts): + return RdmaDataModel(statistic_list=_build_stats(example_stat_dicts)) + + +@pytest.fixture +def clean_stats(example_stat_dicts): + return _build_stats(example_stat_dicts) def test_no_errors_detected(rdma_analyzer, clean_rdma_model): - """Test with nominal data that has no errors.""" result = rdma_analyzer.analyze_data(clean_rdma_model) assert result.status == ExecutionStatus.OK assert len(result.events) == 0 -def test_single_error_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing a single error.""" - stats = list(clean_rdma_model.statistic_list) - stats[0].tx_roce_errors = 5 - model = RdmaDataModel(statistic_list=stats) +def test_single_error_detected(rdma_analyzer, example_stat_dicts): + stats_with_error = _build_stats(example_stat_dicts) + stats_with_error[0].vendor_statistics.req_rx_pkt_seq_err = 5 + model = RdmaDataModel(statistic_list=stats_with_error) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message assert len(result.events) == 1 - assert result.events[0].description == "RDMA error detected on bnxt_re0: [tx_roce_errors]" + assert result.events[0].description == "RDMA error detected: req_rx_pkt_seq_err" assert result.events[0].priority == EventPriority.ERROR - assert result.events[0].data["errors"] == {"tx_roce_errors": 5} - assert result.events[0].data["interface"] == "bnxt_re0" + assert result.events[0].data["error_count"] == 5 + assert result.events[0].data["interface"] == "ionic_0" -def test_multiple_errors_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing multiple errors (grouped per interface).""" - stats = list(clean_rdma_model.statistic_list) - stats[0].tx_roce_errors = 10 - stats[0].rx_roce_errors = 3 - stats[1].packet_seq_err = 7 - model = RdmaDataModel(statistic_list=stats) +def test_multiple_errors_detected(rdma_analyzer, example_stat_dicts): + stats_with_errors = _build_stats(example_stat_dicts) + stats_with_errors[0].vendor_statistics.req_rx_rmt_acc_err = 10 + stats_with_errors[0].vendor_statistics.req_tx_loc_oper_err = 3 + stats_with_errors[8].vendor_statistics.packet_seq_err = 7 + model = RdmaDataModel(statistic_list=stats_with_errors) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message - assert len(result.events) == 2 # one per interface + assert len(result.events) == 3 for event in result.events: assert event.priority == EventPriority.ERROR - # Total 3 errors across 2 interfaces - assert sum(len(e.data["errors"]) for e in result.events) == 3 -def test_critical_error_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing a critical error (grouped per interface).""" - stats = list(clean_rdma_model.statistic_list) - stats[0].unrecoverable_err = 1 - stats[0].res_tx_pci_err = 2 +def test_critical_error_detected(rdma_analyzer): + stats = [ + RdmaStatistics( + ifname="bnxt_re_test", + port=1, + vendor_statistics=Thor2RdmaStatistics( + unrecoverable_err=1, + res_tx_pci_err=2, + ), + ) + ] model = RdmaDataModel(statistic_list=stats) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message - assert len(result.events) == 1 # one event per interface - assert result.events[0].priority == EventPriority.CRITICAL - assert "unrecoverable_err" in result.events[0].data["errors"] - assert "res_tx_pci_err" in result.events[0].data["errors"] + assert len(result.events) == 2 + critical_events = [e for e in result.events if e.priority == EventPriority.CRITICAL] + assert len(critical_events) == 2 def test_empty_statistics(rdma_analyzer): - """Test with empty statistics list: WARNING and message logged.""" model = RdmaDataModel(statistic_list=[], link_list=[]) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.WARNING assert result.message == "No RDMA devices found" -def test_multiple_interfaces_with_errors(rdma_analyzer, clean_rdma_model): - """Test with errors across multiple interfaces.""" - stats = list(clean_rdma_model.statistic_list) - stats[0].max_retry_exceeded = 15 - stats[2].local_ack_timeout_err = 8 - stats[4].out_of_buffer = 100 - model = RdmaDataModel(statistic_list=stats) +def test_multiple_interfaces_with_errors(rdma_analyzer, example_stat_dicts): + stats_multi_errors = _build_stats(example_stat_dicts) + stats_multi_errors[0].vendor_statistics.req_rx_pkt_seq_err = 15 + stats_multi_errors[2].vendor_statistics.tx_rdma_ack_timeout = 8 + stats_multi_errors[8].vendor_statistics.out_of_buffer = 100 + model = RdmaDataModel(statistic_list=stats_multi_errors) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert len(result.events) == 3 @@ -141,44 +164,58 @@ def test_multiple_interfaces_with_errors(rdma_analyzer, clean_rdma_model): def test_all_error_types(rdma_analyzer): - """Test that all error fields are properly detected (grouped in one event).""" - stats = RdmaStatistics( - ifname="bnxt_re_test", - port=1, - recoverable_errors=1, - tx_roce_errors=1, - unrecoverable_err=1, - ) - model = RdmaDataModel(statistic_list=[stats]) + stats = [ + RdmaStatistics( + ifname="ionic_test", + port=1, + vendor_statistics=PollaraRdmaStatistics( + req_rx_pkt_seq_err=1, + req_tx_loc_oper_err=1, + ), + ), + RdmaStatistics( + ifname="mlx5_test", + port=1, + vendor_statistics=Cx7RdmaStatistics( + packet_seq_err=1, + ), + ), + ] + model = RdmaDataModel(statistic_list=stats) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR - assert len(result.events) == 1 # one event per interface - assert "unrecoverable_err" in result.events[0].data["errors"] - assert result.events[0].priority == EventPriority.CRITICAL - assert set(result.events[0].data["errors"].keys()) == { - "recoverable_errors", - "tx_roce_errors", - "unrecoverable_err", - } + assert len(result.events) == 3 + interfaces = {event.data["interface"] for event in result.events} + assert interfaces == {"ionic_test", "mlx5_test"} def test_zero_errors_are_ignored(rdma_analyzer): - """Test that zero-value errors are not reported.""" - stats = RdmaStatistics( - ifname="bnxt_re_test", - port=1, - tx_roce_errors=0, - rx_roce_errors=0, - unrecoverable_err=0, - ) - model = RdmaDataModel(statistic_list=[stats]) + stats = [ + RdmaStatistics( + ifname="ionic_test", + port=1, + vendor_statistics=PollaraRdmaStatistics( + req_rx_pkt_seq_err=0, + req_rx_rnr_retry_err=0, + tx_rdma_ack_timeout=0, + ), + ), + RdmaStatistics( + ifname="mlx5_test", + port=1, + vendor_statistics=Cx7RdmaStatistics( + packet_seq_err=0, + out_of_buffer=0, + ), + ), + ] + model = RdmaDataModel(statistic_list=stats) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.OK assert len(result.events) == 0 def test_rdma_link_all_active(rdma_analyzer, clean_stats): - """Test with RDMA links that are all active and up.""" links = [ RdmaLink( ifindex=0, @@ -207,7 +244,6 @@ def test_rdma_link_all_active(rdma_analyzer, clean_stats): def test_rdma_link_down_detected(rdma_analyzer, clean_stats): - """Test with RDMA links that are down""" links = [ RdmaLink( ifindex=0, @@ -230,12 +266,10 @@ def test_rdma_link_down_detected(rdma_analyzer, clean_stats): ] model = RdmaDataModel(statistic_list=clean_stats, link_list=links) result = rdma_analyzer.analyze_data(model) - # Current implementation only checks statistics, not link state assert result.status == ExecutionStatus.OK def test_rdma_link_empty_list(rdma_analyzer, clean_stats): - """Test with empty RDMA link list.""" model = RdmaDataModel(statistic_list=clean_stats, link_list=[]) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.OK @@ -243,7 +277,6 @@ def test_rdma_link_empty_list(rdma_analyzer, clean_stats): def test_rdma_link_multiple_interfaces(rdma_analyzer, clean_stats): - """Test with multiple RDMA interfaces with different link states.""" links = [ RdmaLink( ifindex=0, From 193980843583847ed21e00637307bc06de631045 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Tue, 21 Apr 2026 21:13:29 +0000 Subject: [PATCH 2/3] rdma fix --- .../plugins/inband/rdma/rdma_analyzer.py | 156 +- .../plugins/inband/rdma/rdma_collector.py | 81 +- .../fixtures/rdma_statistic_example_data.json | 1276 +++++++---------- test/unit/plugin/test_rdma_collector.py | 130 +- 4 files changed, 754 insertions(+), 889 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/rdma_analyzer.py b/nodescraper/plugins/inband/rdma/rdma_analyzer.py index 163602b0..00f6977f 100644 --- a/nodescraper/plugins/inband/rdma/rdma_analyzer.py +++ b/nodescraper/plugins/inband/rdma/rdma_analyzer.py @@ -37,106 +37,12 @@ class RdmaAnalyzer(DataAnalyzer[RdmaDataModel, None]): DATA_MODEL = RdmaDataModel - # Error fields checked from rdma statistic output (bnxt_re, mlx5, ionic, etc.) - ERROR_FIELDS = [ - "recoverable_errors", - "tx_roce_errors", - "tx_roce_discards", - "rx_roce_errors", - "rx_roce_discards", - "local_ack_timeout_err", - "packet_seq_err", - "max_retry_exceeded", - "rnr_nak_retry_err", - "implied_nak_seq_err", - "unrecoverable_err", - "bad_resp_err", - "local_qp_op_err", - "local_protection_err", - "mem_mgmt_op_err", - "req_remote_invalid_request", - "req_remote_access_errors", - "remote_op_err", - "duplicate_request", - "res_exceed_max", - "resp_local_length_error", - "res_exceeds_wqe", - "res_opcode_err", - "res_rx_invalid_rkey", - "res_rx_domain_err", - "res_rx_no_perm", - "res_rx_range_err", - "res_tx_invalid_rkey", - "res_tx_domain_err", - "res_tx_no_perm", - "res_tx_range_err", - "res_irrq_oflow", - "res_unsup_opcode", - "res_unaligned_atomic", - "res_rem_inv_err", - "res_mem_err", - "res_srq_err", - "res_cmp_err", - "res_invalid_dup_rkey", - "res_wqe_format_err", - "res_cq_load_err", - "res_srq_load_err", - "res_tx_pci_err", - "res_rx_pci_err", - "out_of_buffer", - "out_of_sequence", - "req_cqe_error", - "req_cqe_flush_error", - "resp_cqe_error", - "resp_cqe_flush_error", - "resp_remote_access_errors", - "req_rx_pkt_seq_err", - "req_rx_rnr_retry_err", - "req_rx_rmt_acc_err", - "req_rx_rmt_req_err", - "req_rx_oper_err", - "req_rx_impl_nak_seq_err", - "req_rx_cqe_err", - "req_rx_cqe_flush", - "req_rx_dup_response", - "req_rx_inval_pkts", - "req_tx_loc_acc_err", - "req_tx_loc_oper_err", - "req_tx_mem_mgmt_err", - "req_tx_retry_excd_err", - "req_tx_loc_sgl_inv_err", - "resp_rx_dup_request", - "resp_rx_outof_buf", - "resp_rx_outouf_seq", - "resp_rx_cqe_err", - "resp_rx_cqe_flush", - "resp_rx_loc_len_err", - "resp_rx_inval_request", - "resp_rx_loc_oper_err", - "resp_rx_outof_atomic", - "resp_tx_pkt_seq_err", - "resp_tx_rmt_inval_req_err", - "resp_tx_rmt_acc_err", - "resp_tx_rmt_oper_err", - "resp_tx_rnr_retry_err", - "resp_tx_loc_sgl_inv_err", - "resp_rx_s0_table_err", - "resp_rx_ccl_cts_outouf_seq", - "tx_rdma_ack_timeout", - "tx_rdma_ccl_cts_ack_timeout", - "rx_rdma_mtu_discard_pkts", - ] - - CRITICAL_ERROR_FIELDS = [ - "unrecoverable_err", - "res_tx_pci_err", - "res_rx_pci_err", - "res_mem_err", - ] - def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> TaskResult: """Analyze RDMA statistics for non-zero error counters. + Error and critical counter names come from each vendor's statistics model + (ionic / bnxt / mlx prefixes). + Args: data: RDMA data model with statistic_list (and optionally link_list). args: Unused (analyzer has no configurable args). @@ -150,32 +56,36 @@ def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> Task return self.result error_state = False - for idx, stat in enumerate(data.statistic_list): - errors_on_interface = [] # (error_field, value, is_critical) - for error_field in self.ERROR_FIELDS: - value = getattr(stat, error_field, None) - if value is not None and value > 0: - is_critical = error_field in self.CRITICAL_ERROR_FIELDS - errors_on_interface.append((error_field, value, is_critical)) - if errors_on_interface: - error_state = True - interface_label = stat.ifname or "unknown" - error_names = [e[0] for e in errors_on_interface] - any_critical = any(e[2] for e in errors_on_interface) - priority = EventPriority.CRITICAL if any_critical else EventPriority.ERROR - errors_data = {field: value for field, value, _ in errors_on_interface} - self._log_event( - category=EventCategory.IO, - description=f"RDMA error detected on {interface_label}: [{', '.join(error_names)}]", - data={ - "interface": stat.ifname, - "port": stat.port, - "errors": errors_data, - "statistic_index": idx, - }, - priority=priority, - console_log=True, - ) + + for stat in data.statistic_list: + if stat.vendor_statistics is None: + continue + + error_fields = stat.vendor_statistics.error_fields + critical_fields = stat.vendor_statistics.critial_error_fields + + for error_field in error_fields + critical_fields: + error_value = getattr(stat.vendor_statistics, error_field, None) + + if error_value is not None and error_value > 0: + priority = ( + EventPriority.CRITICAL + if error_field in critical_fields + else EventPriority.ERROR + ) + self._log_event( + category=EventCategory.NETWORK, + description=f"RDMA error detected: {error_field}", + data={ + "interface": stat.ifname, + "port": stat.port, + "error_field": error_field, + "error_count": error_value, + }, + priority=priority, + console_log=True, + ) + error_state = True if error_state: self.result.message = "RDMA errors detected in statistics" diff --git a/nodescraper/plugins/inband/rdma/rdma_collector.py b/nodescraper/plugins/inband/rdma/rdma_collector.py index b5e01b5c..a719a334 100644 --- a/nodescraper/plugins/inband/rdma/rdma_collector.py +++ b/nodescraper/plugins/inband/rdma/rdma_collector.py @@ -34,7 +34,15 @@ from nodescraper.models import TaskResult from nodescraper.utils import get_exception_traceback -from .rdmadata import RdmaDataModel, RdmaDevice, RdmaLink, RdmaLinkText, RdmaStatistics +from .rdmadata import ( + VENDOR_PREFIX_MAP, + RdmaDataModel, + RdmaDevice, + RdmaLink, + RdmaLinkText, + RdmaStatistics, + RdmaVendorStatistics, +) class RdmaCollector(InBandDataCollector[RdmaDataModel, None]): @@ -61,7 +69,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: if res.exit_code != 0: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description=f"Error running rdma command: {cmd}", data={ "command": cmd, @@ -80,7 +88,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: return json.loads(res.stdout) except json.JSONDecodeError as e: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description=f"Error parsing command: {cmd} json data", data={ "cmd": cmd, @@ -172,7 +180,11 @@ def _parse_rdma_link_text(self, output: str) -> list[RdmaLinkText]: return links def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: - """Get RDMA statistics from 'rdma statistic -j'.""" + """Get RDMA statistics from 'rdma statistic -j'. + + Warns on unexpected or missing fields relative to the vendor-specific model + for the interface prefix (ionic / bnxt / mlx). + """ stat_data = self._run_rdma_command(self.CMD_STATISTIC) if stat_data is None: return None @@ -184,21 +196,70 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: for stat in stat_data: if not isinstance(stat, dict): self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Invalid data type for RDMA statistic", data={"data_type": type(stat).__name__}, priority=EventPriority.WARNING, ) continue - statistics.append(RdmaStatistics(**stat)) + + ifname = stat.get("ifname", "") + vendor_stats: Optional[RdmaVendorStatistics] = None + for prefix, vendor_cls in VENDOR_PREFIX_MAP.items(): + if ifname.startswith(prefix): + vendor_fields = set(vendor_cls.model_fields.keys()) + stat_fields = set(stat.keys()) - {"ifname", "port"} + + extra_fields = stat_fields - vendor_fields + if extra_fields: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Unexpected fields in RDMA statistic for {ifname}", + data={ + "interface": ifname, + "extra_fields": sorted(extra_fields), + }, + priority=EventPriority.WARNING, + ) + + missing_fields = vendor_fields - stat_fields + if missing_fields: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Missing fields in RDMA statistic for {ifname}", + data={ + "interface": ifname, + "missing_fields": sorted(missing_fields), + }, + priority=EventPriority.WARNING, + ) + + try: + vendor_stats = vendor_cls(**stat) + except ValidationError as ve: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Failed to build vendor model for {ifname}", + data={"exception": get_exception_traceback(ve)}, + priority=EventPriority.WARNING, + ) + break + + rdma_stat = RdmaStatistics( + ifname=stat.get("ifname"), + port=stat.get("port"), + vendor_statistics=vendor_stats, + ) + statistics.append(rdma_stat) + return statistics except ValidationError as e: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Failed to build RdmaStatistics model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, ) - return statistics + return None def _get_rdma_link(self) -> Optional[list[RdmaLink]]: """Get RDMA link data from 'rdma link -j'.""" @@ -213,7 +274,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: for link in link_data: if not isinstance(link, dict): self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Invalid data type for RDMA link", data={"data_type": type(link).__name__}, priority=EventPriority.WARNING, @@ -223,7 +284,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: return links except ValidationError as e: self._log_event( - category=EventCategory.NETWORK, + category=EventCategory.APPLICATION, description="Failed to build RdmaLink model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, diff --git a/test/unit/plugin/fixtures/rdma_statistic_example_data.json b/test/unit/plugin/fixtures/rdma_statistic_example_data.json index e338e41a..6f0a33ed 100644 --- a/test/unit/plugin/fixtures/rdma_statistic_example_data.json +++ b/test/unit/plugin/fixtures/rdma_statistic_example_data.json @@ -1,826 +1,598 @@ [ { - "ifname": "bnxt_re0", + "ifname": "ionic_0", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 12, - "watermark_ahs": 8, - "watermark_qps": 229, - "watermark_rc_qps": 220, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 305, - "watermark_mws": 0, - "rx_pkts": 3504998440, - "rx_bytes": 2966950848, - "tx_pkts": 2747190987, - "tx_bytes": 912073550, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3324056122, - "tx_read_resp": 3324056122, - "tx_write_req": 622240024, - "tx_send_req": 97500, - "rx_atomic_requests": 0, - "rx_read_requests": 3324056122, - "rx_read_resp": 3324056122, - "rx_write_requests": 626374468, - "rx_send_req": 97500, - "rx_good_pkts": 1401322762, - "rx_good_bytes": 2966950848, - "out_of_buffer": 0, - "np_cnp_sent": 2873487760, - "rp_cnp_handled": 2103675678, - "np_ecn_marked_roce_packets": 2873487760, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re1", + "ifname": "ionic_1", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 14, - "watermark_ahs": 3, - "watermark_qps": 228, - "watermark_rc_qps": 219, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 287, - "watermark_mws": 0, - "rx_pkts": 1509751895, - "rx_bytes": 3099873130, - "tx_pkts": 692925073, - "tx_bytes": 2068663286, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3322387232, - "tx_read_resp": 3322387232, - "tx_write_req": 620621144, - "tx_send_req": 0, - "rx_atomic_requests": 0, - "rx_read_requests": 3322387232, - "rx_read_resp": 3322387232, - "rx_write_requests": 621181433, - "rx_send_req": 0, - "rx_good_pkts": 3507768689, - "rx_good_bytes": 3099873130, - "out_of_buffer": 0, - "np_cnp_sent": 1097578610, - "rp_cnp_handled": 2296950502, - "np_ecn_marked_roce_packets": 1097578610, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re2", + "ifname": "ionic_2", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 4, - "watermark_qps": 230, - "watermark_rc_qps": 221, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 294, - "watermark_mws": 0, - "rx_pkts": 2328181128, - "rx_bytes": 79750872, - "tx_pkts": 1404869338, - "tx_bytes": 644434628, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3212760135, - "tx_read_resp": 3212760135, - "tx_write_req": 1995861174, - "tx_send_req": 0, - "rx_atomic_requests": 0, - "rx_read_requests": 3212760135, - "rx_read_resp": 3212760135, - "rx_write_requests": 1995579948, - "rx_send_req": 0, - "rx_good_pkts": 4025638368, - "rx_good_bytes": 79750872, - "out_of_buffer": 0, - "np_cnp_sent": 4174752904, - "rp_cnp_handled": 2597510056, - "np_ecn_marked_roce_packets": 4174752904, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re3", + "ifname": "ionic_3", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 12, - "watermark_ahs": 7, - "watermark_qps": 229, - "watermark_rc_qps": 220, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 292, - "watermark_mws": 0, - "rx_pkts": 3888070733, - "rx_bytes": 3748987850, - "tx_pkts": 2265082996, - "tx_bytes": 3715380316, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 3103369202, - "tx_read_resp": 3103369202, - "tx_write_req": 3370635080, - "tx_send_req": 0, - "rx_atomic_requests": 0, - "rx_read_requests": 3103369202, - "rx_read_resp": 3103369202, - "rx_write_requests": 3368547249, - "rx_send_req": 0, - "rx_good_pkts": 2688805201, - "rx_good_bytes": 3748987850, - "out_of_buffer": 0, - "np_cnp_sent": 134598312, - "rp_cnp_handled": 1199265532, - "np_ecn_marked_roce_packets": 134598312, - "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, - "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, - "resp_remote_access_errors": 0, - "roce_adp_retrans": 0, - "roce_adp_retrans_to": 0, - "roce_slow_restart": 0, - "roce_slow_restart_cnps": 0, - "roce_slow_restart_trans": 0, - "rp_cnp_ignored": 0, - "rx_icrc_encapsulated": 0 + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 }, { - "ifname": "bnxt_re4", + "ifname": "ionic_4", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 12, - "watermark_ahs": 6, - "watermark_qps": 230, - "watermark_rc_qps": 221, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 302, - "watermark_mws": 0, - "rx_pkts": 986831570, - "rx_bytes": 1185181414, - "tx_pkts": 1975828812, - "tx_bytes": 2763928250, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2993618119, - "tx_read_resp": 2993618119, - "tx_write_req": 449606302, - "tx_send_req": 37687, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "ionic_5", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "ionic_6", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "ionic_7", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "tx_rdma_ucast_pkts": 0, + "tx_rdma_mcast_bytes": 0, + "tx_rdma_mcast_pkts": 0, + "tx_rdma_cnp_pkts": 0, + "rx_rdma_ucast_bytes": 0, + "rx_rdma_ucast_pkts": 0, + "rx_rdma_mcast_bytes": 0, + "rx_rdma_mcast_pkts": 0, + "rx_rdma_cnp_pkts": 0, + "rx_rdma_ecn_pkts": 0, + "req_rx_pkt_seq_err": 0, + "req_rx_rnr_retry_err": 0, + "req_rx_rmt_acc_err": 0, + "req_rx_rmt_req_err": 0, + "req_rx_oper_err": 0, + "req_rx_impl_nak_seq_err": 0, + "req_rx_cqe_err": 0, + "req_rx_cqe_flush": 0, + "req_rx_dup_response": 0, + "req_rx_inval_pkts": 0, + "req_tx_loc_acc_err": 0, + "req_tx_loc_oper_err": 0, + "req_tx_mem_mgmt_err": 0, + "req_tx_retry_excd_err": 0, + "req_tx_loc_sgl_inv_err": 0, + "resp_rx_dup_request": 0, + "resp_rx_outof_buf": 0, + "resp_rx_outouf_seq": 0, + "resp_rx_cqe_err": 0, + "resp_rx_cqe_flush": 0, + "resp_rx_loc_len_err": 0, + "resp_rx_inval_request": 0, + "resp_rx_loc_oper_err": 0, + "resp_rx_outof_atomic": 0, + "resp_tx_pkt_seq_err": 0, + "resp_tx_rmt_inval_req_err": 0, + "resp_tx_rmt_acc_err": 0, + "resp_tx_rmt_oper_err": 0, + "resp_tx_rnr_retry_err": 0, + "resp_tx_loc_sgl_inv_err": 0, + "resp_rx_s0_table_err": 0, + "tx_rdma_ccl_cts_bytes": 0, + "tx_rdma_ccl_cts_pkts": 0, + "rx_rdma_ccl_cts_bytes": 0, + "rx_rdma_ccl_cts_pkts": 0, + "resp_rx_ccl_cts_outouf_seq": 0, + "tx_rdma_ack_timeout": 0, + "tx_rdma_ccl_cts_ack_timeout": 0, + "tx_rdma_retx_bytes": 0, + "tx_rdma_retx_pkts": 0, + "tx_rdma_ccl_cts_retx_bytes": 0, + "tx_rdma_ccl_cts_retx_pkts": 0, + "rx_rdma_mtu_discard_pkts": 0 + }, + { + "ifname": "mlx5_0", + "port": 1, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2993618119, - "rx_read_resp": 2993618119, - "rx_write_requests": 448485514, - "rx_send_req": 37687, - "rx_good_pkts": 2876478595, - "rx_good_bytes": 1185181414, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 3525492995, - "rp_cnp_handled": 2405320271, - "np_ecn_marked_roce_packets": 3525492995, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 }, { - "ifname": "bnxt_re5", + "ifname": "mlx5_1", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 7, - "watermark_qps": 228, - "watermark_rc_qps": 219, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 287, - "watermark_mws": 0, - "rx_pkts": 3602164391, - "rx_bytes": 515322372, - "tx_pkts": 3498885620, - "tx_bytes": 3601952844, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2883798845, - "tx_read_resp": 2883798845, - "tx_write_req": 1822414941, - "tx_send_req": 0, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2883798845, - "rx_read_resp": 2883798845, - "rx_write_requests": 1819507161, - "rx_send_req": 0, - "rx_good_pkts": 1576292710, - "rx_good_bytes": 515322372, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 4093842522, - "rp_cnp_handled": 2025871681, - "np_ecn_marked_roce_packets": 4093842522, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 }, { - "ifname": "bnxt_re6", + "ifname": "mlx5_2", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 7, - "watermark_qps": 230, - "watermark_rc_qps": 221, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 95, - "watermark_mrs": 294, - "watermark_mws": 0, - "rx_pkts": 2577272275, - "rx_bytes": 2249875450, - "tx_pkts": 2452138468, - "tx_bytes": 700557582, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2775090592, - "tx_read_resp": 2775090592, - "tx_write_req": 3201764210, - "tx_send_req": 0, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2775090592, - "rx_read_resp": 2775090592, - "rx_write_requests": 3201655162, - "rx_send_req": 0, - "rx_good_pkts": 1197866395, - "rx_good_bytes": 2249875450, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 2401103251, - "rp_cnp_handled": 1379405880, - "np_ecn_marked_roce_packets": 2401103251, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 }, { - "ifname": "bnxt_re7", + "ifname": "mlx5_3", "port": 1, - "active_pds": 1, - "active_ahs": 0, - "active_qps": 1, - "active_rc_qps": 0, - "active_ud_qps": 0, - "active_srqs": 0, - "active_cqs": 1, - "active_mrs": 0, - "active_mws": 0, - "watermark_pds": 13, - "watermark_ahs": 6, - "watermark_qps": 228, - "watermark_rc_qps": 219, - "watermark_ud_qps": 8, - "watermark_srqs": 8, - "watermark_cqs": 94, - "watermark_mrs": 287, - "watermark_mws": 0, - "rx_pkts": 1606921676, - "rx_bytes": 4007942950, - "tx_pkts": 1249198409, - "tx_bytes": 25134278, - "recoverable_errors": 0, - "tx_roce_errors": 0, - "tx_roce_discards": 0, - "rx_roce_errors": 0, - "rx_roce_discards": 0, - "local_ack_timeout_err": 0, - "packet_seq_err": 0, - "max_retry_exceeded": 0, - "rnr_nak_retry_err": 0, - "implied_nak_seq_err": 0, - "unrecoverable_err": 0, - "bad_resp_err": 0, - "local_qp_op_err": 0, - "local_protection_err": 0, - "mem_mgmt_op_err": 0, - "req_remote_invalid_request": 0, - "req_remote_access_errors": 0, - "remote_op_err": 0, - "duplicate_request": 0, - "res_exceed_max": 0, - "resp_local_length_error": 0, - "res_exceeds_wqe": 0, - "res_opcode_err": 0, - "res_rx_invalid_rkey": 0, - "res_rx_domain_err": 0, - "res_rx_no_perm": 0, - "res_rx_range_err": 0, - "res_tx_invalid_rkey": 0, - "res_tx_domain_err": 0, - "res_tx_no_perm": 0, - "res_tx_range_err": 0, - "res_irrq_oflow": 0, - "res_unsup_opcode": 0, - "res_unaligned_atomic": 0, - "res_rem_inv_err": 0, - "res_mem_err": 0, - "res_srq_err": 0, - "res_cmp_err": 0, - "res_invalid_dup_rkey": 0, - "res_wqe_format_err": 0, - "res_cq_load_err": 0, - "res_srq_load_err": 0, - "res_tx_pci_err": 0, - "res_rx_pci_err": 0, - "tx_atomic_req": 0, - "tx_read_req": 2665758274, - "tx_read_resp": 2665758274, - "tx_write_req": 284646587, - "tx_send_req": 0, + "rx_write_requests": 0, + "rx_read_requests": 0, "rx_atomic_requests": 0, - "rx_read_requests": 2665758274, - "rx_read_resp": 2665758274, - "rx_write_requests": 284542358, - "rx_send_req": 0, - "rx_good_pkts": 253070639, - "rx_good_bytes": 4007942950, + "rx_dct_connect": 0, "out_of_buffer": 0, - "np_cnp_sent": 2670842510, - "rp_cnp_handled": 1353851037, - "np_ecn_marked_roce_packets": 2670842510, "out_of_sequence": 0, - "pacing_reschedule": 0, - "pacing_complete": 0, - "pacing_alerts": 0, - "db_fifo_register": 2147450881, - "req_cqe_error": 0, - "req_cqe_flush_error": 0, + "duplicate_request": 0, + "rnr_nak_retry_err": 0, + "packet_seq_err": 0, + "implied_nak_seq_err": 0, + "local_ack_timeout_err": 0, + "resp_local_length_error": 0, "resp_cqe_error": 0, - "resp_cqe_flush_error": 0, + "req_cqe_error": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, "resp_remote_access_errors": 0, + "resp_cqe_flush_error": 0, + "req_cqe_flush_error": 0, "roce_adp_retrans": 0, "roce_adp_retrans_to": 0, "roce_slow_restart": 0, "roce_slow_restart_cnps": 0, "roce_slow_restart_trans": 0, "rp_cnp_ignored": 0, + "rp_cnp_handled": 0, + "np_ecn_marked_roce_packets": 0, + "np_cnp_sent": 0, "rx_icrc_encapsulated": 0 } ] diff --git a/test/unit/plugin/test_rdma_collector.py b/test/unit/plugin/test_rdma_collector.py index 595e7e33..eb687c54 100644 --- a/test/unit/plugin/test_rdma_collector.py +++ b/test/unit/plugin/test_rdma_collector.py @@ -23,12 +23,13 @@ # SOFTWARE. # ############################################################################### +import json from pathlib import Path import pytest from nodescraper.connection.inband.inband import CommandArtifact -from nodescraper.enums import ExecutionStatus, OSFamily +from nodescraper.enums import EventPriority, ExecutionStatus, OSFamily from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.plugins.inband.rdma.rdma_collector import RdmaCollector from nodescraper.plugins.inband.rdma.rdmadata import RdmaDataModel @@ -70,9 +71,10 @@ def test_collect_success(collector, conn_mock, rdma_link_output, rdma_statistic_ assert res.status == ExecutionStatus.OK assert data is not None assert isinstance(data, RdmaDataModel) - # Full statistic fixture has 8 devices (bnxt_re0..bnxt_re7) with full stats - assert len(data.statistic_list) == 8 - assert data.statistic_list[0].ifname == "bnxt_re0" + assert len(data.statistic_list) == 12 + assert data.statistic_list[0].ifname == "ionic_0" + assert data.statistic_list[0].vendor_statistics is not None + assert data.statistic_list[0].vendor_statistics.tx_rdma_ucast_bytes == 0 # Full link fixture has 4 ionic links assert len(data.link_list) == 4 assert data.link_list[0].ifname == "ionic_0" @@ -154,3 +156,123 @@ def test_parse_rdma_link_text_empty(collector): """Test parsing empty rdma link (text) output.""" links = collector._parse_rdma_link_text("") assert len(links) == 0 + + +def test_collect_extra_fields_warning(collector, conn_mock): + """Extra keys in a statistic row produce a warning event but collection succeeds.""" + collector.system_info.os_family = OSFamily.LINUX + stat_data = [ + { + "ifname": "ionic_0", + "port": 1, + "tx_rdma_ucast_bytes": 0, + "unknown_field_1": 42, + "unknown_field_2": 99, + } + ] + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=json.dumps(stat_data), + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert len(data.statistic_list) == 1 + assert data.statistic_list[0].vendor_statistics is not None + extra_events = [e for e in res.events if "Unexpected fields" in e.description] + assert len(extra_events) == 1 + assert extra_events[0].priority == EventPriority.WARNING + assert "unknown_field_1" in extra_events[0].data["extra_fields"] + assert "unknown_field_2" in extra_events[0].data["extra_fields"] + + +def test_collect_missing_fields_warning(collector, conn_mock): + """Missing vendor fields produce a warning event.""" + collector.system_info.os_family = OSFamily.LINUX + stat_data = [{"ifname": "ionic_0", "port": 1, "tx_rdma_ucast_bytes": 0}] + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=json.dumps(stat_data), + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert len(data.statistic_list) == 1 + missing_events = [e for e in res.events if "Missing fields" in e.description] + assert len(missing_events) == 1 + assert missing_events[0].priority == EventPriority.WARNING + assert "tx_rdma_ucast_pkts" in missing_events[0].data["missing_fields"] + + +def test_collect_extra_and_missing_fields_warning(collector, conn_mock): + """Both extra and unknown vendor keys produce separate warnings (mlx).""" + collector.system_info.os_family = OSFamily.LINUX + stat_data = [ + { + "ifname": "mlx5_0", + "port": 1, + "rx_write_requests": 0, + "brand_new_counter": 7, + } + ] + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=json.dumps(stat_data), + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + extra_events = [e for e in res.events if "Unexpected fields" in e.description] + missing_events = [e for e in res.events if "Missing fields" in e.description] + assert len(extra_events) == 1 + assert len(missing_events) == 1 + assert "brand_new_counter" in extra_events[0].data["extra_fields"] + assert "rx_read_requests" in missing_events[0].data["missing_fields"] + + +def test_collect_no_field_warnings_when_fixture_matches( + collector, conn_mock, rdma_statistic_output +): + """Full fixture rows match vendor models: no missing/extra field warnings.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, + stdout=rdma_statistic_output, + stderr="", + command="rdma statistic -j", + ), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma dev"), + CommandArtifact(exit_code=0, stdout="", stderr="", command="rdma link"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + drift = [ + e + for e in res.events + if "Unexpected fields" in e.description or "Missing fields" in e.description + ] + assert drift == [] From a0a5449034c6e14d20bf9a2cd26a36f37b662afc Mon Sep 17 00:00:00 2001 From: jaspals Date: Thu, 23 Apr 2026 16:00:30 -0500 Subject: [PATCH 3/3] event category fix --- .../plugins/inband/rdma/rdma_collector.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/rdma_collector.py b/nodescraper/plugins/inband/rdma/rdma_collector.py index a719a334..67f4073b 100644 --- a/nodescraper/plugins/inband/rdma/rdma_collector.py +++ b/nodescraper/plugins/inband/rdma/rdma_collector.py @@ -69,7 +69,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: if res.exit_code != 0: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Error running rdma command: {cmd}", data={ "command": cmd, @@ -88,7 +88,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: return json.loads(res.stdout) except json.JSONDecodeError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Error parsing command: {cmd} json data", data={ "cmd": cmd, @@ -196,7 +196,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: for stat in stat_data: if not isinstance(stat, dict): self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Invalid data type for RDMA statistic", data={"data_type": type(stat).__name__}, priority=EventPriority.WARNING, @@ -213,7 +213,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: extra_fields = stat_fields - vendor_fields if extra_fields: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Unexpected fields in RDMA statistic for {ifname}", data={ "interface": ifname, @@ -225,7 +225,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: missing_fields = vendor_fields - stat_fields if missing_fields: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Missing fields in RDMA statistic for {ifname}", data={ "interface": ifname, @@ -238,7 +238,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: vendor_stats = vendor_cls(**stat) except ValidationError as ve: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description=f"Failed to build vendor model for {ifname}", data={"exception": get_exception_traceback(ve)}, priority=EventPriority.WARNING, @@ -254,7 +254,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: return statistics except ValidationError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Failed to build RdmaStatistics model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, @@ -274,7 +274,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: for link in link_data: if not isinstance(link, dict): self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Invalid data type for RDMA link", data={"data_type": type(link).__name__}, priority=EventPriority.WARNING, @@ -284,7 +284,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: return links except ValidationError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Failed to build RdmaLink model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING,