From ad9d567d2ce3544d153af514c335899878f3d63c Mon Sep 17 00:00:00 2001 From: "zero@thinky" Date: Sun, 5 Apr 2026 08:19:34 +0300 Subject: [PATCH] feat(backend): add root cause calculation --- backend/cmd/main.go | 3 +- backend/internal/handlers/graph.go | 221 ++++++++++++++++++++++++++++- 2 files changed, 217 insertions(+), 7 deletions(-) diff --git a/backend/cmd/main.go b/backend/cmd/main.go index 02ea424..3f9f679 100644 --- a/backend/cmd/main.go +++ b/backend/cmd/main.go @@ -110,7 +110,7 @@ func main() { if graphPath == "" { graphPath = "/etc/hellreign/services.yaml" } - graphHandlers := handlers.NewGraphHandlers(graphPath) + graphHandlers := handlers.NewGraphHandlers(graphPath, coll) agents := handlers.NewAgentsGroup(h, coll) auth := handlers.AuthGroup{Handlers: h} @@ -226,6 +226,7 @@ func main() { graphGroup.PUT("", graphHandlers.UpdateYAML) graphGroup.GET("/order", graphHandlers.StartupOrder) graphGroup.GET("/cycle", graphHandlers.CycleCheck) + graphGroup.GET("/failure", graphHandlers.GetFailureRootCause) } // Agent registration diff --git a/backend/internal/handlers/graph.go b/backend/internal/handlers/graph.go index 63b74c0..7b1273e 100644 --- a/backend/internal/handlers/graph.go +++ b/backend/internal/handlers/graph.go @@ -5,23 +5,26 @@ import ( "log" "net/http" "os" + "strings" "sync" "gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph" + "gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/grpcsrv/collector" "github.com/gin-gonic/gin" ) // GraphHandlers manages the service dependency graph. type GraphHandlers struct { - path string - mu sync.RWMutex - yamlData []byte - loaded *graph.Graph + path string + mu sync.RWMutex + yamlData []byte + loaded *graph.Graph + collector *collector.Collector } // NewGraphHandlers loads the graph from the given YAML file path. -func NewGraphHandlers(yamlPath string) *GraphHandlers { - h := &GraphHandlers{path: yamlPath} +func NewGraphHandlers(yamlPath string, coll *collector.Collector) *GraphHandlers { + h := &GraphHandlers{path: yamlPath, collector: coll} if err := h.reload(); err != nil { if _, ok := err.(*os.PathError); ok { log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath) @@ -147,3 +150,209 @@ func (h *GraphHandlers) CycleCheck(c *gin.Context) { c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()}) } + +// ServiceStatusOut represents a service and its current status. +type ServiceStatusOut struct { + NodeID string `json:"node_id"` + Name string `json:"name"` + Status string `json:"status"` + Healthy bool `json:"healthy"` +} + +// FailureRootCauseOut represents the result of a failure analysis. +type FailureRootCauseOut struct { + Affected ServiceStatusOut `json:"affected"` + RootCause *ServiceStatusOut `json:"root_cause,omitempty"` + DependencyChain []string `json:"dependency_chain,omitempty"` +} + +// GetFailureRootCause analyzes the dependency graph and current service +// statuses to find the root cause of a service failure. +// If the specified service is unhealthy, it traverses its dependencies +// to find the first unhealthy dependency — the one that is the root cause. +// @Summary Find failure root cause +// @Description Analyzes dependencies and service statuses to find the root cause of a failure +// @Tags graph +// @Param node_id query string false "Node ID (agent label)" +// @Param service query string true "Service name" +// @Produce json +// @Success 200 {object} FailureRootCauseOut +// @Failure 400 {object} map[string]string +// @Security Bearer +// @Router /graph/failure [get] +func (h *GraphHandlers) GetFailureRootCause(c *gin.Context) { + nodeID := c.Query("node_id") + svcName := c.Query("service") + if svcName == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "service query param is required"}) + return + } + + h.mu.RLock() + g := h.loaded + h.mu.RUnlock() + + if g == nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "no graph loaded"}) + return + } + + // Build a map of service statuses from all agents + svcStatus := h.buildServiceStatusMap() + + // If no node specified, search all nodes for the service + if nodeID == "" { + for _, node := range g.Nodes() { + if _, ok := g.GetService(node.ID, svcName); ok { + nodeID = node.ID + break + } + } + } + + if nodeID == "" { + c.JSON(http.StatusNotFound, gin.H{"error": "service not found in graph"}) + return + } + + if _, ok := g.GetService(nodeID, svcName); !ok { + c.JSON(http.StatusNotFound, gin.H{"error": "service not found in node"}) + return + } + + // Get current status + status := svcStatus[nodeID+":"+svcName] + affected := ServiceStatusOut{ + NodeID: nodeID, + Name: svcName, + Status: status.status, + Healthy: status.healthy, + } + + // If the service is healthy, no failure to analyze + if status.healthy { + c.JSON(http.StatusOK, FailureRootCauseOut{ + Affected: affected, + }) + return + } + + // Find root cause: traverse dependencies to find the first unhealthy one + rootCause, chain := findRootCause(g, nodeID, svcName, svcStatus) + + out := FailureRootCauseOut{ + Affected: affected, + DependencyChain: chain, + } + if rootCause != nil { + out.RootCause = rootCause + } + + c.JSON(http.StatusOK, out) +} + +// svcStatusEntry holds parsed status info. +type svcStatusEntry struct { + status string + healthy bool +} + +// buildServiceStatusMap creates a map of "nodeID:serviceName" → status. +// Matches graph nodes to agent labels in the collector. +func (h *GraphHandlers) buildServiceStatusMap() map[string]svcStatusEntry { + result := make(map[string]svcStatusEntry) + + h.mu.RLock() + nodes := h.loaded.Nodes() + h.mu.RUnlock() + + for _, agent := range h.collector.Agents() { + for _, svc := range agent.Services { + healthy := isHealthyStatus(svc.Status) + entry := svcStatusEntry{status: svc.Status, healthy: healthy} + + // Try exact node match first + key := agent.Label + ":" + svc.Name + result[key] = entry + + // Also register under all nodes that don't have a status yet + for _, node := range nodes { + nodeKey := node.ID + ":" + svc.Name + if _, exists := result[nodeKey]; !exists { + result[nodeKey] = entry + } + } + } + } + + return result +} + +// findRootCause traverses the dependency graph to find the first unhealthy dependency. +func findRootCause(g *graph.Graph, nodeID, svcName string, statusMap map[string]svcStatusEntry) (*ServiceStatusOut, []string) { + visited := make(map[string]bool) + var chain []string + + var dfs func(string, string) *ServiceStatusOut + dfs = func(nid, sname string) *ServiceStatusOut { + key := nid + ":" + sname + chain = append(chain, key) + visited[key] = true + + svc, ok := g.GetService(nid, sname) + if !ok { + return nil + } + + // Check each dependency + for _, dep := range svc.Dependencies { + depNodeID := dep.Target.NodeID + if depNodeID == "" { + depNodeID = nid + } + depKey := depNodeID + ":" + dep.Target.Name + + if visited[depKey] { + continue // avoid loops + } + + depStatus := statusMap[depKey] + + if !depStatus.healthy { + // This dependency is unhealthy — check if IT has an unhealthy dependency + // (to find the true root cause) + if deeper := dfs(depNodeID, dep.Target.Name); deeper != nil { + return deeper + } + // This is the root cause + return &ServiceStatusOut{ + NodeID: depNodeID, + Name: dep.Target.Name, + Status: depStatus.status, + Healthy: false, + } + } + } + + return nil + } + + root := dfs(nodeID, svcName) + + // Deduplicate chain + seen := make(map[string]bool) + var deduped []string + for _, k := range chain { + if !seen[k] { + seen[k] = true + deduped = append(deduped, k) + } + } + + return root, deduped +} + +func isHealthyStatus(status string) bool { + s := strings.ToLower(status) + return s == "running" || s == "up" || s == "healthy" +}