feat(backend): add root cause calculation
ci-agent / build (push) Has been cancelled

This commit is contained in:
2026-04-05 08:19:34 +03:00
parent c6c46aee68
commit ad9d567d2c
2 changed files with 217 additions and 7 deletions
+2 -1
View File
@@ -110,7 +110,7 @@ func main() {
if graphPath == "" { if graphPath == "" {
graphPath = "/etc/hellreign/services.yaml" graphPath = "/etc/hellreign/services.yaml"
} }
graphHandlers := handlers.NewGraphHandlers(graphPath) graphHandlers := handlers.NewGraphHandlers(graphPath, coll)
agents := handlers.NewAgentsGroup(h, coll) agents := handlers.NewAgentsGroup(h, coll)
auth := handlers.AuthGroup{Handlers: h} auth := handlers.AuthGroup{Handlers: h}
@@ -226,6 +226,7 @@ func main() {
graphGroup.PUT("", graphHandlers.UpdateYAML) graphGroup.PUT("", graphHandlers.UpdateYAML)
graphGroup.GET("/order", graphHandlers.StartupOrder) graphGroup.GET("/order", graphHandlers.StartupOrder)
graphGroup.GET("/cycle", graphHandlers.CycleCheck) graphGroup.GET("/cycle", graphHandlers.CycleCheck)
graphGroup.GET("/failure", graphHandlers.GetFailureRootCause)
} }
// Agent registration // Agent registration
+215 -6
View File
@@ -5,23 +5,26 @@ import (
"log" "log"
"net/http" "net/http"
"os" "os"
"strings"
"sync" "sync"
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph" "gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph"
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/grpcsrv/collector"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
) )
// GraphHandlers manages the service dependency graph. // GraphHandlers manages the service dependency graph.
type GraphHandlers struct { type GraphHandlers struct {
path string path string
mu sync.RWMutex mu sync.RWMutex
yamlData []byte yamlData []byte
loaded *graph.Graph loaded *graph.Graph
collector *collector.Collector
} }
// NewGraphHandlers loads the graph from the given YAML file path. // NewGraphHandlers loads the graph from the given YAML file path.
func NewGraphHandlers(yamlPath string) *GraphHandlers { func NewGraphHandlers(yamlPath string, coll *collector.Collector) *GraphHandlers {
h := &GraphHandlers{path: yamlPath} h := &GraphHandlers{path: yamlPath, collector: coll}
if err := h.reload(); err != nil { if err := h.reload(); err != nil {
if _, ok := err.(*os.PathError); ok { if _, ok := err.(*os.PathError); ok {
log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath) log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath)
@@ -147,3 +150,209 @@ func (h *GraphHandlers) CycleCheck(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()}) c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()})
} }
// ServiceStatusOut represents a service and its current status.
type ServiceStatusOut struct {
NodeID string `json:"node_id"`
Name string `json:"name"`
Status string `json:"status"`
Healthy bool `json:"healthy"`
}
// FailureRootCauseOut represents the result of a failure analysis.
type FailureRootCauseOut struct {
Affected ServiceStatusOut `json:"affected"`
RootCause *ServiceStatusOut `json:"root_cause,omitempty"`
DependencyChain []string `json:"dependency_chain,omitempty"`
}
// GetFailureRootCause analyzes the dependency graph and current service
// statuses to find the root cause of a service failure.
// If the specified service is unhealthy, it traverses its dependencies
// to find the first unhealthy dependency — the one that is the root cause.
// @Summary Find failure root cause
// @Description Analyzes dependencies and service statuses to find the root cause of a failure
// @Tags graph
// @Param node_id query string false "Node ID (agent label)"
// @Param service query string true "Service name"
// @Produce json
// @Success 200 {object} FailureRootCauseOut
// @Failure 400 {object} map[string]string
// @Security Bearer
// @Router /graph/failure [get]
func (h *GraphHandlers) GetFailureRootCause(c *gin.Context) {
nodeID := c.Query("node_id")
svcName := c.Query("service")
if svcName == "" {
c.JSON(http.StatusBadRequest, gin.H{"error": "service query param is required"})
return
}
h.mu.RLock()
g := h.loaded
h.mu.RUnlock()
if g == nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "no graph loaded"})
return
}
// Build a map of service statuses from all agents
svcStatus := h.buildServiceStatusMap()
// If no node specified, search all nodes for the service
if nodeID == "" {
for _, node := range g.Nodes() {
if _, ok := g.GetService(node.ID, svcName); ok {
nodeID = node.ID
break
}
}
}
if nodeID == "" {
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in graph"})
return
}
if _, ok := g.GetService(nodeID, svcName); !ok {
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in node"})
return
}
// Get current status
status := svcStatus[nodeID+":"+svcName]
affected := ServiceStatusOut{
NodeID: nodeID,
Name: svcName,
Status: status.status,
Healthy: status.healthy,
}
// If the service is healthy, no failure to analyze
if status.healthy {
c.JSON(http.StatusOK, FailureRootCauseOut{
Affected: affected,
})
return
}
// Find root cause: traverse dependencies to find the first unhealthy one
rootCause, chain := findRootCause(g, nodeID, svcName, svcStatus)
out := FailureRootCauseOut{
Affected: affected,
DependencyChain: chain,
}
if rootCause != nil {
out.RootCause = rootCause
}
c.JSON(http.StatusOK, out)
}
// svcStatusEntry holds parsed status info.
type svcStatusEntry struct {
status string
healthy bool
}
// buildServiceStatusMap creates a map of "nodeID:serviceName" → status.
// Matches graph nodes to agent labels in the collector.
func (h *GraphHandlers) buildServiceStatusMap() map[string]svcStatusEntry {
result := make(map[string]svcStatusEntry)
h.mu.RLock()
nodes := h.loaded.Nodes()
h.mu.RUnlock()
for _, agent := range h.collector.Agents() {
for _, svc := range agent.Services {
healthy := isHealthyStatus(svc.Status)
entry := svcStatusEntry{status: svc.Status, healthy: healthy}
// Try exact node match first
key := agent.Label + ":" + svc.Name
result[key] = entry
// Also register under all nodes that don't have a status yet
for _, node := range nodes {
nodeKey := node.ID + ":" + svc.Name
if _, exists := result[nodeKey]; !exists {
result[nodeKey] = entry
}
}
}
}
return result
}
// findRootCause traverses the dependency graph to find the first unhealthy dependency.
func findRootCause(g *graph.Graph, nodeID, svcName string, statusMap map[string]svcStatusEntry) (*ServiceStatusOut, []string) {
visited := make(map[string]bool)
var chain []string
var dfs func(string, string) *ServiceStatusOut
dfs = func(nid, sname string) *ServiceStatusOut {
key := nid + ":" + sname
chain = append(chain, key)
visited[key] = true
svc, ok := g.GetService(nid, sname)
if !ok {
return nil
}
// Check each dependency
for _, dep := range svc.Dependencies {
depNodeID := dep.Target.NodeID
if depNodeID == "" {
depNodeID = nid
}
depKey := depNodeID + ":" + dep.Target.Name
if visited[depKey] {
continue // avoid loops
}
depStatus := statusMap[depKey]
if !depStatus.healthy {
// This dependency is unhealthy — check if IT has an unhealthy dependency
// (to find the true root cause)
if deeper := dfs(depNodeID, dep.Target.Name); deeper != nil {
return deeper
}
// This is the root cause
return &ServiceStatusOut{
NodeID: depNodeID,
Name: dep.Target.Name,
Status: depStatus.status,
Healthy: false,
}
}
}
return nil
}
root := dfs(nodeID, svcName)
// Deduplicate chain
seen := make(map[string]bool)
var deduped []string
for _, k := range chain {
if !seen[k] {
seen[k] = true
deduped = append(deduped, k)
}
}
return root, deduped
}
func isHealthyStatus(status string) bool {
s := strings.ToLower(status)
return s == "running" || s == "up" || s == "healthy"
}