This commit is contained in:
+2
-1
@@ -110,7 +110,7 @@ func main() {
|
||||
if graphPath == "" {
|
||||
graphPath = "/etc/hellreign/services.yaml"
|
||||
}
|
||||
graphHandlers := handlers.NewGraphHandlers(graphPath)
|
||||
graphHandlers := handlers.NewGraphHandlers(graphPath, coll)
|
||||
|
||||
agents := handlers.NewAgentsGroup(h, coll)
|
||||
auth := handlers.AuthGroup{Handlers: h}
|
||||
@@ -226,6 +226,7 @@ func main() {
|
||||
graphGroup.PUT("", graphHandlers.UpdateYAML)
|
||||
graphGroup.GET("/order", graphHandlers.StartupOrder)
|
||||
graphGroup.GET("/cycle", graphHandlers.CycleCheck)
|
||||
graphGroup.GET("/failure", graphHandlers.GetFailureRootCause)
|
||||
}
|
||||
|
||||
// Agent registration
|
||||
|
||||
@@ -5,23 +5,26 @@ import (
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph"
|
||||
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/grpcsrv/collector"
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
// GraphHandlers manages the service dependency graph.
|
||||
type GraphHandlers struct {
|
||||
path string
|
||||
mu sync.RWMutex
|
||||
yamlData []byte
|
||||
loaded *graph.Graph
|
||||
path string
|
||||
mu sync.RWMutex
|
||||
yamlData []byte
|
||||
loaded *graph.Graph
|
||||
collector *collector.Collector
|
||||
}
|
||||
|
||||
// NewGraphHandlers loads the graph from the given YAML file path.
|
||||
func NewGraphHandlers(yamlPath string) *GraphHandlers {
|
||||
h := &GraphHandlers{path: yamlPath}
|
||||
func NewGraphHandlers(yamlPath string, coll *collector.Collector) *GraphHandlers {
|
||||
h := &GraphHandlers{path: yamlPath, collector: coll}
|
||||
if err := h.reload(); err != nil {
|
||||
if _, ok := err.(*os.PathError); ok {
|
||||
log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath)
|
||||
@@ -147,3 +150,209 @@ func (h *GraphHandlers) CycleCheck(c *gin.Context) {
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()})
|
||||
}
|
||||
|
||||
// ServiceStatusOut represents a service and its current status.
|
||||
type ServiceStatusOut struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
Healthy bool `json:"healthy"`
|
||||
}
|
||||
|
||||
// FailureRootCauseOut represents the result of a failure analysis.
|
||||
type FailureRootCauseOut struct {
|
||||
Affected ServiceStatusOut `json:"affected"`
|
||||
RootCause *ServiceStatusOut `json:"root_cause,omitempty"`
|
||||
DependencyChain []string `json:"dependency_chain,omitempty"`
|
||||
}
|
||||
|
||||
// GetFailureRootCause analyzes the dependency graph and current service
|
||||
// statuses to find the root cause of a service failure.
|
||||
// If the specified service is unhealthy, it traverses its dependencies
|
||||
// to find the first unhealthy dependency — the one that is the root cause.
|
||||
// @Summary Find failure root cause
|
||||
// @Description Analyzes dependencies and service statuses to find the root cause of a failure
|
||||
// @Tags graph
|
||||
// @Param node_id query string false "Node ID (agent label)"
|
||||
// @Param service query string true "Service name"
|
||||
// @Produce json
|
||||
// @Success 200 {object} FailureRootCauseOut
|
||||
// @Failure 400 {object} map[string]string
|
||||
// @Security Bearer
|
||||
// @Router /graph/failure [get]
|
||||
func (h *GraphHandlers) GetFailureRootCause(c *gin.Context) {
|
||||
nodeID := c.Query("node_id")
|
||||
svcName := c.Query("service")
|
||||
if svcName == "" {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "service query param is required"})
|
||||
return
|
||||
}
|
||||
|
||||
h.mu.RLock()
|
||||
g := h.loaded
|
||||
h.mu.RUnlock()
|
||||
|
||||
if g == nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "no graph loaded"})
|
||||
return
|
||||
}
|
||||
|
||||
// Build a map of service statuses from all agents
|
||||
svcStatus := h.buildServiceStatusMap()
|
||||
|
||||
// If no node specified, search all nodes for the service
|
||||
if nodeID == "" {
|
||||
for _, node := range g.Nodes() {
|
||||
if _, ok := g.GetService(node.ID, svcName); ok {
|
||||
nodeID = node.ID
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if nodeID == "" {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in graph"})
|
||||
return
|
||||
}
|
||||
|
||||
if _, ok := g.GetService(nodeID, svcName); !ok {
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in node"})
|
||||
return
|
||||
}
|
||||
|
||||
// Get current status
|
||||
status := svcStatus[nodeID+":"+svcName]
|
||||
affected := ServiceStatusOut{
|
||||
NodeID: nodeID,
|
||||
Name: svcName,
|
||||
Status: status.status,
|
||||
Healthy: status.healthy,
|
||||
}
|
||||
|
||||
// If the service is healthy, no failure to analyze
|
||||
if status.healthy {
|
||||
c.JSON(http.StatusOK, FailureRootCauseOut{
|
||||
Affected: affected,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Find root cause: traverse dependencies to find the first unhealthy one
|
||||
rootCause, chain := findRootCause(g, nodeID, svcName, svcStatus)
|
||||
|
||||
out := FailureRootCauseOut{
|
||||
Affected: affected,
|
||||
DependencyChain: chain,
|
||||
}
|
||||
if rootCause != nil {
|
||||
out.RootCause = rootCause
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, out)
|
||||
}
|
||||
|
||||
// svcStatusEntry holds parsed status info.
|
||||
type svcStatusEntry struct {
|
||||
status string
|
||||
healthy bool
|
||||
}
|
||||
|
||||
// buildServiceStatusMap creates a map of "nodeID:serviceName" → status.
|
||||
// Matches graph nodes to agent labels in the collector.
|
||||
func (h *GraphHandlers) buildServiceStatusMap() map[string]svcStatusEntry {
|
||||
result := make(map[string]svcStatusEntry)
|
||||
|
||||
h.mu.RLock()
|
||||
nodes := h.loaded.Nodes()
|
||||
h.mu.RUnlock()
|
||||
|
||||
for _, agent := range h.collector.Agents() {
|
||||
for _, svc := range agent.Services {
|
||||
healthy := isHealthyStatus(svc.Status)
|
||||
entry := svcStatusEntry{status: svc.Status, healthy: healthy}
|
||||
|
||||
// Try exact node match first
|
||||
key := agent.Label + ":" + svc.Name
|
||||
result[key] = entry
|
||||
|
||||
// Also register under all nodes that don't have a status yet
|
||||
for _, node := range nodes {
|
||||
nodeKey := node.ID + ":" + svc.Name
|
||||
if _, exists := result[nodeKey]; !exists {
|
||||
result[nodeKey] = entry
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// findRootCause traverses the dependency graph to find the first unhealthy dependency.
|
||||
func findRootCause(g *graph.Graph, nodeID, svcName string, statusMap map[string]svcStatusEntry) (*ServiceStatusOut, []string) {
|
||||
visited := make(map[string]bool)
|
||||
var chain []string
|
||||
|
||||
var dfs func(string, string) *ServiceStatusOut
|
||||
dfs = func(nid, sname string) *ServiceStatusOut {
|
||||
key := nid + ":" + sname
|
||||
chain = append(chain, key)
|
||||
visited[key] = true
|
||||
|
||||
svc, ok := g.GetService(nid, sname)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check each dependency
|
||||
for _, dep := range svc.Dependencies {
|
||||
depNodeID := dep.Target.NodeID
|
||||
if depNodeID == "" {
|
||||
depNodeID = nid
|
||||
}
|
||||
depKey := depNodeID + ":" + dep.Target.Name
|
||||
|
||||
if visited[depKey] {
|
||||
continue // avoid loops
|
||||
}
|
||||
|
||||
depStatus := statusMap[depKey]
|
||||
|
||||
if !depStatus.healthy {
|
||||
// This dependency is unhealthy — check if IT has an unhealthy dependency
|
||||
// (to find the true root cause)
|
||||
if deeper := dfs(depNodeID, dep.Target.Name); deeper != nil {
|
||||
return deeper
|
||||
}
|
||||
// This is the root cause
|
||||
return &ServiceStatusOut{
|
||||
NodeID: depNodeID,
|
||||
Name: dep.Target.Name,
|
||||
Status: depStatus.status,
|
||||
Healthy: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
root := dfs(nodeID, svcName)
|
||||
|
||||
// Deduplicate chain
|
||||
seen := make(map[string]bool)
|
||||
var deduped []string
|
||||
for _, k := range chain {
|
||||
if !seen[k] {
|
||||
seen[k] = true
|
||||
deduped = append(deduped, k)
|
||||
}
|
||||
}
|
||||
|
||||
return root, deduped
|
||||
}
|
||||
|
||||
func isHealthyStatus(status string) bool {
|
||||
s := strings.ToLower(status)
|
||||
return s == "running" || s == "up" || s == "healthy"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user