This commit is contained in:
+2
-1
@@ -110,7 +110,7 @@ func main() {
|
|||||||
if graphPath == "" {
|
if graphPath == "" {
|
||||||
graphPath = "/etc/hellreign/services.yaml"
|
graphPath = "/etc/hellreign/services.yaml"
|
||||||
}
|
}
|
||||||
graphHandlers := handlers.NewGraphHandlers(graphPath)
|
graphHandlers := handlers.NewGraphHandlers(graphPath, coll)
|
||||||
|
|
||||||
agents := handlers.NewAgentsGroup(h, coll)
|
agents := handlers.NewAgentsGroup(h, coll)
|
||||||
auth := handlers.AuthGroup{Handlers: h}
|
auth := handlers.AuthGroup{Handlers: h}
|
||||||
@@ -226,6 +226,7 @@ func main() {
|
|||||||
graphGroup.PUT("", graphHandlers.UpdateYAML)
|
graphGroup.PUT("", graphHandlers.UpdateYAML)
|
||||||
graphGroup.GET("/order", graphHandlers.StartupOrder)
|
graphGroup.GET("/order", graphHandlers.StartupOrder)
|
||||||
graphGroup.GET("/cycle", graphHandlers.CycleCheck)
|
graphGroup.GET("/cycle", graphHandlers.CycleCheck)
|
||||||
|
graphGroup.GET("/failure", graphHandlers.GetFailureRootCause)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Agent registration
|
// Agent registration
|
||||||
|
|||||||
@@ -5,9 +5,11 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph"
|
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph"
|
||||||
|
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/grpcsrv/collector"
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -17,11 +19,12 @@ type GraphHandlers struct {
|
|||||||
mu sync.RWMutex
|
mu sync.RWMutex
|
||||||
yamlData []byte
|
yamlData []byte
|
||||||
loaded *graph.Graph
|
loaded *graph.Graph
|
||||||
|
collector *collector.Collector
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewGraphHandlers loads the graph from the given YAML file path.
|
// NewGraphHandlers loads the graph from the given YAML file path.
|
||||||
func NewGraphHandlers(yamlPath string) *GraphHandlers {
|
func NewGraphHandlers(yamlPath string, coll *collector.Collector) *GraphHandlers {
|
||||||
h := &GraphHandlers{path: yamlPath}
|
h := &GraphHandlers{path: yamlPath, collector: coll}
|
||||||
if err := h.reload(); err != nil {
|
if err := h.reload(); err != nil {
|
||||||
if _, ok := err.(*os.PathError); ok {
|
if _, ok := err.(*os.PathError); ok {
|
||||||
log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath)
|
log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath)
|
||||||
@@ -147,3 +150,209 @@ func (h *GraphHandlers) CycleCheck(c *gin.Context) {
|
|||||||
|
|
||||||
c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()})
|
c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ServiceStatusOut represents a service and its current status.
|
||||||
|
type ServiceStatusOut struct {
|
||||||
|
NodeID string `json:"node_id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Healthy bool `json:"healthy"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// FailureRootCauseOut represents the result of a failure analysis.
|
||||||
|
type FailureRootCauseOut struct {
|
||||||
|
Affected ServiceStatusOut `json:"affected"`
|
||||||
|
RootCause *ServiceStatusOut `json:"root_cause,omitempty"`
|
||||||
|
DependencyChain []string `json:"dependency_chain,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetFailureRootCause analyzes the dependency graph and current service
|
||||||
|
// statuses to find the root cause of a service failure.
|
||||||
|
// If the specified service is unhealthy, it traverses its dependencies
|
||||||
|
// to find the first unhealthy dependency — the one that is the root cause.
|
||||||
|
// @Summary Find failure root cause
|
||||||
|
// @Description Analyzes dependencies and service statuses to find the root cause of a failure
|
||||||
|
// @Tags graph
|
||||||
|
// @Param node_id query string false "Node ID (agent label)"
|
||||||
|
// @Param service query string true "Service name"
|
||||||
|
// @Produce json
|
||||||
|
// @Success 200 {object} FailureRootCauseOut
|
||||||
|
// @Failure 400 {object} map[string]string
|
||||||
|
// @Security Bearer
|
||||||
|
// @Router /graph/failure [get]
|
||||||
|
func (h *GraphHandlers) GetFailureRootCause(c *gin.Context) {
|
||||||
|
nodeID := c.Query("node_id")
|
||||||
|
svcName := c.Query("service")
|
||||||
|
if svcName == "" {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "service query param is required"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h.mu.RLock()
|
||||||
|
g := h.loaded
|
||||||
|
h.mu.RUnlock()
|
||||||
|
|
||||||
|
if g == nil {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "no graph loaded"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a map of service statuses from all agents
|
||||||
|
svcStatus := h.buildServiceStatusMap()
|
||||||
|
|
||||||
|
// If no node specified, search all nodes for the service
|
||||||
|
if nodeID == "" {
|
||||||
|
for _, node := range g.Nodes() {
|
||||||
|
if _, ok := g.GetService(node.ID, svcName); ok {
|
||||||
|
nodeID = node.ID
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nodeID == "" {
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in graph"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := g.GetService(nodeID, svcName); !ok {
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in node"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get current status
|
||||||
|
status := svcStatus[nodeID+":"+svcName]
|
||||||
|
affected := ServiceStatusOut{
|
||||||
|
NodeID: nodeID,
|
||||||
|
Name: svcName,
|
||||||
|
Status: status.status,
|
||||||
|
Healthy: status.healthy,
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the service is healthy, no failure to analyze
|
||||||
|
if status.healthy {
|
||||||
|
c.JSON(http.StatusOK, FailureRootCauseOut{
|
||||||
|
Affected: affected,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find root cause: traverse dependencies to find the first unhealthy one
|
||||||
|
rootCause, chain := findRootCause(g, nodeID, svcName, svcStatus)
|
||||||
|
|
||||||
|
out := FailureRootCauseOut{
|
||||||
|
Affected: affected,
|
||||||
|
DependencyChain: chain,
|
||||||
|
}
|
||||||
|
if rootCause != nil {
|
||||||
|
out.RootCause = rootCause
|
||||||
|
}
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// svcStatusEntry holds parsed status info.
|
||||||
|
type svcStatusEntry struct {
|
||||||
|
status string
|
||||||
|
healthy bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildServiceStatusMap creates a map of "nodeID:serviceName" → status.
|
||||||
|
// Matches graph nodes to agent labels in the collector.
|
||||||
|
func (h *GraphHandlers) buildServiceStatusMap() map[string]svcStatusEntry {
|
||||||
|
result := make(map[string]svcStatusEntry)
|
||||||
|
|
||||||
|
h.mu.RLock()
|
||||||
|
nodes := h.loaded.Nodes()
|
||||||
|
h.mu.RUnlock()
|
||||||
|
|
||||||
|
for _, agent := range h.collector.Agents() {
|
||||||
|
for _, svc := range agent.Services {
|
||||||
|
healthy := isHealthyStatus(svc.Status)
|
||||||
|
entry := svcStatusEntry{status: svc.Status, healthy: healthy}
|
||||||
|
|
||||||
|
// Try exact node match first
|
||||||
|
key := agent.Label + ":" + svc.Name
|
||||||
|
result[key] = entry
|
||||||
|
|
||||||
|
// Also register under all nodes that don't have a status yet
|
||||||
|
for _, node := range nodes {
|
||||||
|
nodeKey := node.ID + ":" + svc.Name
|
||||||
|
if _, exists := result[nodeKey]; !exists {
|
||||||
|
result[nodeKey] = entry
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// findRootCause traverses the dependency graph to find the first unhealthy dependency.
|
||||||
|
func findRootCause(g *graph.Graph, nodeID, svcName string, statusMap map[string]svcStatusEntry) (*ServiceStatusOut, []string) {
|
||||||
|
visited := make(map[string]bool)
|
||||||
|
var chain []string
|
||||||
|
|
||||||
|
var dfs func(string, string) *ServiceStatusOut
|
||||||
|
dfs = func(nid, sname string) *ServiceStatusOut {
|
||||||
|
key := nid + ":" + sname
|
||||||
|
chain = append(chain, key)
|
||||||
|
visited[key] = true
|
||||||
|
|
||||||
|
svc, ok := g.GetService(nid, sname)
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check each dependency
|
||||||
|
for _, dep := range svc.Dependencies {
|
||||||
|
depNodeID := dep.Target.NodeID
|
||||||
|
if depNodeID == "" {
|
||||||
|
depNodeID = nid
|
||||||
|
}
|
||||||
|
depKey := depNodeID + ":" + dep.Target.Name
|
||||||
|
|
||||||
|
if visited[depKey] {
|
||||||
|
continue // avoid loops
|
||||||
|
}
|
||||||
|
|
||||||
|
depStatus := statusMap[depKey]
|
||||||
|
|
||||||
|
if !depStatus.healthy {
|
||||||
|
// This dependency is unhealthy — check if IT has an unhealthy dependency
|
||||||
|
// (to find the true root cause)
|
||||||
|
if deeper := dfs(depNodeID, dep.Target.Name); deeper != nil {
|
||||||
|
return deeper
|
||||||
|
}
|
||||||
|
// This is the root cause
|
||||||
|
return &ServiceStatusOut{
|
||||||
|
NodeID: depNodeID,
|
||||||
|
Name: dep.Target.Name,
|
||||||
|
Status: depStatus.status,
|
||||||
|
Healthy: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
root := dfs(nodeID, svcName)
|
||||||
|
|
||||||
|
// Deduplicate chain
|
||||||
|
seen := make(map[string]bool)
|
||||||
|
var deduped []string
|
||||||
|
for _, k := range chain {
|
||||||
|
if !seen[k] {
|
||||||
|
seen[k] = true
|
||||||
|
deduped = append(deduped, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return root, deduped
|
||||||
|
}
|
||||||
|
|
||||||
|
func isHealthyStatus(status string) bool {
|
||||||
|
s := strings.ToLower(status)
|
||||||
|
return s == "running" || s == "up" || s == "healthy"
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user