386 lines
9.6 KiB
Go
386 lines
9.6 KiB
Go
package handlers
|
|
|
|
import (
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"strings"
|
|
"sync"
|
|
|
|
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph"
|
|
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/grpcsrv/collector"
|
|
"github.com/gin-gonic/gin"
|
|
)
|
|
|
|
// GraphHandlers manages the service dependency graph.
|
|
type GraphHandlers struct {
|
|
path string
|
|
mu sync.RWMutex
|
|
yamlData []byte
|
|
loaded *graph.Graph
|
|
collector *collector.Collector
|
|
}
|
|
|
|
// NewGraphHandlers loads the graph from the given YAML file path.
|
|
func NewGraphHandlers(yamlPath string, coll *collector.Collector) *GraphHandlers {
|
|
h := &GraphHandlers{path: yamlPath, collector: coll}
|
|
if err := h.reload(); err != nil {
|
|
if _, ok := err.(*os.PathError); ok {
|
|
log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath)
|
|
h.loaded = graph.New()
|
|
h.yamlData = []byte("nodes: {}\n")
|
|
} else {
|
|
log.Fatalf("[graph] failed to load graph from %q: %v", yamlPath, err)
|
|
}
|
|
}
|
|
return h
|
|
}
|
|
|
|
func (h *GraphHandlers) reload() error {
|
|
data, err := os.ReadFile(h.path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
g, err := graph.ParseYAML(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
h.mu.Lock()
|
|
h.yamlData = data
|
|
h.loaded = g
|
|
h.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// LoadedGraph returns the current parsed graph.
|
|
func (h *GraphHandlers) LoadedGraph() *graph.Graph {
|
|
h.mu.RLock()
|
|
defer h.mu.RUnlock()
|
|
return h.loaded
|
|
}
|
|
|
|
// GetGraph returns the current dependency graph as JSON.
|
|
// @Summary Get dependency graph
|
|
// @Description Returns the service dependency graph as JSON
|
|
// @Tags graph
|
|
// @Produce json
|
|
// @Success 200 {object} map[string]interface{} "Dependency graph"
|
|
// @Security Bearer
|
|
// @Router /graph [get]
|
|
func (h *GraphHandlers) GetGraph(c *gin.Context) {
|
|
h.mu.RLock()
|
|
defer h.mu.RUnlock()
|
|
|
|
g := h.loaded
|
|
if g == nil {
|
|
c.JSON(http.StatusOK, gin.H{"nodes": map[string]interface{}{}})
|
|
return
|
|
}
|
|
|
|
nodes := make(map[string]interface{})
|
|
for _, node := range g.Nodes() {
|
|
services := make(map[string]interface{})
|
|
for _, svc := range node.Services {
|
|
deps := make([]map[string]interface{}, 0)
|
|
for _, dep := range svc.Dependencies {
|
|
deps = append(deps, map[string]interface{}{
|
|
"target": dep.Target,
|
|
"condition": dep.Condition,
|
|
})
|
|
}
|
|
services[svc.Name] = map[string]interface{}{
|
|
"dependencies": deps,
|
|
}
|
|
}
|
|
nodes[node.ID] = map[string]interface{}{
|
|
"services": services,
|
|
}
|
|
}
|
|
|
|
c.JSON(http.StatusOK, gin.H{"nodes": nodes})
|
|
}
|
|
|
|
// UpdateYAML updates the graph from new YAML text.
|
|
// @Summary Update dependency graph YAML
|
|
// @Description Replaces the service dependency graph YAML and reloads it
|
|
// @Tags graph
|
|
// @Accept plain
|
|
// @Produce json
|
|
// @Param body body string true "New YAML content"
|
|
// @Success 200 {object} map[string]string
|
|
// @Failure 400 {object} map[string]string
|
|
// @Security Bearer
|
|
// @Router /graph [put]
|
|
func (h *GraphHandlers) UpdateYAML(c *gin.Context) {
|
|
body, err := io.ReadAll(c.Request.Body)
|
|
if err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read body"})
|
|
return
|
|
}
|
|
|
|
g, err := graph.ParseYAML(body)
|
|
if err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
|
return
|
|
}
|
|
|
|
if err := os.WriteFile(h.path, body, 0o644); err != nil {
|
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to write graph file"})
|
|
return
|
|
}
|
|
|
|
h.mu.Lock()
|
|
h.yamlData = body
|
|
h.loaded = g
|
|
h.mu.Unlock()
|
|
|
|
log.Printf("[graph] updated graph from admin, saved to %s", h.path)
|
|
c.JSON(http.StatusOK, gin.H{"message": "graph updated"})
|
|
}
|
|
|
|
// StartupOrder returns the computed service startup order.
|
|
// @Summary Get startup order
|
|
// @Description Returns the topologically sorted service startup order
|
|
// @Tags graph
|
|
// @Produce json
|
|
// @Success 200 {array} string
|
|
// @Failure 400 {object} map[string]string
|
|
// @Security Bearer
|
|
// @Router /graph/order [get]
|
|
func (h *GraphHandlers) StartupOrder(c *gin.Context) {
|
|
h.mu.RLock()
|
|
g := h.loaded
|
|
h.mu.RUnlock()
|
|
|
|
order, err := g.TopologicalSort()
|
|
if err != nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
|
return
|
|
}
|
|
|
|
c.JSON(http.StatusOK, order)
|
|
}
|
|
|
|
// CycleCheck checks if the graph has cycles.
|
|
// @Summary Check for cycles
|
|
// @Description Returns whether the dependency graph contains cycles
|
|
// @Tags graph
|
|
// @Produce json
|
|
// @Success 200 {object} map[string]bool
|
|
// @Security Bearer
|
|
// @Router /graph/cycle [get]
|
|
func (h *GraphHandlers) CycleCheck(c *gin.Context) {
|
|
h.mu.RLock()
|
|
g := h.loaded
|
|
h.mu.RUnlock()
|
|
|
|
c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()})
|
|
}
|
|
|
|
// ServiceStatusOut represents a service and its current status.
|
|
type ServiceStatusOut struct {
|
|
NodeID string `json:"node_id"`
|
|
Name string `json:"name"`
|
|
Status string `json:"status"`
|
|
Healthy bool `json:"healthy"`
|
|
}
|
|
|
|
// FailureRootCauseOut represents the result of a failure analysis.
|
|
type FailureRootCauseOut struct {
|
|
Affected ServiceStatusOut `json:"affected"`
|
|
RootCause *ServiceStatusOut `json:"root_cause,omitempty"`
|
|
DependencyChain []string `json:"dependency_chain,omitempty"`
|
|
}
|
|
|
|
// GetFailureRootCause analyzes the dependency graph and current service
|
|
// statuses to find the root cause of a service failure.
|
|
// If the specified service is unhealthy, it traverses its dependencies
|
|
// to find the first unhealthy dependency — the one that is the root cause.
|
|
// @Summary Find failure root cause
|
|
// @Description Analyzes dependencies and service statuses to find the root cause of a failure
|
|
// @Tags graph
|
|
// @Param node_id query string false "Node ID (agent label)"
|
|
// @Param service query string true "Service name"
|
|
// @Produce json
|
|
// @Success 200 {object} FailureRootCauseOut
|
|
// @Failure 400 {object} map[string]string
|
|
// @Security Bearer
|
|
// @Router /graph/failure [get]
|
|
func (h *GraphHandlers) GetFailureRootCause(c *gin.Context) {
|
|
nodeID := c.Query("node_id")
|
|
svcName := c.Query("service")
|
|
if svcName == "" {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "service query param is required"})
|
|
return
|
|
}
|
|
|
|
h.mu.RLock()
|
|
g := h.loaded
|
|
h.mu.RUnlock()
|
|
|
|
if g == nil {
|
|
c.JSON(http.StatusBadRequest, gin.H{"error": "no graph loaded"})
|
|
return
|
|
}
|
|
|
|
// Build a map of service statuses from all agents
|
|
svcStatus := h.buildServiceStatusMap()
|
|
|
|
// If no node specified, search all nodes for the service
|
|
if nodeID == "" {
|
|
for _, node := range g.Nodes() {
|
|
if _, ok := g.GetService(node.ID, svcName); ok {
|
|
nodeID = node.ID
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if nodeID == "" {
|
|
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in graph"})
|
|
return
|
|
}
|
|
|
|
if _, ok := g.GetService(nodeID, svcName); !ok {
|
|
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in node"})
|
|
return
|
|
}
|
|
|
|
// Get current status
|
|
status := svcStatus[nodeID+":"+svcName]
|
|
affected := ServiceStatusOut{
|
|
NodeID: nodeID,
|
|
Name: svcName,
|
|
Status: status.status,
|
|
Healthy: status.healthy,
|
|
}
|
|
|
|
// If the service is healthy, no failure to analyze
|
|
if status.healthy {
|
|
c.JSON(http.StatusOK, FailureRootCauseOut{
|
|
Affected: affected,
|
|
})
|
|
return
|
|
}
|
|
|
|
// Find root cause: traverse dependencies to find the first unhealthy one
|
|
rootCause, chain := findRootCause(g, nodeID, svcName, svcStatus)
|
|
|
|
out := FailureRootCauseOut{
|
|
Affected: affected,
|
|
DependencyChain: chain,
|
|
}
|
|
if rootCause != nil {
|
|
out.RootCause = rootCause
|
|
}
|
|
|
|
c.JSON(http.StatusOK, out)
|
|
}
|
|
|
|
// svcStatusEntry holds parsed status info.
|
|
type svcStatusEntry struct {
|
|
status string
|
|
healthy bool
|
|
}
|
|
|
|
// buildServiceStatusMap creates a map of "nodeID:serviceName" → status.
|
|
// Matches graph nodes to agent labels in the collector.
|
|
func (h *GraphHandlers) buildServiceStatusMap() map[string]svcStatusEntry {
|
|
result := make(map[string]svcStatusEntry)
|
|
|
|
h.mu.RLock()
|
|
nodes := h.loaded.Nodes()
|
|
h.mu.RUnlock()
|
|
|
|
for _, agent := range h.collector.Agents() {
|
|
for _, svc := range agent.Services {
|
|
healthy := isHealthyStatus(svc.Status)
|
|
entry := svcStatusEntry{status: svc.Status, healthy: healthy}
|
|
|
|
// Try exact node match first
|
|
key := agent.Label + ":" + svc.Name
|
|
result[key] = entry
|
|
|
|
// Also register under all nodes that don't have a status yet
|
|
for _, node := range nodes {
|
|
nodeKey := node.ID + ":" + svc.Name
|
|
if _, exists := result[nodeKey]; !exists {
|
|
result[nodeKey] = entry
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// findRootCause traverses the dependency graph to find the first unhealthy dependency.
|
|
func findRootCause(g *graph.Graph, nodeID, svcName string, statusMap map[string]svcStatusEntry) (*ServiceStatusOut, []string) {
|
|
visited := make(map[string]bool)
|
|
var chain []string
|
|
|
|
var dfs func(string, string) *ServiceStatusOut
|
|
dfs = func(nid, sname string) *ServiceStatusOut {
|
|
key := nid + ":" + sname
|
|
chain = append(chain, key)
|
|
visited[key] = true
|
|
|
|
svc, ok := g.GetService(nid, sname)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
|
|
// Check each dependency
|
|
for _, dep := range svc.Dependencies {
|
|
depNodeID := dep.Target.NodeID
|
|
if depNodeID == "" {
|
|
depNodeID = nid
|
|
}
|
|
depKey := depNodeID + ":" + dep.Target.Name
|
|
|
|
if visited[depKey] {
|
|
continue // avoid loops
|
|
}
|
|
|
|
depStatus := statusMap[depKey]
|
|
|
|
if !depStatus.healthy {
|
|
// This dependency is unhealthy — check if IT has an unhealthy dependency
|
|
// (to find the true root cause)
|
|
if deeper := dfs(depNodeID, dep.Target.Name); deeper != nil {
|
|
return deeper
|
|
}
|
|
// This is the root cause
|
|
return &ServiceStatusOut{
|
|
NodeID: depNodeID,
|
|
Name: dep.Target.Name,
|
|
Status: depStatus.status,
|
|
Healthy: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
root := dfs(nodeID, svcName)
|
|
|
|
// Deduplicate chain
|
|
seen := make(map[string]bool)
|
|
var deduped []string
|
|
for _, k := range chain {
|
|
if !seen[k] {
|
|
seen[k] = true
|
|
deduped = append(deduped, k)
|
|
}
|
|
}
|
|
|
|
return root, deduped
|
|
}
|
|
|
|
func isHealthyStatus(status string) bool {
|
|
s := strings.ToLower(status)
|
|
return s == "running" || s == "up" || s == "healthy"
|
|
}
|