Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ad9d567d2c | |||
| c6c46aee68 |
@@ -1,6 +1,22 @@
|
|||||||
package models
|
package models
|
||||||
|
|
||||||
|
// ServiceStatus represents the unified status of a service across all monitor types.
|
||||||
|
type ServiceStatus string
|
||||||
|
|
||||||
|
const (
|
||||||
|
StatusRunning ServiceStatus = "running"
|
||||||
|
StatusStopped ServiceStatus = "stopped"
|
||||||
|
StatusDegraded ServiceStatus = "degraded"
|
||||||
|
StatusPending ServiceStatus = "pending"
|
||||||
|
StatusUnknown ServiceStatus = "unknown"
|
||||||
|
)
|
||||||
|
|
||||||
|
// IsHealthy reports whether the service is stable enough for dependents to rely on.
|
||||||
|
func (s ServiceStatus) IsHealthy() bool {
|
||||||
|
return s == StatusRunning
|
||||||
|
}
|
||||||
|
|
||||||
type Service struct {
|
type Service struct {
|
||||||
Name string
|
Name string
|
||||||
Status string
|
Status ServiceStatus
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,7 +36,23 @@ func (self *DockerMonitor) CheckServices(ctx context.Context) ([]models.Service,
|
|||||||
return lo.Map(ctrs.Items, func(item container.Summary, _ int) models.Service {
|
return lo.Map(ctrs.Items, func(item container.Summary, _ int) models.Service {
|
||||||
return models.Service{
|
return models.Service{
|
||||||
Name: lo.If(len(item.Names) > 0, item.Names[0]).Else(item.ID),
|
Name: lo.If(len(item.Names) > 0, item.Names[0]).Else(item.ID),
|
||||||
Status: string(item.State), // TODO: map to standartized states enum
|
Status: mapContainerState(string(item.State)),
|
||||||
}
|
}
|
||||||
}), nil
|
}), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// mapContainerState maps Docker container states to unified ServiceStatus.
|
||||||
|
func mapContainerState(state string) models.ServiceStatus {
|
||||||
|
switch state {
|
||||||
|
case "running":
|
||||||
|
return models.StatusRunning
|
||||||
|
case "exited", "dead":
|
||||||
|
return models.StatusStopped
|
||||||
|
case "paused":
|
||||||
|
return models.StatusDegraded
|
||||||
|
case "restarting", "created", "removing":
|
||||||
|
return models.StatusPending
|
||||||
|
default:
|
||||||
|
return models.StatusUnknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -44,7 +44,23 @@ func (self *KubesMonitor) CheckServices(ctx context.Context) ([]models.Service,
|
|||||||
return lo.Map(pods.Items, func(item corev1.Pod, _ int) models.Service {
|
return lo.Map(pods.Items, func(item corev1.Pod, _ int) models.Service {
|
||||||
return models.Service{
|
return models.Service{
|
||||||
Name: item.Name,
|
Name: item.Name,
|
||||||
Status: string(item.Status.Phase), // TODO: map to standartized states enum
|
Status: mapPodPhase(item.Status.Phase),
|
||||||
}
|
}
|
||||||
}), nil
|
}), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// mapPodPhase maps K8s pod phases to unified ServiceStatus.
|
||||||
|
func mapPodPhase(phase corev1.PodPhase) models.ServiceStatus {
|
||||||
|
switch phase {
|
||||||
|
case corev1.PodRunning:
|
||||||
|
return models.StatusRunning
|
||||||
|
case corev1.PodSucceeded:
|
||||||
|
return models.StatusStopped
|
||||||
|
case corev1.PodFailed:
|
||||||
|
return models.StatusStopped
|
||||||
|
case corev1.PodPending:
|
||||||
|
return models.StatusPending
|
||||||
|
default:
|
||||||
|
return models.StatusUnknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
+2
-1
@@ -110,7 +110,7 @@ func main() {
|
|||||||
if graphPath == "" {
|
if graphPath == "" {
|
||||||
graphPath = "/etc/hellreign/services.yaml"
|
graphPath = "/etc/hellreign/services.yaml"
|
||||||
}
|
}
|
||||||
graphHandlers := handlers.NewGraphHandlers(graphPath)
|
graphHandlers := handlers.NewGraphHandlers(graphPath, coll)
|
||||||
|
|
||||||
agents := handlers.NewAgentsGroup(h, coll)
|
agents := handlers.NewAgentsGroup(h, coll)
|
||||||
auth := handlers.AuthGroup{Handlers: h}
|
auth := handlers.AuthGroup{Handlers: h}
|
||||||
@@ -226,6 +226,7 @@ func main() {
|
|||||||
graphGroup.PUT("", graphHandlers.UpdateYAML)
|
graphGroup.PUT("", graphHandlers.UpdateYAML)
|
||||||
graphGroup.GET("/order", graphHandlers.StartupOrder)
|
graphGroup.GET("/order", graphHandlers.StartupOrder)
|
||||||
graphGroup.GET("/cycle", graphHandlers.CycleCheck)
|
graphGroup.GET("/cycle", graphHandlers.CycleCheck)
|
||||||
|
graphGroup.GET("/failure", graphHandlers.GetFailureRootCause)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Agent registration
|
// Agent registration
|
||||||
|
|||||||
@@ -5,23 +5,26 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph"
|
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/graph"
|
||||||
|
"gitea.d3m0k1d.ru/d3m0k1d/HellreigN/backend/internal/grpcsrv/collector"
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
)
|
)
|
||||||
|
|
||||||
// GraphHandlers manages the service dependency graph.
|
// GraphHandlers manages the service dependency graph.
|
||||||
type GraphHandlers struct {
|
type GraphHandlers struct {
|
||||||
path string
|
path string
|
||||||
mu sync.RWMutex
|
mu sync.RWMutex
|
||||||
yamlData []byte
|
yamlData []byte
|
||||||
loaded *graph.Graph
|
loaded *graph.Graph
|
||||||
|
collector *collector.Collector
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewGraphHandlers loads the graph from the given YAML file path.
|
// NewGraphHandlers loads the graph from the given YAML file path.
|
||||||
func NewGraphHandlers(yamlPath string) *GraphHandlers {
|
func NewGraphHandlers(yamlPath string, coll *collector.Collector) *GraphHandlers {
|
||||||
h := &GraphHandlers{path: yamlPath}
|
h := &GraphHandlers{path: yamlPath, collector: coll}
|
||||||
if err := h.reload(); err != nil {
|
if err := h.reload(); err != nil {
|
||||||
if _, ok := err.(*os.PathError); ok {
|
if _, ok := err.(*os.PathError); ok {
|
||||||
log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath)
|
log.Printf("[graph] no graph file at %q, starting with empty graph", yamlPath)
|
||||||
@@ -147,3 +150,209 @@ func (h *GraphHandlers) CycleCheck(c *gin.Context) {
|
|||||||
|
|
||||||
c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()})
|
c.JSON(http.StatusOK, gin.H{"has_cycle": g.HasCycle()})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ServiceStatusOut represents a service and its current status.
|
||||||
|
type ServiceStatusOut struct {
|
||||||
|
NodeID string `json:"node_id"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Healthy bool `json:"healthy"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// FailureRootCauseOut represents the result of a failure analysis.
|
||||||
|
type FailureRootCauseOut struct {
|
||||||
|
Affected ServiceStatusOut `json:"affected"`
|
||||||
|
RootCause *ServiceStatusOut `json:"root_cause,omitempty"`
|
||||||
|
DependencyChain []string `json:"dependency_chain,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetFailureRootCause analyzes the dependency graph and current service
|
||||||
|
// statuses to find the root cause of a service failure.
|
||||||
|
// If the specified service is unhealthy, it traverses its dependencies
|
||||||
|
// to find the first unhealthy dependency — the one that is the root cause.
|
||||||
|
// @Summary Find failure root cause
|
||||||
|
// @Description Analyzes dependencies and service statuses to find the root cause of a failure
|
||||||
|
// @Tags graph
|
||||||
|
// @Param node_id query string false "Node ID (agent label)"
|
||||||
|
// @Param service query string true "Service name"
|
||||||
|
// @Produce json
|
||||||
|
// @Success 200 {object} FailureRootCauseOut
|
||||||
|
// @Failure 400 {object} map[string]string
|
||||||
|
// @Security Bearer
|
||||||
|
// @Router /graph/failure [get]
|
||||||
|
func (h *GraphHandlers) GetFailureRootCause(c *gin.Context) {
|
||||||
|
nodeID := c.Query("node_id")
|
||||||
|
svcName := c.Query("service")
|
||||||
|
if svcName == "" {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "service query param is required"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
h.mu.RLock()
|
||||||
|
g := h.loaded
|
||||||
|
h.mu.RUnlock()
|
||||||
|
|
||||||
|
if g == nil {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "no graph loaded"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a map of service statuses from all agents
|
||||||
|
svcStatus := h.buildServiceStatusMap()
|
||||||
|
|
||||||
|
// If no node specified, search all nodes for the service
|
||||||
|
if nodeID == "" {
|
||||||
|
for _, node := range g.Nodes() {
|
||||||
|
if _, ok := g.GetService(node.ID, svcName); ok {
|
||||||
|
nodeID = node.ID
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nodeID == "" {
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in graph"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, ok := g.GetService(nodeID, svcName); !ok {
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": "service not found in node"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get current status
|
||||||
|
status := svcStatus[nodeID+":"+svcName]
|
||||||
|
affected := ServiceStatusOut{
|
||||||
|
NodeID: nodeID,
|
||||||
|
Name: svcName,
|
||||||
|
Status: status.status,
|
||||||
|
Healthy: status.healthy,
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the service is healthy, no failure to analyze
|
||||||
|
if status.healthy {
|
||||||
|
c.JSON(http.StatusOK, FailureRootCauseOut{
|
||||||
|
Affected: affected,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find root cause: traverse dependencies to find the first unhealthy one
|
||||||
|
rootCause, chain := findRootCause(g, nodeID, svcName, svcStatus)
|
||||||
|
|
||||||
|
out := FailureRootCauseOut{
|
||||||
|
Affected: affected,
|
||||||
|
DependencyChain: chain,
|
||||||
|
}
|
||||||
|
if rootCause != nil {
|
||||||
|
out.RootCause = rootCause
|
||||||
|
}
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// svcStatusEntry holds parsed status info.
|
||||||
|
type svcStatusEntry struct {
|
||||||
|
status string
|
||||||
|
healthy bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildServiceStatusMap creates a map of "nodeID:serviceName" → status.
|
||||||
|
// Matches graph nodes to agent labels in the collector.
|
||||||
|
func (h *GraphHandlers) buildServiceStatusMap() map[string]svcStatusEntry {
|
||||||
|
result := make(map[string]svcStatusEntry)
|
||||||
|
|
||||||
|
h.mu.RLock()
|
||||||
|
nodes := h.loaded.Nodes()
|
||||||
|
h.mu.RUnlock()
|
||||||
|
|
||||||
|
for _, agent := range h.collector.Agents() {
|
||||||
|
for _, svc := range agent.Services {
|
||||||
|
healthy := isHealthyStatus(svc.Status)
|
||||||
|
entry := svcStatusEntry{status: svc.Status, healthy: healthy}
|
||||||
|
|
||||||
|
// Try exact node match first
|
||||||
|
key := agent.Label + ":" + svc.Name
|
||||||
|
result[key] = entry
|
||||||
|
|
||||||
|
// Also register under all nodes that don't have a status yet
|
||||||
|
for _, node := range nodes {
|
||||||
|
nodeKey := node.ID + ":" + svc.Name
|
||||||
|
if _, exists := result[nodeKey]; !exists {
|
||||||
|
result[nodeKey] = entry
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// findRootCause traverses the dependency graph to find the first unhealthy dependency.
|
||||||
|
func findRootCause(g *graph.Graph, nodeID, svcName string, statusMap map[string]svcStatusEntry) (*ServiceStatusOut, []string) {
|
||||||
|
visited := make(map[string]bool)
|
||||||
|
var chain []string
|
||||||
|
|
||||||
|
var dfs func(string, string) *ServiceStatusOut
|
||||||
|
dfs = func(nid, sname string) *ServiceStatusOut {
|
||||||
|
key := nid + ":" + sname
|
||||||
|
chain = append(chain, key)
|
||||||
|
visited[key] = true
|
||||||
|
|
||||||
|
svc, ok := g.GetService(nid, sname)
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check each dependency
|
||||||
|
for _, dep := range svc.Dependencies {
|
||||||
|
depNodeID := dep.Target.NodeID
|
||||||
|
if depNodeID == "" {
|
||||||
|
depNodeID = nid
|
||||||
|
}
|
||||||
|
depKey := depNodeID + ":" + dep.Target.Name
|
||||||
|
|
||||||
|
if visited[depKey] {
|
||||||
|
continue // avoid loops
|
||||||
|
}
|
||||||
|
|
||||||
|
depStatus := statusMap[depKey]
|
||||||
|
|
||||||
|
if !depStatus.healthy {
|
||||||
|
// This dependency is unhealthy — check if IT has an unhealthy dependency
|
||||||
|
// (to find the true root cause)
|
||||||
|
if deeper := dfs(depNodeID, dep.Target.Name); deeper != nil {
|
||||||
|
return deeper
|
||||||
|
}
|
||||||
|
// This is the root cause
|
||||||
|
return &ServiceStatusOut{
|
||||||
|
NodeID: depNodeID,
|
||||||
|
Name: dep.Target.Name,
|
||||||
|
Status: depStatus.status,
|
||||||
|
Healthy: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
root := dfs(nodeID, svcName)
|
||||||
|
|
||||||
|
// Deduplicate chain
|
||||||
|
seen := make(map[string]bool)
|
||||||
|
var deduped []string
|
||||||
|
for _, k := range chain {
|
||||||
|
if !seen[k] {
|
||||||
|
seen[k] = true
|
||||||
|
deduped = append(deduped, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return root, deduped
|
||||||
|
}
|
||||||
|
|
||||||
|
func isHealthyStatus(status string) bool {
|
||||||
|
s := strings.ToLower(status)
|
||||||
|
return s == "running" || s == "up" || s == "healthy"
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user