diff --git a/cmd/node/nodeCreate.go b/cmd/node/nodeCreate.go index f6a14897..26b648c0 100644 --- a/cmd/node/nodeCreate.go +++ b/cmd/node/nodeCreate.go @@ -122,6 +122,7 @@ func parseCreateNodeCmd(cmd *cobra.Command, args []string) ([]*k3d.Node, *k3d.Cl Labels: map[string]string{ k3d.LabelRole: roleStr, }, + Restart: true, } nodes = append(nodes, node) } diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 8ef65198..48a4c027 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -180,7 +180,7 @@ func ClusterCreate(ctx context.Context, runtime k3drt.Runtime, cluster *k3d.Clus node.Name = generateNodeName(cluster.Name, node.Role, suffix) node.Network = cluster.Network.Name - + node.Restart = true node.GPURequest = cluster.CreateClusterOpts.GPURequest // create node @@ -343,6 +343,7 @@ func ClusterCreate(ctx context.Context, runtime k3drt.Runtime, cluster *k3d.Clus Role: k3d.LoadBalancerRole, Labels: k3d.DefaultObjectLabels, // TODO: createLoadBalancer: add more expressive labels Network: cluster.Network.Name, + Restart: true, } cluster.Nodes = append(cluster.Nodes, lbNode) // append lbNode to list of cluster nodes, so it will be considered during rollback log.Infof("Creating LoadBalancer '%s'", lbNode.Name) diff --git a/pkg/cluster/node.go b/pkg/cluster/node.go index 24ec2550..230897e4 100644 --- a/pkg/cluster/node.go +++ b/pkg/cluster/node.go @@ -324,7 +324,7 @@ func NodeGet(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node) (*k3 return node, nil } -// NodeWaitForLogMessage follows the logs of a node container and returns if it finds a specific line in there (or timeout is reached) +//NodeWaitForLogMessage follows the logs of a node container and returns if it finds a specific line in there (or timeout is reached) func NodeWaitForLogMessage(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, message string, since time.Time) error { for { select { @@ -353,8 +353,15 @@ func NodeWaitForLogMessage(ctx context.Context, runtime runtimes.Runtime, node * if nRead > 0 && strings.Contains(output, message) { break } + + // check if the container is restarting + running, status, _ := runtime.GetNodeStatus(ctx, node) + if running && status == k3d.NodeStatusRestarting { + return fmt.Errorf("Node %s is restarting, early exit to avoid crash loop", node.Name) + } + + time.Sleep(500 * time.Millisecond) // wait for half a second to avoid overloading docker (error `socket: too many open files`) } - time.Sleep(500 * time.Millisecond) // wait for half a second to avoid overloading docker (error `socket: too many open files`) log.Debugf("Finished waiting for log message '%s' from node '%s'", message, node.Name) return nil } diff --git a/pkg/runtimes/containerd/node.go b/pkg/runtimes/containerd/node.go index a6fd19a7..366a64b8 100644 --- a/pkg/runtimes/containerd/node.go +++ b/pkg/runtimes/containerd/node.go @@ -118,6 +118,11 @@ func (d Containerd) GetNode(ctx context.Context, node *k3d.Node) (*k3d.Node, err return nil, nil } +// GetNodeStatus returns the status of a node (Running, Started, etc.) +func (d Containerd) GetNodeStatus(ctx context.Context, node *k3d.Node) (bool, string, error) { + return true, "", nil +} + // GetNodeLogs returns the logs from a given node func (d Containerd) GetNodeLogs(ctx context.Context, node *k3d.Node, since time.Time) (io.ReadCloser, error) { return nil, nil diff --git a/pkg/runtimes/runtime.go b/pkg/runtimes/runtime.go index ebc14674..cc7f3a98 100644 --- a/pkg/runtimes/runtime.go +++ b/pkg/runtimes/runtime.go @@ -55,6 +55,7 @@ type Runtime interface { DeleteNode(context.Context, *k3d.Node) error GetNodesByLabel(context.Context, map[string]string) ([]*k3d.Node, error) GetNode(context.Context, *k3d.Node) (*k3d.Node, error) + GetNodeStatus(context.Context, *k3d.Node) (bool, string, error) CreateNetworkIfNotPresent(context.Context, string) (string, bool, error) // @return NETWORK_NAME, EXISTS, ERROR GetKubeconfig(context.Context, *k3d.Node) (io.ReadCloser, error) DeleteNetwork(context.Context, string) error diff --git a/pkg/types/types.go b/pkg/types/types.go index 7efa6c1e..bc7e3284 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -54,6 +54,9 @@ var ReadyLogMessageByRole = map[Role]string{ LoadBalancerRole: "start worker processes", } +// NodeStatusRestarting defines the status string that signals the node container is restarting +const NodeStatusRestarting = "restarting" + // Role defines a k3d node role type Role string