Merge pull request #252 from rancher/feature/start-cluster-wait-for-master

[Feature] v3/startCluster: add --wait and --timeout
pull/244/head
Thorsten Klein 4 years ago committed by GitHub
commit 637c48e63c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 9
      cmd/start/startCluster.go
  2. 2
      docs/usage/commands.md
  3. 41
      pkg/cluster/cluster.go
  4. 4
      pkg/cluster/node.go
  5. 3
      pkg/runtimes/containerd/node.go
  6. 8
      pkg/runtimes/docker/node.go
  7. 3
      pkg/runtimes/runtime.go
  8. 6
      pkg/types/types.go
  9. 5
      tests/test_full_lifecycle.sh

@ -22,8 +22,11 @@ THE SOFTWARE.
package start
import (
"time"
"github.com/rancher/k3d/pkg/cluster"
"github.com/rancher/k3d/pkg/runtimes"
"github.com/rancher/k3d/pkg/types"
"github.com/spf13/cobra"
k3d "github.com/rancher/k3d/pkg/types"
@ -34,6 +37,8 @@ import (
// NewCmdStartCluster returns a new cobra command
func NewCmdStartCluster() *cobra.Command {
startClusterOpts := types.StartClusterOpts{}
// create new command
cmd := &cobra.Command{
Use: "cluster (NAME [NAME...] | --all)",
@ -45,7 +50,7 @@ func NewCmdStartCluster() *cobra.Command {
log.Infoln("No clusters found")
} else {
for _, c := range clusters {
if err := cluster.StartCluster(c, runtimes.SelectedRuntime); err != nil {
if err := cluster.StartCluster(cmd.Context(), c, runtimes.SelectedRuntime, startClusterOpts); err != nil {
log.Fatalln(err)
}
}
@ -55,6 +60,8 @@ func NewCmdStartCluster() *cobra.Command {
// add flags
cmd.Flags().BoolP("all", "a", false, "Start all existing clusters")
cmd.Flags().BoolVar(&startClusterOpts.WaitForMaster, "wait", false, "Wait for the master(s) (and loadbalancer) to be ready before returning.")
cmd.Flags().DurationVar(&startClusterOpts.Timeout, "timeout", 0*time.Second, "Maximum waiting time for '--wait' before canceling/returning.")
// add subcommands

@ -33,6 +33,8 @@ k3d
start
cluster CLUSTERNAME # start a (stopped) cluster
-a, --all # start all clusters
--wait # wait for all masters and master-loadbalancer to be up before returning
--timeout # maximum waiting time for '--wait' before canceling/returning
node NODENAME # start a (stopped) node
stop
cluster CLUSTERNAME # stop a cluster

@ -30,6 +30,7 @@ import (
"time"
k3drt "github.com/rancher/k3d/pkg/runtimes"
"github.com/rancher/k3d/pkg/types"
k3d "github.com/rancher/k3d/pkg/types"
"github.com/rancher/k3d/pkg/util"
log "github.com/sirupsen/logrus"
@ -191,7 +192,7 @@ func CreateCluster(ctx context.Context, cluster *k3d.Cluster, runtime k3drt.Runt
default:
}
log.Debugln("Waiting for initializing master node...")
logreader, err := runtime.GetNodeLogs(cluster.InitNode)
logreader, err := runtime.GetNodeLogs(cluster.InitNode, time.Time{})
if err != nil {
if logreader != nil {
logreader.Close()
@ -253,7 +254,7 @@ func CreateCluster(ctx context.Context, cluster *k3d.Cluster, runtime k3drt.Runt
// TODO: avoid `level=fatal msg="starting kubernetes: preparing server: post join: a configuration change is already in progress (5)"`
// ... by scanning for this line in logs and restarting the container in case it appears
log.Debugf("Starting to wait for master node '%s'", masterNode.Name)
return WaitForNodeLogMessage(ctx, runtime, masterNode, "Wrote kubeconfig")
return WaitForNodeLogMessage(ctx, runtime, masterNode, "Wrote kubeconfig", time.Time{})
})
}
}
@ -471,9 +472,20 @@ func generateNodeName(cluster string, role k3d.Role, suffix int) string {
}
// StartCluster starts a whole cluster (i.e. all nodes of the cluster)
func StartCluster(cluster *k3d.Cluster, runtime k3drt.Runtime) error {
func StartCluster(ctx context.Context, cluster *k3d.Cluster, runtime k3drt.Runtime, startClusterOpts types.StartClusterOpts) error {
log.Infof("Starting cluster '%s'", cluster.Name)
start := time.Now()
if startClusterOpts.Timeout > 0*time.Second {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, startClusterOpts.Timeout)
defer cancel()
}
// vars to support waiting for master nodes to be ready
waitForMasterWaitgroup, ctx := errgroup.WithContext(ctx)
failed := 0
var masterlb *k3d.Node
for _, node := range cluster.Nodes {
@ -490,6 +502,17 @@ func StartCluster(cluster *k3d.Cluster, runtime k3drt.Runtime) error {
failed++
continue
}
// asynchronously wait for this master node to be ready (by checking the logs for a specific log mesage)
if node.Role == k3d.MasterRole && startClusterOpts.WaitForMaster {
masterNode := node
waitForMasterWaitgroup.Go(func() error {
// TODO: avoid `level=fatal msg="starting kubernetes: preparing server: post join: a configuration change is already in progress (5)"`
// ... by scanning for this line in logs and restarting the container in case it appears
log.Debugf("Starting to wait for master node '%s'", masterNode.Name)
return WaitForNodeLogMessage(ctx, runtime, masterNode, "Wrote kubeconfig", start)
})
}
}
// start masterlb
@ -499,6 +522,18 @@ func StartCluster(cluster *k3d.Cluster, runtime k3drt.Runtime) error {
log.Warningf("Failed to start masterlb '%s': Try to start it manually", masterlb.Name)
failed++
}
waitForMasterWaitgroup.Go(func() error {
// TODO: avoid `level=fatal msg="starting kubernetes: preparing server: post join: a configuration change is already in progress (5)"`
// ... by scanning for this line in logs and restarting the container in case it appears
log.Debugf("Starting to wait for loadbalancer node '%s'", masterlb.Name)
return WaitForNodeLogMessage(ctx, runtime, masterlb, "start worker processes", start)
})
}
if err := waitForMasterWaitgroup.Wait(); err != nil {
log.Errorln("Failed to bring up all nodes in time. Check the logs:")
log.Errorln(">>> ", err)
return fmt.Errorf("Failed to bring up cluster")
}
if failed > 0 {

@ -233,7 +233,7 @@ func GetNode(node *k3d.Node, runtime runtimes.Runtime) (*k3d.Node, error) {
}
// WaitForNodeLogMessage follows the logs of a node container and returns if it finds a specific line in there (or timeout is reached)
func WaitForNodeLogMessage(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, message string) error {
func WaitForNodeLogMessage(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, message string, since time.Time) error {
for {
select {
case <-ctx.Done():
@ -242,7 +242,7 @@ func WaitForNodeLogMessage(ctx context.Context, runtime runtimes.Runtime, node *
}
// read the logs
out, err := runtime.GetNodeLogs(node)
out, err := runtime.GetNodeLogs(node, since)
if err != nil {
if out != nil {
out.Close()

@ -25,6 +25,7 @@ package containerd
import (
"context"
"io"
"time"
"github.com/containerd/containerd"
"github.com/containerd/containerd/containers"
@ -119,7 +120,7 @@ func (d Containerd) GetNode(node *k3d.Node) (*k3d.Node, error) {
}
// GetNodeLogs returns the logs from a given node
func (d Containerd) GetNodeLogs(node *k3d.Node) (io.ReadCloser, error) {
func (d Containerd) GetNodeLogs(node *k3d.Node, since time.Time) (io.ReadCloser, error) {
return nil, nil
}

@ -218,7 +218,7 @@ func (d Docker) GetNode(node *k3d.Node) (*k3d.Node, error) {
}
// GetNodeLogs returns the logs from a given node
func (d Docker) GetNodeLogs(node *k3d.Node) (io.ReadCloser, error) {
func (d Docker) GetNodeLogs(node *k3d.Node, since time.Time) (io.ReadCloser, error) {
// get the container for the given node
container, err := getNodeContainer(node)
if err != nil {
@ -244,7 +244,11 @@ func (d Docker) GetNodeLogs(node *k3d.Node) (io.ReadCloser, error) {
return nil, fmt.Errorf("Node '%s' (container '%s') not running", node.Name, containerInspectResponse.ID)
}
logreader, err := docker.ContainerLogs(ctx, container.ID, types.ContainerLogsOptions{ShowStdout: true, ShowStderr: true})
sinceStr := ""
if !since.IsZero() {
sinceStr = since.Format("2006-01-02T15:04:05")
}
logreader, err := docker.ContainerLogs(ctx, container.ID, types.ContainerLogsOptions{ShowStdout: true, ShowStderr: true, Since: sinceStr})
if err != nil {
log.Errorf("Failed to get logs from node '%s' (container '%s')", node.Name, container.ID)
return nil, err

@ -24,6 +24,7 @@ package runtimes
import (
"fmt"
"io"
"time"
"github.com/rancher/k3d/pkg/runtimes/containerd"
"github.com/rancher/k3d/pkg/runtimes/docker"
@ -56,7 +57,7 @@ type Runtime interface {
GetRuntimePath() string // returns e.g. '/var/run/docker.sock' for a default docker setup
ExecInNode(*k3d.Node, []string) error
// DeleteContainer() error
GetNodeLogs(*k3d.Node) (io.ReadCloser, error)
GetNodeLogs(*k3d.Node, time.Time) (io.ReadCloser, error)
}
// GetRuntime checks, if a given name is represented by an implemented k3d runtime and returns it

@ -106,6 +106,12 @@ type CreateClusterOpts struct {
K3sAgentArgs []string
}
// StartClusterOpts describe a set of options one can set when (re-)starting a cluster
type StartClusterOpts struct {
WaitForMaster bool
Timeout time.Duration
}
// ClusterNetwork describes a network which a cluster is running in
type ClusterNetwork struct {
Name string `yaml:"name" json:"name,omitempty"`

@ -30,10 +30,7 @@ check_clusters "$clustername" && failed "cluster was not stopped, since we still
# 3. start the cluster
info "Starting cluster..."
$EXE start cluster "$clustername"
info "Sleeping for 5 seconds to give the cluster enough time to get ready..."
sleep 5
$EXE start cluster "$clustername" --wait --timeout 360s || failed "cluster didn't come back in time"
info "Checking that we have access to the cluster..."
check_clusters "$clustername" || failed "error checking cluster"

Loading…
Cancel
Save