diff -Nru golang-github-thejerf-suture-2.0.3/debian/changelog golang-github-thejerf-suture-3.0.0/debian/changelog --- golang-github-thejerf-suture-2.0.3/debian/changelog 2018-07-29 07:52:55.000000000 +0000 +++ golang-github-thejerf-suture-3.0.0/debian/changelog 2018-10-11 03:23:06.000000000 +0000 @@ -1,3 +1,9 @@ +golang-github-thejerf-suture (3.0.0-1) unstable; urgency=medium + + * New upstream version. + + -- Alexandre Viau Wed, 10 Oct 2018 23:23:06 -0400 + golang-github-thejerf-suture (2.0.3-1) unstable; urgency=medium * New upstream version. diff -Nru golang-github-thejerf-suture-2.0.3/messages.go golang-github-thejerf-suture-3.0.0/messages.go --- golang-github-thejerf-suture-2.0.3/messages.go 2018-01-03 15:47:05.000000000 +0000 +++ golang-github-thejerf-suture-3.0.0/messages.go 2018-09-21 18:57:38.000000000 +0000 @@ -46,7 +46,7 @@ } type serviceEnded struct { - id serviceID + id serviceID complete bool } diff -Nru golang-github-thejerf-suture-2.0.3/README.md golang-github-thejerf-suture-3.0.0/README.md --- golang-github-thejerf-suture-2.0.3/README.md 2018-01-03 15:47:05.000000000 +0000 +++ golang-github-thejerf-suture-3.0.0/README.md 2018-09-21 18:57:38.000000000 +0000 @@ -48,21 +48,29 @@ suture uses semantic versioning. -1. 2.0.3 +* 3.0: + * Added a default jitter of up to 50% on the restart intervals. While + this is a backwards-compatible change from a source perspective, this + does represent a non-trivial behavior change. It should generally be a + good thing, but this is released as a major version as a warning. +* 2.0.4 + * Added option PassThroughPanics, to allow panics to propagate up through + the supervisor. +* 2.0.3 * Accepted PR #23, making the logging functions in the supervisor public. * Added a new Supervisor method RemoveAndWait, allowing you to make a best effort way to wait for a service to terminate. * Accepted PR #24, adding an optional IsCompletable interface that Services can implement that indicates they do not need to be restarted upon a normal return. -1. 2.0.2 +* 2.0.2 * Fixed issue #21. gccgo doesn't like `case (<-c)`, with the parentheses. Of course the parens aren't doing anything useful anyhow. No behavior changes. -1. 2.0.1 +* 2.0.1 * __Test code change only__. Addresses the possibility that one of the tests can spuriously fail if they run in a certain order. -1. 2.0.0 +* 2.0.0 * Major version due to change to the signature of the logging methods: A race condition could occur when the Supervisor rendered the service @@ -72,20 +80,20 @@ to the logging methods. * Removal of use of sync/atomic due to possible brokenness in the Debian architecture. -1. 1.1.2 +* 1.1.2 * TravisCI showed that the fix for 1.1.1 induced a deadlock in Go 1.4 and before. * If the supervisor is terminated before a service, the service goroutine could be orphaned trying the shutdown notification to the supervisor. This should no longer occur. -1. 1.1.1 +* 1.1.1 * Per #14, the fix in 1.1.0 did not actually wait for the Supervisor to stop. -1. 1.1.0 +* 1.1.0 * Per #12, Supervisor.stop now tries to wait for its children before returning. A careful reading of the original .Stop() contract says this is the correct behavior. -1. 1.0.1 +* 1.0.1 * Fixed data race on the .state variable. -1. 1.0.0 +* 1.0.0 * Initial release. diff -Nru golang-github-thejerf-suture-2.0.3/supervisor.go golang-github-thejerf-suture-3.0.0/supervisor.go --- golang-github-thejerf-suture-2.0.3/supervisor.go 2018-01-03 15:47:05.000000000 +0000 +++ golang-github-thejerf-suture-3.0.0/supervisor.go 2018-09-21 18:57:38.000000000 +0000 @@ -5,6 +5,7 @@ "fmt" "log" "math" + "math/rand" "runtime" "sync" "time" @@ -61,6 +62,36 @@ name string } +// Jitter returns the sum of the input duration and a random jitter. It is +// compatible with the jitter functions in github.com/lthibault/jitterbug. +type Jitter interface { + Jitter(time.Duration) time.Duration +} + +// NoJitter does not apply any jitter to the input duration +type NoJitter struct{} + +// Jitter leaves the input duration d unchanged. +func (NoJitter) Jitter(d time.Duration) time.Duration { return d } + +// DefaultJitter is the jitter function that is applied when spec.BackoffJitter +// is set to nil. +type DefaultJitter struct { + rand *rand.Rand +} + +// Jitter will jitter the backoff time by uniformly distributing it into +// the range [FailureBackoff, 1.5 * FailureBackoff). +func (dj *DefaultJitter) Jitter(d time.Duration) time.Duration { + // this is only called by the core supervisor loop, so it is + // single-thread safe. + if dj.rand == nil { + dj.rand = rand.New(rand.NewSource(time.Now().UnixNano())) + } + jitter := dj.rand.Float64() / 2 + return d + time.Duration(float64(d)*jitter) +} + /* Supervisor is the core type of the module that represents a Supervisor. @@ -92,11 +123,11 @@ */ type Supervisor struct { Name string - id supervisorID failureDecay float64 failureThreshold float64 failureBackoff time.Duration + backoffJitter Jitter timeout time.Duration log func(string) services map[serviceID]serviceWithName @@ -119,20 +150,26 @@ getAfterChan func(time.Duration) <-chan time.Time sync.Mutex - state uint8 + + // malign leftovers + id supervisorID + state uint8 + recoverPanics bool } // Spec is used to pass arguments to the New function to create a // supervisor. See the New function for full documentation. type Spec struct { - Log func(string) - FailureDecay float64 - FailureThreshold float64 - FailureBackoff time.Duration - Timeout time.Duration - LogBadStop BadStopLogger - LogFailure FailureLogger - LogBackoff BackoffLogger + Log func(string) + FailureDecay float64 + FailureThreshold float64 + FailureBackoff time.Duration + BackoffJitter Jitter + Timeout time.Duration + LogBadStop BadStopLogger + LogFailure FailureLogger + LogBackoff BackoffLogger + PassThroughPanics bool } /* @@ -149,6 +186,7 @@ * FailureThreshold: 5 failures * FailureBackoff: 15 seconds * Timeout: 10 seconds + * BackoffJitter: DefaultJitter The Log function will be called when errors occur. Suture will log the following: @@ -176,6 +214,9 @@ Timeout is how long Suture will wait for a service to properly terminate. +The PassThroughPanics options can be set to let panics in services propagate +and crash the program, should this be desirable. + */ func New(name string, spec Spec) (s *Supervisor) { s = new(Supervisor) @@ -209,11 +250,17 @@ } else { s.failureBackoff = spec.FailureBackoff } + if spec.BackoffJitter == nil { + s.backoffJitter = &DefaultJitter{} + } else { + s.backoffJitter = spec.BackoffJitter + } if spec.Timeout == 0 { s.timeout = time.Second * 10 } else { s.timeout = spec.Timeout } + s.recoverPanics = !spec.PassThroughPanics // overriding these allows for testing the threshold behavior s.getNow = time.Now @@ -441,7 +488,7 @@ // used only by tests panic("Panicking as requested!") } - case _ = <-s.resumeTimer: + case <-s.resumeTimer: // We're resuming normal operation after a pause due to // excessive thrashing // FIXME: Ought to permit some spacing of these functions, rather @@ -490,7 +537,7 @@ s.state = paused s.Unlock() s.LogBackoff(s, true) - s.resumeTimer = s.getAfterChan(s.failureBackoff) + s.resumeTimer = s.getAfterChan(s.backoffJitter.Jitter(s.failureBackoff)) } s.lastFail = now @@ -520,14 +567,16 @@ func (s *Supervisor) runService(service Service, id serviceID) { go func() { - defer func() { - if r := recover(); r != nil { - buf := make([]byte, 65535, 65535) - written := runtime.Stack(buf, false) - buf = buf[:written] - s.fail(id, r, buf) - } - }() + if s.recoverPanics { + defer func() { + if r := recover(); r != nil { + buf := make([]byte, 65535) + written := runtime.Stack(buf, false) + buf = buf[:written] + s.fail(id, r, buf) + } + }() + } service.Serve() @@ -546,10 +595,10 @@ delete(s.services, id) s.servicesShuttingDown[id] = namedService go func() { - successChan := make(chan bool) + successChan := make(chan struct{}) go func() { namedService.Service.Stop() - successChan <- true + close(successChan) if notificationChan != nil { notificationChan <- struct{}{} } @@ -602,7 +651,6 @@ } close(s.liveness) - return } // String implements the fmt.Stringer interface. @@ -614,7 +662,7 @@ select { case s.control <- sm: return true - case _, _ = <-s.liveness: + case <-s.liveness: return false } } @@ -639,7 +687,10 @@ to Stop() it. It will wait up to the given timeout value for the service to terminate. A timeout value of 0 means to wait forever. -If a nil error is returned from this function +If a nil error is returned from this function, then the service was +terminated normally. If either the supervisor terminates or the timeout +passes, ErrTimeout is returned. (If this isn't even the right supervisor +ErrWrongSupervisor is returned.) */ func (s *Supervisor) RemoveAndWait(id ServiceToken, timeout time.Duration) error { sID := supervisorID(id.id >> 32) @@ -659,7 +710,7 @@ sentControl := s.sendControl(removeService{serviceID(id.id & 0xffffffff), notificationC}) - if sentControl == false { + if !sentControl { return ErrTimeout } @@ -671,7 +722,7 @@ // This occurs if the entire supervisor ends without the service // having terminated, and includes the timeout the supervisor // itself waited before closing the liveness channel. - case _, _ = <-s.liveness: + case <-s.liveness: return ErrTimeout // The local timeout. diff -Nru golang-github-thejerf-suture-2.0.3/suture_test.go golang-github-thejerf-suture-3.0.0/suture_test.go --- golang-github-thejerf-suture-2.0.3/suture_test.go 2018-01-03 15:47:05.000000000 +0000 +++ golang-github-thejerf-suture-3.0.0/suture_test.go 2018-09-21 18:57:38.000000000 +0000 @@ -589,6 +589,26 @@ if err != ErrTimeout { t.Fatal("Unexpected result for RemoveAndWait on a stopped service: " + err.Error()) } + + // Abnormal case: The service takes long to terminate, which takes more than the timeout of the spec, but + // if the service eventually terminates, this does not hang RemoveAndWait. + s = NewSimple("main") + s.timeout = time.Millisecond + s.ServeBackground() + service = NewService("A1") + token = s.Add(service) + <-service.started + service.take <- Hang + + go func() { + time.Sleep(10 * time.Millisecond) + service.release <- true + }() + + err = s.RemoveAndWait(token, 0) + if err != nil { + t.Fatal("Unexpected result of RemoveAndWait: " + err.Error()) + } } func TestCoverage(t *testing.T) {