Initial commit: Open sourcing all of the Maple Open Technologies code.
This commit is contained in:
commit
755d54a99d
2010 changed files with 448675 additions and 0 deletions
411
cloud/maplefile-backend/pkg/leaderelection/README.md
Normal file
411
cloud/maplefile-backend/pkg/leaderelection/README.md
Normal file
|
|
@ -0,0 +1,411 @@
|
|||
# Leader Election Package
|
||||
|
||||
Distributed leader election for MapleFile backend instances using Redis.
|
||||
|
||||
## Overview
|
||||
|
||||
This package provides leader election functionality for multiple backend instances running behind a load balancer. It ensures that only one instance acts as the "leader" at any given time, with automatic failover if the leader crashes.
|
||||
|
||||
## Features
|
||||
|
||||
- ✅ **Redis-based**: Fast, reliable leader election using Redis
|
||||
- ✅ **Automatic Failover**: New leader elected automatically if current leader crashes
|
||||
- ✅ **Heartbeat Mechanism**: Leader maintains lock with periodic renewals
|
||||
- ✅ **Callbacks**: Execute custom code when becoming/losing leadership
|
||||
- ✅ **Graceful Shutdown**: Clean leadership handoff on shutdown
|
||||
- ✅ **Thread-Safe**: Safe for concurrent use
|
||||
- ✅ **Observable**: Query leader status and information
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Election**: Instances compete to acquire a Redis lock (key)
|
||||
2. **Leadership**: First instance to acquire the lock becomes the leader
|
||||
3. **Heartbeat**: Leader renews the lock every `HeartbeatInterval` (default: 3s)
|
||||
4. **Lock TTL**: Lock expires after `LockTTL` if not renewed (default: 10s)
|
||||
5. **Failover**: If leader crashes, lock expires → followers compete for leadership
|
||||
6. **Re-election**: New leader elected within seconds of previous leader failure
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
│ Instance 1 │ │ Instance 2 │ │ Instance 3 │
|
||||
│ (Leader) │ │ (Follower) │ │ (Follower) │
|
||||
└──────┬──────┘ └──────┬──────┘ └──────┬──────┘
|
||||
│ │ │
|
||||
│ Heartbeat │ Try Acquire │ Try Acquire
|
||||
│ (Renew Lock) │ (Check Lock) │ (Check Lock)
|
||||
│ │ │
|
||||
└───────────────────┴───────────────────┘
|
||||
│
|
||||
┌────▼────┐
|
||||
│ Redis │
|
||||
│ Lock │
|
||||
└─────────┘
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Setup
|
||||
|
||||
```go
|
||||
import (
|
||||
"context"
|
||||
"github.com/redis/go-redis/v9"
|
||||
"go.uber.org/zap"
|
||||
"codeberg.org/mapleopentech/monorepo/cloud/maplefile-backend/pkg/leaderelection"
|
||||
)
|
||||
|
||||
// Create Redis client (you likely already have this)
|
||||
redisClient := redis.NewClient(&redis.Options{
|
||||
Addr: "localhost:6379",
|
||||
})
|
||||
|
||||
// Create logger
|
||||
logger, _ := zap.NewProduction()
|
||||
|
||||
// Create leader election configuration
|
||||
config := leaderelection.DefaultConfig()
|
||||
|
||||
// Create leader election instance
|
||||
election, err := leaderelection.NewRedisLeaderElection(config, redisClient, logger)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
// Start leader election in a goroutine
|
||||
ctx := context.Background()
|
||||
go func() {
|
||||
if err := election.Start(ctx); err != nil {
|
||||
logger.Error("Leader election failed", zap.Error(err))
|
||||
}
|
||||
}()
|
||||
|
||||
// Check if this instance is the leader
|
||||
if election.IsLeader() {
|
||||
logger.Info("I am the leader! 👑")
|
||||
}
|
||||
|
||||
// Graceful shutdown
|
||||
defer election.Stop()
|
||||
```
|
||||
|
||||
### With Callbacks
|
||||
|
||||
```go
|
||||
// Register callback when becoming leader
|
||||
election.OnBecomeLeader(func() {
|
||||
logger.Info("🎉 I became the leader!")
|
||||
|
||||
// Start leader-only tasks
|
||||
go startBackgroundJobs()
|
||||
go startMetricsAggregation()
|
||||
})
|
||||
|
||||
// Register callback when losing leadership
|
||||
election.OnLoseLeadership(func() {
|
||||
logger.Info("😢 I lost leadership")
|
||||
|
||||
// Stop leader-only tasks
|
||||
stopBackgroundJobs()
|
||||
stopMetricsAggregation()
|
||||
})
|
||||
```
|
||||
|
||||
### Integration with Application Startup
|
||||
|
||||
```go
|
||||
// In your main.go or app startup
|
||||
func (app *Application) Start() error {
|
||||
// Start leader election
|
||||
go func() {
|
||||
if err := app.leaderElection.Start(app.ctx); err != nil {
|
||||
app.logger.Error("Leader election error", zap.Error(err))
|
||||
}
|
||||
}()
|
||||
|
||||
// Wait a moment for election to complete
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
if app.leaderElection.IsLeader() {
|
||||
app.logger.Info("This instance is the leader")
|
||||
// Start leader-only services
|
||||
} else {
|
||||
app.logger.Info("This instance is a follower")
|
||||
// Start follower-only services (if any)
|
||||
}
|
||||
|
||||
// Start your HTTP server, etc.
|
||||
return app.httpServer.Start()
|
||||
}
|
||||
```
|
||||
|
||||
### Conditional Logic Based on Leadership
|
||||
|
||||
```go
|
||||
// Only leader executes certain tasks
|
||||
func (s *Service) PerformTask() {
|
||||
if s.leaderElection.IsLeader() {
|
||||
// Only leader does this expensive operation
|
||||
s.aggregateMetrics()
|
||||
}
|
||||
}
|
||||
|
||||
// Get information about the current leader
|
||||
func (s *Service) GetLeaderStatus() (*leaderelection.LeaderInfo, error) {
|
||||
info, err := s.leaderElection.GetLeaderInfo()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
fmt.Printf("Leader: %s (%s)\n", info.InstanceID, info.Hostname)
|
||||
fmt.Printf("Started: %s\n", info.StartedAt)
|
||||
fmt.Printf("Last Heartbeat: %s\n", info.LastHeartbeat)
|
||||
|
||||
return info, nil
|
||||
}
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Default Configuration
|
||||
|
||||
```go
|
||||
config := leaderelection.DefaultConfig()
|
||||
// Returns:
|
||||
// {
|
||||
// RedisKeyName: "maplefile:leader:lock",
|
||||
// RedisInfoKeyName: "maplefile:leader:info",
|
||||
// LockTTL: 10 * time.Second,
|
||||
// HeartbeatInterval: 3 * time.Second,
|
||||
// RetryInterval: 2 * time.Second,
|
||||
// }
|
||||
```
|
||||
|
||||
### Custom Configuration
|
||||
|
||||
```go
|
||||
config := &leaderelection.Config{
|
||||
RedisKeyName: "my-app:leader",
|
||||
RedisInfoKeyName: "my-app:leader:info",
|
||||
LockTTL: 30 * time.Second, // Lock expires after 30s
|
||||
HeartbeatInterval: 10 * time.Second, // Renew every 10s
|
||||
RetryInterval: 5 * time.Second, // Check for leadership every 5s
|
||||
InstanceID: "instance-1", // Custom instance ID
|
||||
Hostname: "server-01", // Custom hostname
|
||||
}
|
||||
```
|
||||
|
||||
### Configuration in Application Config
|
||||
|
||||
Add to your `config/config.go`:
|
||||
|
||||
```go
|
||||
type Config struct {
|
||||
// ... existing fields ...
|
||||
|
||||
LeaderElection struct {
|
||||
LockTTL time.Duration `env:"LEADER_ELECTION_LOCK_TTL" envDefault:"10s"`
|
||||
HeartbeatInterval time.Duration `env:"LEADER_ELECTION_HEARTBEAT_INTERVAL" envDefault:"3s"`
|
||||
RetryInterval time.Duration `env:"LEADER_ELECTION_RETRY_INTERVAL" envDefault:"2s"`
|
||||
InstanceID string `env:"LEADER_ELECTION_INSTANCE_ID" envDefault:""`
|
||||
Hostname string `env:"LEADER_ELECTION_HOSTNAME" envDefault:""`
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Use Cases
|
||||
|
||||
### 1. Background Job Processing
|
||||
Only the leader runs scheduled jobs:
|
||||
|
||||
```go
|
||||
election.OnBecomeLeader(func() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Hour)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
if election.IsLeader() {
|
||||
processScheduledJobs()
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
```
|
||||
|
||||
### 2. Database Migrations
|
||||
Only the leader runs migrations on startup:
|
||||
|
||||
```go
|
||||
if election.IsLeader() {
|
||||
logger.Info("Leader instance - running database migrations")
|
||||
if err := migrator.Up(); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
logger.Info("Follower instance - skipping migrations")
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Cache Warming
|
||||
Only the leader pre-loads caches:
|
||||
|
||||
```go
|
||||
election.OnBecomeLeader(func() {
|
||||
logger.Info("Warming caches as leader")
|
||||
warmApplicationCache()
|
||||
})
|
||||
```
|
||||
|
||||
### 4. Metrics Aggregation
|
||||
Only the leader aggregates and sends metrics:
|
||||
|
||||
```go
|
||||
election.OnBecomeLeader(func() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
if election.IsLeader() {
|
||||
aggregateAndSendMetrics()
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
```
|
||||
|
||||
### 5. Cleanup Tasks
|
||||
Only the leader runs periodic cleanup:
|
||||
|
||||
```go
|
||||
election.OnBecomeLeader(func() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(24 * time.Hour)
|
||||
defer ticker.Stop()
|
||||
|
||||
for range ticker.C {
|
||||
if election.IsLeader() {
|
||||
cleanupOldRecords()
|
||||
purgeExpiredSessions()
|
||||
}
|
||||
}
|
||||
}()
|
||||
})
|
||||
```
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Health Check Endpoint
|
||||
|
||||
```go
|
||||
func (h *HealthHandler) LeaderElectionHealth(w http.ResponseWriter, r *http.Request) {
|
||||
info, err := h.leaderElection.GetLeaderInfo()
|
||||
if err != nil {
|
||||
http.Error(w, "Failed to get leader info", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
response := map[string]interface{}{
|
||||
"is_leader": h.leaderElection.IsLeader(),
|
||||
"instance_id": h.leaderElection.GetInstanceID(),
|
||||
"leader_info": info,
|
||||
}
|
||||
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
```
|
||||
|
||||
### Logging
|
||||
|
||||
The package logs important events:
|
||||
- `🎉 Became the leader!` - When instance becomes leader
|
||||
- `Heartbeat sent` - When leader renews lock (DEBUG level)
|
||||
- `Failed to send heartbeat, lost leadership` - When leader loses lock
|
||||
- `Releasing leadership voluntarily` - On graceful shutdown
|
||||
|
||||
## Testing
|
||||
|
||||
### Local Testing with Multiple Instances
|
||||
|
||||
```bash
|
||||
# Terminal 1
|
||||
LEADER_ELECTION_INSTANCE_ID=instance-1 ./maplefile-backend
|
||||
|
||||
# Terminal 2
|
||||
LEADER_ELECTION_INSTANCE_ID=instance-2 ./maplefile-backend
|
||||
|
||||
# Terminal 3
|
||||
LEADER_ELECTION_INSTANCE_ID=instance-3 ./maplefile-backend
|
||||
```
|
||||
|
||||
### Failover Testing
|
||||
|
||||
1. Start 3 instances
|
||||
2. Check logs - one will become leader
|
||||
3. Kill the leader instance (Ctrl+C)
|
||||
4. Watch logs - another instance becomes leader within seconds
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Always check leadership before expensive operations**
|
||||
```go
|
||||
if election.IsLeader() {
|
||||
// expensive operation
|
||||
}
|
||||
```
|
||||
|
||||
2. **Use callbacks for starting/stopping leader-only services**
|
||||
```go
|
||||
election.OnBecomeLeader(startLeaderServices)
|
||||
election.OnLoseLeadership(stopLeaderServices)
|
||||
```
|
||||
|
||||
3. **Set appropriate timeouts**
|
||||
- `LockTTL` should be 2-3x `HeartbeatInterval`
|
||||
- Shorter TTL = faster failover but more Redis traffic
|
||||
- Longer TTL = slower failover but less Redis traffic
|
||||
|
||||
4. **Handle callback panics**
|
||||
- Callbacks run in goroutines and panics are caught
|
||||
- But you should still handle errors gracefully
|
||||
|
||||
5. **Always call Stop() on shutdown**
|
||||
```go
|
||||
defer election.Stop()
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Leader keeps changing
|
||||
- Increase `LockTTL` (network might be slow)
|
||||
- Check Redis connectivity
|
||||
- Check for clock skew between instances
|
||||
|
||||
### No leader elected
|
||||
- Check Redis is running and accessible
|
||||
- Check Redis key permissions
|
||||
- Check logs for errors
|
||||
|
||||
### Leader doesn't release on shutdown
|
||||
- Ensure `Stop()` is called
|
||||
- Check for blocking operations preventing shutdown
|
||||
- TTL will eventually expire the lock
|
||||
|
||||
## Performance
|
||||
|
||||
- **Election time**: < 100ms
|
||||
- **Failover time**: < `LockTTL` (default: 10s)
|
||||
- **Redis operations per second**: `1 / HeartbeatInterval` (default: 0.33/s)
|
||||
- **Memory overhead**: Minimal (~1KB per instance)
|
||||
|
||||
## Thread Safety
|
||||
|
||||
All methods are thread-safe and can be called from multiple goroutines:
|
||||
- `IsLeader()`
|
||||
- `GetLeaderID()`
|
||||
- `GetLeaderInfo()`
|
||||
- `OnBecomeLeader()`
|
||||
- `OnLoseLeadership()`
|
||||
- `Stop()`
|
||||
Loading…
Add table
Add a link
Reference in a new issue