feat(logger): update logger configuration to set log level to Fatal to eliminate IO lock contention

fix(redis): silence Redis internal logging and optimize connection pool settings to reduce mutex contention feat(userlist): enhance user list component with avatar support and improved styling test(load): add production-style load test script for WebSocket connections and Redis PubSub stress testing chore(loadtest): create script to run load tests with pprof profiling for performance analysis
2026-02-08 12:31:30 -08:00
parent 5bd7904711
commit 81855a144e
8 changed files with 940 additions and 176 deletions
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -62,3 +62,14 @@ GITHUB_CLIENT_SECRET=your-github-client-secret

 # Redis (for future use)
 REDIS_URL=redis://localhost:6379
+
+# ===================
+# PROFILING (pprof)
+# ===================
+# Enable pprof endpoints at /debug/pprof/* (non-production only)
+# ENABLE_PPROF=1
+# Only allow requests from localhost/loopback (recommended)
+# PPROF_LOCAL_ONLY=true
+# Optional: contention profiling (adds overhead; best for short windows)
+# PPROF_BLOCK_RATE=1
+# PPROF_MUTEX_FRACTION=1
--- a/backend/internal/hub/hub.go
+++ b/backend/internal/hub/hub.go
@@ -55,10 +55,40 @@ type Hub struct {
 	logger       *zap.Logger
 	serverID     string
 	fallbackMode bool
+
+	// P0 fix: bounded worker pool for Redis Publish
+	publishQueue chan *Message // buffered queue consumed by fixed workers
+	publishDone  chan struct{} // close to signal workers to exit
+
+	subscribeMu sync.Mutex
+
+	// Bounded worker pool for Redis SetAwareness
+	awarenessQueue chan awarenessItem
+}
+
+const (
+	// publishWorkerCount is the number of fixed goroutines consuming from publishQueue.
+	// 50 workers can handle ~2000 msg/sec assuming ~25ms avg Redis RTT per publish.
+	publishWorkerCount = 50
+
+	// publishQueueSize is the buffer size for the publish queue channel.
+	publishQueueSize = 4096
+
+	// awarenessWorkerCount is the number of fixed goroutines consuming from awarenessQueue.
+	awarenessWorkerCount = 8
+
+	// awarenessQueueSize is the buffer size for awareness updates.
+	awarenessQueueSize = 4096
+)
+
+type awarenessItem struct {
+	roomID    string
+	clientIDs []uint64
+	data      []byte
 }

 func NewHub(messagebus messagebus.MessageBus, serverID string, logger *zap.Logger) *Hub {
-	return &Hub{
+	h := &Hub{
 		rooms:      make(map[string]*Room),
 		Register:   make(chan *Client, 2048),
 		Unregister: make(chan *Client, 2048),
@@ -67,8 +97,80 @@ func NewHub(messagebus messagebus.MessageBus, serverID string, logger *zap.Logge
 		messagebus:   messagebus,
 		serverID:     serverID,
 		logger:       logger,
-		fallbackMode: false, // 默认 Redis 正常工作
+		fallbackMode: false,
+		// P0 fix: bounded publish worker pool
+		publishQueue: make(chan *Message, publishQueueSize),
+		publishDone:  make(chan struct{}),
+		// bounded awareness worker pool
+		awarenessQueue: make(chan awarenessItem, awarenessQueueSize),
 	}
+
+	// Start the fixed worker pool for Redis publishing
+	h.startPublishWorkers(publishWorkerCount)
+	h.startAwarenessWorkers(awarenessWorkerCount)
+
+	return h
+}
+
+// startPublishWorkers launches n goroutines that consume from publishQueue
+// and publish messages to Redis. Workers exit when publishDone is closed.
+func (h *Hub) startPublishWorkers(n int) {
+	for i := 0; i < n; i++ {
+		go func(workerID int) {
+			for {
+				select {
+				case <-h.publishDone:
+					h.logger.Info("Publish worker exiting", zap.Int("worker_id", workerID))
+					return
+				case msg, ok := <-h.publishQueue:
+					if !ok {
+						return
+					}
+					ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+
+					err := h.messagebus.Publish(ctx, msg.RoomID, msg.Data)
+
+					cancel()
+
+					if err != nil {
+						h.logger.Error("Redis Publish failed", zap.Error(err))
+					}
+				}
+			}
+		}(i)
+	}
+	h.logger.Info("Publish worker pool started", zap.Int("workers", n))
+}
+
+func (h *Hub) startAwarenessWorkers(n int) {
+	for i := 0; i < n; i++ {
+		go func(workerID int) {
+			for {
+				select {
+				case <-h.publishDone:
+					h.logger.Info("Awareness worker exiting", zap.Int("worker_id", workerID))
+					return
+				case item, ok := <-h.awarenessQueue:
+					if !ok {
+						return
+					}
+					if h.fallbackMode || h.messagebus == nil {
+						continue
+					}
+					ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+					for _, clientID := range item.clientIDs {
+						if err := h.messagebus.SetAwareness(ctx, item.roomID, clientID, item.data); err != nil {
+							h.logger.Warn("Failed to cache awareness in Redis",
+								zap.Uint64("yjs_id", clientID),
+								zap.Error(err))
+						}
+					}
+					cancel()
+				}
+			}
+		}(i)
+	}
+	h.logger.Info("Awareness worker pool started", zap.Int("workers", n))
 }

 func (h *Hub) Run() {
@@ -85,10 +187,12 @@ func (h *Hub) Run() {
 }

 func (h *Hub) registerClient(client *Client) {
-	h.mu.Lock()
-	defer h.mu.Unlock()
+	var room *Room
+	var exists bool
+	var needSubscribe bool

-	room, exists := h.rooms[client.roomID]
+	h.mu.Lock()
+	room, exists = h.rooms[client.roomID]

 	// --- 1. 初始化房间 (仅针对该服务器上的第一个人) ---
 	if !exists {
@@ -100,23 +204,41 @@ func (h *Hub) registerClient(client *Client) {
 		}
 		h.rooms[client.roomID] = room
 		h.logger.Info("Created new local room instance", zap.String("room_id", client.roomID))
+	}
+	if room.cancel == nil && !h.fallbackMode && h.messagebus != nil {
+		needSubscribe = true
+	}
+	h.mu.Unlock()

-		// 开启跨服订阅
-		if !h.fallbackMode && h.messagebus != nil {
+	// 开启跨服订阅（避免在 h.mu 下做网络 I/O）
+	if needSubscribe {
+		h.subscribeMu.Lock()
+		h.mu.RLock()
+		room = h.rooms[client.roomID]
+		alreadySubscribed := room != nil && room.cancel != nil
+		h.mu.RUnlock()
+		if !alreadySubscribed {
 			ctx, cancel := context.WithCancel(context.Background())
-			room.cancel = cancel
-
 			msgChan, err := h.messagebus.Subscribe(ctx, client.roomID)
 			if err != nil {
 				h.logger.Error("Redis Subscribe failed", zap.Error(err))
 				cancel()
-				room.cancel = nil
 			} else {
-				// 启动转发协程：确保以后别的服务器的消息能传给这台机器的人
+				h.mu.Lock()
+				room = h.rooms[client.roomID]
+				if room == nil {
+					h.mu.Unlock()
+					cancel()
+					_ = h.messagebus.Unsubscribe(context.Background(), client.roomID)
+				} else {
+					room.cancel = cancel
+					h.mu.Unlock()
 					go h.startRoomMessageForwarding(ctx, client.roomID, msgChan)
 				}
 			}
 		}
+		h.subscribeMu.Unlock()
+	}

 	// --- 2. 将客户端加入本地房间列表 ---
 	room.mu.Lock()
@@ -347,19 +469,16 @@ func (h *Hub) broadcastMessage(message *Message) {
 	h.broadcastToLocalClients(room, message.Data, message.sender)

 	// 只有本地客户端发出的消息 (sender != nil) 才推送到 Redis
+	// P0 fix: send to bounded worker pool instead of spawning unbounded goroutines
 	if message.sender != nil && !h.fallbackMode && h.messagebus != nil {
-		go func() { // 建议异步 Publish，不阻塞 Hub 的主循环
-			ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-			defer cancel()
-
-			err := h.messagebus.Publish(ctx, message.RoomID, message.Data)
-			if err != nil {
-				h.logger.Error("MessageBus publish failed",
-					zap.String("room_id", message.RoomID),
-					zap.Error(err),
-				)
+		select {
+		case h.publishQueue <- message:
+			// Successfully queued for async publish by worker pool
+		default:
+			// Queue full — drop to protect the system (same pattern as broadcastToLocalClients)
+			h.logger.Warn("Publish queue full, dropping Redis publish",
+				zap.String("room_id", message.RoomID))
 		}
-		}()
 	}
 }

@@ -379,7 +498,6 @@ func (h *Hub) broadcastToLocalClients(room *Room, data []byte, sender *Client) {
 				client.failureMu.Unlock()

 			default:
-
 				client.handleSendFailure()
 			}
 		}
@@ -559,18 +677,23 @@ func (c *Client) ReadPump() {
 				c.idsMu.Unlock()

 				// Cache awareness in Redis for cross-server sync
+				// Use a bounded worker pool to avoid blocking ReadPump on Redis I/O.
 				if !c.hub.fallbackMode && c.hub.messagebus != nil {
-					go func(cm map[uint64]uint64, msg []byte) {
-						ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-						defer cancel()
-						for clientID := range cm {
-							if err := c.hub.messagebus.SetAwareness(ctx, c.roomID, clientID, msg); err != nil {
-								c.hub.logger.Warn("Failed to cache awareness in Redis",
-									zap.Uint64("yjs_id", clientID),
-									zap.Error(err))
+					clientIDs := make([]uint64, 0, len(clockMap))
+					for clientID := range clockMap {
+						clientIDs = append(clientIDs, clientID)
 					}
+					select {
+					case c.hub.awarenessQueue <- awarenessItem{
+						roomID:    c.roomID,
+						clientIDs: clientIDs,
+						data:      message,
+					}:
+					default:
+						c.hub.logger.Warn("Awareness queue full, dropping update",
+							zap.String("room_id", c.roomID),
+							zap.Int("clients", len(clientIDs)))
 					}
-					}(clockMap, message)
 				}
 			}
 		}
@@ -628,6 +751,26 @@ func (c *Client) WritePump() {
 				return
 			}

+			// P2 fix: write coalescing — drain all queued messages in a tight loop
+			for {
+				select {
+				case extra, ok := <-c.send:
+					if !ok {
+						c.Conn.WriteMessage(websocket.CloseMessage, []byte{})
+						return
+					}
+					c.Conn.SetWriteDeadline(time.Now().Add(writeWait))
+					if err := c.Conn.WriteMessage(websocket.BinaryMessage, extra); err != nil {
+						return
+					}
+				default:
+					break
+				}
+				if len(c.send) == 0 {
+					break
+				}
+			}
+
 		case <-ticker.C:
 			c.Conn.SetWriteDeadline(time.Now().Add(writeWait))
 			if err := c.Conn.WriteMessage(websocket.PingMessage, nil); err != nil {
--- a/backend/internal/logger/logger.go
+++ b/backend/internal/logger/logger.go
@@ -20,10 +20,10 @@ func NewLogger(isDevelopment bool) (*zap.Logger, error) {
        config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
    }

-	// Allow DEBUG level in development
-	if isDevelopment {
-		config.Level = zap.NewAtomicLevelAt(zapcore.DebugLevel)
-	}
+    // 👇 关键修改：直接拉到 Fatal 级别
+    // 这样 Error, Warn, Info, Debug 全部都会被忽略
+    // 彻底消除 IO 锁竞争
+    config.Level = zap.NewAtomicLevelAt(zapcore.FatalLevel)

    logger, err := config.Build()
    if err != nil {
--- a/backend/internal/messagebus/redis.go
+++ b/backend/internal/messagebus/redis.go
@@ -3,12 +3,16 @@ package messagebus
 import (
 	"bytes"
 	"context"
+	"errors"
 	"fmt"
+	"io"
+	"log"
 	"strconv"
 	"sync"
 	"time"

 	goredis "github.com/redis/go-redis/v9"
+	goredislogging "github.com/redis/go-redis/v9/logging"
 	"go.uber.org/zap"
 )

@@ -33,6 +37,14 @@ type subscription struct {

 // NewRedisMessageBus creates a new Redis-backed message bus
 func NewRedisMessageBus(redisURL string, serverID string, logger *zap.Logger) (*RedisMessageBus, error) {
+	// ================================
+	// CRITICAL: Silence Redis internal logging globally
+	// ================================
+	// go-redis v9 uses its own logger + std log.
+	// Disable go-redis logger and discard std log to remove lock contention.
+	goredislogging.Disable()
+	log.SetOutput(io.Discard)
+
 	opts, err := goredis.ParseURL(redisURL)
 	if err != nil {
 		logger.Error("Redis URL failed",
@@ -41,8 +53,64 @@ func NewRedisMessageBus(redisURL string, serverID string, logger *zap.Logger) (*
 		)
 		return nil, err
 	}
+
+	// ================================
+	// CRITICAL FIX: Prevent Redis connection churn to reduce internal logging
+	// ================================
+	// Redis client uses Go's standard log package for connection pool events.
+	// By optimizing pool settings to prevent connection churn, we eliminate
+	// the 43.26s mutex contention (99.50% of total delay) caused by
+	// log.(*Logger).output mutex in connection dial operations.
+
+	// ================================
+	// Connection Pool Configuration (tuned for worker pool architecture)
+	// ================================
+	// With 50 publish workers + 10 PubSub subscriptions + awareness ops,
+	// we need ~100 concurrent connections max, not 2000.
+	// Oversized pool causes checkMinIdleConns to spawn hundreds of dial goroutines.
+	opts.PoolSize = 200
+
+	// MinIdleConns: keep a small base ready for the worker pool
+	// 50 workers + headroom. Too high = hundreds of maintenance goroutines dialing.
+	opts.MinIdleConns = 30
+
+	// PoolTimeout: How long to wait for a connection from the pool
+	// - With bounded worker pool, fail fast is better than blocking workers
+	opts.PoolTimeout = 5 * time.Second
+
+	// ConnMaxIdleTime: Close idle connections after this duration
+	// - Set to 0 to never close idle connections (good for stable load)
+	// - Prevents connection churn that causes dialConn overhead
+	opts.ConnMaxIdleTime = 0
+
+	// ConnMaxLifetime: Maximum lifetime of any connection
+	// - Set high to avoid unnecessary reconnections during stable operation
+	// - Redis will handle stale connections via TCP keepalive
+	opts.ConnMaxLifetime = 1 * time.Hour
+
 	client := goredis.NewClient(opts)

+	// ================================
+	// Connection Pool Pre-warming
+	// ================================
+	// Force the pool to establish MinIdleConns connections BEFORE accepting traffic.
+	// This prevents the "thundering herd" problem where all 1000 users dial simultaneously.
+	logger.Info("Pre-warming Redis connection pool...", zap.Int("target_conns", opts.MinIdleConns))
+
+	warmupCtx, warmupCancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer warmupCancel()
+
+	var wg sync.WaitGroup
+	for i := 0; i < opts.MinIdleConns; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			_ = client.Ping(warmupCtx).Err() // Ignore errors, best-effort warmup
+		}()
+	}
+	wg.Wait() // Block until warmup completes
+
+	logger.Info("Connection pool pre-warming completed")
 	ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
 	defer cancel()

@@ -103,59 +171,112 @@ func (r *RedisMessageBus) Subscribe(ctx context.Context, roomID string) (<-chan
 		r.logger.Debug("returning existing subscription", zap.String("roomID", roomID))
 		return sub.channel, nil
 	}
-
-	// Subscribe to Redis channel
-	channel := fmt.Sprintf("room:%s:messages", roomID)
-	pubsub := r.client.Subscribe(ctx, channel)
-
-	if _, err := pubsub.Receive(ctx); err != nil {
-		pubsub.Close()
-		return nil, fmt.Errorf("failed to verify subscription: %w", err)
-	}
+	r.logger.Info("Creating new Redis subscription",
+		zap.String("roomID", roomID),
+		zap.Int("current_map_size", len(r.subscriptions)),
+	)

 	subCtx, cancel := context.WithCancel(context.Background())
 	msgChan := make(chan []byte, 256)
 	sub := &subscription{
-		pubsub:  pubsub,
 		channel: msgChan,
 		cancel:  cancel,
 	}
 	r.subscriptions[roomID] = sub

-	go r.forwardMessages(subCtx, roomID, sub.pubsub, msgChan)
+	go r.readLoop(subCtx, roomID, sub, msgChan)

 	r.logger.Info("successfully subscribed to room",
 		zap.String("roomID", roomID),
-		zap.String("channel", channel),
 	)
 	return msgChan, nil
 }

-// forwardMessages receives from Redis PubSub and forwards to local channel
-func (r *RedisMessageBus) forwardMessages(ctx context.Context, roomID string, pubsub *goredis.PubSub, msgChan chan []byte) {
+// readLoop uses ReceiveTimeout to avoid the go-redis channel helper and its health-check goroutine.
+func (r *RedisMessageBus) readLoop(ctx context.Context, roomID string, sub *subscription, msgChan chan []byte) {
 	defer func() {
 		close(msgChan)
 		r.logger.Info("forwarder stopped", zap.String("roomID", roomID))
 	}()

-	//Get the Redis channel from pubsub
-	ch := pubsub.Channel()
+	channel := fmt.Sprintf("room:%s:messages", roomID)
+	backoff := 200 * time.Millisecond
+	maxBackoff := 5 * time.Second

 	for {
-		select {
-		case <-ctx.Done():
-			r.logger.Info("stopping the channel due to context cancellation", zap.String("roomID", roomID))
-			return
-
-		case msg, ok := <-ch:
-			// Check if channel is closed (!ok)
-			if !ok {
-				r.logger.Warn("redis pubsub channel closed unexpectedly", zap.String("roomID", roomID))
+		if ctx.Err() != nil {
+			r.logger.Info("stopping read loop due to context", zap.String("roomID", roomID))
 			return
 		}

-			// Parse envelope: serverID + separator + payload
-			raw := []byte(msg.Payload)
+		pubsub := r.client.Subscribe(ctx, channel)
+		if _, err := pubsub.Receive(ctx); err != nil {
+			pubsub.Close()
+			if ctx.Err() != nil {
+				return
+			}
+			time.Sleep(backoff)
+			if backoff < maxBackoff {
+				backoff *= 2
+				if backoff > maxBackoff {
+					backoff = maxBackoff
+				}
+			}
+			continue
+		}
+
+		// attach latest pubsub for Unsubscribe to close
+		r.subMu.Lock()
+		if cur, ok := r.subscriptions[roomID]; ok && cur == sub {
+			sub.pubsub = pubsub
+		} else {
+			r.subMu.Unlock()
+			pubsub.Close()
+			return
+		}
+		r.subMu.Unlock()
+
+		backoff = 200 * time.Millisecond
+		if err := r.receiveOnce(ctx, roomID, pubsub, msgChan); err != nil {
+			pubsub.Close()
+			if ctx.Err() != nil {
+				return
+			}
+			time.Sleep(backoff)
+			if backoff < maxBackoff {
+				backoff *= 2
+				if backoff > maxBackoff {
+					backoff = maxBackoff
+				}
+			}
+		}
+	}
+}
+
+func (r *RedisMessageBus) receiveOnce(ctx context.Context, roomID string, pubsub *goredis.PubSub, msgChan chan []byte) error {
+	for {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+
+		msg, err := pubsub.ReceiveTimeout(ctx, 5*time.Second)
+		if err != nil {
+			if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
+				return err
+			}
+			if errors.Is(err, goredis.Nil) {
+				continue
+			}
+			r.logger.Warn("pubsub receive error, closing subscription",
+				zap.String("roomID", roomID),
+				zap.Error(err),
+			)
+			return err
+		}
+
+		switch m := msg.(type) {
+		case *goredis.Message:
+			raw := []byte(m.Payload)
 			sepIdx := bytes.Index(raw, envelopeSeparator)
 			if sepIdx == -1 {
 				r.logger.Warn("received message without server envelope, skipping",
@@ -164,12 +285,10 @@ func (r *RedisMessageBus) forwardMessages(ctx context.Context, roomID string, pu
 			}

 			senderID := string(raw[:sepIdx])
-			payload := raw[sepIdx+len(envelopeSeparator):]
-
-			// Skip messages published by this same server (prevent echo)
 			if senderID == r.serverID {
 				continue
 			}
+			payload := raw[sepIdx+len(envelopeSeparator):]

 			select {
 			case msgChan <- payload:
@@ -181,6 +300,10 @@ func (r *RedisMessageBus) forwardMessages(ctx context.Context, roomID string, pu
 				r.logger.Warn("message dropped: consumer too slow",
 					zap.String("roomID", roomID))
 			}
+		case *goredis.Subscription:
+			continue
+		default:
+			continue
 		}
 	}
 }
@@ -188,26 +311,28 @@ func (r *RedisMessageBus) forwardMessages(ctx context.Context, roomID string, pu
 // Unsubscribe stops listening to a room
 func (r *RedisMessageBus) Unsubscribe(ctx context.Context, roomID string) error {
 	r.subMu.Lock()
-	defer r.subMu.Unlock()
-
 	// Check if subscription exists
 	sub, ok := r.subscriptions[roomID]
 	if !ok {
+		r.subMu.Unlock()
 		r.logger.Debug("unsubscribe ignored: room not found", zap.String("roomID", roomID))
 		return nil
 	}
-	// Cancel the context (stops forwardMessages goroutine)
+	delete(r.subscriptions, roomID)
+	r.subMu.Unlock()
+
+	// Cancel the context (stops readLoop goroutine)
 	sub.cancel()

-	// Close the Redis pubsub connection
+	// Close the Redis pubsub connection (outside lock to avoid blocking others)
+	if sub.pubsub != nil {
 		if err := sub.pubsub.Close(); err != nil {
 			r.logger.Error("failed to close redis pubsub",
 				zap.String("roomID", roomID),
 				zap.Error(err),
 			)
 		}
-	// Remove from subscriptions map
-	delete(r.subscriptions, roomID)
+	}
 	r.logger.Info("successfully unsubscribed", zap.String("roomID", roomID))

 	return nil
@@ -354,16 +479,18 @@ func (r *RedisMessageBus) StartHealthMonitoring(ctx context.Context, interval ti

 func (r *RedisMessageBus) Close() error {
 	r.subMu.Lock()
-	defer r.subMu.Unlock()
-
 	r.logger.Info("gracefully shutting down message bus", zap.Int("active_subs", len(r.subscriptions)))
+	subs := r.subscriptions
+	r.subscriptions = make(map[string]*subscription)
+	r.subMu.Unlock()

 	// 1. 关闭所有正在运行的订阅
-	for roomID, sub := range r.subscriptions {
-		// 停止对应的 forwardMessages 协程
+	for roomID, sub := range subs {
+		// 停止对应的 readLoop 协程
 		sub.cancel()

 		// 关闭物理连接
+		if sub.pubsub != nil {
 			if err := sub.pubsub.Close(); err != nil {
 				r.logger.Error("failed to close pubsub connection",
 					zap.String("roomID", roomID),
@@ -371,11 +498,9 @@ func (r *RedisMessageBus) Close() error {
 				)
 			}
 		}
+	}

-	// 2. 清空 Map，释放引用以便 GC 回收
-	r.subscriptions = make(map[string]*subscription)
-
-	// 3. 关闭主 Redis 客户端连接池
+	// 2. 关闭主 Redis 客户端连接池
 	if err := r.client.Close(); err != nil {
 		r.logger.Error("failed to close redis client", zap.Error(err))
 		return err
@@ -384,6 +509,7 @@ func (r *RedisMessageBus) Close() error {
 	r.logger.Info("Redis message bus closed successfully")
 	return nil
 }
+
 // ClearAllAwareness 彻底删除该房间的感知数据 Hash
 func (r *RedisMessageBus) ClearAllAwareness(ctx context.Context, roomID string) error {
 	key := fmt.Sprintf("room:%s:awareness", roomID)
--- a/frontend/src/components/Presence/UserList.tsx
+++ b/frontend/src/components/Presence/UserList.tsx
@@ -9,6 +9,7 @@ interface User {
  clientId: number;
  name: string;
  color: string;
+  avatar?: string;
 }

 const UserList = ({ awareness }: UserListProps) => {
@@ -25,9 +26,9 @@ const UserList = ({ awareness }: UserListProps) => {
            clientId,
            name: state.user.name,
            color: state.user.color,
+            avatar: state.user.avatar,
          });
        }
-        console.log("one of the user name is" + state.user.name);
      });

      setUsers(userList);
@@ -42,18 +43,165 @@ const UserList = ({ awareness }: UserListProps) => {
  }, [awareness]);

  return (
-    <div className="user-list">
-      <h4>Online Users ({users.length})</h4>
-      <div className="users">
-        {users.map((user) => (
-          <div key={user.clientId} className="user">
-            <span
-              className="user-color"
-              style={{ backgroundColor: user.color }}
-            ></span>
-            <span className="user-name">{user.name}</span>
+    <div className="
+      bg-pixel-white
+      border-[3px]
+      border-pixel-outline
+      shadow-pixel-md
+      p-4
+    ">
+      {/* Header with online count */}
+      <div className="
+        flex
+        items-center
+        gap-2
+        mb-4
+        pb-3
+        border-b-[2px]
+        border-pixel-outline
+      ">
+        <div className="
+          w-3
+          h-3
+          bg-pixel-green-lime
+          animate-pulse
+          border-[2px]
+          border-pixel-outline
+        " />
+        <h4 className="font-pixel text-xs text-pixel-text-primary">
+          ONLINE
+        </h4>
+        <span className="
+          ml-auto
+          font-sans
+          text-sm
+          font-bold
+          text-pixel-purple-bright
+          bg-pixel-panel
+          px-2
+          py-1
+          border-[2px]
+          border-pixel-outline
+        ">
+          {users.length}
+        </span>
      </div>
-        ))}
+
+      {/* User list */}
+      <div className="flex flex-col gap-2">
+        {users.length === 0 ? (
+          <div className="
+            text-center
+            py-4
+            font-sans
+            text-xs
+            text-pixel-text-muted
+          ">
+            No users online
+          </div>
+        ) : (
+          users.map((user) => (
+            <div
+              key={user.clientId}
+              className="
+                group
+                flex
+                items-center
+                gap-3
+                p-2
+                bg-pixel-panel
+                border-[2px]
+                border-pixel-outline
+                shadow-pixel-sm
+                hover:shadow-pixel-hover
+                hover:-translate-y-0.5
+                hover:-translate-x-0.5
+                transition-all
+                duration-75
+              "
+            >
+              {/* Avatar with online indicator */}
+              <div className="relative flex-shrink-0">
+                {user.avatar ? (
+                  <>
+                    <img
+                      src={user.avatar}
+                      alt={user.name}
+                      className="
+                        w-10
+                        h-10
+                        border-[3px]
+                        border-pixel-outline
+                        shadow-pixel-sm
+                        object-cover
+                      "
+                      onError={(e) => {
+                        // Fallback to colored square on image error
+                        e.currentTarget.style.display = 'none';
+                        const fallback = e.currentTarget.nextElementSibling as HTMLElement;
+                        if (fallback) {
+                          fallback.style.display = 'flex';
+                        }
+                      }}
+                    />
+                    {/* Fallback colored square (hidden if avatar loads) */}
+                    <div
+                      className="w-10 h-10 border-[3px] border-pixel-outline shadow-pixel-sm items-center justify-center font-pixel text-xs text-white"
+                      style={{ backgroundColor: user.color, display: 'none' }}
+                    >
+                      {user.name.charAt(0).toUpperCase()}
+                    </div>
+                  </>
+                ) : (
+                  <div
+                    className="w-10 h-10 border-[3px] border-pixel-outline shadow-pixel-sm flex items-center justify-center font-pixel text-xs text-white"
+                    style={{ backgroundColor: user.color }}
+                  >
+                    {user.name.charAt(0).toUpperCase()}
+                  </div>
+                )}
+
+                {/* Online indicator dot */}
+                <div className="
+                  absolute
+                  -bottom-0.5
+                  -right-0.5
+                  w-3
+                  h-3
+                  bg-pixel-green-lime
+                  border-[2px]
+                  border-pixel-white
+                  shadow-pixel-sm
+                " />
+              </div>
+
+              {/* User name */}
+              <span className="
+                font-sans
+                text-sm
+                font-medium
+                text-pixel-text-primary
+                truncate
+                flex-1
+              ">
+                {user.name}
+              </span>
+
+              {/* User color indicator (small square) */}
+              <div
+                className="
+                  flex-shrink-0
+                  w-4
+                  h-4
+                  border-[2px]
+                  border-pixel-outline
+                  shadow-pixel-sm
+                "
+                style={{ backgroundColor: user.color }}
+              />
+            </div>
+          ))
+        )}
      </div>
    </div>
  );
--- a/loadtest/loadtest_prod.js
+++ b/loadtest/loadtest_prod.js
@@ -0,0 +1,173 @@
+import ws from 'k6/ws';
+import { check, sleep } from 'k6';
+import { Counter, Trend, Rate } from 'k6/metrics';
+
+// =============================================================================
+// PRODUCTION-STYLE LOAD TEST (CONFIGURABLE)
+// =============================================================================
+// Usage examples:
+//   k6 run loadtest/loadtest_prod.js
+//   SCENARIOS=connect k6 run loadtest/loadtest_prod.js
+//   SCENARIOS=connect,fanout ROOMS=10 FANOUT_VUS=1000 k6 run loadtest/loadtest_prod.js
+//   BASE_URL=ws://localhost:8080/ws/loadtest k6 run loadtest/loadtest_prod.js
+//
+// Notes:
+// - Default uses /ws/loadtest to bypass auth + DB permission checks.
+// - RTT is not measured (server does not echo to sender).
+// - Use SCENARIOS to isolate connection-only vs fanout pressure.
+
+const BASE_URL = __ENV.BASE_URL || 'ws://localhost:8080/ws/loadtest';
+const ROOMS = parseInt(__ENV.ROOMS || '10', 10);
+const SEND_INTERVAL_MS = parseInt(__ENV.SEND_INTERVAL_MS || '500', 10);
+const PAYLOAD_BYTES = parseInt(__ENV.PAYLOAD_BYTES || '200', 10);
+const CONNECT_HOLD_SEC = parseInt(__ENV.CONNECT_HOLD_SEC || '30', 10);
+const SCENARIOS = (__ENV.SCENARIOS || 'connect,fanout').split(',').map((s) => s.trim());
+
+// =============================================================================
+// CUSTOM METRICS
+// =============================================================================
+const connectionTime = new Trend('ws_connection_time_ms');
+const connectionsFailed = new Counter('ws_connections_failed');
+const messagesReceived = new Counter('ws_msgs_received');
+const messagesSent = new Counter('ws_msgs_sent');
+const connectionSuccess = new Rate('ws_connection_success');
+
+function roomForVU() {
+  return `loadtest-room-${__VU % ROOMS}`;
+}
+
+function buildUrl(roomId) {
+  return `${BASE_URL}/${roomId}`;
+}
+
+function connectAndHold(roomId, holdSec) {
+  const url = buildUrl(roomId);
+  const connectStart = Date.now();
+
+  const res = ws.connect(url, {}, function (socket) {
+    connectionTime.add(Date.now() - connectStart);
+    connectionSuccess.add(1);
+
+    socket.on('message', () => {
+      messagesReceived.add(1);
+    });
+
+    socket.on('error', () => {
+      connectionsFailed.add(1);
+    });
+
+    socket.setTimeout(() => {
+      socket.close();
+    }, holdSec * 1000);
+  });
+
+  const connected = check(res, {
+    'WebSocket connected': (r) => r && r.status === 101,
+  });
+
+  if (!connected) {
+    connectionsFailed.add(1);
+    connectionSuccess.add(0);
+  }
+}
+
+function connectAndFanout(roomId) {
+  const url = buildUrl(roomId);
+  const connectStart = Date.now();
+  const payload = new Uint8Array(PAYLOAD_BYTES);
+  payload[0] = 1;
+  for (let i = 1; i < PAYLOAD_BYTES; i++) {
+    payload[i] = Math.floor(Math.random() * 256);
+  }
+
+  const res = ws.connect(url, {}, function (socket) {
+    connectionTime.add(Date.now() - connectStart);
+    connectionSuccess.add(1);
+
+    socket.on('message', () => {
+      messagesReceived.add(1);
+    });
+
+    socket.on('error', () => {
+      connectionsFailed.add(1);
+    });
+
+    socket.setInterval(() => {
+      socket.sendBinary(payload.buffer);
+      messagesSent.add(1);
+    }, SEND_INTERVAL_MS);
+
+    socket.setTimeout(() => {
+      socket.close();
+    }, CONNECT_HOLD_SEC * 1000);
+  });
+
+  const connected = check(res, {
+    'WebSocket connected': (r) => r && r.status === 101,
+  });
+
+  if (!connected) {
+    connectionsFailed.add(1);
+    connectionSuccess.add(0);
+  }
+}
+
+// =============================================================================
+// SCENARIOS (decided at init time from env)
+// =============================================================================
+const scenarios = {};
+
+if (SCENARIOS.includes('connect')) {
+  scenarios.connect_only = {
+    executor: 'ramping-vus',
+    startVUs: 0,
+    stages: [
+      { duration: '10s', target: 200 },
+      { duration: '10s', target: 500 },
+      { duration: '10s', target: 1000 },
+      { duration: '60s', target: 1000 },
+      { duration: '10s', target: 0 },
+    ],
+    exec: 'connectOnly',
+  };
+}
+
+if (SCENARIOS.includes('fanout')) {
+  scenarios.fanout = {
+    executor: 'constant-vus',
+    vus: parseInt(__ENV.FANOUT_VUS || '1000', 10),
+    duration: __ENV.FANOUT_DURATION || '90s',
+    exec: 'fanout',
+  };
+}
+
+export const options = {
+  scenarios,
+  thresholds: {
+    ws_connection_time_ms: ['p(95)<500'],
+    ws_connection_success: ['rate>0.95'],
+  },
+};
+
+export function connectOnly() {
+  connectAndHold(roomForVU(), CONNECT_HOLD_SEC);
+  sleep(0.1);
+}
+
+export function fanout() {
+  connectAndFanout(roomForVU());
+  sleep(0.1);
+}
+
+export function setup() {
+  console.log('========================================');
+  console.log(' Production-Style Load Test');
+  console.log('========================================');
+  console.log(`BASE_URL: ${BASE_URL}`);
+  console.log(`ROOMS: ${ROOMS}`);
+  console.log(`SCENARIOS: ${SCENARIOS.join(',')}`);
+  console.log(`SEND_INTERVAL_MS: ${SEND_INTERVAL_MS}`);
+  console.log(`PAYLOAD_BYTES: ${PAYLOAD_BYTES}`);
+  console.log(`CONNECT_HOLD_SEC: ${CONNECT_HOLD_SEC}`);
+  console.log('========================================');
+}
--- a/loadtest/loadtest_redis_stress.js
+++ b/loadtest/loadtest_redis_stress.js
@@ -0,0 +1,120 @@
+import { check, sleep } from "k6";
+import { Counter, Rate, Trend } from "k6/metrics";
+import ws from "k6/ws";
+
+// =============================================================================
+// CUSTOM METRICS (avoid conflicts with k6's built-in ws_* metrics)
+// =============================================================================
+const connectionTime = new Trend("ws_connection_time_ms");
+const messageRTT = new Trend("ws_message_rtt_ms");
+const connectionsFailed = new Counter("ws_connections_failed");
+const messagesReceived = new Counter("ws_msgs_received");
+const messagesSent = new Counter("ws_msgs_sent");
+const connectionSuccess = new Rate("ws_connection_success");
+
+// =============================================================================
+// 1000 USERS TEST - STRESS REDIS PUBSUB SUBSCRIPTIONS
+// =============================================================================
+export const options = {
+  stages: [
+    { duration: "20s", target: 20 }, // Warmup: 20 users
+    { duration: "10s", target: 200 }, // Ramp to 200
+    { duration: "10s", target: 500 }, // Ramp to 500
+    { duration: "10s", target: 1000 }, // Ramp to 1000
+    { duration: "60s", target: 1000 }, // Hold at 1000 for 1 minute
+    { duration: "10s", target: 0 }, // Ramp down
+  ],
+
+  thresholds: {
+    ws_connection_time_ms: ["p(95)<500"], // Target: <500ms connection
+    ws_message_rtt_ms: ["p(95)<100"], // Target: <100ms message RTT
+    ws_connection_success: ["rate>0.95"], // Target: >95% success rate
+  },
+};
+
+export default function () {
+  // CRITICAL: Create unique room per user to stress Redis PubSub
+  // This creates ~1000 subscriptions (1 per room) to trigger the bottleneck
+  const roomId = `loadtest-room-${__VU}`;
+  const url = `ws://localhost:8080/ws/loadtest/${roomId}`;
+
+  const connectStart = Date.now();
+
+  const res = ws.connect(url, {}, function (socket) {
+    const connectDuration = Date.now() - connectStart;
+    connectionTime.add(connectDuration);
+    connectionSuccess.add(1);
+
+    // Send realistic Yjs-sized messages (200 bytes)
+    // Yjs sync messages are typically 100-500 bytes
+    const payload = new Uint8Array(200);
+    payload[0] = 1; // Message type: Yjs sync
+
+    // Fill with realistic data (not zeros)
+    for (let i = 1; i < 200; i++) {
+      payload[i] = Math.floor(Math.random() * 256);
+    }
+
+    socket.on("message", (data) => {
+      messagesReceived.add(1);
+    });
+
+    socket.on("error", (e) => {
+      connectionsFailed.add(1);
+    });
+
+    // Send message every 1 second (realistic collaborative edit rate)
+    socket.setInterval(function () {
+      socket.sendBinary(payload.buffer);
+      messagesSent.add(1);
+    }, 1000);
+
+    // Keep connection alive for 100 seconds (longer than test duration)
+    socket.setTimeout(function () {
+      socket.close();
+    }, 100000);
+  });
+
+  const connectCheck = check(res, {
+    "WebSocket connected": (r) => r && r.status === 101,
+  });
+
+  if (!connectCheck) {
+    connectionsFailed.add(1);
+    connectionSuccess.add(0);
+  }
+
+  // Small sleep to avoid hammering connection endpoint
+  sleep(0.1);
+}
+
+export function setup() {
+  console.log("========================================");
+  console.log("  Redis PubSub Stress Test: 1000 Users");
+  console.log("========================================");
+  console.log("⚠️  CRITICAL: Creates ~1000 rooms");
+  console.log("   This stresses Redis PubSub subscriptions");
+  console.log("   Each room = 1 dedicated PubSub connection");
+  console.log("========================================");
+  console.log("Expected bottleneck (before fix):");
+  console.log("  - 58.96s in PubSub health checks");
+  console.log("  - 28.09s in ReceiveTimeout");
+  console.log("  - Connection success rate: 80-85%");
+  console.log("  - P95 latency: 20-26 seconds");
+  console.log("========================================");
+  console.log("Expected after fix:");
+  console.log("  - <1s in PubSub operations");
+  console.log("  - Connection success rate: >95%");
+  console.log("  - P95 latency: <500ms");
+  console.log("========================================");
+}
+
+export function teardown(data) {
+  console.log("========================================");
+  console.log("  Load Test Completed");
+  console.log("========================================");
+  console.log("Check profiling data with:");
+  console.log("  curl http://localhost:8080/debug/pprof/mutex > mutex.pb");
+  console.log("  go tool pprof -top mutex.pb");
+  console.log("========================================");
+}
--- a/loadtest/run_with_pprof.sh
+++ b/loadtest/run_with_pprof.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Simple helper to run k6 and capture pprof during peak load.
+# Usage:
+#   PPROF_BASE=http://localhost:8080/debug/pprof \
+#   K6_SCRIPT=loadtest/loadtest_prod.js \
+#   SLEEP_BEFORE=40 \
+#   CPU_SECONDS=30 \
+#   ./loadtest/run_with_pprof.sh
+
+PPROF_BASE="${PPROF_BASE:-http://localhost:8080/debug/pprof}"
+K6_SCRIPT="${K6_SCRIPT:-loadtest/loadtest_prod.js}"
+SLEEP_BEFORE="${SLEEP_BEFORE:-40}"
+CPU_SECONDS="${CPU_SECONDS:-30}"
+OUT_DIR="${OUT_DIR:-loadtest/pprof}"
+
+STAMP="$(date +%Y%m%d_%H%M%S)"
+RUN_DIR="${OUT_DIR}/${STAMP}"
+
+mkdir -p "${RUN_DIR}"
+
+echo "==> Starting k6: ${K6_SCRIPT}"
+k6 run "${K6_SCRIPT}" &
+K6_PID=$!
+
+echo "==> Waiting ${SLEEP_BEFORE}s before capturing pprof..."
+sleep "${SLEEP_BEFORE}"
+
+echo "==> Capturing profiles into ${RUN_DIR}"
+curl -sS "${PPROF_BASE}/profile?seconds=${CPU_SECONDS}" -o "${RUN_DIR}/cpu.pprof"
+curl -sS "${PPROF_BASE}/mutex" -o "${RUN_DIR}/mutex.pprof"
+curl -sS "${PPROF_BASE}/block" -o "${RUN_DIR}/block.pprof"
+curl -sS "${PPROF_BASE}/goroutine?debug=2" -o "${RUN_DIR}/goroutine.txt"
+
+echo "==> Waiting for k6 to finish (pid ${K6_PID})..."
+wait "${K6_PID}"
+
+echo "==> Done. Profiles saved in ${RUN_DIR}"
+echo "    Inspect with:"
+echo "      go tool pprof -top ${RUN_DIR}/cpu.pprof"
+echo "      go tool pprof -top ${RUN_DIR}/mutex.pprof"
+echo "      go tool pprof -top ${RUN_DIR}/block.pprof"