最后一点信息,然后我想我要下线几个小时了。
root@discourse_app:/# ps aux --sort=-%mem | head -20
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
postgres 4398 17.0 22.0 8216352 6797764 ? Ss 11:14 9:53 postgres: 15/main: discourse discourse [local] UPDATE
postgres 2729 15.3 21.1 8123888 6517308 ? Ss 11:13 9:03 postgres: 15/main: discourse discourse [local] UPDATE
postgres 2501 15.9 19.9 8079700 6148812 ? Ds 11:13 9:25 postgres: 15/main: discourse discourse [local] UPDATE
postgres 22777 16.9 19.5 8084888 6012052 ? Ds 11:42 4:58 postgres: 15/main: discourse discourse [local] UPDATE
postgres 2753 28.5 11.3 8055000 3482260 ? Ss 11:13 16:50 postgres: 15/main: discourse discourse [local] idle
postgres 25715 2.9 6.9 7884064 2135536 ? Ss 11:47 0:44 postgres: 15/main: discourse discourse [local] idle
postgres 20487 2.9 6.6 7885300 2061088 ? Ss 11:39 0:59 postgres: 15/main: discourse discourse [local] idle
postgres 22055 3.3 6.5 7887336 2012504 ? Ss 11:41 1:02 postgres: 15/main: discourse discourse [local] idle
postgres 25883 2.5 6.0 7884096 1848424 ? Ss 11:47 0:38 postgres: 15/main: discourse discourse [local] idle
postgres 28126 2.4 5.6 7883848 1744912 ? Ss 11:50 0:31 postgres: 15/main: discourse discourse [local] idle
postgres 29365 1.0 4.5 7883084 1386544 ? Ss 11:52 0:12 postgres: 15/main: discourse discourse [local] idle
postgres 27172 1.6 4.4 7884288 1384664 ? Ss 11:49 0:22 postgres: 15/main: discourse discourse [local] idle
postgres 25896 2.1 4.4 8034236 1357264 ? Ss 11:47 0:31 postgres: 15/main: discourse discourse [local] idle
postgres 89 1.7 4.3 7864156 1342760 ? Ss 11:11 1:04 postgres: 15/main: checkpointer
postgres 28505 1.0 4.2 7884360 1315360 ? Ss 11:51 0:13 postgres: 15/main: discourse discourse [local] idle
postgres 27175 1.6 4.1 7882780 1277612 ? Ss 11:49 0:23 postgres: 15/main: discourse discourse [local] idle
postgres 28553 0.9 3.4 7883976 1064964 ? Ss 11:51 0:11 postgres: 15/main: discourse discourse [local] idle
postgres 30409 1.0 3.3 7882892 1034860 ? Ss 11:54 0:10 postgres: 15/main: discourse discourse [local] idle
postgres 40651 4.6 1.9 7872036 592152 ? Ss 12:11 0:03 postgres: 15/main: discourse discourse [local] idle
root@discourse_app:/# redis-cli info memory
# Memory
used_memory:179899224
used_memory_human:171.57M
used_memory_rss:47591424
used_memory_rss_human:45.39M
used_memory_peak:184509776
used_memory_peak_human:175.96M
used_memory_peak_perc:97.50%
used_memory_overhead:3681093
used_memory_startup:948600
used_memory_dataset:176218131
used_memory_dataset_perc:98.47%
allocator_allocated:181437808
allocator_active:182353920
allocator_resident:188317696
allocator_muzzy:0
total_system_memory:31537295360
total_system_memory_human:29.37G
used_memory_lua:58368
used_memory_vm_eval:58368
used_memory_lua_human:57.00K
used_memory_scripts_eval:10304
number_of_cached_scripts:13
number_of_functions:0
number_of_libraries:0
used_memory_vm_functions:33792
used_memory_vm_total:92160
used_memory_vm_total_human:90.00K
used_memory_functions:192
used_memory_scripts:10496
used_memory_scripts_human:10.25K
maxmemory:0
maxmemory_human:0B
maxmemory_policy:noeviction
allocator_frag_ratio:1.00
allocator_frag_bytes:700208
allocator_rss_ratio:1.03
allocator_rss_bytes:5963776
rss_overhead_ratio:0.25
rss_overhead_bytes:-140726272
mem_fragmentation_ratio:0.26
mem_fragmentation_bytes:-132268896
mem_not_counted_for_evict:0
mem_replication_backlog:0
mem_total_replication_buffers:0
mem_clients_slaves:0
mem_clients_normal:498197
mem_cluster_links:0
mem_aof_buffer:0
mem_allocator:jemalloc-5.3.0
mem_overhead_db_hashtable_rehashing:0
active_defrag_running:0
lazyfree_pending_objects:0
lazyfreed_objects:0
root@discourse_app:/# cat /etc/postgresql/15/main/postgresql.conf | grep shared_buffers
shared_buffers = 7424MB
#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers
root@discourse_app:/# su - postgres -c "psql discourse -c \"SELECT pid, query_start, state, wait_event_type, wait_event, left(query, 100) as query FROM pg_stat_activity WHERE state != 'idle' ORDER BY query_start;\""
pid | query_start | state | wait_event_type | wait_event | query
-------+-------------------------------+--------+-----------------+--------------+------------------------------------------------------------------------------------------------------
2501 | 2026-02-07 11:25:01.028892+00 | active | IO | DataFileRead | UPDATE posts +
| | | | | SET percent_rank = X.percent_rank +
| | | | | FROM ( +
| | | | | SELECT posts.id, Y.percent_rank +
| | | | | FROM posts
4398 | 2026-02-07 11:52:53.108942+00 | active | IPC | BufferIO | WITH eligible_users AS ( +
| | | | | SELECT id +
| | | | | FROM users +
| | | | | WHERE id > 0 AND active AND silenced_till IS NUL
2729 | 2026-02-07 11:54:27.666129+00 | active | IPC | BufferIO | UPDATE topics AS topics +
| | | | | SET has_summary = (topics.like_count >= 1 AND +
| | | | | topics.post
22777 | 2026-02-07 11:59:27.040575+00 | active | IO | DataFileRead | UPDATE posts +
| | | | | SET percent_rank = X.percent_rank +
| | | | | FROM ( +
| | | | | SELECT posts.id, Y.percent_rank +
| | | | | FROM posts
27172 | 2026-02-07 12:15:42.50553+00 | active | IO | DataFileRead | SELECT "posts"."id" FROM "posts" WHERE "posts"."deleted_at" IS NULL AND "posts"."topic_id" = 792311
25883 | 2026-02-07 12:15:52.665883+00 | active | | | SELECT "posts"."id" FROM "posts" WHERE "posts"."deleted_at" IS NULL AND "posts"."topic_id" = 829626
20487 | 2026-02-07 12:16:09.733384+00 | active | IO | DataFileRead | SELECT "posts"."id" FROM "posts" WHERE "posts"."deleted_at" IS NULL AND "posts"."topic_id" = 653216
42185 | 2026-02-07 12:16:21.053706+00 | active | IO | DataFileRead | SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "p
43940 | 2026-02-07 12:16:21.925505+00 | active | | | SELECT pid, query_start, state, wait_event_type, wait_event, left(query, 100) as query FROM pg_stat_
28126 | 2026-02-07 12:16:21.96218+00 | active | IO | DataFileRead | SELECT "posts"."id" FROM "posts" WHERE "posts"."deleted_at" IS NULL AND "posts"."topic_id" = 818063
42323 | 2026-02-07 12:16:21.966689+00 | active | Client | ClientRead | SELECT "discourse_post_event_events"."id", "discourse_post_event_events"."status", "discourse_post_e
(11 rows)
我的问题基本上可以归结为:“什么操作会导致一个 UPDATE 查询挂起 9 个小时?”
我猜是没有足够的内存:查询进入了交换空间(swap)。
一个 40GB 的 posts 表会是一个潜在问题吗?
root@discourse_app:/# su - postgres -c "psql discourse -c \"SELECT schemaname, tablename, pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size FROM pg_tables WHERE schemaname = 'public' ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC LIMIT 10;\""
schemaname | tablename | size
------------+-------------------+---------
public | posts | 40 GB
public | post_search_data | 4326 MB
public | topic_users | 1306 MB
public | topics | 837 MB
public | topic_search_data | 702 MB
public | post_replies | 567 MB
public | top_topics | 512 MB
public | user_actions | 417 MB
public | topic_links | 285 MB
public | directory_items | 243 MB

