问题现象

起初是有一个通过阿里云k8s集群部署的node应用出现OOM kill的情况,排查后发现是因为写入文件时在write系统调用上返回了ENOMEM的错误码,node程序在这个写入异常后出现内存问题, 内存问题由应用开发去排查.针对文件写入问题,除去这个node应用写入文件日志时在最近两周发生了两次这样的情况, 在后续的排查过程中, 还发现了dockerd在写入容器stdout日志到文件时也出现了一样的问题.以上5个案例都集中在集群中的两台机器上.

经过trace内核调用堆栈后, 判定是ext4文件系统相关的问题.下面会说明详细的排查过程.

排查过程

内核版本: 3.10.0-693.2.2.el7.x86_64

trace使用的是对function tracer和kprobe进行了友好封装的perf-tools(https://github.com/brendangregg/perf-tools)

# 问题表现, 使用O_APPEND模式写入文件会报ENOMEM
[root@ali-k8s-10-111-12-136 ~]# strace -e open,dup2,write -s 200 sh -c "echo hello >> /dockermount/docker/containers/b7e6c58c68f49c3512f0e243ecb7b48ed0990192de488b2498f598b965749a80/b7e6c58c68f49c3512f0e243ecb7b48ed0990192de488b2498f598b965749a80-json.log"
open("/dockermount/docker/containers/b7e6c58c68f49c3512f0e243ecb7b48ed0990192de488b2498f598b965749a80/b7e6c58c68f49c3512f0e243ecb7b48ed0990192de488b2498f598b965749a80-json.log", O_WRONLY|O_CREAT|O_APPEND, 0666) = 3
dup2(3, 1)
write(1, "hello\n", 6)                  = -1 ENOMEM (Cannot allocate memory)
# 如果使用O_TRUNC(echo >)写入文件就可以恢复这个文件

# 使用function graph tracer结合代码,定位到返回ENOMEM的堆栈
24139252.586203 |   32) dockerd-29145  |               |  ext4_mb_init_cache [ext4]() {
24139252.586204 |   32) dockerd-29145  |               |    ext4_read_block_bitmap_nowait [ext4]() {
24139252.586204 |   32) dockerd-29145  |   0.048 us    |      ext4_get_group_desc [ext4]();
24139252.586204 |   32) dockerd-29145  |   0.035 us    |      ext4_block_bitmap [ext4]();
24139252.586204 |   32) dockerd-29145  |               |      __getblk() {
24139252.586204 |   32) dockerd-29145  |               |        __find_get_block() {
24139252.586205 |   32) dockerd-29145  |   0.036 us    |          mark_page_accessed();
24139252.586205 |   32) dockerd-29145  |   0.303 us    |        }
24139252.586205 |   32) dockerd-29145  |   0.036 us    |        _cond_resched();
24139252.586205 |   32) dockerd-29145  |   0.830 us    |      }
24139252.586205 |   32) dockerd-29145  |   0.038 us    |      ext4_validate_block_bitmap [ext4]();
24139252.586206 |   32) dockerd-29145  |   0.036 us    |      __smp_mb__before_atomic();
24139252.586206 |   32) dockerd-29145  |   2.270 us    |    }
24139252.586206 |   32) dockerd-29145  |   2.668 us    |  }

# 使用kprobe确定ext4_mb_init_cache的返回值为-ENOMEM
[root@ali-k8s-10-111-12-136 ~]# ./kprobe -s -d 1 -p 29038 'r:ext4_mb_init_cache $retval'
         dockerd-29158 [020] d... 24138804.044626: ext4_mb_init_cache: (ext4_mb_load_buddy+0x2b6/0x340 [ext4] <- ext4_mb_init_cache) arg1=0xfffffff4
         dockerd-29158 [020] d... 24138804.044638: <stack trace>
 => ext4_mb_find_by_goal
 => ext4_mb_regular_allocator
 => ext4_mb_new_blocks
 => ext4_ext_map_blocks
 => ext4_map_blocks
 => _ext4_get_block
 => ext4_get_block
 => __block_write_begin
 => ext4_write_begin
 => generic_file_buffered_write
 => __generic_file_aio_write
 => generic_file_aio_write
 => ext4_file_write
 => do_sync_write
 => vfs_write
 => SyS_write
 => system_call_fastpath
#define	ENOMEM		12	/* Out of memory */
# ENOMEM=12, ext4_mb_init_cache返回值 0xfffffff4 是-12补码表示形式

# 返回ENOMEM的代码段在 -> https://elixir.bootlin.com/linux/v3.10/source/fs/ext4/mballoc.c#L871, 下面是摘录:
if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
	err = -ENOMEM;
	goto out;
}

# 再次确认了ext4_read_block_bitmap_nowait的返回值为NULL
[root@ali-k8s-10-111-12-136 ~]# ./kprobe -d 1 -p 29038 'r:ext4_read_block_bitmap_nowait $retval'
ext4_read_block_bitmap_nowait: (ext4_mb_init_cache+0x181/0x6e0 [ext4] <- ext4_read_block_bitmap_nowait) arg1=0x0

# 继续trace了ext4_read_block_bitmap_nowait, 发现了__getblk一直有返回同一个地址, 所以推断问题出在ext4_validate_block_bitmap方法上
[root@ali-k8s-10-111-12-136 ~]# ./kprobe -d 1 -p 29038 'r:__getblk $retval'
 dockerd-29066 [024] d... 24140654.894696: __getblk: (ext4_read_block_bitmap_nowait+0x52/0x5f0 [ext4] <- __getblk) arg1=0xffff8819e4d7da28
 dockerd-29066 [024] d... 24140654.894731: __getblk: (ext4_read_block_bitmap_nowait+0x52/0x5f0 [ext4] <- __getblk) arg1=0xffff8819e4d7da28
 dockerd-29139 [020] d... 24140654.894790: __getblk: (ext4_read_block_bitmap_nowait+0x52/0x5f0 [ext4] <- __getblk) arg1=0xffff8819e4d7da28

# ext4_validate_block_bitmap代码段
void ext4_validate_block_bitmap(struct super_block *sb,
			       struct ext4_group_desc *desc,
			       unsigned int block_group,
			       struct buffer_head *bh)
{
	ext4_fsblk_t	blk;

	if (buffer_verified(bh))
		return;

	ext4_lock_group(sb, block_group);
	blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
	if (unlikely(blk != 0)) {
		ext4_unlock_group(sb, block_group);
		ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
			   block_group, blk);
		return;
	}
	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
			desc, bh))) {
		ext4_unlock_group(sb, block_group);
		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
		return;
	}
	set_buffer_verified(bh);
	ext4_unlock_group(sb, block_group);
}
# ext4_validate_block_bitmap 这个函数内调用的函数都不是可以被trace的,所以不能跟进一步确认是哪个分支出现的问题.还有两个不明白的点: 
# 1. bh指针__getblk()调用后赋值了,但没看不出来在ext4_validate_block_bitmap的哪里被改成NULL的 
# 2. 两个异常分支都有对应的error日志,但在异常的机器的/var/log/message中没有找到相关的日志

故障现场

现存三个dockerd写入stdout日志失败的现场,可以通过文件中最后一条日志推测开始出问题的时间点.