三 Low Memory Killer
Andorid的 Low Memory Killer 是在標準的linux lernel的 OOM 基礎上修改而來的一種內存管理機制。當系統內存不足時,殺死不必要的進程釋放其內存。不必要的進程的選擇根據有2個:oom_adj和占用的內存的大小。oom_adj 代表進程的優先級,數值越高,優先級越低,越容易被殺死;對應每個oom_adj都可以有一個空閑進程的閥值。Android Kernel每隔一段時間會檢測當前空閑內存是否低于某個閥值。假如是,則殺死oom_adj最大的不必要的進程,如果有多個,就根據 oom_score_adj 去殺死進程,,直到內存恢復低于閥值的狀態。
LowMemoryKiller 的閾值的設定,主要保存在2個文件之中,分別是:
/sys/module/lowmemorykiller/parameters/adj
/sys/module/lowmemorykiller/parameters/minfree
adj保存著當前系統殺進程的等級,minfree則是保存著對應的內存閥值。
Nexus6 Android7.0 系統的設置(源碼編譯的 OS,可能和最終設備不一樣):
shamu:/ # cat /sys/module/lowmemorykiller/parameters/adj
0,100,200,300,900,906
shamu:/ # cat /sys/module/lowmemorykiller/parameters/minfree
18432,23040,27648,32256,36864,46080
例如:將1,6
寫入節點/sys/module/lowmemorykiller/parameters/adj
,將1024,8192
寫入節點/sys/module/lowmemorykiller/parameters/minfree
。
策略:當系統可用內存低于8192個pages時,則會殺掉oom_score_adj>=6的進程;當系統可用內存低于1024個pages時,則會殺掉oom_score_adj>=1的進程。
3.1 lmkd 守護進程
LMK 的進程是lmkd
守護進程,隨著系統的啟動而啟動的。實現源碼要在system/core/lmkd/lmkd.c
。
lmkd
會創建名為lmkd
的socket,節點位于/dev/socket/lmkd
,該socket用于跟上層framework交互。
service lmkd /system/bin/lmkd
class core
critical
socket lmkd seqpacket 0660 system system
writepid /dev/cpuset/system-background/tasks
lmkd
會接收 Framework 的命令,進行相應的操作:
功能 | 命令 | 對應方法 |
---|---|---|
LMK_PROCPRIO | 設置進程adj | PL.setOomAdj() |
LMK_TARGET | 更新oom_adj | PL.updateOomLevels() |
LMK_PROCREMOVE | 移除進程 | PL.remove() |
lmkd
socket 命令處理
static void ctrl_command_handler(void) {
int ibuf[CTRL_PACKET_MAX / sizeof(int)];
int len;
int cmd = -1;
int nargs;
int targets;
len = ctrl_data_read((char *)ibuf, CTRL_PACKET_MAX);
if (len <= 0)
return;
nargs = len / sizeof(int) - 1;
if (nargs < 0)
goto wronglen;
//將網絡字節順序轉換為主機字節順序
cmd = ntohl(ibuf[0]);
switch(cmd) {
case LMK_TARGET:
targets = nargs / 2;
if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
goto wronglen;
cmd_target(targets, &ibuf[1]);
break;
case LMK_PROCPRIO:
if (nargs != 3)
goto wronglen;
//設置進程adj
cmd_procprio(ntohl(ibuf[1]), ntohl(ibuf[2]), ntohl(ibuf[3]));
break;
case LMK_PROCREMOVE:
if (nargs != 1)
goto wronglen;
cmd_procremove(ntohl(ibuf[1]));
break;
default:
ALOGE("Received unknown command code %d", cmd);
return;
}
return;
wronglen:
ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
}
設置進程 adj
static void cmd_procprio(int pid, int uid, int oomadj) {
struct proc *procp;
char path[80];
char val[20];
...
snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", pid);
snprintf(val, sizeof(val), "%d", oomadj);
// 向節點/proc/<pid>/oom_score_adj寫入oomadj
writefilestring(path, val);
// 當使用kernel方式則直接返回
if (use_inkernel_interface)
return;
procp = pid_lookup(pid);
if (!procp) {
procp = malloc(sizeof(struct proc));
if (!procp) {
// Oh, the irony. May need to rebuild our state.
return;
}
procp->pid = pid;
procp->uid = uid;
procp->oomadj = oomadj;
proc_insert(procp);
} else {
proc_unslot(procp);
procp->oomadj = oomadj;
proc_slot(procp);
}
}
向節點/proc/<pid>/oom_score_adj
寫入oom_adj。由于use_inkernel_interface=1
,那么再接下里需要看看 kernel 的情況。
小結:
use_inkernel_interface
該值后續應該會逐漸采用用戶空間策略。不過目前仍為use_inkernel_interface=1
則有:
- LMK_TARGET:AMS.updateConfiguration()的過程中調用updateOomLevels()方法, 分別向/sys/module/lowmemorykiller/parameters目錄下的minfree和adj節點寫入相應信息;
- LMK_PROCPRIO: AMS.applyOomAdjLocked()的過程中調用setOomAdj(),向/proc/<pid>/oom_score_adj寫入oomadj 后直接返回;
- LMK_PROCREMOVE:AMS.handleAppDiedLocked或者 AMS.cleanUpApplicationRecordLocked()的過程,調用remove(),目前不做任何事,直接返回;
3.2 LowMemoryKiller Kernel driver
lowmemorykiller
driver 位于 drivers/staging/android/lowmemorykiller.c
lowmemorykiller
static struct shrinker lowmem_shrinker = {
.shrink = lowmem_shrink,
.seeks = DEFAULT_SEEKS * 16
};
static int __init lowmem_init(void)
{
register_shrinker(&lowmem_shrinker);
vmpressure_notifier_register(&lmk_vmpr_nb);
return 0;
}
static void __exit lowmem_exit(void)
{
unregister_shrinker(&lowmem_shrinker);
}
通過 register_shrinker
和unregister_shrinker
分別用于初始化和退出。
shrinker
LMK驅動通過注冊 shrinker 來實現的,shrinker是linux kernel標準的回收內存page的機制,由內核線程kswapd負責監控。
當內存不足時kswapd線程會遍歷一張shrinker鏈表,并回調已注冊的shrinker函數來回收內存page,kswapd還會周期性喚醒來執行內存操作。每個zone維護active_list和inactive_list鏈表,內核根據頁面活動狀態將page在這兩個鏈表之間移動,最終通過shrink_slab和shrink_zone來回收內存頁。
lowmem_shrink
觸發 shrink 操作:
static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
{
struct task_struct *tsk;
struct task_struct *selected = NULL;
int rem = 0;
int tasksize;
int i;
int ret = 0;
short min_score_adj = OOM_SCORE_ADJ_MAX + 1; //1001
int minfree = 0;
int selected_tasksize = 0;
int selected_oom_score_adj;
int array_size = ARRAY_SIZE(lowmem_adj);
int other_free;
int other_file;
unsigned long nr_to_scan = sc->nr_to_scan;
if (nr_to_scan > 0) {
if (mutex_lock_interruptible(&scan_mutex) < 0)
return 0;
}
// 剩余內存
other_free = global_page_state(NR_FREE_PAGES);
if (global_page_state(NR_SHMEM) + total_swapcache_pages <
global_page_state(NR_FILE_PAGES))
other_file = global_page_state(NR_FILE_PAGES) -
global_page_state(NR_SHMEM) -
total_swapcache_pages;
else
other_file = 0;
tune_lmk_param(&other_free, &other_file, sc);
if (lowmem_adj_size < array_size)
array_size = lowmem_adj_size;
if (lowmem_minfree_size < array_size)
array_size = lowmem_minfree_size;
for (i = 0; i < array_size; i++) {
minfree = lowmem_minfree[i];
if (other_free < minfree && other_file < minfree) {
min_score_adj = lowmem_adj[i];
break;
}
}
if (nr_to_scan > 0) {
ret = adjust_minadj(&min_score_adj);
lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
nr_to_scan, sc->gfp_mask, other_free,
other_file, min_score_adj);
}
rem = global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE);
if (nr_to_scan <= 0 || min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
lowmem_print(5, "lowmem_shrink %lu, %x, return %d\n",
nr_to_scan, sc->gfp_mask, rem);
if (nr_to_scan > 0)
mutex_unlock(&scan_mutex);
if ((min_score_adj == OOM_SCORE_ADJ_MAX + 1) &&
(nr_to_scan > 0))
trace_almk_shrink(0, ret, other_free, other_file, 0);
return rem;
}
selected_oom_score_adj = min_score_adj;
rcu_read_lock();
for_each_process(tsk) {
struct task_struct *p;
int oom_score_adj;
if (tsk->flags & PF_KTHREAD)
continue;
/* if task no longer has any memory ignore it */
if (test_task_flag(tsk, TIF_MM_RELEASED))
continue;
if (time_before_eq(jiffies, lowmem_deathpending_timeout)) {
if (test_task_flag(tsk, TIF_MEMDIE)) {
rcu_read_unlock();
/* give the system time to free up the memory */
msleep_interruptible(20);
mutex_unlock(&scan_mutex);
return 0;
}
}
p = find_lock_task_mm(tsk);
if (!p)
continue;
oom_score_adj = p->signal->oom_score_adj;
// oom_adj 小于 最小值,忽略
if (oom_score_adj < min_score_adj) {
task_unlock(p);
continue;
}
// 進程 RSS
tasksize = get_mm_rss(p->mm);
task_unlock(p);
if (tasksize <= 0)
continue;
if (selected) {
if (oom_score_adj < selected_oom_score_adj)
continue;
if (oom_score_adj == selected_oom_score_adj &&
tasksize <= selected_tasksize)
continue;
}
selected = p;
selected_tasksize = tasksize;
selected_oom_score_adj = oom_score_adj;
lowmem_print(3, "select '%s' (%d), adj %hd, size %d, to kill\n",
p->comm, p->pid, oom_score_adj, tasksize);
}
if (selected) {
lowmem_print(1, "Killing '%s' (%d), adj %d,\n" \
" to free %ldkB on behalf of '%s' (%d) because\n" \
" cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \
" Free memory is %ldkB above reserved.\n" \
" Free CMA is %ldkB\n" \
" Total reserve is %ldkB\n" \
" Total free pages is %ldkB\n" \
" Total file cache is %ldkB\n" \
" Slab Reclaimable is %ldkB\n" \
" Slab UnReclaimable is %ldkB\n" \
" Total Slab is %ldkB\n" \
" GFP mask is 0x%x\n",
selected->comm, selected->pid,
selected_oom_score_adj,
selected_tasksize * (long)(PAGE_SIZE / 1024),
current->comm, current->pid,
other_file * (long)(PAGE_SIZE / 1024),
minfree * (long)(PAGE_SIZE / 1024),
min_score_adj,
other_free * (long)(PAGE_SIZE / 1024),
global_page_state(NR_FREE_CMA_PAGES) *
(long)(PAGE_SIZE / 1024),
totalreserve_pages * (long)(PAGE_SIZE / 1024),
global_page_state(NR_FREE_PAGES) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_FILE_PAGES) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_SLAB_RECLAIMABLE) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_SLAB_UNRECLAIMABLE) *
(long)(PAGE_SIZE / 1024),
global_page_state(NR_SLAB_RECLAIMABLE) *
(long)(PAGE_SIZE / 1024) +
global_page_state(NR_SLAB_UNRECLAIMABLE) *
(long)(PAGE_SIZE / 1024),
sc->gfp_mask);
if (lowmem_debug_level >= 2 && selected_oom_score_adj == 0) {
show_mem(SHOW_MEM_FILTER_NODES);
dump_tasks(NULL, NULL);
show_mem_call_notifiers();
}
lowmem_deathpending_timeout = jiffies + HZ;
send_sig(SIGKILL, selected, 0);
set_tsk_thread_flag(selected, TIF_MEMDIE);
rem -= selected_tasksize;
rcu_read_unlock();
/* give the system time to free up the memory */
msleep_interruptible(20);
trace_almk_shrink(selected_tasksize, ret,
other_free, other_file, selected_oom_score_adj);
} else {
trace_almk_shrink(1, ret, other_free, other_file, 0);
rcu_read_unlock();
}
lowmem_print(4, "lowmem_shrink %lu, %x, return %d\n",
nr_to_scan, sc->gfp_mask, rem);
mutex_unlock(&scan_mutex);
return rem;
}
- 選擇
oom_score_adj
最大的進程中,并且rss內存最大的進程作為選中要殺的進程。 - 殺進程方式:
send_sig(SIGKILL, selected, 0)
向選中的目標進程發送signal 9
來殺掉目標進程。
lmkd參數
- oom_adj:代表進程的優先級, 數值越大,優先級越低,越容易被殺. 取值范圍[-16, 15]
- oom_score_adj: 取值范圍[-1000, 1000]
- oom_score:lmk策略中貌似并沒有看到使用的地方,這個應該是oom才會使用。
lowmem_oom_adj_to_oom_score_adj 計算:
static int lowmem_oom_adj_to_oom_score_adj(int oom_adj)
{
if (oom_adj == OOM_ADJUST_MAX)
return OOM_SCORE_ADJ_MAX;
else
return (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
}
- 當oom_adj = 15, 則 oom_score_adj = 1000;
- 當oom_adj < 15, 則 oom_score_adj = oom_adj * 1000/17;
四 總結
以上整個過程可以簡單總結如下:
- 系統 Framework 層根據不同類型進程生命周期控制,動態分配不同的 adj 值,并且在一定的時機會對所有進程的 adj 進行更新;
- 更新 adj 時,Framework 層會和 lmkd 守護進程進行通信,修改系統 lmk driver 配置的參數,同時設置
/proc/pid/oom_score_adj
; - lowmemorykiller 驅動會被 linux 內核的內存 shrinker 機制調度,在 shrinker 操作中,計算進程 adj 和 rss,依據 driver 的 oom_adj 和 minfree 配置,進行 kill 進程操作。
所以,后臺應用被回收的問題,需要額外關注:
- 進程的生命周期及5大優先級分類
- 減小內存占用,在 trimmemory 時能及時釋放內存