稀土掘金地址:https://juejin.cn/post/7274046488752619579
matrix 對io的監控包括四個方面
- 監控在主線程執行 IO 操作的問題
- 監控緩沖區過小的問題
- 監控重復讀同一文件
- 監控內存泄漏問題
IOCanaryPlugin,內部由IOCanaryCore完成真正的操作。
start方法
根據配置進行hook的安裝
//io流hook
if (ioConfig.isDetectFileIOInMainThread() || ioConfig.isDetectFileIOBufferTooSmall() || ioConfig.isDetectFileIORepeatReadSameFile()) {
IOCanaryJniBridge.install(ioConfig, this);
}
//內存泄漏hook
if (ioConfig.isDetectIOClosableLeak()) {
this.mCloseGuardHooker = new CloseGuardHooker(this);
this.mCloseGuardHooker.hook();
}
stop方法
取消hook
if (this.mCloseGuardHooker != null) {
this.mCloseGuardHooker.unHook();
}
IOCanaryJniBridge.uninstall();
IOCanaryJniBridge.install()
底層hook安裝包函幾個步驟,加載so,設置hook內容,分別對應了下面幾個方法
loadJni
System.loadLibrary("io-canary")
執行了System.loadLibrary("io-canary"),此時會進入io_canary_jni.cc中的JNI_OnLoad方法,在這個方法中有兩項關鍵操作,1.獲取到java層的一些信息,2.設置一個回調接口,用于上傳監控信息.
InitJniEnv()
static bool InitJniEnv(JavaVM *vm) {
....
jclass temp_cls = env->FindClass("com/tencent/matrix/iocanary/core/IOCanaryJniBridge");
....
}
SetIssuedCallback()
iocanary::IOCanary::Get().SetIssuedCallback(OnIssuePublish)
其中OnIssuePublish是在拿到信息之后將信息組裝成java層的對象IOIssue,然后放入List中,通過調用java層IOCanaryJniBridge類的onIssuePublish實現信息的拋出。
com/tencent/matrix/iocanary/core/IOIssue
com/tencent/matrix/iocanary/core/IOCanaryJniBridge
enableDetector
通過傳入定義好的type類型到底層,實現此類型的io監控,代碼如下
iocanary::IOCanary::Get().RegisterDetector(static_cast<DetectorType>(detector_type));
可以看到最終是往detectors_這個vector集合中存入了對應的Detector,每個Detector都FileIODetector的子類。
- FileIOMainThreadDetector
- FileIORepeatReadDetector
- FileIOSmallBufferDetector
void IOCanary::RegisterDetector(DetectorType type) {
switch (type) {
case DetectorType::kDetectorMainThreadIO:
detectors_.push_back(new FileIOMainThreadDetector());
break;
case DetectorType::kDetectorSmallBuffer:
detectors_.push_back(new FileIOSmallBufferDetector());
break;
case DetectorType::kDetectorRepeatRead:
detectors_.push_back(new FileIORepeatReadDetector());
break;
default:
break;
}
}
setConfig
給對應的io監控設置監控閾值,存入configs_數組,配置和對應的默認值如下,超過閾值則觸發監控
- kMainThreadThreshold = 500 毫秒
- kSmallBufferThreshold = 4096 kb
- kRepeatReadThreshold = 20 次
iocanary::IOCanary::Get().SetConfig(static_cast<IOCanaryConfigKey>(key), val);
void IOCanaryEnv::SetConfig(IOCanaryConfigKey key, long val) {
if (key >= IOCanaryConfigKey::kConfigKeysLen) {
return;
}
configs_[key] = val;
}
dohook
dohook是核心方法,前邊配置信息準備好后,這里開始進行對應方法的hook。被hook的so文件為
const static char* TARGET_MODULES[] = {
"libopenjdkjvm.so",
"libjavacore.so",
"libopenjdk.so"
};
關于GOT hook,可以查看愛奇藝的開源框架XHook,這里不再描述細節。https://github.com/iqiyi/xHook/blob/master/docs/overview/android_plt_hook_overview.zh-CN.md
被hook的方法如下
open、open64、close、android_fdsan_close_with_tag,
如果so是libjavacore.so,會嘗試hook它內部的這幾個方法
read、__read_chk、write、__write_chk
open
當一個文件被打開時,回調到設置好的方法ProxyOpen中,在這里會檢測是否是主線程操作,如不是則不做處理,如是主線程,則執行DoProxyOpenLogic邏輯。
int ProxyOpen(const char *pathname, int flags, mode_t mode) {
if(!IsMainThread()) {
return original_open(pathname, flags, mode);
}
int ret = original_open(pathname, flags, mode);
if (ret != -1) {
DoProxyOpenLogic(pathname, flags, mode, ret);
}
return ret;
}
在DoProxyOpenLogic方法中會獲取到當前堆棧信息
static void DoProxyOpenLogic(const char *pathname, int flags, mode_t mode, int ret) {
....
//kJavaBridgeClass = com/tencent/matrix/iocanary/core/IOCanaryJniBridge
//kMethodIDGetJavaContext = getJavaContext() 得到的是一個JavaContext,
//是一個內部類,這個類上有一個變量stack,在java Context 創建的時候,
//就會獲取到堆棧信息,保存在stack變量上
jobject java_context_obj = env->CallStaticObjectMethod(kJavaBridgeClass, kMethodIDGetJavaContext);
if (NULL == java_context_obj) {
return;
}
//堆棧信息
jstring j_stack = (jstring) env->GetObjectField(java_context_obj, kFieldIDStack);
jstring j_thread_name = (jstring) env->GetObjectField(java_context_obj, kFieldIDThreadName);
//當前線程名
char* thread_name = jstringToChars(env, j_thread_name);
char* stack = jstringToChars(env, j_stack);
JavaContext java_context(GetCurrentThreadId(), thread_name == NULL ? "" : thread_name, stack == NULL ? "" : stack);
....
//pathname是被打開的文件名,java_context中包含了堆棧和線程名
//flags和mode都是系統open方法調用傳過來的值,ret是open執行的結果
//這里進入了IOCanary OnOpen方法
iocanary::IOCanary::Get().OnOpen(pathname, flags, mode, ret, java_context);
....
}
open64
同open()
close
檢測是主線程,進入IOCanary OnClose方法
int ProxyClose(int fd) {
if(!IsMainThread()) {
return original_close(fd);
}
int ret = original_close(fd);
iocanary::IOCanary::Get().OnClose(fd, ret);
return ret;
}
android_fdsan_close_with_tag
同close()
read
主要是獲取到read消耗的時長,然后攜帶信息進入IOCanary OnRead
ssize_t ProxyRead(int fd, void *buf, size_t size) {
if(!IsMainThread()) {
return original_read(fd, buf, size);
}
//獲取到當前時間
int64_t start = GetTickCountMicros();
//執行原read方法
size_t ret = original_read(fd, buf, size);
//記錄read時間間隔
long read_cost_us = GetTickCountMicros() - start;
//將信息傳入IOCanary OnRead
iocanary::IOCanary::Get().OnRead(fd, buf, size, ret, read_cost_us);
return ret;
}
__read_chk
同read
write
ssize_t ProxyWrite(int fd, const void *buf, size_t size) {
if(!IsMainThread()) {
return original_write(fd, buf, size);
}
//獲取到當前時間
int64_t start = GetTickCountMicros();
//執行write
size_t ret = original_write(fd, buf, size);
//記錄時間間隔
long write_cost_us = GetTickCountMicros() - start;
//將信息傳入IOCanary OnRead
iocanary::IOCanary::Get().OnWrite(fd, buf, size, ret, write_cost_us);
return ret;
}
__write_chk
同write
IOCanary
從上邊open close read write方法的流向可知,最終都還是匯集到了IOCanary這個C++類中,進入對應的方法可知,IOCanary內部又調用了IOInfoCollector這個類。
。
OnOpen
void IOCanary::OnOpen(const char *pathname, int flags, mode_t mode,
int open_ret, const JavaContext& java_context) {
collector_.OnOpen(pathname, flags, mode, open_ret, java_context);
}
方法內部邏輯也很清晰,直接將文件名和相關信息組裝成info,然后以文件描述符為key,info為value存入了c++的info_map_(一個std::unordered_map)中,信息存起來肯定是要用的,我們后邊會看到。文件打開之后,下一步就是或讀或寫,繼續去看read方法。
void IOInfoCollector::OnOpen(const char *pathname, int flags, mode_t mode
, int open_ret, const JavaContext& java_context) {
if (open_ret == -1) {
return;
}
//open_ret參數指的是open方法調用后的結果,也就是當前被打開的文件的文件描述符,
//如果已存在,則返回
if (info_map_.find(open_ret) != info_map_.end()) {
return;
}
std::shared_ptr<IOInfo> info = std::make_shared<IOInfo>(pathname, java_context);
info_map_.insert(std::make_pair(open_ret, info));
}
OnRead
void IOCanary::OnRead(int fd, const void *buf, size_t size,
ssize_t read_ret, long read_cost) {
collector_.OnRead(fd, buf, size, read_ret, read_cost);
}
看起來關鍵內容在CountRWInfo中,從方法名上可以看出,讀和寫都與此方法有關,所以我們先不看CountRWInfo方法內容,看完write后再去深入CountRWInfo方法。
void IOInfoCollector::OnRead(int fd, const void *buf, size_t size,
ssize_t read_ret, long read_cost) {
if (read_ret == -1 || read_cost < 0) {
return;
}
if (info_map_.find(fd) == info_map_.end()) {
return;
}
CountRWInfo(fd, FileOpType::kRead, size, read_cost);
}
OnWrite
void IOCanary::OnWrite(int fd, const void *buf, size_t size,
ssize_t write_ret, long write_cost) {
collector_.OnWrite(fd, buf, size, write_ret, write_cost);
}
和read一樣,進入了CountRWInfo方法
void IOInfoCollector::OnWrite(int fd, const void *buf, size_t size,
ssize_t write_ret, long write_cost) {
if (write_ret == -1 || write_cost < 0) {
return;
}
if (info_map_.find(fd) == info_map_.end()) {
return;
}
CountRWInfo(fd, FileOpType::kWrite, size, write_cost);
}
CountRWInfo
CountRWInfo將每個文件對應的信息封裝到IOInfo這個類中,封裝的信息包函:
- 讀(寫)次數
- 文件大小
- 讀(寫)消耗的時長
- 單次讀(寫)最大時長
- 讀(寫)間隔小于8000微妙的總時長
- 緩存區大小
- 讀寫類型,讀還是寫
在一個文件被讀寫過程中,這個方法會不斷的被調用,并更新對應的信息,讀寫完成之后,得到最終的信息,執行close方法。
void IOInfoCollector::CountRWInfo(int fd, const FileOpType &fileOpType, long op_size, long rw_cost) {
if (info_map_.find(fd) == info_map_.end()) {
return;
}
const int64_t now = GetSysTimeMicros();
//讀寫次數
info_map_[fd]->op_cnt_ ++;
//文件大小
info_map_[fd]->op_size_ += op_size;
//讀寫消耗的時長
info_map_[fd]->rw_cost_us_ += rw_cost;
//單次讀寫最大時長
if (rw_cost > info_map_[fd]->max_once_rw_cost_time_μs_) {
info_map_[fd]->max_once_rw_cost_time_μs_ = rw_cost;
}
//讀寫間隔小于8000微妙的總時長
if (info_map_[fd]->last_rw_time_μs_ > 0 && (now - info_map_[fd]->last_rw_time_μs_) < kContinualThreshold) {
info_map_[fd]->current_continual_rw_time_μs_ += rw_cost;
} else {
info_map_[fd]->current_continual_rw_time_μs_ = rw_cost;
}
if (info_map_[fd]->current_continual_rw_time_μs_ > info_map_[fd]->max_continual_rw_cost_time_μs_) {
info_map_[fd]->max_continual_rw_cost_time_μs_ = info_map_[fd]->current_continual_rw_time_μs_;
}
info_map_[fd]->last_rw_time_μs_ = now;
//緩存區大小
if (info_map_[fd]->buffer_size_ < op_size) {
info_map_[fd]->buffer_size_ = op_size;
}
//讀寫類型,讀還是寫
if (info_map_[fd]->op_type_ == FileOpType::kInit) {
info_map_[fd]->op_type_ = fileOpType;
}
}
OnClose
void IOCanary::OnClose(int fd, int close_ret) {
std::shared_ptr<IOInfo> info = collector_.OnClose(fd, close_ret);
if (info == nullptr) {
return;
}
OfferFileIOInfo(info);
}
close時記錄總時長,文件大小,然后返回,返回后進入OfferFileIOInfo方法
std::shared_ptr<IOInfo> IOInfoCollector::OnClose(int fd, int close_ret) {
if (info_map_.find(fd) == info_map_.end()) {
return nullptr;
}
//從打開到關閉的總時長
info_map_[fd]->total_cost_μs_ = GetSysTimeMicros() - info_map_[fd]->start_time_μs_;
//獲取到文件大小
info_map_[fd]->file_size_ = GetFileSize(info_map_[fd]->path_.c_str());
std::shared_ptr<IOInfo> info = info_map_[fd];
//從map中移除
info_map_.erase(fd);
//返回信息
return info;
}
OfferFileIOInfo將info放入隊列,并調用notify_one方法通知消費者消費,這里用到了生產消費模式,生產者將生產果實放在隊列中,消費者從隊列取出進行消費,我們找下消費者在哪。
void IOCanary::OfferFileIOInfo(std::shared_ptr<IOInfo> file_io_info) {
std::unique_lock<std::mutex> lock(queue_mutex_);
queue_.push_back(file_io_info);
queue_cv_.notify_one();
lock.unlock();
}
可以看到,IOCanary在創建的時候,啟動了一個線程
IOCanary::IOCanary() {
exit_ = false;
std::thread detect_thread(&IOCanary::Detect, this);
detect_thread.detach();
}
線程中有一個無限循環,它負責不停的從隊列中拿info,如果隊列為空則掛起線程等待。
前邊我們看到了拿到一條info之后,將info放入到隊列中,然后通知消費者消費,此時消費者線程會從TakeFileIOInfo方法中被喚醒,并拿到一條info,交給各個detector去檢測。
檢測完成之后,滿足條件的信息會被放入published_issues中,然后issued_callback_將信息回調出去。前邊提到有三個detector接下來具體看下他們的內部邏輯。
void IOCanary::Detect() {
std::vector<Issue> published_issues;
std::shared_ptr<IOInfo> file_io_info;
while (true) {
published_issues.clear();
int ret = TakeFileIOInfo(file_io_info);
if (ret != 0) {
break;
}
for (auto detector : detectors_) {
detector->Detect(env_, *file_io_info, published_issues);
}
if (issued_callback_ && !published_issues.empty()) {
issued_callback_(published_issues);
}
file_io_info = nullptr;
}
}
FileIOMainThreadDetector
檢測主線程io
void FileIOMainThreadDetector::Detect(const IOCanaryEnv &env, const IOInfo &file_io_info,
std::vector<Issue>& issues) {
//必須是主線程才會執行
if (GetMainThreadId() == file_io_info.java_context_.thread_id_) {
int type = 0;
//單次io時長超過13毫秒,要記錄
//constexpr static const int kPossibleNegativeThreshold = 13*1000;
if (file_io_info.max_once_rw_cost_time_μs_ > IOCanaryEnv::kPossibleNegativeThreshold) {
type = 1;
}
//最大連續讀寫時長超過env.GetMainThreadThreshold()=500
if(file_io_info.max_continual_rw_cost_time_μs_ > env.GetMainThreadThreshold()) {
type |= 2;
}
if (type != 0) {
Issue issue(kType, file_io_info);
issue.repeat_read_cnt_ = type;
//存入
PublishIssue(issue, issues);
}
}
}
FileIORepeatReadDetector
監聽重復讀取同一文件
void FileIORepeatReadDetector::Detect(const IOCanaryEnv &env,
const IOInfo &file_io_info,
std::vector<Issue>& issues) {
const std::string& path = file_io_info.path_;
if (observing_map_.find(path) == observing_map_.end()) {
if (file_io_info.max_continual_rw_cost_time_μs_ < env.kPossibleNegativeThreshold) {
return;
}
observing_map_.insert(std::make_pair(path, std::vector<RepeatReadInfo>()));
}
std::vector<RepeatReadInfo>& repeat_infos = observing_map_[path];
if (file_io_info.op_type_ == FileOpType::kWrite) {
repeat_infos.clear();
return;
}
RepeatReadInfo repeat_read_info(file_io_info.path_, file_io_info.java_context_.stack_, file_io_info.java_context_.thread_id_,
file_io_info.op_size_, file_io_info.file_size_);
if (repeat_infos.size() == 0) {
repeat_infos.push_back(repeat_read_info);
return;
}
if((GetTickCount() - repeat_infos[repeat_infos.size() - 1].op_timems) > 17) { //17ms todo astrozhou add to params
repeat_infos.clear();
}
bool found = false;
int repeatCnt;
for (auto& info : repeat_infos) {
if (info == repeat_read_info) {
found = true;
info.IncRepeatReadCount();
repeatCnt = info.GetRepeatReadCount();
break;
}
}
if (!found) {
repeat_infos.push_back(repeat_read_info);
return;
}
if (repeatCnt >= env.GetRepeatReadThreshold()) {
Issue issue(kType, file_io_info);
issue.repeat_read_cnt_ = repeatCnt;
issue.stack = repeat_read_info.GetStack();
PublishIssue(issue, issues);
}
}
FileIOSmallBufferDetector
監聽緩存區過小
void FileIOSmallBufferDetector::Detect(const IOCanaryEnv &env, const IOInfo &file_io_info,
std::vector<Issue>& issues) {
if (file_io_info.op_cnt_ > env.kSmallBufferOpTimesThreshold && (file_io_info.op_size_ / file_io_info.op_cnt_) < env.GetSmallBufferThreshold()
&& file_io_info.max_continual_rw_cost_time_μs_ >= env.kPossibleNegativeThreshold) {
PublishIssue(Issue(kType, file_io_info), issues);
}
}
OnIssuePublish
所有信息都拿到之后就開始要回調了,也就回到了我們最開始開到的
iocanary::IOCanary::Get().SetIssuedCallback(OnIssuePublish)
void OnIssuePublish(const std::vector<Issue>& published_issues) {
....
//這里new了一個Java層的List
jobject j_issues = env->NewObject(kListClass, kMethodIDListConstruct);
//遍歷所有的info,拿到信息,每一條信息創建一個Java層的IOIssue對象,封裝到這個對象中
for (const auto& issue : published_issues) {
jint type = issue.type_;
jstring path = env->NewStringUTF(issue.file_io_info_.path_.c_str());
jlong file_size = issue.file_io_info_.file_size_;
jint op_cnt = issue.file_io_info_.op_cnt_;
jlong buffer_size = issue.file_io_info_.buffer_size_;
jlong op_cost_time = issue.file_io_info_.rw_cost_us_/1000;
jint op_type = issue.file_io_info_.op_type_;
jlong op_size = issue.file_io_info_.op_size_;
jstring thread_name = env->NewStringUTF(issue.file_io_info_.java_context_.thread_name_.c_str());
jstring stack = env->NewStringUTF(issue.stack.c_str());
jint repeat_read_cnt = issue.repeat_read_cnt_;
jobject issue_obj = env->NewObject(kIssueClass, kMethodIDIssueConstruct, type, path, file_size, op_cnt, buffer_size,
op_cost_time, op_type, op_size, thread_name, stack, repeat_read_cnt);
//講IOIssue對象add到List中
env->CallBooleanMethod(j_issues, kMethodIDListAdd, issue_obj);
....
}
//回調到Java層的IOCanaryJniBridge類中的靜態方法onIssuePublish中
env->CallStaticVoidMethod(kJavaBridgeClass, kMethodIDOnIssuePublish, j_issues);
....
}
后邊在Java層onIssuePublish中就開始拼接信息轉為json打印到控制臺或上傳服務器,流程至此就算結束了。
總結
IOCanaryPlugin通過hook底層io方法open、read、write、close來實現對io操作的攔截,于是所有的io操作都會被監控到,這樣就可以在每一個io操作的過程中記錄操作的信息,并分析io操作是否超過設定閾值,如滿足條件則進行上報。