systemtap
systemtap安装
systemtap是一个强大的内核开发调试工具。
stap 执行编写的脚本动作。流程如下:
- 1.parse:分析脚本语法
- 2.elaborate:展开脚本 中定义的探针和连接预定义脚本库,分析内核和内核模块的调试信息
- 3.translate:.将脚本编译成c语言内核模块文件放 在$HOME/xxx.c 缓存起来,避免同一脚本多次编译
- 4.build:将c语言模块文件编译成.ko的内核模块,也缓存起来。
-
- 把模块交给staprun,staprun加载内核模块到内核空间,stapio连接内核模块和用户空间,提供交互IO通道,采集数据。
systemtap 运行需要内核的调试信息支撑,默认发行版的内核在配置时这些调试开关没有打开,所以安装完systemtap也是无法去探测内核信息的
安装systemtap:
方法一:
编译内核以支持systemtap //不合适
方法二:
安装调试内核镜像
如果先安装了systemtap,有stap-prep命令,可以快速安装内核镜像,prep是通过调用yum来安装,如果yum中没有相应的内核镜像软件包,也是无法安装的
stap-prep
如果prep不能安装则需要手动安装,必须安装内核对应的内核镜像版本,内核信息包
[root@VM_11_7_centos systemtap-3.0]# uname -a
Linux VM_11_7_centos 3.10.0-327.el7.x86_64 #1 SMP Thu Nov 19 22:10:57 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux
kernel-devel.x86_64 0:3.10.0-327.el7
kernel-debuginfo-3.10.0-327.el7.x86_64
systemtap运行还需要elfutils的支持,最新的systemtap需要使用最新的elfutils
elfutils
部署 SystemTap,需要安装以下两个 RPM 包:
systemtap
systemtap-runtime
TEST
root@localhost ~]# stap -ve 'probe begin { log("hello world") exit() }'
Pass 1: parsed user script and 115 library script(s) using 230072virt/47472res/3092shr/44892data kb, in 80usr/120sys/197real ms.
Pass 2: analyzed script: 1 probe(s), 2 function(s), 0 embed(s), 0 global(s) using 231128virt/48792res/3212shr/45948data kb, in 0usr/10sys/8real ms.
Pass 3: using cached /root/.systemtap/cache/7f/stap_7f72777f8257e9a432ad3115abd8362f_1030.c
Pass 4: using cached /root/.systemtap/cache/7f/stap_7f72777f8257e9a432ad3115abd8362f_1030.ko
Pass 5: starting run.
hello world
Pass 5: run completed in 10usr/80sys/819real ms.
正常情况下能都看到hello world,单这个测试只能测试软件都安装正常,正常使用还需要一些涉及到内核模块的测试
#!/usr/bin/stap
probe begin
{
log("begin to probe")
}
probe syscall.open
{
printf ("%s(%d) open (%s)\n", execname(), pid(), argstr)
}
probe timer.ms(4000) # after 4 seconds
{
exit ()
}
probe end
{
log("end to probe")
}
这个脚本检测4秒内,调用open的进程
[root@localhost vinny]# stap open.stp
begin to probe
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
^Cgnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
vmtoolsd(866) open ("/proc/net/dev", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
end to probe
正常的输入如上,但是在tlinux下面,总会出现如下保存
Tlinux
ERROR: Couldn't insert module '/tmp/stapKOJEJH/stap_53172c9aef79113568db5a2ab948f356_1228.ko': Required key not available
这是由于公司安全考虑,在tlinux系统上,所有加载到内核的模块都要通过公司认证 http://sign.tlinux.oa.com/cgi-bin/sign_svr/cgi-bin/login.cgi?url=http://sign.tlinux.oa.com:80/cgi-bin/sign_svr/cgi-bin/index.cgi
参考
http://www.cnblogs.com/hazir/p/systemtap_introduction.html https://dirtysalt.github.io/systemtap.html
[chuba@my031045 ~] # stap -g viewcache.stp
在另外的shell里面
[chuba@my031045 ~] # dmesg
#!/usr/bin/stap
#
# This Script used to scan buffer/cache and statistic each file mapped page
#
#
%{
#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/nodemask.h>
#include <linux/gfp.h>
#include <linux/cpuset.h>
#include <linux/delay.h>
%}
%{
%}
%{
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->active_list);
zone->nr_active++;
}
static inline void
add_page_to_inactive_list(struct zone *zone, struct page *page)
{
list_add(&page->lru, &zone->inactive_list);
zone->nr_inactive++;
}
%}
%{
unsigned long NR_TO_SCAN = 128; //we scan a 64 page cluster for a time
%}
%{
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
if ((_page)->lru.prev != _base) { \
struct page *prev; \
\
prev = lru_to_page(&(_page->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
%}
%{
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
unsigned long *scanned)
{
unsigned long nr_taken = 0;
struct page *page;
unsigned long scan;
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
struct list_head *target;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
BUG_ON(!PageLRU(page));
list_del(&page->lru);
target = src;
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
ClearPageLRU(page);
target = dst;
nr_taken++;
} /* else it is being freed elsewhere */
list_add(&page->lru, target);
}
*scanned = scan;
return nr_taken;
}
%}
//use B-Tree to store inode data
%{
struct cache_node {
unsigned long ino; // inode NO.
unsigned long nr; // number of pages related to this inode current in memory
struct cache_node *left, *right;
};
typedef struct cache_node node;
node * root;
/*
Given a binary tree, return true if a node
with the target data is found in the tree. Recurs
down the tree, chooses the left or right
branch by comparing the target to each node.
*/
static int
cache_node_lookup(node *node, unsigned long target) {
// 1. Base case == empty tree
// in that case, the target is not found so return false
if (node == NULL) {
return 0;
}
else {
// 2. see if found here
if (target == node->ino) {
node->nr++;
return 1;
} else {
// 3. otherwise recur down the correct subtree
if (target < node->ino) return(cache_node_lookup(node->left, target));
else return(cache_node_lookup(node->right, target));
}
}
}
static void
insert_cache_node (node **tree,node *item)
{
if(!(*tree)) {
*tree = item;
return;
}
if(item->ino<(*tree)->ino)
insert_cache_node(&(*tree)->left, item);
else if(item->ino>(*tree)->ino)
insert_cache_node(&(*tree)->right, item);
}
#ifdef SW
static int
find_get_cache_node(node * node, unsigned long target)
{
struct cache_node * n;
if(cache_node_lookup(node, target)) //cache find
return 1;
else { // cache miss
n = kmalloc(sizeof(struct cache_node), GFP_KERNEL);
if (!n) return -ENOMEM;
n->ino = target;
n->nr = 1;
n->left = NULL;
n->right = NULL;
insert_cache_node(&node, n);
return 0;
}
}
#endif
static void
traverse_tree(node *node) {
if(node->left) traverse_tree(node->left);
printk(KERN_ALERT "inode: %lu, num: %lu\n", node->ino, node->nr);
if(node->right) traverse_tree(node->right);
}
static void
destroy_tree(node *node) {
if(node->left) traverse_tree(node->left);
if(node->right) traverse_tree(node->right);
kfree(node);
}
static int
scan_hold_list(struct list_head *src, struct list_head *dst, unsigned long *nr)
{
struct page *page;
while(!list_empty(src)) {
page = lru_to_page(src);
list_del(&page->lru);
//we only do file mapped page , etl. skip the anonymous page
if(!PageSwapCache(page) && !((unsigned long)page->mapping & PAGE_MAPPING_ANON)) {
//because we don't have the page and inode lock, so
//we must insure both mapping and inode object has not be freed
if(likely(page->mapping && page->mapping->host)) {
struct cache_node * curr;
int ret;
unsigned long ino = page->mapping->host->i_ino ;
ret = cache_node_lookup(root, ino);
if(ret != 1) {// not found, so create a new node;
curr = kmalloc(sizeof(struct cache_node), GFP_KERNEL);
//FIX me, we should put remain page back to LRU
if (curr == NULL) return -ENOMEM;
curr->ino = ino;
curr->nr = 1;
curr->left = NULL;
curr->right = NULL;
insert_cache_node(&root, curr);
}
}
*nr++;
}
//put back page to zone's active list
list_add(&page->lru, dst);
}
return 0;
}
%}
function viewcache:long()
%{
pg_data_t *pgdat;
unsigned long i = 0, j = 0;
int zone_idx, node_idx;
int ret;
struct zone *zone;
struct page *page = NULL;
// struct address_space *mapping;
unsigned long pgmoved = 0;
unsigned long scaned;
unsigned long nr_to_scan = 0;
//we use l_active and l_inactive to store the temp list stolened page from LRU
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
LIST_HEAD(l_hold);
unsigned long nr_active ,nr_inactive;
//get the fist zone
zone = NODE_DATA(first_online_node)->node_zones;
cond_resched();
for (;;) {
if (!zone) break;
cond_resched();
nr_active = zone->nr_active;
nr_inactive = zone->nr_inactive;
node_idx = zone->zone_pgdat->node_id;
zone_idx = zone - zone->zone_pgdat->node_zones;
while(nr_active) {
nr_to_scan = min(NR_TO_SCAN, nr_active);
//lock the zone
printk(KERN_ALERT "%d %d %ld %ld\n", node_idx, zone_idx, nr_active, nr_to_scan);
spin_lock_irq(&zone->lru_lock);
//we try to scan 'nr_to_scan' page, and save page scand in 'scaned' variable
//and number of page be moved to tmp list is saved in pgmoved variable
pgmoved = isolate_lru_pages(nr_to_scan, &zone->active_list,
&l_active, &scaned);
zone->pages_scanned += scaned;
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
//decrease the scaned page
nr_active -= nr_to_scan;
//here we have a 'l_active' list store the page stored from active list
if(scan_hold_list(&l_active, &l_hold, &i) < 0 ) goto done;
spin_lock_irq(&zone->lru_lock);
while(!list_empty(&l_hold)) {
page = lru_to_page(&l_hold);
prefetchw_prev_lru_page(page, &l_hold, flags);
list_del(&page->lru);
BUG_ON(PageLRU(page));
SetPageLRU(page);
BUG_ON(!PageActive(page));
add_page_to_active_list(zone, page);
}
spin_unlock_irq(&zone->lru_lock);
//force to sleep 300 msec
msleep(300);
}
while(nr_inactive) {
nr_to_scan = min(NR_TO_SCAN, nr_inactive);
printk(KERN_ALERT "%d %d %ld %ld\n", node_idx, zone_idx, nr_inactive, nr_to_scan);
spin_lock_irq(&zone->lru_lock);
pgmoved = isolate_lru_pages(nr_to_scan, &zone->inactive_list,
&l_inactive, &scaned);
zone->pages_scanned += scaned;
zone->nr_inactive -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
nr_inactive -= nr_to_scan;
if (scan_hold_list(&l_inactive, &l_hold, &j)<0) goto done;
spin_lock_irq(&zone->lru_lock);
while(!list_empty(&l_hold)) {
page = lru_to_page(&l_hold);
BUG_ON(PageLRU(page));
SetPageLRU(page);
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
else
add_page_to_inactive_list(zone, page);
}
spin_unlock_irq(&zone->lru_lock);
//sleep 300 msecs
msleep(300);
}
if (zone < zone->zone_pgdat->node_zones + MAX_NR_ZONES - 1)
zone++;
else {
int nid = next_online_node(zone->zone_pgdat->node_id);
if (nid == MAX_NUMNODES)
pgdat = NULL;
else
pgdat = NODE_DATA(nid);
if (pgdat)
zone = pgdat->node_zones;
else
zone = NULL;
}
}
done:
//print result
traverse_tree(root);
//free memory
destroy_tree(root);
THIS->__retvalue = i+j;
%}
probe begin {
printf("total file mapped LRU page = %d\n", viewcache())
exit()
}
来源: http://blog.yufeng.info/wp-content/uploads/2010/09/viewcache.stp_.txt
#! /usr/bin/env stap
global device_of_interest
probe begin {
device_of_interest = $1
printf ("device of interest: 0x%x\n", device_of_interest)
}
probe kernel.function("submit_bio")
{
dev = $bio->bi_bdev->bd_dev
if (dev == device_of_interest)
printf ("[%s](%d) dev:0x%x rw:%d size:%d\n",
execname(), pid(), dev, $rw, $bio->bi_size)
}
来源: http://mp.weixin.qq.com/s?__biz=MzA3MzYwNjQ3NA==&mid=2651297322&idx=1&sn=21d124b5f0f8a724a9e080e40e8f841f&chksm=84ff420fb388cb1999464de59859432511b2c6691517b32ac2782b7580caae66e0ca8850828f&mpshare=1&scene=23&srcid=1116LPYnaLVoi0RXGT2bR74H#rd