systemtap

systemtap安装

systemtap是一个强大的内核开发调试工具。

stap 执行编写的脚本动作。流程如下:

  • 1.parse:分析脚本语法
  • 2.elaborate:展开脚本 中定义的探针和连接预定义脚本库,分析内核和内核模块的调试信息
  • 3.translate:.将脚本编译成c语言内核模块文件放 在$HOME/xxx.c 缓存起来,避免同一脚本多次编译
  • 4.build:将c语言模块文件编译成.ko的内核模块,也缓存起来。
    1. 把模块交给staprun,staprun加载内核模块到内核空间,stapio连接内核模块和用户空间,提供交互IO通道,采集数据。

systemtap 运行需要内核的调试信息支撑,默认发行版的内核在配置时这些调试开关没有打开,所以安装完systemtap也是无法去探测内核信息的

安装systemtap:

方法一:

编译内核以支持systemtap //不合适

方法二:

安装调试内核镜像

如果先安装了systemtap,有stap-prep命令,可以快速安装内核镜像,prep是通过调用yum来安装,如果yum中没有相应的内核镜像软件包,也是无法安装的

stap-prep

如果prep不能安装则需要手动安装,必须安装内核对应的内核镜像版本,内核信息包

[root@VM_11_7_centos systemtap-3.0]# uname  -a
Linux VM_11_7_centos 3.10.0-327.el7.x86_64 #1 SMP Thu Nov 19 22:10:57 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux

kernel-devel.x86_64 0:3.10.0-327.el7
kernel-debuginfo-3.10.0-327.el7.x86_64

systemtap运行还需要elfutils的支持,最新的systemtap需要使用最新的elfutils

elfutils

部署 SystemTap,需要安装以下两个 RPM 包:

systemtap
systemtap-runtime

TEST

root@localhost ~]# stap -ve 'probe begin { log("hello world") exit() }'
Pass 1: parsed user script and 115 library script(s) using 230072virt/47472res/3092shr/44892data kb, in 80usr/120sys/197real ms.
Pass 2: analyzed script: 1 probe(s), 2 function(s), 0 embed(s), 0 global(s) using 231128virt/48792res/3212shr/45948data kb, in 0usr/10sys/8real ms.
Pass 3: using cached /root/.systemtap/cache/7f/stap_7f72777f8257e9a432ad3115abd8362f_1030.c
Pass 4: using cached /root/.systemtap/cache/7f/stap_7f72777f8257e9a432ad3115abd8362f_1030.ko
Pass 5: starting run.
hello world
Pass 5: run completed in 10usr/80sys/819real ms.

正常情况下能都看到hello world,单这个测试只能测试软件都安装正常,正常使用还需要一些涉及到内核模块的测试

#!/usr/bin/stap
probe begin 
{
    log("begin to probe")
}

probe syscall.open
{
    printf ("%s(%d) open (%s)\n", execname(), pid(), argstr)
}

probe timer.ms(4000) # after 4 seconds
{
    exit ()
}

probe end
{
    log("end to probe")
}

这个脚本检测4秒内,调用open的进程

[root@localhost vinny]# stap open.stp 
begin to probe
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
^Cgnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
vmtoolsd(866) open ("/proc/net/dev", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
vmtoolsd(866) open ("/proc/net/if_inet6", O_RDONLY)
gnome-shell(12291) open ("/proc/self/stat", O_RDONLY)
end to probe

正常的输入如上,但是在tlinux下面,总会出现如下保存

Tlinux

ERROR: Couldn't insert module '/tmp/stapKOJEJH/stap_53172c9aef79113568db5a2ab948f356_1228.ko': Required key not available

这是由于公司安全考虑,在tlinux系统上,所有加载到内核的模块都要通过公司认证 http://sign.tlinux.oa.com/cgi-bin/sign_svr/cgi-bin/login.cgi?url=http://sign.tlinux.oa.com:80/cgi-bin/sign_svr/cgi-bin/index.cgi

参考

http://www.cnblogs.com/hazir/p/systemtap_introduction.html https://dirtysalt.github.io/systemtap.html

[chuba@my031045 ~] # stap -g viewcache.stp

在另外的shell里面

[chuba@my031045 ~] # dmesg


#!/usr/bin/stap
#
# This Script used to scan buffer/cache and statistic each file mapped page 
#
#

%{
#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/nodemask.h>
#include <linux/gfp.h>
#include <linux/cpuset.h>
#include <linux/delay.h>
%}

%{

%}

%{
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))

static inline void
add_page_to_active_list(struct zone *zone, struct page *page)
{
        list_add(&page->lru, &zone->active_list);
        zone->nr_active++;
}

static inline void
add_page_to_inactive_list(struct zone *zone, struct page *page)
{
        list_add(&page->lru, &zone->inactive_list);
        zone->nr_inactive++;
}

%}

%{
unsigned long NR_TO_SCAN = 128;  //we scan a 64 page cluster for a time
%}

%{
#define prefetchw_prev_lru_page(_page, _base, _field)                   \
        do {                                                            \
                if ((_page)->lru.prev != _base) {                       \
                        struct page *prev;                              \
                                                                        \
                        prev = lru_to_page(&(_page->lru));              \
                        prefetchw(&prev->_field);                       \
                }                                                       \
        } while (0)
%}


%{
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
                unsigned long *scanned)
{
        unsigned long nr_taken = 0;
        struct page *page;
        unsigned long scan;

        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
                struct list_head *target;
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);

                BUG_ON(!PageLRU(page));

                list_del(&page->lru);
                target = src;
                if (likely(get_page_unless_zero(page))) {
                        /*
                         * Be careful not to clear PageLRU until after we're
                         * sure the page is not being freed elsewhere -- the
                         * page release code relies on it.
                         */
                        ClearPageLRU(page);
                        target = dst;
                        nr_taken++;
                } /* else it is being freed elsewhere */

                list_add(&page->lru, target);
        }

        *scanned = scan;
        return nr_taken;
}
%}


//use B-Tree to store inode data
%{
struct cache_node {
        unsigned long ino;    // inode NO.
        unsigned long nr;     // number of pages related to this inode current in memory
        struct cache_node *left, *right;
};

typedef struct cache_node node;

node * root;
/*
 Given a binary tree, return true if a node
 with the target data is found in the tree. Recurs
 down the tree, chooses the left or right
 branch by comparing the target to each node.
*/
static int 
cache_node_lookup(node *node, unsigned long target) {
        // 1. Base case == empty tree
        // in that case, the target is not found so return false
        if (node == NULL) {
                return 0;
        }
        else {
                // 2. see if found here
                if (target == node->ino) {
                        node->nr++;
                        return 1;
                } else {
                        // 3. otherwise recur down the correct subtree
                        if (target < node->ino) return(cache_node_lookup(node->left, target));
                        else return(cache_node_lookup(node->right, target));
                }
        }
}

static void
insert_cache_node (node **tree,node *item) 
{
        if(!(*tree)) {
                *tree = item;
                return;
        }
        if(item->ino<(*tree)->ino)  
                insert_cache_node(&(*tree)->left, item);
        else if(item->ino>(*tree)->ino)
                insert_cache_node(&(*tree)->right, item);
}

#ifdef SW
static int 
find_get_cache_node(node * node, unsigned long target)
{
        struct cache_node * n;
        if(cache_node_lookup(node, target)) //cache find
                return 1;
        else { // cache miss
                n = kmalloc(sizeof(struct cache_node), GFP_KERNEL);
                if (!n) return -ENOMEM;
                n->ino = target;
                n->nr = 1;
                n->left = NULL;
                n->right = NULL;
                insert_cache_node(&node, n);
                return 0;
        }
}
#endif

static void 
traverse_tree(node *node) {
        if(node->left) traverse_tree(node->left);
        printk(KERN_ALERT "inode: %lu, num: %lu\n", node->ino, node->nr);
        if(node->right) traverse_tree(node->right);
}

static void 
destroy_tree(node *node) {
        if(node->left) traverse_tree(node->left);
        if(node->right) traverse_tree(node->right);
        kfree(node);
}

static int 
scan_hold_list(struct list_head *src, struct list_head *dst, unsigned long *nr)
{
        struct page *page;
        while(!list_empty(src)) {
                page = lru_to_page(src);
                list_del(&page->lru);
                //we only do file mapped page , etl. skip the anonymous page
                if(!PageSwapCache(page) && !((unsigned long)page->mapping & PAGE_MAPPING_ANON)) {
                        //because we don't have the page and inode lock, so 
                        //we must insure both mapping and inode object has not be freed
                        if(likely(page->mapping && page->mapping->host)) {
                                struct cache_node * curr;
                                int ret;
                                unsigned long ino = page->mapping->host->i_ino ;
                                ret = cache_node_lookup(root, ino);
                                if(ret != 1)  {// not found, so create a new node;
                                        curr = kmalloc(sizeof(struct cache_node), GFP_KERNEL);
                                        //FIX me, we should put remain page back to LRU
                                        if (curr == NULL) return -ENOMEM;
                                        curr->ino = ino;
                                        curr->nr = 1;
                                        curr->left = NULL;
                                        curr->right = NULL;
                                        insert_cache_node(&root, curr);
                                }
                        }
                        *nr++;
                }
                //put back page to zone's active list
                list_add(&page->lru, dst);
        }
        return 0;      
}
%}

function viewcache:long()
%{
        pg_data_t *pgdat;

        unsigned long i = 0, j = 0;
        int zone_idx, node_idx;
        int ret;
        struct zone *zone;
        struct page *page = NULL;
        //        struct address_space *mapping;
        unsigned long pgmoved = 0;
        unsigned long scaned;
        unsigned long nr_to_scan = 0;
        //we use l_active and l_inactive to store the temp list stolened page from LRU
        LIST_HEAD(l_active);
        LIST_HEAD(l_inactive);
        LIST_HEAD(l_hold);
        unsigned long nr_active ,nr_inactive;
        
        //get the fist zone
        zone = NODE_DATA(first_online_node)->node_zones;

        cond_resched();

        for (;;) {
                if (!zone) break;

                cond_resched();

                nr_active = zone->nr_active;
                nr_inactive = zone->nr_inactive;
                node_idx = zone->zone_pgdat->node_id;
                zone_idx = zone - zone->zone_pgdat->node_zones;

                while(nr_active) {
                        nr_to_scan = min(NR_TO_SCAN, nr_active);
                        //lock the zone
                        printk(KERN_ALERT "%d %d %ld %ld\n", node_idx, zone_idx, nr_active, nr_to_scan);
                        spin_lock_irq(&zone->lru_lock);
                        //we try to scan 'nr_to_scan' page, and save page scand in 'scaned' variable
                        //and number of page be moved to tmp list is saved in pgmoved variable
                        pgmoved = isolate_lru_pages(nr_to_scan, &zone->active_list,
                                                    &l_active, &scaned);
                        zone->pages_scanned += scaned;
                        zone->nr_active -= pgmoved;
                        spin_unlock_irq(&zone->lru_lock);
                        //decrease the scaned page
                        nr_active -= nr_to_scan;

                        //here we have a 'l_active' list store the page stored from active list
                        if(scan_hold_list(&l_active, &l_hold, &i) < 0 ) goto done;

                        spin_lock_irq(&zone->lru_lock);
                        while(!list_empty(&l_hold)) {
                                page = lru_to_page(&l_hold);
                                prefetchw_prev_lru_page(page, &l_hold, flags);
                                list_del(&page->lru);
                                BUG_ON(PageLRU(page));
                                SetPageLRU(page);
                                BUG_ON(!PageActive(page));
                                add_page_to_active_list(zone, page);
                        }
                        spin_unlock_irq(&zone->lru_lock);
                        //force to sleep 300 msec
                        msleep(300);
                } 
                while(nr_inactive) {
                        nr_to_scan = min(NR_TO_SCAN, nr_inactive);
                        printk(KERN_ALERT "%d %d %ld %ld\n", node_idx, zone_idx, nr_inactive, nr_to_scan);
                        spin_lock_irq(&zone->lru_lock);
                        pgmoved = isolate_lru_pages(nr_to_scan, &zone->inactive_list,
                                                    &l_inactive, &scaned);
                        zone->pages_scanned += scaned;
                        zone->nr_inactive -= pgmoved;
                        spin_unlock_irq(&zone->lru_lock);
                        nr_inactive -= nr_to_scan;

                        if (scan_hold_list(&l_inactive, &l_hold, &j)<0) goto done;

                        spin_lock_irq(&zone->lru_lock);
                        while(!list_empty(&l_hold)) {
                                page = lru_to_page(&l_hold);
                                BUG_ON(PageLRU(page));
                                SetPageLRU(page);
                                list_del(&page->lru);
                                if (PageActive(page))
                                        add_page_to_active_list(zone, page);
                                else
                                        add_page_to_inactive_list(zone, page);
                        }
                        spin_unlock_irq(&zone->lru_lock);
                        //sleep 300 msecs
                        msleep(300);
                }

                if (zone < zone->zone_pgdat->node_zones + MAX_NR_ZONES - 1)
                        zone++;
                else {
                        int nid = next_online_node(zone->zone_pgdat->node_id);
                        if (nid == MAX_NUMNODES)
                                pgdat = NULL;
                        else
                                pgdat = NODE_DATA(nid);
                        if (pgdat)
                                zone = pgdat->node_zones;
                        else
                                zone = NULL;
                }
        }
done:   
        //print result
        traverse_tree(root);
        //free memory 
        destroy_tree(root);
        THIS->__retvalue = i+j;
%}

probe begin {
        printf("total file mapped LRU page = %d\n", viewcache())
        exit()
}

来源:  http://blog.yufeng.info/wp-content/uploads/2010/09/viewcache.stp_.txt
#! /usr/bin/env stap
global device_of_interest
probe begin {
  device_of_interest = $1
  printf ("device of interest: 0x%x\n", device_of_interest)
}

probe kernel.function("submit_bio")
{
  dev = $bio->bi_bdev->bd_dev
  if (dev == device_of_interest)
    printf ("[%s](%d) dev:0x%x rw:%d size:%d\n",
            execname(), pid(), dev, $rw, $bio->bi_size)
}

来源: http://mp.weixin.qq.com/s?__biz=MzA3MzYwNjQ3NA==&mid=2651297322&idx=1&sn=21d124b5f0f8a724a9e080e40e8f841f&chksm=84ff420fb388cb1999464de59859432511b2c6691517b32ac2782b7580caae66e0ca8850828f&mpshare=1&scene=23&srcid=1116LPYnaLVoi0RXGT2bR74H#rd