品牌推廣網(wǎng)站怎樣做關(guān)鍵詞優(yōu)化排名查詢
一、rbd內(nèi)核驅(qū)動(dòng)寫入流程
1)初始化
首先是rbd驅(qū)動(dòng)的初始化工作:包括驗(yàn)證libceph的兼容性,分配內(nèi)存,在sysfs中創(chuàng)建塊設(shè)備控制文件、創(chuàng)建工作隊(duì)列rbd_wq并調(diào)用INIT_WORK初始化它
module_init(rbd_init);
static int __init rbd_init(void)
{
if (!libceph_compatible(NULL)) { //兼容性
rbd_warn(NULL, "libceph incompatibility (quitting)");
return -EINVAL;
}
rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); //創(chuàng)建工作隊(duì)列
if (!rbd_wq)
{rc = -ENOMEM;goto err_out_slab;
}
rc = rbd_slab_init(); //初始化內(nèi)存分配器
if (rc)
return rc;
.......
rc = rbd_sysfs_init(); //創(chuàng)建/sys/bus/rbd/
if (rc)
goto err_out_blkdev;
...}static int rbd_init_request(void *data, struct request *rq,unsigned int hctx_idx, unsigned int request_idx,unsigned int numa_node)
{struct work_struct *work = blk_mq_rq_to_pdu(rq);INIT_WORK(work, rbd_queue_workfn); //初始化一個(gè)work,work通過rbd_queue_workfn進(jìn)行處理return 0;
}
2)塊設(shè)備創(chuàng)建、工作隊(duì)列中啟動(dòng)work
添加塊設(shè)備,首先創(chuàng)建一個(gè)rbd client用來(lái)通信,然后選擇一個(gè)pool存儲(chǔ)池去創(chuàng)建rbd設(shè)備,創(chuàng)建完成后調(diào)用rbd_dev_device_setup初始化rbd設(shè)備,在初始化塊設(shè)備的時(shí)候會(huì)啟動(dòng)工作隊(duì)列rbd_wq,并將通用塊設(shè)備層的請(qǐng)求轉(zhuǎn)化為一個(gè)work添加到rbd_wq工作隊(duì)列中,然后由cpu調(diào)度執(zhí)行工作隊(duì)列rbd_wq中的work,work對(duì)應(yīng)的處理函數(shù)為rbd_queue_workfn,該work用于處理通用塊設(shè)備層的IO請(qǐng)求。
啟動(dòng)work的調(diào)用關(guān)系: rbd_dev_device_setup → rbd_init_disk? → rbd_mq_ops → rbd_init_request → rbd_queue_workfn處理函數(shù)
static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,const struct blk_mq_queue_data *bd)
{struct request *rq = bd->rq;struct work_struct *work = blk_mq_rq_to_pdu(rq); //通用塊設(shè)備層請(qǐng)求轉(zhuǎn)為workqueue_work(rbd_wq, work); //將work加入到工作隊(duì)列,工作隊(duì)列中的work由cpu調(diào)度處理return BLK_MQ_RQ_QUEUE_OK;
}static ssize_t rbd_add(struct bus_type *bus,const char *buf,size_t count)
{if (single_major)return -EINVAL;return do_rbd_add(bus, buf, count);
}static ssize_t do_rbd_add(struct bus_type *bus,const char *buf,size_t count)
{.....rbdc = rbd_get_client(ceph_opts); //獲取或創(chuàng)建rbd_clientif (IS_ERR(rbdc)) {rc = PTR_ERR(rbdc);goto err_out_args;}/* pick the pool */rc = rbd_add_get_pool_id(rbdc, spec->pool_name); //選擇存儲(chǔ)池if (rc < 0) {if (rc == -ENOENT)pr_info("pool %s does not exist\n", spec->pool_name);goto err_out_client;}spec->pool_id = (u64)rc;rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); //創(chuàng)建rbd設(shè)備down_write(&rbd_dev->header_rwsem);
......rc = rbd_dev_image_probe(rbd_dev, 0); //探針更多的是檢查rbd image是否被mapif (rc < 0) {up_write(&rbd_dev->header_rwsem);goto err_out_rbd_dev;}
......rc = rbd_dev_device_setup(rbd_dev); //包括obj->pg映射等static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
{int ret;
....../* Set up the blkdev mapping. */ret = rbd_init_disk(rbd_dev); ......
}static int rbd_init_disk(struct rbd_device *rbd_dev)
{struct gendisk *disk;struct request_queue *q;u64 segment_size;int err;
.....memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));rbd_dev->tag_set.ops = &rbd_mq_ops; //rbd_dev初始化rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
.....
}static struct blk_mq_ops rbd_mq_ops = {.queue_rq = rbd_queue_rq,.init_request = rbd_init_request, //調(diào)用rbd_init_request
};static int rbd_init_request(void *data, struct request *rq,unsigned int hctx_idx, unsigned int request_idx,unsigned int numa_node)
{struct work_struct *work = blk_mq_rq_to_pdu(rq);INIT_WORK(work, rbd_queue_workfn); //通過work_struct啟動(dòng)線程return 0;
}
3)work處理函數(shù)rbd_queue_workfn內(nèi)流程分析
從上層取出通用塊設(shè)備層請(qǐng)求后,轉(zhuǎn)換為image對(duì)象,再?gòu)膇mage對(duì)象批量轉(zhuǎn)為object對(duì)象,再計(jì)算出object到pg,pg到osd的映射關(guān)系。
3.1 獲取通用塊設(shè)備層信息
在rbd_queue_workfn中,通過blk_mq_rq_from_pdu獲取到通用塊設(shè)備層IO請(qǐng)求rq、通過blk_rq_bytes(rq)獲取到請(qǐng)求中需要寫入的數(shù)據(jù)長(zhǎng)度length(length表示的是客戶端需要寫到磁盤總的數(shù)據(jù)長(zhǎng)度),通過blk_rq_pos(rq)獲取塊設(shè)備寫入偏移量offset。
static void rbd_queue_workfn(struct work_struct *work)
{struct request *rq = blk_mq_rq_from_pdu(work); //通用塊設(shè)備層請(qǐng)求struct rbd_device *rbd_dev = rq->q->queuedata;struct rbd_img_request *img_request;struct ceph_snap_context *snapc = NULL;u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; //塊設(shè)備的偏移量u64 length = blk_rq_bytes(rq); //enum obj_operation_type op_type;
.....
}
3.2 通用塊設(shè)備層信息轉(zhuǎn)換image請(qǐng)求,image請(qǐng)求批量轉(zhuǎn)換為object
在rbd_queue_workfn中從通用塊設(shè)備層請(qǐng)求中獲取到塊設(shè)備偏移offset和長(zhǎng)度length后,再使用這些指標(biāo)來(lái)創(chuàng)建img_request并將img_request→offset進(jìn)行填充中,然后調(diào)用rbd_img_request_fill函數(shù),在該函數(shù)中,基于rados object的大小(4M)與rados對(duì)象在rbd中的segment排列,對(duì)請(qǐng)求進(jìn)行拆分,最終將rbd_img_request拆分成多個(gè)rbd_obj_request對(duì)象,通過這樣的過程實(shí)現(xiàn)從linux內(nèi)核的通用塊請(qǐng)求到ceph rados object的轉(zhuǎn)換。
static void rbd_queue_workfn(struct work_struct *work)
{struct request *rq = blk_mq_rq_from_pdu(work);struct rbd_device *rbd_dev = rq->q->queuedata;struct rbd_img_request *img_request;u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; //塊設(shè)備偏移u64 length = blk_rq_bytes(rq); //長(zhǎng)度
......img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, //創(chuàng)建img_requestsnapc); img_request->offset = offset; //填充img_request→offsetresult = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, //將rbd_img_request劃分為一個(gè)個(gè)rbd_obj_requestrq->bio);
.....
}static int rbd_img_request_fill(struct rbd_img_request *img_request,enum obj_request_type type,void *data_desc)
{struct rbd_obj_request *obj_request = NULL;u64 img_offset;img_offset = img_request->offset; //塊設(shè)備當(dāng)前寫入的偏移位置resid = img_request->length; //待寫入的長(zhǎng)度while (resid) {
......object_name = rbd_segment_name(rbd_dev, img_offset); //對(duì)象名length = rbd_segment_length(rbd_dev, img_offset, resid); //長(zhǎng)度obj_request = rbd_obj_request_create(object_name, //創(chuàng)建obj_request對(duì)象offset, length, type);
......img_offset += length; //偏移增加lengthresid -= length;
......
}
3.3 rbd塊設(shè)備offset到rados object的映射
rbd塊設(shè)備到rados對(duì)象的映射是根據(jù)rados對(duì)象的大小以及當(dāng)前塊設(shè)備的偏移量來(lái)決定的,并且rados對(duì)象的命名方式采用前綴rbd_data.$image_id.16位16進(jìn)制的序號(hào)構(gòu)成。
3.3.1 rados對(duì)象大小與命名方式
每個(gè)rbd塊設(shè)備都定義了一個(gè)2為底的指數(shù)來(lái)表示每個(gè)rbd對(duì)象的大小,這個(gè)指數(shù)稱為rbd的obj order。obj order默認(rèn)值為22,因此每個(gè)rbd對(duì)象大小2^22Bytes,即每個(gè)rados對(duì)象大小為4MB。