fid介绍
fid是lustre文件系统中文件的唯一标识,总共128位,fid序列、fid序列内编号、fid版本号(目前未使用默认为0)
/**
* File IDentifier.
*
* FID is a cluster-wide unique identifier of a file or an object (stripe).
* FIDs are never reused.
**/
struct lu_fid {
/**
* FID sequence. Sequence is a unit of migration: all files (objects)
* with FIDs from a given sequence are stored on the same server.
* Lustre should support 2^64 objects, so even if each sequence
* has only a single object we can still enumerate 2^64 objects.
**/
__u64 f_seq;
/* FID number within sequence. */
__u32 f_oid;
/**
* FID version, used to distinguish different versions (in the sense
* of snapshots, etc.) of the same file system object. Not currently
* used.
**/
__u32 f_ver;
};
fid获取流程
fld:fid location database
sequence controller: 运行在MDT0上,拥有全量的fld信息
sequence server:运行在MDT(非MDT0)和OST上,互相不会有重叠,是MDT0上fld的子集
sequence client:每个客户端在挂载文件系统时会提前申请一部分sequence,每个客户端拿到的sequence不会有重叠
管理fid范围的结构体:
/**
* Describes a range of sequence, lsr_start is included but lsr_end is
* not in the range.
* Same structure is used in fld module where lsr_index field holds mdt id
* of the home mdt.
*/
struct lu_seq_range {
__u64 lsr_start; //序列号起始
__u64 lsr_end; //序列号结束
__u32 lsr_index;
__u32 lsr_flags;
};
在同一个MDT上创建的文件,如果序列号未使用完,则这些文件的序列号相同,fid序列内编号依次递增。
如果序列号使用完,则客户端会向服务端申请下一批序列号
例:假设同一客户端依次在MDT0上创建test1和test2,那么test1的fid为[0x20001:0x1:0x0],那么test2的fid为[0x20001:0x2:0x0]
fid申请流程
服务端初始化阶段
mdt:
//运行于mdt上的sequnce服务
static int mdt_seq_init(const struct lu_env *env, struct mdt_device *mdt)
{
struct seq_server_site *ss;
int rc;
ENTRY;
ss = mdt_seq_site(mdt);
/* init sequence controller server(MDT0) */
if (ss->ss_node_id == 0) {
OBD_ALLOC_PTR(ss->ss_control_seq);
if (ss->ss_control_seq == NULL)
RETURN(-ENOMEM);
//在mdt0上运行sequnce controller,分配seq范围给sequence server
rc = seq_server_init(env, ss->ss_control_seq, mdt->mdt_bottom,
mdt_obd_name(mdt), LUSTRE_SEQ_CONTROLLER,
ss);
if (rc)
GOTO(out_seq_fini, rc);
}
/* Init normal sequence server */
OBD_ALLOC_PTR(ss->ss_server_seq);
if (ss->ss_server_seq == NULL)
GOTO(out_seq_fini, rc = -ENOMEM);
//其他的mdt会执行下面的代码,运行sequence server,给sequence server分配seq来构建fid
rc = seq_server_init(env, ss->ss_server_seq, mdt->mdt_bottom,
mdt_obd_name(mdt), LUSTRE_SEQ_SERVER, ss);
if (rc)
GOTO(out_seq_fini, rc);
/* init seq client for seq server to talk to seq controller(MDT0) */
rc = mdt_seq_init_cli(env, mdt);
if (rc != 0)
GOTO(out_seq_fini, rc);
if (ss->ss_node_id != 0)
/* register controller export through lwp */
rc = mdt_register_seq_exp(mdt);
EXIT;
out_seq_fini:
if (rc)
mdt_seq_fini(env, mdt);
return rc;
}
ost:
//运行于ost上的sequnce服务
int ofd_fid_init(const struct lu_env *env, struct ofd_device *ofd)
{
struct seq_server_site *ss = &ofd->ofd_seq_site;
struct lu_device *lu = &ofd->ofd_dt_dev.dd_lu_dev;
char *obd_name = ofd_name(ofd);
char *name = NULL;
int len = strlen(obd_name) + 7;
int rc = 0;
ss = &ofd->ofd_seq_site;
lu->ld_site->ld_seq_site = ss;
ss->ss_lu = lu->ld_site;
ss->ss_node_id = ofd->ofd_lut.lut_lsd.lsd_osd_index;
OBD_ALLOC(name, len);
if (name == NULL)
return -ENOMEM;
OBD_ALLOC_PTR(ss->ss_server_seq);
if (ss->ss_server_seq == NULL)
GOTO(out_name, rc = -ENOMEM);
//在ost上运行sequence server
rc = seq_server_init(env, ss->ss_server_seq, ofd->ofd_osd, obd_name,
LUSTRE_SEQ_SERVER, ss);
if (rc) {
CERROR("%s: seq server init error: rc = %d\n", obd_name, rc);
GOTO(out_server, rc);
}
ss->ss_server_seq->lss_space.lsr_index = ss->ss_node_id;
OBD_ALLOC_PTR(ss->ss_client_seq);
if (ss->ss_client_seq == NULL)
GOTO(out_server, rc = -ENOMEM);
snprintf(name, len, "%s-super", obd_name);
//初始化ost上的seq client
rc = seq_client_init(ss->ss_client_seq, NULL, LUSTRE_SEQ_DATA,
name, NULL);
if (rc) {
CERROR("%s: seq client init error: rc = %d\n", obd_name, rc);
GOTO(out_client, rc);
}
rc = seq_server_set_cli(env, ss->ss_server_seq, ss->ss_client_seq);
if (rc) {
out_client:
seq_client_fini(ss->ss_client_seq);
OBD_FREE_PTR(ss->ss_client_seq);
ss->ss_client_seq = NULL;
out_server:
seq_server_fini(ss->ss_server_seq, env);
OBD_FREE_PTR(ss->ss_server_seq);
ss->ss_server_seq = NULL;
}
out_name:
OBD_FREE(name, len);
return rc;
}
当客户端创建新文件时,会检查申请到的sequence是否够用,够用的话直接走本地分配fid,然后在向MDT发送创建请求时会将新分配的fid反馈给MDT,由MDT进行处理
当客户端本地的sequence不够用时,会向sequence server申请新的sequence,如果sequence server上的sequence也不够用了,那么sequence server会向sequence controller申请新的sequence,最终返回新的sequence给客户端。
seq cli <–> seq svr
当client中申请的seq使用完之后会向server申请新的seq
static int seq_client_alloc_seq(const struct lu_env *env,
struct lu_client_seq *seq, u64 *seqnr)
{
......
// eq耗尽的话调用seq_client_alloc_meta()获取新的seq
if (lu_seq_range_is_exhausted(&seq->lcs_space)) {
rc = seq_client_alloc_meta(env, seq);
if (rc) {
if (rc != -EINPROGRESS)
CERROR("%s: Can't allocate new meta-sequence,"
"rc = %d\n", seq->lcs_name, rc);
RETURN(rc);
} else {
CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
seq->lcs_name, PRANGE(&seq->lcs_space));
}
} else {
rc = 0;
}
......
RETURN(rc);
}
static int seq_client_rpc(struct lu_client_seq *seq,
struct lu_seq_range *output, __u32 opc,
const char *opcname)
{
......
if (seq->lcs_type == LUSTRE_SEQ_METADATA) {
req->rq_reply_portal = MDC_REPLY_PORTAL;
req->rq_request_portal = SEQ_METADATA_PORTAL;
} else {
req->rq_reply_portal = OSC_REPLY_PORTAL;
req->rq_request_portal = SEQ_DATA_PORTAL;
}
......
rc = ptlrpc_queue_wait(req);
if (rc)
GOTO(out_req, rc);
//获取到新申请的seq
out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
*output = *out;
......
}
seq server对应处理函数:
static int seq_handler(struct tgt_session_info *tsi)
{
struct lu_seq_range *out, *tmp;
struct lu_site *site;
int rc;
__u32 *opc;
ENTRY;
LASSERT(!(lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY));
site = tsi->tsi_exp->exp_obd->obd_lu_dev->ld_site;
LASSERT(site != NULL);
opc = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_OPC);
if (opc != NULL) {
out = req_capsule_server_get(tsi->tsi_pill, &RMF_SEQ_RANGE);
if (out == NULL)
RETURN(err_serious(-EPROTO));
tmp = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_RANGE);
/* seq client passed mdt id, we need to pass that using out
* range parameter */
out->lsr_index = tmp->lsr_index;
out->lsr_flags = tmp->lsr_flags;
//走这个函数申请新seq
rc = seq_server_handle(site, tsi->tsi_env, *opc, out);
} else {
rc = err_serious(-EPROTO);
}
RETURN(rc);
}
static int seq_server_handle(struct lu_site *site,
const struct lu_env *env,
__u32 opc, struct lu_seq_range *out)
{
switch (opc) {
case SEQ_ALLOC_META:
if (!ss_site->ss_server_seq) {
CERROR("Sequence server is not "
"initialized\n");
RETURN(-EINVAL);
}
dev = lu2dt_dev(ss_site->ss_server_seq->lss_obj->do_lu.lo_dev);
if (dev->dd_rdonly)
RETURN(-EROFS);
rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
break;
......
}
int seq_server_alloc_meta(struct lu_server_seq *seq,
struct lu_seq_range *out,
const struct lu_env *env)
{
......
mutex_lock(&seq->lss_mutex);
rc = __seq_server_alloc_meta(seq, out, env);
mutex_unlock(&seq->lss_mutex);
RETURN(rc);
}
static int __seq_server_alloc_meta(struct lu_server_seq *seq,
struct lu_seq_range *out,
const struct lu_env *env)
{
struct lu_seq_range *space = &seq->lss_space;
int rc = 0;
......
// 检查server中seq是否够用,不够用会向controller新申请
rc = seq_server_check_and_alloc_super(env, seq){
if 耗尽{
// 向mdt0申请新的可用的sequence range
rc = seq_client_alloc_super(seq->lss_cli, env);
// 插入到本地的sequence server的fld
rc = fld_insert_entry(env, fld, space);
}
}
......
// 更新server中的seq、将seq server持久化到ldiskfs、赋值给out
rc = range_alloc_set(env, out, seq);
......
}
seq svr <–> seq controller
当seq server中seq不足时,会向seq controller申请新的seq
seq server :
int seq_server_check_and_alloc_super(const struct lu_env *env,
struct lu_server_seq *seq)
{
struct lu_seq_range *space = &seq->lss_space;
int rc = 0;
ENTRY;
/* Check if available space ends and allocate new super seq */
if (lu_seq_range_is_exhausted(space)) {
// 向mdt0申请seq
rc = seq_client_alloc_super(seq->lss_cli, env);
if (rc) {
CDEBUG(D_HA, "%s: Can't allocate super-sequence:"
" rc %d\n", seq->lss_name, rc);
RETURN(rc);
}
/* Saving new range to allocation space. */
*space = seq->lss_cli->lcs_space;
LASSERT(lu_seq_range_is_sane(space));
if (seq->lss_cli->lcs_srv == NULL) {
struct lu_server_fld *fld;
/* Insert it to the local FLDB */
fld = seq->lss_site->ss_server_fld;
mutex_lock(&fld->lsf_lock);
// 将申请好的seq插入到本地的sequence server的fld
rc = fld_insert_entry(env, fld, space);
mutex_unlock(&fld->lsf_lock);
}
}
if (lu_seq_range_is_zero(&seq->lss_lowater_set))
__seq_set_init(env, seq);
RETURN(rc);
}
controller:
static int seq_server_handle(struct lu_site *site,
const struct lu_env *env,
__u32 opc, struct lu_seq_range *out)
{
int rc;
struct seq_server_site *ss_site;
struct dt_device *dev;
ENTRY;
ss_site = lu_site2seq(site);
switch (opc) {
case SEQ_ALLOC_META:
......
break;
case SEQ_ALLOC_SUPER:
if (!ss_site->ss_control_seq) {
CERROR("Sequence controller is not "
"initialized\n");
RETURN(-EINVAL);
}
dev = lu2dt_dev(ss_site->ss_control_seq->lss_obj->do_lu.lo_dev);
if (dev->dd_rdonly)
RETURN(-EROFS);
rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
break;
default:
rc = -EINVAL;
break;
}
RETURN(rc);
}
int seq_server_alloc_super(struct lu_server_seq *seq,
struct lu_seq_range *out,
const struct lu_env *env)
{
int rc;
ENTRY;
mutex_lock(&seq->lss_mutex);
rc = __seq_server_alloc_super(seq, out, env);
mutex_unlock(&seq->lss_mutex);
RETURN(rc);
}
static int __seq_server_alloc_super(struct lu_server_seq *seq,
struct lu_seq_range *out,
const struct lu_env *env)
{
struct lu_seq_range *space = &seq->lss_space;
int rc;
ENTRY;
LASSERT(lu_seq_range_is_sane(space));
if (lu_seq_range_is_exhausted(space)) {
CERROR("%s: Sequences space is exhausted\n",
seq->lss_name);
RETURN(-ENOSPC);
} else {
//在mdt0内分配seq
range_alloc(out, space, seq->lss_width);
}
//将新申请的seq更新到fld
rc = seq_store_update(env, seq, out, 1 /* sync */);
LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
seq->lss_name, rc, PRANGE(out));
RETURN(rc);
}
上述理解如果有理解不正确的地方,欢迎各位大佬指正[手动抱拳]
参考了一位大佬的文章,链接如下:https://cloud.tencent.com/developer/article/2074601