3

浅析linux kernel network之socket创建

 3 years ago
source link: https://blogread.cn/it/article/5080?f=hot1
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

浅析linux kernel network之socket创建

浏览:4661次  出处信息

去年受@colyli指点,决定花些时间读一些linux kernel network部分的代码,准备把阅读代码的过程记录下来,也希望能有大牛前来指点,下面就先写一下创建socket对象的过程:

首先,创建socket需要执行socket系统调用:

int socket(int domain, int type, int protocol);

该系统调用有3个参数,在内核中由SYSCALL_DEFINE3定义,具体代码如下:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
 int retval;
 struct socket *sock;
 int flags;
 
 flags = type & ~SOCK_TYPE_MASK;
 if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
  return -EINVAL;
 type &= SOCK_TYPE_MASK;
 
 if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
  flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
 
 retval = sock_create(family, type, protocol, &sock);
 if (retval < 0)
  goto out;
 
 retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
 if (retval < 0)
  goto out_release;
 
out:
 /* It may be already another descriptor 8) Not kernel problem. */
 return retval;
 
out_release:
 sock_release(sock);
 return retval;
}

上述代码中首先做了一个类型检查,type即我们所熟知的SOCK_STREAM,SOCK_DGRAM等sock_type枚举,其取值范围为1-10,均位于type字段的低八位,而SOCK_CLOEXEC 和SOCK_NONBLOCK为于高位,SOCK_TYPE_MASK宏的值为0xF,该检查的作用是检查除了基本的socket类型和SOCK_CLOEXEC和SOCK_NONBLOCK选项之外是否设置了其它选项,如果有,则返回INVLAID。

之后的检查我理解为是一个兼容性检查,如果设置了SOCK_NONBLOCK选项,则不管SOCK_NONBLOCK的值是否定义为与O_NONBLOCK相同,均将socket的O_NONBLOCK位置位,而将SOCK_NONBLOCK位复位。

随便调用sock_create创建一个新的socket对象,之后再调用sock_map_fd将该socket对象影射为文件描述符retval,随即将其返回,从而函数得到一个socket描述符。sock_create函数直接调用了__sock_create函数来创建socket对象,具体看__sock_create函数的实现:

int __sock_create(struct net *net, int family, int type, int protocol,
    struct socket **res, int kern)
{
 int err;
 struct socket *sock;
 const struct net_proto_family *pf;
 
 if (family < 0 || family >= NPROTO)
  return -EAFNOSUPPORT;
 if (type < 0 || type >= SOCK_MAX)
  return -EINVAL;
 
 if (family == PF_INET && type == SOCK_PACKET) {
  static int warned;
  if (!warned) {
   warned = 1;
   printk(KERN_INFO \"%s uses obsolete (PF_INET,SOCK_PACKET)\\n\",
          current->comm);
  }
  family = PF_PACKET;
 }
 
 err = security_socket_create(family, type, protocol, kern);
 if (err)
  return err;
 
 sock = sock_alloc();
 if (!sock) {
  if (net_ratelimit())
   printk(KERN_WARNING \"socket: no more sockets\\n\");
  return -ENFILE; /* Not exactly a match, but its the
       closest posix thing */
 }
 
 sock->type = type;
 
#ifdef CONFIG_MODULES
 if (net_families[family] == NULL)
  request_module(\"net-pf-%d\", family);
#endif
 
 rcu_read_lock();
 pf = rcu_dereference(net_families[family]);
 err = -EAFNOSUPPORT;
 if (!pf)
  goto out_release;
 
 /*
  * We will call the ->create function, that possibly is in a loadable
  * module, so we have to bump that loadable module refcnt first.
  */
 if (!try_module_get(pf->owner))
  goto out_release;
 
 /* Now protected by module ref count */
 rcu_read_unlock();
 
 err = pf->create(net, sock, protocol, kern);
 if (err < 0)
  goto out_module_put;
 
 /*
  * Now to bump the refcnt of the [loadable] module that owns this
  * socket at sock_release time we decrement its refcnt.
  */
 if (!try_module_get(sock->ops->owner))
  goto out_module_busy;
 
 /*
  * Now that we\'re done with the ->create function, the [loadable]
  * module can have its refcnt decremented
  */
 module_put(pf->owner);
 err = security_socket_post_create(sock, family, type, protocol, kern);
 if (err)
  goto out_sock_release;
 *res = sock;
 
 return 0;
 
out_module_busy:
 err = -EAFNOSUPPORT;
out_module_put:
 sock->ops = NULL;
 module_put(pf->owner);
out_sock_release:
 sock_release(sock);
 return err;
 
out_release:
 rcu_read_unlock();
 goto out_sock_release;
}

开始先进行安全性检查和兼容性检查,security_socket_create()是个空函数,可以忽略。之后调用sock_alloc()函数在VFS上分配一个struct socket对象,所有的协议类型创建socket时创建的均为这个对象,可以理解为是所有网络层socket的模板或者说父类,上层协议栈在初始化socket时会根据这个已创建好的struct socket对象创建并初始化一个struct sock对象,这个对象包含更多上层协议栈的详细信息。

接下来的net_families数据是一个全局变量,在系统初始化时在inet_init()函数内进行初始化,其定义如下:

static const struct net_proto_family *net_families[NPROTO] __read_mostly;

每个协议族都会在该数据中对应一个net_proto_family结构体,当然,未实现的协议族中对应位置的结构体指针为空,我们只关心最常用的协议族即AF_INET,其值为2,而NPROTO的值等于AF_MAX,在2.6.37内核中值为38。刚才提到该数组在inet_init()中被初始化,查看该函数相关代码可知它调用sock_register()函数注册INET协议族,代码如下:

(void)sock_register(&inet_family_ops);

其中inet_family_ops是一个全局静态变量,其定义如下:

static const struct net_proto_family inet_family_ops = {
 .family = PF_INET,
 .create = inet_create, /* 该函数在我们创建socket的过程中起着很关键的作用 */
 .owner = THIS_MODULE,
};

再继续查看sock_register()函数,其代码如下:

int sock_register(const struct net_proto_family *ops)
{
 int err;
 
 if (ops->family >= NPROTO) {
  printk(KERN_CRIT \"protocol %d >= NPROTO(%d)\\n\", ops->family,
         NPROTO);
  return -ENOBUFS;
 }
 
 spin_lock(&net_family_lock);
 if (net_families[ops->family])
  err = -EEXIST;
 else {
  net_families[ops->family] = ops;
  err = 0;
 }
 spin_unlock(&net_family_lock);
 
 printk(KERN_INFO \"NET: Registered protocol family %d\\n\", ops->family);
 return err;
}

代码很简单,即将net_proto_family对象插入到net_families数据对应的位置中,由此来完成对net_families的初始化。

OK,继续返回刚才的__sock_create()函数,看下面的代码:

#ifdef CONFIG_MODULES
 if (net_families[family] == NULL)
  request_module(\"net-pf-%d\", family);
#endif

如果编译时支持可安装模块,则首先检测net_families数组中对应的family是否存在,假设此处为AF_INET,如果不存在,即在系统初始化时没有通过sock_register()函数对该元素进行注册,则调用request_module()动态地安装模块net-pf-2。

再往下通过RCU机制获取net_families中对应的net_proto_family对象,RCU机制主要用于网络层和VFS中,关于RCU机制的更多细节可以参考@gnawux师兄的译文:What is RCU, Fundamentally?

之后由注释可知增加该模块的引用计数,重要的函数为pf->create(),当协议族为AF_INET时,由以上可知,create函数为inet_create(),由这个函数进一步创建该socket,我们再看inet_create()函数的实现:

static int inet_create(struct net *net, struct socket *sock, int protocol,
         int kern)
{
 struct sock *sk;
 struct inet_protosw *answer;
 struct inet_sock *inet;
 struct proto *answer_prot;
 unsigned char answer_flags;
 char answer_no_check;
 int try_loading_module = 0;
 int err;
 
        /* 检查是否有加密字符串,如果没有则查检socket类型,
           只有TCP需要加密字符串,如果协议类型不是SOCK_RAW和SOCK_DGRAM的话,
            调用build_ehash_secret()函数创建一个加密字符串 */
 if (unlikely(!inet_ehash_secret))
  if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
   build_ehash_secret();
 
        /* 将socket状态设置为未连接 */
 sock->state = SS_UNCONNECTED;
 
 /* Look for the requested type/protocol pair. */
lookup_protocol:
 err = -ESOCKTNOSUPPORT;
 rcu_read_lock();
 
        /* 遍历inetsw数组对应请求类型的链表元素,
           如果protocol不是通配符类型,即IPPROTO_IP,
           该宏的值为0,则在链表中搜索该协议类型,
           如未找到则返回协议类型不支持,如果是通配符类型,
           则选择一个适合的协议类型,至于inetsw数组,见下面分析 */
 list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
 
  err = 0;
  /* Check the non-wild match. */
  if (protocol == answer->protocol) {
   if (protocol != IPPROTO_IP)
    break;
  } else {
   /* Check for the two wild cases. */
   if (IPPROTO_IP == protocol) {
    protocol = answer->protocol;
    break;
   }
   if (IPPROTO_IP == answer->protocol)
    break;
  }
  err = -EPROTONOSUPPORT;
 }
        /* 动态加载相关协议模块 */
 if (unlikely(err)) {
  if (try_loading_module < 2) {
   rcu_read_unlock();
   /*
    * Be more specific, e.g. net-pf-2-proto-132-type-1
    * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
    */
   if (++try_loading_module == 1)
    request_module(\"net-pf-%d-proto-%d-type-%d\",
            PF_INET, protocol, sock->type);
   /*
    * Fall back to generic, e.g. net-pf-2-proto-132
    * (net-pf-PF_INET-proto-IPPROTO_SCTP)
    */
   else
    request_module(\"net-pf-%d-proto-%d\",
            PF_INET, protocol);
   goto lookup_protocol;
  } else
   goto out_rcu_unlock;
 }
 
 err = -EPERM;
        /* 众所周知,只有root权限用户才可以创始原始套接字,
           此处检查是否有权限创建SOCK_RAW类型的套接字,
           如果无权限,则返回-EPERM */
 if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
  goto out_rcu_unlock; 
 err = -EAFNOSUPPORT;
       /* 检查协议类型是否支持 */
 if (!inet_netns_ok(net, protocol))
  goto out_rcu_unlock;
 
 sock->ops = answer->ops;
 answer_prot = answer->prot;
 answer_no_check = answer->no_check;
 answer_flags = answer->flags;
 rcu_read_unlock();
 
 WARN_ON(answer_prot->slab == NULL);
 
 err = -ENOBUFS;
 sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
 if (sk == NULL)
  goto out;
 
 err = 0;
 sk->sk_no_check = answer_no_check;
 if (INET_PROTOSW_REUSE & answer_flags)
  sk->sk_reuse = 1;
 
 inet = inet_sk(sk);
        /* 是否是基于连接的socket,目前只有SOCK_STREAM是基于连接的socket */
 inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
 
 inet->nodefrag = 0;
 
 if (SOCK_RAW == sock->type) {
  inet->inet_num = protocol;
                /* 如果是原始套接字,则需要创建IP头部 */
  if (IPPROTO_RAW == protocol)
   inet->hdrincl = 1;
 }
 
        /* 是否启用路径MTU发现机制,可以在修改proc文件开启或关闭:
           /proc/sys/net/ipv4/ip_no_pmtu_disc */
 if (ipv4_config.no_pmtu_disc)
  inet->pmtudisc = IP_PMTUDISC_DONT;
 else
  inet->pmtudisc = IP_PMTUDISC_WANT;
 
 inet->inet_id = 0;
 
        /* 该函数利用sock的内容进一步初始化sk */
 sock_init_data(sock, sk);
 
 sk->sk_destruct    = inet_sock_destruct;
 sk->sk_protocol    = protocol;
 sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
 
 inet->uc_ttl = -1;
 inet->mc_loop = 1;
 inet->mc_ttl = 1;
 inet->mc_all = 1;
 inet->mc_index = 0;
 inet->mc_list = NULL;
 
 sk_refcnt_debug_inc(sk);
 
 if (inet->inet_num) {
  /* It assumes that any protocol which allows
   * the user to assign a number at socket
   * creation time automatically
   * shares.
   */
  inet->inet_sport = htons(inet->inet_num);
  /* Add to protocol hash chains. */
  sk->sk_prot->hash(sk);
 }
 
 if (sk->sk_prot->init) {
  err = sk->sk_prot->init(sk);
  if (err)
   sk_common_release(sk);
 }
out:
 return err;
out_rcu_unlock:
 rcu_read_unlock();
 goto out;
}

说一下inetsw这个数组,它也是针对于AF_INET协议族而言的,是一个全局静态变量,包含着创建一个新的socket所需要的所有信息,定义如下:

static struct list_head inetsw[SOCK_MAX];

其中每个元素都是一个双向链表,该数据的定义也是在inet_init()函数中完成的,看其中相关代码:

for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
 INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
 inet_register_protosw(q);

首先初始化每个链表元素的头,接下来将inetsw_array这个数组中的元素使用inet_register_protosw()函数注册到inetsw数组中,inetsw_array也是一个全局的静态变量,其定义如下:

static struct inet_protosw inetsw_array[] =
{
 {
  .type =       SOCK_STREAM,
  .protocol =   IPPROTO_TCP,
  .prot =       &tcp_prot,
  .ops =        &inet_stream_ops,
  .no_check =   0,
  .flags =      INET_PROTOSW_PERMANENT |
         INET_PROTOSW_ICSK,
 },
 
 {
  .type =       SOCK_DGRAM,
  .protocol =   IPPROTO_UDP,
  .prot =       &udp_prot,
  .ops =        &inet_dgram_ops,
  .no_check =   UDP_CSUM_DEFAULT,
  .flags =      INET_PROTOSW_PERMANENT,
       },
 
 
       {
        .type =       SOCK_RAW,
        .protocol =   IPPROTO_IP, /* wild card */
        .prot =       &raw_prot,
        .ops =        &inet_sockraw_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_REUSE,
       }
};

它包含了各种协议类型所需要的基本信息。

接下来很重要的一步,就是调用sk_alloc()函数创建一个新的struct sock对象,struct socket和struct sock的区别在于,struct socket是创建每个BSD所必须的,它描述BSD socket的一些基本信息,而struct sock则描述网络层的相关信息,它在struct socket的基础上构建。看一下sk_sock()的定义:

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
        struct proto *prot)
{
 struct sock *sk;
 
 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
 if (sk) {
  sk->sk_family = family;
  /*
   * See comment in struct sock definition to understand
   * why we need sk_prot_creator -acme
   */
  sk->sk_prot = sk->sk_prot_creator = prot;
  sock_lock_init(sk);
  sock_net_set(sk, get_net(net));
  atomic_set(&sk->sk_wmem_alloc, 1);
 
  sock_update_classid(sk);
 }
 
 return sk;
}

最后一个参数为当前协议类型对应的struct proto对象,将刚创建的struct sock对象的sk_prot和sk_prot_creator成员初始化为prot。sk_prot_alloc()函数用于创建sock对象,可以在slab分配器上创建,也可以在普通缓存中创建。

接下来贝sock_init_data()函数进一步初始化sk:

void sock_init_data(struct socket *sock, struct sock *sk)
{
        /* 初始化接收/发送/异常缓冲队列,这些队列均为双向链表,
           节点数据内容为struct sk_buff对象, 各种数据包的信息都
           存放在该结构体中。队列sk_error_queue很少使用 */
 skb_queue_head_init(&sk->sk_receive_queue);
 skb_queue_head_init(&sk->sk_write_queue);
 skb_queue_head_init(&sk->sk_error_queue);
#ifdef CONFIG_NET_DMA
 skb_queue_head_init(&sk->sk_async_wait_queue);
#endif
 
 sk->sk_send_head = NULL;
 
 init_timer(&sk->sk_timer);
 
 sk->sk_allocation = GFP_KERNEL;
        /* 接收缓冲区最大字节数 */
 sk->sk_rcvbuf  = sysctl_rmem_default;
        /* 发送缓冲区最大字节数 */
 sk->sk_sndbuf  = sysctl_wmem_default;
        /* 连接状态,SOCK_DGRAM和SOCK_RAW也会共用TCP的一些状态,
         连接刚建立的时候都会使用TCP_CLOSE状态 */
 sk->sk_state  = TCP_CLOSE;
 sk_set_socket(sk, sock);
 
 sock_set_flag(sk, SOCK_ZAPPED);
 
 if (sock) {
  sk->sk_type = sock->type;
  sk->sk_wq = sock->wq;
  sock->sk = sk;
 } else
  sk->sk_wq = NULL;
 
 spin_lock_init(&sk->sk_dst_lock);
 rwlock_init(&sk->sk_callback_lock);
 lockdep_set_class_and_name(&sk->sk_callback_lock,
   af_callback_keys + sk->sk_family,
   af_family_clock_key_strings[sk->sk_family]);
 
 sk->sk_state_change = sock_def_wakeup;
 sk->sk_data_ready = sock_def_readable;
 sk->sk_write_space = sock_def_write_space;
 sk->sk_error_report = sock_def_error_report;
 sk->sk_destruct  = sock_def_destruct;
 
 sk->sk_sndmsg_page = NULL;
 sk->sk_sndmsg_off = 0;
 
 sk->sk_peer_pid  = NULL;
 sk->sk_peer_cred = NULL;
 sk->sk_write_pending = 0;
 sk->sk_rcvlowat  = 1;
 sk->sk_rcvtimeo  = MAX_SCHEDULE_TIMEOUT;
 sk->sk_sndtimeo  = MAX_SCHEDULE_TIMEOUT;
 
 sk->sk_stamp = ktime_set(-1L, 0);
 
 /*
  * Before updating sk_refcnt, we must commit prior changes to memory
  * (Documentation/RCU/rculist_nulls.txt for details)
  */
 smp_wmb();
 atomic_set(&sk->sk_refcnt, 1);
 atomic_set(&sk->sk_drops, 0);
}

在inet_create()函数的最后还有一个很重要的初始化,下面这段代码:

if (sk->sk_prot->init) {
 err = sk->sk_prot->init(sk);
 if (err)
  sk_common_release(sk);
}

前面在sk_alloc()函数中为sk初始化时将协议类型对应的proto对象指针赋给了sk->sk_prot,这里的init()函数即对应的proto中的init函数,当协议类型为SOCK_STREAM时,prot即为tcp_proto(见inetsw_array), 对应的init()函数即为:

static int tcp_v4_init_sock(struct sock *sk)

觉得文章有用?立即:

和朋友一起 共学习 共进步!

建议继续学习:

QQ技术交流群:445447336,欢迎加入!
扫一扫订阅我的微信号:IT技术博客大学习

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK