tinyhttp源码分析

tinyhttp是一个用c写的轻量级的http server，相比较nginx，apache这类的server，它完全和他们不是一个量级的东西，像是小山包和喜马拉雅山的区别，但是看看这个源码，对了解http协议和 http server的原理是超级有帮助的！！！

tinyhttp是用c写的，相比较用python或则是其他高级语言实现的http server，c语言可以更加从底层的让读者明白一个浏览器的请求是如何被server相应，然后server是如何response。大概用一天的时间读完，真的是把我爽到了，强烈推荐！！！

另外，如果你有空，可以看一下web bench的源码，这个是http压测工具，模拟并发的生成http请求。可以结合起来看一下，还是可以学到很多东西的。

源码：https://github.com/zhaozhengcoder/rebuild-the-wheel/tree/master/tinyhttpd

tinyhttp的框架

下图是看源码的时候，手画的一个流程图。

微信图片_20171108143150.jpg

我把一些核心代码和相应的注释贴在这里，如果你感兴趣全部，可以移步我的github。
https://github.com/zhaozhengcoder/rebuild-the-wheel/tree/master/tinyhttpd

main函数：

int main(void)
{
    int server_sock = -1;
    //监听的端口
    u_short port = 4000;
    int client_sock = -1;
    struct sockaddr_in client_name;
    socklen_t  client_name_len = sizeof(client_name);
    pthread_t newthread;

    server_sock = startup(&port);
    printf("httpd running on port %d\n", port);
    while (1)
    {
        //等待socket建立连接
        client_sock = accept(server_sock,(struct sockaddr *)&client_name,&client_name_len);
        if (client_sock == -1)
            error_die("accept");
        //对于一个socket连接(即一个http请求)，创建一个线程去处理
        if (pthread_create(&newthread , NULL, (void *)accept_request, (void *)(intptr_t)client_sock) != 0)
            perror("pthread_create");
    }
    close(server_sock);
    return(0);
}

accept_request函数：

//对于每一个http请求，都会创建一个线程，线程去执行这个函数去处理请求
//注意：client是一个文件句柄，在accept_request函数里面，只读了这个句柄的第一行，得到了请求的方法和url
void accept_request(void *arg)
{
    int client = (intptr_t)arg;
    char buf[1024];
    size_t numchars;
    char method[255];
    char url[255];
    char path[512];
    size_t i, j;
    struct stat st;
    int cgi = 0;      /* becomes true if server decides this is a CGI */
    char *query_string = NULL;

    //获得请求的第一行，请求的第一行往往是 ： GET / HTTP/1.1
    numchars = get_line(client, buf, sizeof(buf));
    printf("buf : %s",buf);

    //分析请求 
    i = 0; j = 0;
    while (!ISspace(buf[i]) && (i < sizeof(method) - 1))
    {
        method[i] = buf[i];
        i++;
    }
    j=i;
    method[i] = '\0';

    //server 支持get 和post 两种方法，如果是其他的方法，就不支持了，返回状态码501，服务器不支持这个方法
    if (strcasecmp(method, "GET") && strcasecmp(method, "POST"))
    {
        unimplemented(client);
        return;
    }

    //对于是post的请求，把cgi(common gateway interface)的flag 设为1，表示这个需要cgi来处理
    if (strcasecmp(method, "POST") == 0)
        cgi = 1;

    //获得请求的url
    i = 0;
    while (ISspace(buf[j]) && (j < numchars))
        j++;
    while (!ISspace(buf[j]) && (i < sizeof(url) - 1) && (j < numchars))
    {
        url[i] = buf[j];
        i++; j++;
    }
    url[i] = '\0';
    printf("url is %s \n",url);  //比如说，这个请求的url可能是  /index.html ，或则是 /index.html?id=100

    //如果是get方法，判断这个get请求，是否是带有参数的请求
    if (strcasecmp(method, "GET") == 0)
    {
        query_string = url;
        while ((*query_string != '?') && (*query_string != '\0'))
            query_string++;
        if (*query_string == '?')
        {
            cgi = 1;
            *query_string = '\0';
            query_string++;
        }
    }

    //sprintf()函数：将格式化的数据写入字符串
    sprintf(path, "htdocs%s", url);  //获取请求文件路径
    printf("path is :%s \n",path);
    //如果路径是一个目录，那么就给这个路径加上index.html ,表示默认的请求
    if (path[strlen(path) - 1] == '/')
        strcat(path, "index.html");
    //根据路径找文件，并获取path文件信息保存到结构体st中，-1表示寻找失败
    if (stat(path, &st) == -1) {
        //如果寻找失败
        while ((numchars > 0) && strcmp("\n", buf))  /* read & discard headers */
            numchars = get_line(client, buf, sizeof(buf));
        not_found(client);
    }
    else
    {
        if ((st.st_mode & S_IFMT) == S_IFDIR)
            strcat(path, "/index.html");
        if ((st.st_mode & S_IXUSR) ||
                (st.st_mode & S_IXGRP) ||
                (st.st_mode & S_IXOTH)    )
            cgi = 1;
        //如果cgi==0，表示仅是一个get请求，没有带参数
        if (!cgi){
            printf("\n to execute server_file \n");
            serve_file(client, path);
        }
        else{
            //表示是post方法或者是带有参数的get方法
            printf("\n to execute execute_cgi \n");
            execute_cgi(client, path, method, query_string);
        }
    }
    close(client);
}

execute_cgi 函数

//对于带有参数的get请求和 post请求，这两类并不能直接返回一个静态的html文件，需要cgi
//cgi是common gateway interface的简称
//谈一下我对cgi的理解，就是对于不能直接返回静态页面的请求，这些请求一定是需要在服务器上面运行一段代码，然后返回一个结果
//具体一点的谈：
//比如一个 get请求 /index?uid=100,它可能对应的场景是返回id=100用户的页面，这显然不是一个静态的页面，需要动态的生成，然后服务器把这个id=100的参数拿到，去执行本地的一个 xxx.cgi 文件，
//执行这个文件的时候，参数是id=100，然后将执行这个文件的输出返回给浏览器  可以参考 ： http://www.runoob.com/python/python-cgi.html
//注意：client是一个文件剧本，在accept_request函数里面，只读了第一行，在execute_cgi函数里面，把剩下的读完
void execute_cgi(int client, const char *path,const char *method, const char *query_string)
{
    printf ("\n in function execute cgi ! \n");
    char buf[1024];
    int cgi_output[2];
    int cgi_input[2];
    pid_t pid;
    int status;
    int i;
    char c;
    int numchars = 1;
    int content_length = -1;

    buf[0] = 'A'; buf[1] = '\0';
    if (strcasecmp(method, "GET") == 0)
        while ((numchars > 0) && strcmp("\n", buf))  /* read & discard headers */
            numchars = get_line(client, buf, sizeof(buf));
    else if (strcasecmp(method, "POST") == 0) /*POST*/
    {
        numchars = get_line(client, buf, sizeof(buf));
        while ((numchars > 0) && strcmp("\n", buf))
        {
            buf[15] = '\0';
            if (strcasecmp(buf, "Content-Length:") == 0)
                content_length = atoi(&(buf[16]));
            numchars = get_line(client, buf, sizeof(buf));
            printf("buf : %s",buf);
        }
        if (content_length == -1) {
            bad_request(client);
            return;
        }
    }
    else/*HEAD or other*/
    {
    }


    if (pipe(cgi_output) < 0) {
        cannot_execute(client);
        return;
    }
    if (pipe(cgi_input) < 0) {
        cannot_execute(client);
        return;
    }

    if ( (pid = fork()) < 0 ) {
        cannot_execute(client);
        return;
    }
    sprintf(buf, "HTTP/1.0 200 OK\r\n");
    send(client, buf, strlen(buf), 0);
    if (pid == 0)  /* child: CGI script */
    {
        char meth_env[255];
        char query_env[255];
        char length_env[255];

        dup2(cgi_output[1], STDOUT);
        dup2(cgi_input[0], STDIN);
        close(cgi_output[0]);
        close(cgi_input[1]);
        sprintf(meth_env, "REQUEST_METHOD=%s", method);
        putenv(meth_env);
        if (strcasecmp(method, "GET") == 0) {
            sprintf(query_env, "QUERY_STRING=%s", query_string);
            printf("qery_env : %s  ",query_env);
            putenv(query_env);
        }
        else {   /* POST */
            sprintf(length_env, "CONTENT_LENGTH=%d", content_length);
            putenv(length_env);
        }
        printf("\npath :  %s",path);
        //执行外部脚本
        execl(path,path, NULL);
        exit(0);
    } else {    /*父进程 */
        close(cgi_output[1]);
        close(cgi_input[0]);
        //注意：这段代码的意识是，如果请求是post类型，post的请求是在正文里面有post的具体数据的 
        if (strcasecmp(method, "POST") == 0)
            for (i = 0; i < content_length; i++) {
                //在这里读的就是post请求的具体参数，父子进程共享文件句柄，然后这个socet的header部分已经读完了，在往下读，就是post的正文了
                recv(client, &c, 1, 0);
                printf("c: %c \n",c);
                //将读到的数据写给子进程
                write(cgi_input[1], &c, 1);
            }
        while (read(cgi_output[0], &c, 1) > 0)
            send(client, &c, 1, 0);

        close(cgi_output[0]);
        close(cgi_input[1]);
        waitpid(pid, &status, 0);
    }
}

others

假如不借鉴源码的思路，自己去写一个http server，那么下面的几个问题，可能需要考虑如何去解决。

如何去处理并发的http请求？
源码给出的思路是，对每一个来的请求，创建一个线程出处理并发的请求。如果换成你，你会怎么做？你的解决方案可以支持高并发吗？
对于一个http请求，如果从请求里面解析到关键的字段信息，比如
http method，是get，post，put，delete，还是head?
url是什么？
如果是post类型的请求，post的参数是在http请求的正文里面的，那么怎么读取出来他们？他们的长度是如何确定的？
对于带有参数的get方法，和post方法，你的服务器如何去处理？

对于问题2，
这个需要了解http的格式，http请求的格式，如下图：

image.png

判断http 请求header的每一行的标志是：\r\n
判断http请求header和请求正文的标志是：两个\r\n （如上图）
对于，一个post请求，请求的正文里面是post的请求数据，header里面的content-length指明了post请求的数据的长度。

可以参考：http://www.jianshu.com/p/f5a5db039737

对于问题3：
带有参数的get请求，和post请求，服务器没有办法简单的返回一个静态的文件，服务器需要在服务器端将相应的页面“计算”出来，然后返回给浏览器。

假如，浏览器发送了一个请求 /index?id=100，请求id=100的人的主页
那么，服务器需要“计算”出来id=100的这个人的主页的的页面。这个就需要cgi来帮忙了，cgi可以理解为在服务器端可以执行的小脚本。服务器收到这个请求之后，执行index.cgi （这个文件是提前写好了，专门来处理这样的请求) ，服务器执行index.cgi ，参数是id=100，然后“计算”出网页的数据，返回给浏览器。

举个例子，cgi的小脚本长什么样子：

image.png

可以参考：http://www.runoob.com/python/python-cgi.html

这是我在看源码的时候，给自己提的几个问题。看懂tinyhttp应该是理解http server的第一步吧，nginx正在前面等你，一起前进吧，少年~

tinyhttp源码分析