七 PROXY LAB

这个LAB 是上完CMU CSAPP的21-25 LECTURE之后，就可以做了。
csapp 课程观看地址：https://search.bilibili.com/all?keyword=csapp&from_source=banner_search
lab 6 下载地址: http://csapp.cs.cmu.edu/3e/labs.html
选择PROXY LAB，点击SELF-STUDY HANDOUT

恭喜你，已经来到了最后一个LAB。我的系列也已经到了尾声。纪念这一个月来的努力。把自己所有的CODE，放到了GITHUB。
https://github.com/yixuaz/csapp-labs

这次的作业主要分三个部分(详情参见WRITE-UP http://csapp.cs.cmu.edu/3e/proxylab.pdf ）：

Sequential Proxy: 接收客户端发送的 HTTP 请求，解析之后向目标服务器转发，获得响应之后再转发回客户端
Concurrent Proxy: 在第一步的基础上，支持多线程
Cache Web Objects: 使用 LRU 缓存单独的对象，而不是整个页面

PART 1

第一部分，我的思考笔记如下。
第一步，看懂TINY SERVER（HANDOUT里赠送）的代码。就大概知道如何写一个SERVER。

第二步，根据WRITE-UP 4 Part I: Implementing a sequential web proxy
大概需要做如下编程工作。服务器端接受请求，解析GET http://www.cmu.edu/hub/index.html HTTP/1.1 转换为 GET /hub/index.html HTTP/1.0, 同时拿到HOST 和 PORT，代理服务器自己作为CLIENT向目标发送HTTP 1.0请求.

header 部分，先全部保持不变，随后改4个值，
分别为
Host: www.cmu.edu
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:10.0.3) Gecko/20120305 Firefox/10.0.3
Connection: close
Proxy-Connection: close

转发送后，把接受到的信息再作为代理服务器的输出，向原客户端转发。
第一部分就大功告成。

第三步代码实现

3.1 抄TINY SERVER的框架，把一些常量定义掉

#include <stdio.h>
#include "csapp.h"
/* Recommended max cache and object sizes */
#define MAX_CACHE_SIZE 1049000
#define MAX_OBJECT_SIZE 102400

/* You won't lose style points for including this long line in your code */
static const char *user_agent_hdr = "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:10.0.3) Gecko/20120305 Firefox/10.0.3\r\n";
static const char *conn_hdr = "Connection: close\r\n";
static const char *prox_hdr = "Proxy-Connection: close\r\n";


void doit(int fd);
void clienterror(int fd, char *cause, char *errnum, 
         char *shortmsg, char *longmsg);
void parse_uri(char *uri,char *hostname,char *path,int *port);
void build_requesthdrs(rio_t *rp, char *newreq, char *hostname);
void *thread(void *vargp);


int main(int argc, char **argv)
{
    int listenfd, *connfd;
    pthread_t tid;
    char hostname[MAXLINE], port[MAXLINE];
    socklen_t clientlen;
    struct sockaddr_storage clientaddr;

    
    /* Check command line args */
    if (argc != 2) {
    fprintf(stderr, "usage: %s <port>\n", argv[0]);
    exit(1);
    }
    signal(SIGPIPE, SIG_IGN);

    listenfd = Open_listenfd(argv[1]);
    while (1) {
        printf("listening..\n");
    clientlen = sizeof(clientaddr);
        connfd = Malloc(sizeof(int));
    *connfd = Accept(listenfd, (SA *)&clientaddr, &clientlen);
        
        Getnameinfo((SA *) &clientaddr, clientlen, hostname, MAXLINE, 
                    port, MAXLINE, 0);
        printf("Accepted connection from (%s, %s)\n", hostname, port);
        Pthread_create(&tid, NULL, thread, connfd);                              
    }
}
/* Thread routine */
void *thread(void *vargp)
{
    int connfd = *((int *)vargp);
    Pthread_detach(pthread_self());
    Free(vargp);
    doit(connfd);                                             
    Close(connfd);  
    return NULL;
}

/*
 * doit - handle one HTTP request/response transaction
 */
/* $begin doit */
void doit(int client_fd) 
{
    int endserver_fd;
    char buf[MAXLINE], method[MAXLINE], uri[MAXLINE], version[MAXLINE];
    rio_t from_client, to_endserver;
    /*store the request line arguments*/
    char hostname[MAXLINE],path[MAXLINE];//path eg  /hub/index.html 
    int port;

    /* Read request line and headers */
    Rio_readinitb(&from_client, client_fd);

    if (!Rio_readlineb(&from_client, buf, MAXLINE))  
        return;
    sscanf(buf, "%s %s %s", method, uri, version);       
    if (strcasecmp(method, "GET")) {                     
        clienterror(client_fd, method, "501", "Not Implemented",
                    "Proxy Server does not implement this method");
        return;
    }

    //parse uri then open a clientfd
    
    parse_uri(uri, hostname, path, &port);
    char port_str[10];
    sprintf(port_str, "%d", port);
    endserver_fd = Open_clientfd(hostname, port_str);
    if(endserver_fd<0){
        printf("connection failed\n");
        return;
    }
    Rio_readinitb(&to_endserver, endserver_fd);

    char newreq[MAXLINE]; //for end server http req headers
    //set up first line eg.GET /hub/index.html HTTP/1.0
    sprintf(newreq, "GET %s HTTP/1.0\r\n", path); 
    build_requesthdrs(&from_client, newreq, hostname);
    
    Rio_writen(endserver_fd, newreq, strlen(newreq)); //send client header to real server
    int n;
    while ((n = Rio_readlineb(&to_endserver, buf, MAXLINE))) {//real server response to buf
        //printf("proxy received %d bytes,then send\n",n);
        Rio_writen(client_fd, buf, n);  //real server response to real client
    }
    
}
/* $end doit */



/*
 * clienterror - returns an error message to the client
 */
/* $begin clienterror */
void clienterror(int fd, char *cause, char *errnum, 
         char *shortmsg, char *longmsg) 
{
    char buf[MAXLINE], body[MAXBUF];

    /* Build the HTTP response body */
    sprintf(body, "<html><title>Proxy Error</title>");
    sprintf(body, "%s<body bgcolor=""ffffff"">\r\n", body);
    sprintf(body, "%s%s: %s\r\n", body, errnum, shortmsg);
    sprintf(body, "%s<p>%s: %s\r\n", body, longmsg, cause);
    sprintf(body, "%s<hr><em>The Proxy Web server</em>\r\n", body);

    /* Print the HTTP response */
    sprintf(buf, "HTTP/1.0 %s %s\r\n", errnum, shortmsg);
    Rio_writen(fd, buf, strlen(buf));
    sprintf(buf, "Content-type: text/html\r\n");
    Rio_writen(fd, buf, strlen(buf));
    sprintf(buf, "Content-length: %d\r\n\r\n", (int)strlen(body));
    Rio_writen(fd, buf, strlen(buf));
    Rio_writen(fd, body, strlen(body));
}
/* $end clienterror */

3.2 实现2个辅助函数

在写PARSE URI方法前，我们得回顾下C 的STR的用法
https://www.cs.cmu.edu/~213/activities/cbootcamp/cbootcamp_s19.pdf

void parse_uri(char *uri,char *hostname,char *path,int *port) {
    *port = 80;
    //uri http://www.cmu.edu/hub/index.html
    char* pos1 = strstr(uri,"//");
    if (pos1 == NULL) {
        pos1 = uri;
    } else pos1 += 2;

    //printf("parse uri pos1 %s\n",pos1);//pos1 www.cmu.edu/hub/index.html

    char* pos2 = strstr(pos1,":");
    /*pos1 www.cmu.edu:8080/hub/index.html, pos2 :8080/hub/index.html */
    if (pos2 != NULL) {
        *pos2 = '\0'; //pos1 www.cmu.edu/08080/hub/index.html
        strncpy(hostname,pos1,MAXLINE);
        sscanf(pos2+1,"%d%s",port,path); //pos2+1 8080/hub/index.html
        *pos2 = ':';
    } else {
        pos2 = strstr(pos1,"/");//pos2 /hub/index.html
        if (pos2 == NULL) {/*pos1 www.cmu.edu*/
            strncpy(hostname,pos1,MAXLINE);
            strcpy(path,"");
            return;
        }
        *pos2 = '\0';
        strncpy(hostname,pos1,MAXLINE);
        *pos2 = '/';
        strncpy(path,pos2,MAXLINE);
    }

}

void build_requesthdrs(rio_t *rp, char *newreq, char *hostname, char* port) {
    //already have sprintf(newreq, "GET %s HTTP/1.0\r\n", path);
    char buf[MAXLINE];

    while(Rio_readlineb(rp, buf, MAXLINE) > 0) {          
    if (!strcmp(buf, "\r\n")) break;
        if (strstr(buf,"Host:") != NULL) continue;
        if (strstr(buf,"User-Agent:") != NULL) continue;
        if (strstr(buf,"Connection:") != NULL) continue;
        if (strstr(buf,"Proxy-Connection:") != NULL) continue;

    sprintf(newreq,"%s%s", newreq,buf);
    }
    sprintf(newreq, "%sHost: %s:%s\r\n",newreq, hostname,port);
    sprintf(newreq, "%s%s%s%s", newreq, user_agent_hdr,conn_hdr,prox_hdr);
    sprintf(newreq,"%s\r\n",newreq);
}

3.3 测试

image.png

第一部分40分拿齐了。

image.png

PART 2

首先阅读ECHO MULTI THREAD的代码
http://www.cs.cmu.edu/afs/cs/academic/class/15213-f18/www/lectures/23-concprog.pdf

随后就根据PPT里的思路用多线程的方式实现。

image.png

进行测试

image.png

依然PART 2

本来想些PART3了，但是突然发现，有2个HINT，我都没察觉到，我用过他们。随后就打算试试看自己的PROXY的健壮性，发现用浏览器测试，连百度都上不去呀。

image.png

随后根据这篇博客，和一版新的HINT 对我的代码进行优化
https://www.keblog.me/2014/12/writing-proxy-lab-csapp/

image.png

依然PART 2.1 修改CSAPP.C做错误保护

这里一律注释掉

image.png

如果有错，一律return 0

image.png

设置方法

image.png

开PROXY SERVER前

image.png

开之后

image.png

依然PART 2.2 测试有没有File Descriptor泄漏

下面红框的，不应该存在，看来我有FD没有做释放。

image.png

在DOIT里面补上这个

image.png

PART 3

要实现CACHE的方法，
决定使用数组的方法，为了不浪费空间，决定采用分级数组的思想。（和MALLOC LAB很想）
因为最大缓存对象是100KB，一共有1M的缓存空间。
我可以用5个100KB （500 KB）
25 KB 可以用12个。(300 KB）
随后10KB 可以用10个。（100KB）
还有5KB的用20个，（100 KB）
1 KB 用 20个（20 KB）
100B的用40个（4KB）

第一步定义数据结构

//cache.h
#include "csapp.h"
#include <sys/time.h>

#define TYPES 6
extern const int cache_block_size[];
extern const int cache_cnt[];

typedef struct cache_block{
    char* url;
    char* data;
    int datasize;
    int64_t time;
    pthread_rwlock_t rwlock;
} cache_block;

typedef struct cache_type{
    cache_block *cacheobjs;  
    int size;
} cache_type;


cache_type caches[TYPES];

//intialize cache with malloc
void init_cache();
//if miss cache return 0, hit cache write content to fd
int read_cache(char* url, int fd);
//save value to cache
void write_cache(char* url, char* data, int len);
//free cache
void free_cache();

第二步实现方法

这里我们用了读者写者模式，并且根据提示。不用严格的按照LRU。这是什么意思的，其实就是暗示我们在读的时候，需要去更新时间错，如果有别的线程也在更新同一个CACHE BLOCK。呢么就按照那个为准，TRY失败了不必强求。

image.png

//cache.c
#include "cache.h"

const int cache_block_size[] = {102, 1024, 5120 ,10240,25600, 102400};
const int cache_cnt[] = {40,20,20,10,12,5};
int64_t currentTimeMillis();

void init_cache()
{
    int i = 0;
    for (; i < TYPES; i++) {
        caches[i].size = cache_cnt[i];
        caches[i].cacheobjs 
              = (cache_block *)malloc(cache_cnt[i] * sizeof(cache_block));
        cache_block *j = caches[i].cacheobjs;
        int k;
        for (k = 0; k < cache_cnt[i]; j++, k++) {
            j->time = 0;
            j->datasize = 0;
            j->url = malloc(sizeof(char) * MAXLINE);
            strcpy(j->url,"");
            j->data = malloc(sizeof(char) * cache_block_size[i]);
            memset(j->data,0,cache_block_size[i]);
            pthread_rwlock_init(&j->rwlock,NULL);
        }
    }
}

void free_cache() {
    int i = 0;
    for (; i < TYPES; i++) {
        cache_block *j = caches[i].cacheobjs;
        int k;
        for (k = 0; k < cache_cnt[i]; j++, k++) {
            free(j->url);
            free(j->data);
            pthread_rwlock_destroy(&j->rwlock);
        }
        free(caches[i].cacheobjs);
    }
}

int read_cache(char *url,int fd){
    
    int tar = 0, i = 0;
    cache_type cur;
    cache_block *p;
    printf("read cache %s \n", url);
    for (; tar < TYPES; tar++) {
        cur = caches[tar];
        p = cur.cacheobjs;
        for(i=0;i < cur.size; i++,p++){
            if(p->time != 0 && strcmp(url,p->url) == 0) break;
        }
        if (i < cur.size) break;     
    }

    if(i == cur.size){
        printf("read cache fail\n");
        return 0;
    }
    pthread_rwlock_rdlock(&p->rwlock);
    if(strcmp(url,p->url) != 0){
        pthread_rwlock_unlock(&p->rwlock);
        return 0;
    }
    pthread_rwlock_unlock(&p->rwlock);
    if (!pthread_rwlock_trywrlock(&p->rwlock)) {
        p->time = currentTimeMillis();
        pthread_rwlock_unlock(&p->rwlock); 
    }
    pthread_rwlock_rdlock(&p->rwlock);
    Rio_writen(fd,p->data,p->datasize);
    pthread_rwlock_unlock(&p->rwlock);
    printf("read cache successful\n");
    return 1;
}

void write_cache(char *url, char *data, int len){
    int tar = 0;
    for (; tar < TYPES && len > cache_block_size[tar]; tar++) ;
    printf("write cache %s %d\n", url, tar);
    /* find empty block */
    cache_type cur = caches[tar];
    cache_block *p = cur.cacheobjs, *pt;
    int i;
    for(i=0;i < cur.size;i++,p++){
        if(p->time == 0){
            break;
        }
    }
    /* find last visited */
    int64_t min = currentTimeMillis();
    if(i == cur.size){
        for(i=0,pt = cur.cacheobjs;i<cur.size;i++,pt++){
            if(pt->time <= min){
                min = pt->time;
                p = pt;
            }
        }
    }
    pthread_rwlock_wrlock(&p->rwlock);
    p->time = currentTimeMillis();
    p->datasize = len;
    memcpy(p->url,url,MAXLINE);
    memcpy(p->data,data,len);
    pthread_rwlock_unlock(&p->rwlock);
    printf("write Cache\n");
}

int64_t currentTimeMillis() {
  struct timeval time;
  gettimeofday(&time, NULL);
  int64_t s1 = (int64_t)(time.tv_sec) * 1000;
  int64_t s2 = (time.tv_usec / 1000);
  return s1 + s2;
}

第三步整合进现有CODE

3.1 修改MAKE FILE

image.png

3.2 增加INIT CACHE

image.png

第四步测试

image.png

用浏览器测试前，需要BAN掉浏览器自带的CACHE。

image.png

这里我访问的是
http://home.baidu.com/home/index/contact_us

里面会加载很多资料，试了几次，基本都CACHE下来了。

image.png

测试是否内存泄漏

valgrind --leak-check=full --show-leak-kinds=all ./proxy 45161

只有一个我的代码无法控制的。

image.png

七 PROXY LAB

PART 1

3.1 抄TINY SERVER的框架，把一些常量定义掉

3.2 实现2个辅助函数

3.3 测试

PART 2

依然PART 2

依然PART 2.1 修改CSAPP.C做错误保护

依然PART 2.2 测试有没有File Descriptor泄漏

PART 3

第一步 定义数据结构

第二步 实现方法

第三步 整合进现有CODE

第四步 测试

第一步定义数据结构

第二步实现方法

第三步整合进现有CODE

第四步测试