Intro to Computer Science

标签（空格分隔）： Udacity

[TOC]

Lesson2 problem set

find last

# Define a procedure, find_last, that takes as input
# two strings, a search string and a target string,
# and returns the last position in the search string
# where the target string appears, or -1 if there
# are no occurrences.
#
# Example: find_last('aaaa', 'a') returns 3

# Make sure your procedure has a return statement.

#关于find()的复数用法，s.find(t, -2),意思是从s的倒数第二个字符开始找

def find_last(s, t):    # s means search, t means target
    last_pos = -1
    while True:
        pos = s.find(t, last_pos+1) //这里的+1非常重要，最关键的部分
        if pos == -1:
            return last_pos
        last_pos = pos
  


print find_last('aaaa', 'a')
#>>> 3

print find_last('aaaaa', 'aa')
#>>> 3

print find_last('aaaa', 'b')
#>>> -1

#print find_last("111111111", "1")
#>>> 8

#print find_last("222222222", "")
#>>> 9

#print find_last("", "3")
#>>> -1

#print find_last("", "")
#>>> 0

Lesson 2.5:how to solve problems

how old

# By Websten from forums
#
# Given your birthday and the current date, calculate your age in days. 
# Account for leap days. 
#
# Assume that the birthday and current date are correct dates (and no 
# time travel). 

#我写的这个漏洞太多，错的
def daysBetweenDates(year1, month1, day1, year2, month2, day2):
    ##
    # Your code here.
    ##
    year = year2 - year1
    if month2 < month1:
        month = month2 + 12 - month1
        year -= 1
    else:
        month = month2 - month1
    if day2 < day1:
        day = day2+30 - day1
        month -= 1
    else:
        day = day2 - day1
    return year*365 + month*30 + day + 1

# Test routine

def test():
    test_cases = [((2012,1,1,2012,2,28), 58), 
                  ((2012,1,1,2012,3,1), 60),
                  ((2011,6,30,2012,6,30), 366),
                  ((2011,1,1,2012,8,8), 585 ),
                  ((1900,1,1,1999,12,31), 36523)]
    for (args, answer) in test_cases:
        result = daysBetweenDates(*args)
        if result != answer:
            print "Test with data:", args, "failed"
        else:
            print "Test case passed!"

test()

先写一个简单的例子，nextday,假设每月只有30天

def nextDay(year, month, day):
    if day < 30:
        return year, month, day+1
    else:
        if month < 12:
            return year, month+1, 1
        else:
        return year+1, 1, 1

连接两个list

我的错误示例

def union(a, b):
    for e in b:
        if e in a:
            pass
        else:
            a + e
            
# To test, uncomment all lines 
# below except those beginning with >>>.

a = [1,2,3]
b = [2,4,6]
union(a,b)
print a 
#>>> [1,2,3,4,6]
#print b
#>>> [2,4,6]

报错说 list can't connect with list，看来这个+ operation不能用

正确方法，用append()

def union(a, b):
    for e in b:
        if e not in a: 
            a.append(e)

lesson 3 problem set

Max Pages solution

这个是无限版crawl的例子

def crawl_web(see, max_pages):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            union(tocrawl, get_all_links(get_page(page))) 
            crawled.append(page)
    return crawled

这个是针对max_pages做出调整的代码

def crawl_web(see, max_pages):
    tocrawl = [seed]
    crawled = []  //len(crawled) is length of crawled
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled and len(crawled) < max_pages: // 只要爬过的crawled list数目小于max_pages即可
            union(tocrawl, get_all_links(get_page(page))) 
            crawled.append(page)
    return crawled

Max Depth Solution

这个是无限版crawl的例子

def crawl_web(see, max_pages):
    tocrawl = [seed]
    crawled = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            union(tocrawl, get_all_links(get_page(page))) 
            crawled.append(page)
    return crawled

根据max_pages做出相应调整的例子

def crawl_web(seed,max_depth):    
    tocrawl = [seed]
    crawled = []
    next_depth = []  // 用来记载下一个depth里需要crawl的页面
    depth = 0       //depth一开始的默认值是0
    while tocrawl and depth <= max_depth:   //当tocrawl为零，即没有需要crawl的网页时结束，当depth等于max_depth时也结束
        page = tocrawl.pop()
        if page not in crawled:
            union(next_depth, get_all_links(get_page(page)))    //把tocrawl变成了next_depth, 将下一个depth里的page和当前page合并
            crawled.append(page)
        if not tocrawl:  // 关键，只有在当前层crawl完后才会运行这段代码，进入下一层
            tocrawl, next_depth = next_depth, []  //把下一层的页面赋给tocrawl，将下一层的页面清空
            depth = depth + 1 // 此时tocrawl里全是下一层的页面，故depth+1， 
    return crawled

无注释版

def crawl_web(seed,max_depth):    
    tocrawl = [seed]
    crawled = []
    next_depth = []
    depth = 0
    while tocrawl and depth <= max_depth:
        page = tocrawl.pop()
        if page not in crawled:
            union(next_depth, get_all_links(get_page(page)))
            crawled.append(page)
        if not tocrawl:
            tocrawl, next_depth = next_depth, []
            depth = depth + 1
    return crawled

problem set : sudoku

问题描述

# A valid sudoku square satisfies these
# two properties:

#   1. Each column of the square contains
#       each of the whole numbers from 1 to n exactly once.

#   2. Each row of the square contains each
#       of the whole numbers from 1 to n exactly once.

# You may assume the the input is square and contains at
# least one row and column.

我的答案

def twice(p, m):    #检查m在p中是否有两个以上的数字
    n = 0
    for i in p:
        if i == m
        n++
    if n == 1:
        return True
    else:
        return False
    
    
def check_sudoku(sudoku):
    for i in sudoku:    #检查是否是方阵n*n
        if len(i) != len(sudoku):
            return False
    for n in sudoku:     #检查数字是否是1到9内的数字，字母也不行，分数也不行
        for m in n:
            if m > num_list and m not in [1,2,3,4,5,6,7,8,9]:
                return False
    for n in sudoku:    #检查行与列是否有相同数字
        // row = sudoku.index(n)
        for m in n:     # 检查每行是否有相同的数字
            if twice(n, m):
                return False
            column = n.index(m)   #得到m的index
            # 下面四行把m所在列的数字放到m_column里
            m_column = []
            i = 0
            while i <= len(sodoku):   
                m_column.append(sudoku[i][column])
            #检查每列是否有相同数字
            if twice(m_column, m):
                return False
     return True

答案
思路是不管每个digit是什么，不从数组中取值，直接拿1到9去数组里找

def check_sudoku(p):
    n = len(p) # Extract size of grid
    digit = 1 # strat with 1,从1开始逐个与每个digit比较
    while digit <= n: #Go through each digit
        i = 0
        while i < n: #Go through each row and column        
            row_count = 0
            col_count = 0
            j = 0
            while j < n: #for each entry in ith row/column
                if p[i][j] == digit: #check row count
                    row_count = row_count + 1
                if p[j][i] == digit:
                    col_count = col_count + 1
                j = j + 1
            if row_count != 1 or col_count != 1:
                return False
            i = i + 1 #newt row/column
            digit = digit + 1 #next digit
    return True #Nothing was wrong!

Symmetric Square

zip()函数能把矩阵变成逆矩阵

q = [[0, 1, 2], 
     [-1, 0, 3], 
     [-2, -3, 0]]

print zip(*q)

[(0, -1, -2), 
 (1, 0, -3), 
 (2, 3, 0)]

mutated = [list(i) for i in(zip(*q))]
print mutated

[[0, -1, -2], 
 [1, 0, -3], 
 [2, 3, 0]]

也不知道list起什么作用，能把()变成[]
全部函数

def symmetric(q):
    original = q
    mutated = [list(i) for i in(zip(*q))]
    if original==mutated:
        return True
    return False

Lesson 4 Responding to Queries

Add to Index

要求笔记里也有写

# Define a procedure, add_to_index,
# that takes 3 inputs:

# - an index: [[<keyword>,[<url>,...]],...]
# - a keyword: String
# - a url: String

# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.

# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]

index = []

def add_to_index(index_0,keyword,url):
       
        
add_to_index(index,'udacity','http://udacity.com')
add_to_index(index,'computing','http://acm.org')
add_to_index(index,'udacity','http://npr.org')
print index
#>>> [['udacity', ['http://udacity.com', 'http://npr.org']], 
#>>> ['computing', ['http://acm.org']]]

我的代码

def check_keyword(index, keyword):   #检查keyword是否在index中
    for i in index:
        if i[0] == keyword:
            return True
        else:
            return False

#把index中keyword的第一层位置返回
def return_keyword_index(index, keyword):  
    for i in index:
        if[0] == keyword:
            return index.index(i)
        

def add_to_index(index,keyword,url):
    if not check_keyword(index, keyword):
        index.append([keyword, [url]])
    else:
        i = return_keyword_index(index, keyword)
        index[i][1].append('url')

The answer

def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword, [url]])

lookup

返回keyword的所有url，没有对应keyword的话返回empty list

# Define a procedure, lookup,
# that takes two inputs:

# - an index
# - keyword

# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.

index = [['udacity', ['http://udacity.com', 'http://npr.org']],
         ['computing', ['http://acm.org']]]
#my code
def lookup(index,keyword):
    for i in index:
        if i[0] == keyword:
           return i[1]
    return []

Add Page to Index Solution

把content(比如说一篇文章）分成每个单词，把单词作为keyword,把url放到这个keyword之后的url list。

# Define a procedure, add_page_to_index,
# that takes three inputs:

#   - index
#   - url (String)
#   - content (String)

# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.

index = []


def add_to_index(index,keyword,url):
    for entry in index:
        if entry[0] == keyword:
            entry[1].append(url)
            return
    index.append([keyword,[url]])

#待定义的代码
def add_page_to_index(index,url,content):
    words = content.split()
    for word in words:
        add_to_index(index,word,url)

测试的结果

add_page_to_index(index,'fake.text',"This is a test")
print index

[['This', ['fake.text']], ['is', ['fake.text']], ['a', ['fake.text']], ['test', ['fake.text']]]

Finish the web crawler

把之前的crawl_web针对index功能做一些修改，实现最终的crawled

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = [] #update
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page) #update, the page is the url
            add_page_to_index(index,page,content) # update
            union(tocrawl,get_all_links(content))
            crawled.append(page)
    return index

3 The Internet

get_page()函数的代码

def get_page(url):
    try:
        import urllib
        return urllib.urlopen(url).read()
    except:
        return ""

Lesson 4 problem set

Better Splitting

 # The built-in <string>.split() procedure works
# okay, but fails to find all the words on a page
# because it only uses whitespace to split the
# string. To do better, we should also use punctuation
# marks to split the page into words.

# Define a procedure, split_string, that takes two
# inputs: the string to split and a string containing
# all of the characters considered separators. The
# procedure should return a list of strings that break
# the source string up by the characters in the
# splitlist.


def split_string(source,splitlist):
    output = []
    atsplit = True
    for char in source:
        if char in splitlist:
            atsplit = True
        else:
            if atsplit:
                output.append(char)
                atsplit = False
            else:
                output[-1] = output[-1] + char
    return output

Improving the Index Solution

# The current index includes a url in the list of urls
# for a keyword multiple times if the keyword appears
# on that page more than once.

# It might be better to only include the same url
# once in the url list for a keyword, even if it appears
# many times.

# Modify add_to_index so that a given url is only
# included once in the url list for a keyword,
# no matter how many times that keyword appears.

def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == url:
            if url not in entry[1]: # 本问题只需要加这一行即可 
   
             entry[1].append(keyword)
            return
    # not found, add new keyword to index
    index.append([url, [keyword]])


def get_page(url):
    try:
        if url == "http://www.udacity.com/cs101x/index.html":
            return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body>
</html>'''

        elif url == "http://www.udacity.com/cs101x/crawling.html":
            return '''<html> <body> I have not learned to crawl yet, but I am
quite good at  <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''

        elif url == "http://www.udacity.com/cs101x/walking.html":
            return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a></body></html>'''

        elif url == "http://www.udacity.com/cs101x/flying.html":
            return '''<html>
<body>The magic words are Squeamish Ossifrage!</body></html>'''
    except:
        return ""
    return ""

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            union(tocrawl, get_all_links(content))
            crawled.append(page)
    return index

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def lookup(index, keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return None

#index = crawl_web("http://www.udacity.com/cs101x/index.html")
#print lookup(index,"is")
#>>> ['http://www.udacity.com/cs101x/index.html']

Counting Clicks Solution

记载每次搜索某一url的次数
每用一次lookup()函数，就是一次对keyword的搜索，如果能找到url，就说明这个URL的count数+1，用record_user_click(index, keyword, url) 将其+1. 这要改变原有的数据结构，在每一个url的后面接一个count记载搜索次数。index = [keyword, [[url, count], [url, count] ......]

#urls = [[url, count],[url,count]...]

def record_user_click(index, keyword, url):
    urls = lookup(index, keyword)
    if urls:
        for entry in urls:
            if entry[0] == url:
                entry[1] = entry[1]+1

def add_to_index(index, keyword, url):
    # format of index: [[keyword, [[url, count], [url, count],..]],...]
    for entry in index:
        if entry[0] == keyword:
            for urls in entry[1]:
                if urls[0] == url:
                    return
            entry[1].append([url,0])
            return
    # not found, add new keyword to index
    index.append([keyword, [[url,0]]])

time_execution

import time #this is a Python library

def time_execution(code):
   start = time.clock()  # start the clock
   result = eval(code)  # evaluate any string as if it is a Python command
   run_time = time.clock() - start  # find difference in start and end time
   return result, run_time  # return the result of the code and time taken

eval()里面必须用string, for example:time_execution('add_to_index(1,2,3)')

Lesson 6

6.8 implemanting URank

学的page rank之后，修改之前web_crawl()的代码，并返回graph structure.
之前的web_crawl():

def crawl_web(seed):
    tocrawl = [seed]
    crawled = []
    index = []
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            union(tocrawl, get_all_links(content))
            crawled.append(page)
    return index

修改后的：

def crawl_web(seed): # returns index, graph of outlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>:[list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content） #得到content里所有的links
            
            #Insert Code Here
            graph[page] = outlinks
            
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph

所有的代码：

# Modify the crawl_web procedure so that instead of just returning the 
# index, it returns an index and a graph. The graph should be a 
# Dictionary where the key:value entries are:

#  url: [list of pages url links to] 


def crawl_web(seed): # returns index, graph of outlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>:[list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            
            #Insert Code Here
            if page not in graph:
                graph[page] = outlinks
            
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph


cache = {
   'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipes:
<ul>
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
</ul>

For more expert opinions, check out the 
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a> 
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from 
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
</p>
<p>
For great hummus, try 
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a href="http://udacity.com/cs101x/urank/kathleen.html">
best Hummus recipe!
</a>

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>

<ol>
<li> Open a can of garbanzo beans.
<li> Crush them in a blender.
<li> Add 3 tablespoons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>

<ol>
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>

<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>

</body>
</html>




""", 
}

def get_page(url):
    if url in cache:
        return cache[url]
    else:
        return None
    
def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links


def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)
        
def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None



index , graph = crawl_web('http://udacity.com/cs101x/urank/index.html') 

if 'http://udacity.com/cs101x/urank/index.html' in graph:
    print g�raph['http://udacity.com/cs101x/urank/index.html']
#>>> ['http://udacity.com/cs101x/urank/hummus.html',
#'http://udacity.com/cs101x/urank/arsenic.html',
#'http://udacity.com/cs101x/urank/kathleen.html',
#'http://udacity.com/cs101x/urank/nickel.html',
#'http://udacity.com/cs101x/urank/zinc.html']

6.8.5 完整的page rank code

#Finishing the page ranking algorithm.

def compute_ranks(graph):
    d = 0.8 # damping factor
    numloops = 10
    
    ranks = {}
    npages = len(graph)
    for page in graph:
        ranks[page] = 1.0 / npages
    
    for i in range(0, numloops):
        newranks = {}
        for page in graph:
            newrank = (1 - d) / npages
            
            #Insert Code Here
            for node in graph:
                if page in graph[node]:
                    newrank = newrank + d*(rank[node] / len(graph[node]))

            newranks[page] = newrank
        ranks = newranks
    return ranks



cache = {
   'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipies:
<ul>
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
</ul>

For more expert opinions, check out the 
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a> 
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from 
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
</p>
<p>
For great hummus, try 
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a href="http://udacity.com/cs101x/urank/kathleen.html">
best Hummus recipe!
</a>

</body>
</html>






""", 
   'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>

<ol>
<li> Open a can of garbonzo beans.
<li> Crush them in a blender.
<li> Add 3 tablesppons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>

<ol>
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>

</body>
</html>

""", 
   'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>

<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>

</body>
</html>




""", 
}

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>, [list of pages it links to]
    index = {} 
    while tocrawl: 
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            
            
            graph[page] = outlinks
            
            
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph


def get_page(url):
    if url in cache:
        return cache[url]
    else:
        return None
    
def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1: 
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links


def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)
        
def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None

index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)
print ranks

#>>> {'http://udacity.com/cs101x/urank/kathleen.html': 0.11661866666666663,
#'http://udacity.com/cs101x/urank/zinc.html': 0.038666666666666655,
#'http://udacity.com/cs101x/urank/hummus.html': 0.038666666666666655,
#'http://udacity.com/cs101x/urank/arsenic.html': 0.054133333333333325,
#'http://udacity.com/cs101x/urank/index.html': 0.033333333333333326,
#'http://udacity.com/cs101x/urank/nickel.html': 0.09743999999999997}

图片发自简书App

Intro to Computer Science（closed）