본문 바로가기

Python/Parsing,Crwaling

티스토리 파싱

#-*- coding: utf-8 -*-
import urllib,urllib2
import re,os
global path

def file_writeb(name,content,extension):
    f = open(path+name,"wb")
    f.write(content)
    f.close()
def file_writeh(name,content):
    f = open(path+name+".html","w")
    f.write(content)
    f.close()

for num in range(0,40):
    path = "F:\\python\\pharsing\\"
    url = ""
    no =num
    no=str(no)
    url += no
    req=urllib2.Request(url,headers={'User-Agent':"Magic Browser"}) # tistory 403 forbidden soluction
    
    try :
        html = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
        print e.code
        continue

    start = html.find('<meta property="og:title" content="')+35
    end = html.find('<meta property="og:description"')-9    
    title = html[start:end]
    title = title.replace("&quot;","")
    print "title : %s" % title
    path = path+title
    
    n = path
    try :
        os.mkdir(n)
    except OSError,why:
        print "%s already exists" % why.filename
        continue
    print path
    path+='\\'
    
    
    img = re.findall(r"http://cfile[\w\.-]+.uf.tistory.com/image/[\w\.-]+",html)
    for i in img:
        data = urllib.urlopen(i).read()
        rep = re.findall(r"http://cfile[\w\.-]+.uf.tistory.com/image/",i)
        html = html.replace(rep[0],"./")
        file_writeb(i.replace(rep[0],""),data,"jpg")
        

    html=html.replace(rep[0],"./")
    html=html.replace(r"http://cfile[\w\.-]+.uf.tistory.com/image/","./")
    file_writeh(title,html)
    print "[%d].." % num

    img = re.findall(r"http://cfile[\w\.-]+.uf.tistory.com/original/[\w\.-]+",html)
    for i in img:
        data = urllib.urlopen(i).read()
        rep = re.findall(r"http://cfile[\w\.-]+.uf.tistory.com/original/",i)
        html = html.replace(rep[0],"./")
        file_writeb(i.replace(rep[0],""),data,"jpg")
        

    html=html.replace(rep[0],"./")
    html=html.replace(r"http://cfile[\w\.-]+.uf.tistory.com/original/","./")
    file_writeh(title,html)
    print "[%d].." % num
    
input("done!")

코드하이라이팅 했더니 깨지네요


'Python > Parsing,Crwaling' 카테고리의 다른 글

python3.x urllib urlopen  (0) 2014.10.09
python 3 beautifulsoup 윈도우 설치  (1) 2014.10.09
티스토리 파싱  (1) 2014.09.08