第一篇:网站图片爬虫
最近系统梳理 python 的知识,希望把零散的知识整理一遍,夯实基础。
思路
问题
暂时正则无法匹配中文,遇中文就报错
解決方法:仔細看啊,前面兩個 u,一個都別少
# -*- coding: utf-8 -*-
import re
m = re.match(ur"([\u4e00-\u9fa5]+)", u'中文')
print m.group(1)
import urllib,urllib2
import re
import os
class Spider():
def __init__(self):
self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
self.headers = {"User-Agent":self.user_agent}
def getPage(self):
url = 'http://sc.chinaz.com/biaoqing/index_3.html'
request = urllib2.Request(url)
request.add_header("User-Agent", 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
response = urllib2.urlopen(request)
return response.read()
def getContent(self):
page = self.getPage()
f = open('D:\Projects\Test\mytest\englishLetter.txt','wb')
f.write(page)
f.close()
pattern = re.compile('''
<div\sclass='num_1'>
<p>
<a\shref='.*?'\starget='_blank'\stitle='(.*?)'>
<img\ssrc2="(.*?)">
</a>
</p>
</div>
''')
items = pattern.findall(page)
contents = []
for item in items:
print item[0],item[1]
contents.append([item[0],item[1]])
return contents
def mk_dir(self,path):
isExisist = os.path.exists(path)
if not isExisist:
os.makedirs(path)
return True
else:
return False
def downTmage(self,url,dirname):
imageUrl = url
request = urllib2.Request(imageUrl)
request.add_header("User-Agent", 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
response = urllib2.urlopen(request)
imageContents = response.read()
urlArr = imageUrl.split(u'/')
imageName = str(urlArr[len(urlArr)-1])
path = u'D:\Projects\Test\mytest'+dirname
self.mk_dir(path)
imagePath = path +u'/'+imageName
f = open(imagePath,'wb')
f.write(imageContents)
f.close()
def downOnePage(self):
contents= self.getContent()
for li in contents:
dirname = li[0]
imageUrl = li[1]
self.downTmage(imageUrl, dirname)
###爬虫调用
if __name__ == '__main__':
demo = Spider()
demo.downOnePage()
print 'done'
第二篇:批量修改目录下面的文件
思路
问题
from PIL import Image
import os
for filename in os.listdir(r'D:\Projects\Test\005picture'):
path = 'D:\\Projects\\Test\\005picture'+'\\'+filename
im = Image.open(path,'r')
newsize = (0,0,150,150)
region = im.crop(newsize)
newname = 'new'+filename
region.save('D:\\Projects\\Test\\005picture'+'\\'+newname)
for filename in os.listdir(r'D:\Projects\Test\005picture'):
path = 'D:\\Projects\\Test\\005picture'+'\\'+filename
im = Image.open(path,'r')
print im.format, im.size, im.mode
第三篇:批量修改目录下面的文件二
思路
import os
for filename in os.listdir(r'D:\Projects\Test\mytestfile\007txtfile'):
print filename
count = 0
path = 'D:\\Projects\\Test\\mytestfile\\007txtfile\\' + filename
f = open(path,'r')
for eachLine in f:
print eachLine
count += 1
print count
第四篇:找到 HTML 文件的正文
思路
import os
import re
for filename in os.listdir(r'D:\Projects\Test\mytestfile\007txtfile'):
matche = re.match('.*?.html$',filename)
if matche:
path = 'D:\\Projects\\Test\\mytestfile\\007txtfile\\' + filename
f = open(path,'r')
content = f.read()
pattern = re.compile('''<body>(.*?)</body>''')
items = pattern.findall(content)
print items[0]
第五篇:随机验证码生产
思路
import string
import random
word = []
for i in range(4):
word.append(random.choice(string.letters))
print word
第六篇:敏感词替换
思路
#简易的敏感词列表
sensit = ['iam','love']
b = None
peopleInput = raw_input('enter:')
flag = 0
for i in range(2):
if peopleInput.find(sensit[i]) != -1:
print sensit[i]
b = peopleInput.replace(sensit[i], '**')
print 'Freedom'
break
else:
flag += 1
if flag == 2:
print 'Human rights'
print b
第七篇:字典到 excel 的存储
思路
问题:字典取出来顺序有点随意
import xlwt
tree = {'1':['one',100,200,300],
'2':['two',200,300,400],
'3':['three',600,700,800]}
wb = xlwt.Workbook()
ws = wb.add_sheet('student',cell_overwrite_ok=True)
row = 0
col = 0
for key in tree.keys():
ws.write(row,0,key)
col = 1
len = tree[key]
index = 0
for i in len:
ws.write(row,col,len[index])
index += 1
col += 1
row += 1
wb.save(r'D:\Projects\Test\mytestfile\14file\student.xls')