第一篇:网站图片爬虫
最近系统梳理 python 的知识,希望把零散的知识整理一遍,夯实基础。
思路

问题
暂时正则无法匹配中文,遇中文就报错
解決方法:仔細看啊,前面兩個 u,一個都別少

# -*- coding: utf-8 -*-  
import re
m = re.match(ur"([\u4e00-\u9fa5]+)", u'中文')
print m.group(1)
import urllib,urllib2
import re
import os

class Spider():    
    def __init__(self):
        self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
        self.headers = {"User-Agent":self.user_agent}

    def getPage(self):
        url = 'http://sc.chinaz.com/biaoqing/index_3.html'
        request = urllib2.Request(url)
        request.add_header("User-Agent", 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')

        response = urllib2.urlopen(request)
        return response.read()

    def getContent(self):
        page = self.getPage()


        f = open('D:\Projects\Test\mytest\englishLetter.txt','wb')
        f.write(page)
        f.close()


        pattern = re.compile('''
                            <div\sclass='num_1'>
                            <p>
                            <a\shref='.*?'\starget='_blank'\stitle='(.*?)'>
                            <img\ssrc2="(.*?)">
                            </a>
                            </p>
                            </div>
                            ''')
        items = pattern.findall(page)


        contents = []
        for item in items:
            print item[0],item[1]
            contents.append([item[0],item[1]])
        return contents

    def mk_dir(self,path):
        isExisist = os.path.exists(path)
        if not isExisist:
            os.makedirs(path)
            return True
        else:
            return False

    def downTmage(self,url,dirname):
        imageUrl = url
        request = urllib2.Request(imageUrl)
        request.add_header("User-Agent", 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0')
        response = urllib2.urlopen(request)
        imageContents = response.read()

        urlArr = imageUrl.split(u'/')
        imageName = str(urlArr[len(urlArr)-1])

        path = u'D:\Projects\Test\mytest'+dirname
        self.mk_dir(path)

        imagePath = path +u'/'+imageName

        f = open(imagePath,'wb')
        f.write(imageContents)
        f.close()

    def downOnePage(self):
        contents= self.getContent()

        for li in contents:
            dirname = li[0]
            imageUrl = li[1]
            self.downTmage(imageUrl, dirname)
 ###爬虫调用       
if __name__ == '__main__':
    demo = Spider()
    demo.downOnePage()
    print 'done'

第二篇:批量修改目录下面的文件

思路

问题

from PIL import Image
import os

for filename in os.listdir(r'D:\Projects\Test\005picture'):
    path = 'D:\\Projects\\Test\\005picture'+'\\'+filename
    im = Image.open(path,'r')

    newsize = (0,0,150,150)
    region = im.crop(newsize)

    newname = 'new'+filename
    region.save('D:\\Projects\\Test\\005picture'+'\\'+newname)

for filename in os.listdir(r'D:\Projects\Test\005picture'):
    path = 'D:\\Projects\\Test\\005picture'+'\\'+filename
    im = Image.open(path,'r')

    print im.format, im.size, im.mode

第三篇:批量修改目录下面的文件二

思路

import os

for filename in os.listdir(r'D:\Projects\Test\mytestfile\007txtfile'):
    print filename
    count = 0
    path = 'D:\\Projects\\Test\\mytestfile\\007txtfile\\' + filename
    f = open(path,'r')

    for eachLine in f:
        print eachLine
        count += 1

    print count

第四篇:找到 HTML 文件的正文

思路

import os
import re

for filename in os.listdir(r'D:\Projects\Test\mytestfile\007txtfile'):

    matche = re.match('.*?.html$',filename)
    if matche:
        path = 'D:\\Projects\\Test\\mytestfile\\007txtfile\\' + filename

        f = open(path,'r')

        content = f.read()

        pattern = re.compile('''<body>(.*?)</body>''')
        items = pattern.findall(content)
        print items[0]

第五篇:随机验证码生产

思路

import string
import random

word = []
for i in range(4):
    word.append(random.choice(string.letters)) 
print word

第六篇:敏感词替换

思路

#简易的敏感词列表
sensit = ['iam','love']
b = None
peopleInput = raw_input('enter:')
flag = 0
for i in range(2):

    if peopleInput.find(sensit[i]) != -1:
        print sensit[i]
        b = peopleInput.replace(sensit[i], '**')
        print  'Freedom'
        break
    else:
        flag += 1

if flag == 2:
    print 'Human rights'

print b

第七篇:字典到 excel 的存储

思路
问题:字典取出来顺序有点随意

import xlwt

tree = {'1':['one',100,200,300],
        '2':['two',200,300,400],
        '3':['three',600,700,800]}

wb = xlwt.Workbook()
ws = wb.add_sheet('student',cell_overwrite_ok=True)

row = 0
col = 0

for key in tree.keys():

    ws.write(row,0,key)
    col = 1
    len = tree[key]

    index = 0
    for i in len:
        ws.write(row,col,len[index])
        index += 1
        col += 1
    row += 1

wb.save(r'D:\Projects\Test\mytestfile\14file\student.xls')


↙↙↙阅读原文可查看相关链接,并与作者交流