爬虫小项目——抓取博客文章名和时间

用python及requests库，beautifulSoup库，正则表达式等抓取博客文章名，创建时间，修改时间，并存到excel文件中

requests库
beautifulSoup4库
openpyxl库
正则表达式

思路和框架

思路

抓取网页源码
分析源码，并将需要的结果储存到合适的数据结构内
把结果存入excel文件

框架

import requests
import bs4
import re
import openpyxl
import os

def getHTMLText(url): #抓取网页代码
    return ""

def fillTitleList(titleList, html): #填充数据结构
    pass

def storageTitleList(TitleList): #存入excel
    pass

def main(): #主函数
	titleList = []
	url = 'http://treasurew.com'
	try:
        html = getHTMLText(url)
        fillTitleList(titleList, html)
	except:
        continue
	storageTitleList(TitleList)
main()

编写getHTMLText()函数

def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        return r.text
    except:
        return ""

编写fillTitleList()函数（核心）

分析博客源代码

右键-查看网页源代码

可以看出一篇文是class="post-block"属性
可以看出文章的名字是class="post-title-link"属性
而创建时间和修改时间直接用正则表达式可以很方便地获取
以上我都在网页源代码中注释了

<div class="post-block"> #一篇文章从这开始
    <link itemprop="mainEntityOfPage" href="http://blog.treasurew.com/2019/11/15/Acm知识树（持续点亮中orz）/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="treasurew">
      <meta itemprop="description" content>
      <meta itemprop="image" content="/images/dog.jpg">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="treasurew">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">
                
                
                <a href="/2019/11/15/Acm知识树（持续点亮中orz）/" class="post-title-link" itemprop="url">Acm知识树（持续点亮中orz）</a> #这里是标题所在
              
            
          </h1>
        

        <div class="post-meta">

          
          
          

          
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              

              
                
              

              <time title="创建时间：2019-11-15 10:09:45" itemprop="dateCreated datePublished" datetime="2019-11-15T10:09:45+08:00">2019-11-15</time> #这里是创建时间
            </span>
          

          
            

            
              <span class="post-meta-item">
                <span class="post-meta-item-icon">
                  <i class="fa fa-calendar-check-o"></i>
                </span>
                
                  <span class="post-meta-item-text">更新于</span>
                
                <time title="修改时间：2020-01-03 14:17:10" itemprop="dateModified" datetime="2020-01-03T14:17:10+08:00">2020-01-03</time> #这里是修改时间
              </span>
            
          

          
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing"><a href="/categories/Acm/" itemprop="url" rel="index"><span itemprop="name">Acm</span></a></span>

                
                
              
            </span>
          

          
            
            
          

          
          

          

          

          <br>
          
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-file-word-o"></i>
              </span>
              
                <span class="post-meta-item-text">本文字数：</span>
              
              <span title="本文字数">209</span>
            </span>
          

          
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-clock-o"></i>
              </span>
              
                <span class="post-meta-item-text">阅读时长 &asymp;</span>
              
              <span title="阅读时长">1 分钟</span>
            </span>
          

          

          
            <i class="fa fa-thumb-tack"></i>
            <span class="post-meta-divider">|</span>
            <font color="green">置顶</font>
          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        
          <blockquote>
<p>treasurew加油！</p>
</blockquote>
          <!--noindex-->
          
            <div class="post-button text-center">
              <a class="btn" href="/2019/11/15/Acm知识树（持续点亮中orz）/#more" rel="contents">
                阅读全文 &raquo;
              </a>
            </div>
          
          <!--/noindex-->
        
      
    </div>

    

    

    
    
    

    

    
      
    
    

    

    <footer class="post-footer">
      

      

      

      
      
        <div class="post-eof"></div>
      
    </footer>
  </div>
  
  
  
  </article>

解析了html网页源代码，现在我们可以编写代码了

编写代码

def fillTitleList(titleList, html):
    soup = bs4.BeautifulSoup(html, 'html.parser')
    postBlock = soup.select('.post-block') 

    for pb in postBlock:
        t = pb.select('.post-title-link')
        strpb = str(pb)
        rmt = re.compile(r'修改时间：(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})')
        rrt = re.compile(r'创建时间：(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})')
        try:
            modTime = rmt.search(strpb)
            modTime = modTime.group(1)
        except:
            modTime = None
        relTime = rrt.search(strpb).group(1)
        tList = [t[0].string, relTime, modTime]
        titleList.append(tList)

编写storageTitleList()函数

def storageTitleList(TitleList):
    os.chdir('C:\\Users\\treasurew\\Desktop')
    try:
        wb = openpyxl.load_workbook('result.xlsx')
    except:
        wb = openpyxl.Workbook()
    sheet = wb.active
    sheet['A1'] = '文章名'
    sheet['B1'] = '创建时间'
    sheet['C1'] = '修改时间'
    for rowNum, row in zip(range(2, len(TitleList) + 2), TitleList):
        for column, cell in zip(range(1, 4), row):
            sheet.cell(row = rowNum, column = column, value = str(cell))
    wb.save('result.xlsx')

编写main()主函数和新加的getPage()函数

因为博客有好多页，所以我们还需要获取博客页数

编写getPage()函数

先给出html源码

ctrl+f搜索’page’找到这段源码

1
2
3

<nav class="pagination">
  <span class="page-number current">1</span><a class="page-number" href="/page/2/">2</a><span class="space">&hellip;</span><a class="page-number" href="/page/4/">4</a><a class="extend next" rel="next" href="/page/2/"><i class="fa fa-angle-right" aria-label="下一页"></i></a>
</nav>

容易看出最后一个class="page-number"属性即是最大页数所有

那么函数就容易编写了

def getPage(url):
    html = getHTMLText(url)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    page = soup.select('.page-number')
    return page[-1].string

编写main()函数

可以观察到page和url的关系，于是编写以下函数

def main():
    titleList = []
    start_url = 'http://treasurew.com/'
    page = getPage(start_url)
    for i in range(1, int(page) + 1):
        try:
            if i == 1:
                url = start_url
            else:
                url = start_url + 'page/' + str(i) + '/'
            html = getHTMLText(url)
            fillTitleList(titleList, html)
        except:
            continue
    storageTitleList(titleList)

main()

遇到的坑与解决

try-except坑

由于不是每个修改时间都可以匹配到，于是匹配不到时，fillTitleList()出错，main()函数会执行except中的语句，直接跳过第一页除第一篇的所有信息。

解决：在fillTitleList()中也加入try-except语句，提前处理掉错误，这样可以避免让main()中的except报错

beautifulSoup库不熟的错误

由于不熟悉beautifulSoup库，不熟悉返回值是一个列表还是一个单个的什么，会出现set类型用string方法报错

现在想想不应该，因为函数的作用就暗含了这一点，因为可能找到不止一个值的函数，肯定是返回列表的

有空整理下beautifulSoup库的整个过程

爬虫源码

有空我一定熟读PEP8(

import requests
import bs4
import re
import openpyxl
import os

def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.encoding = r.apparent_encoding
        r.raise_for_status()
        return r.text
    except:
        return ""

def getPage(url):
    html = getHTMLText(url)
    soup = bs4.BeautifulSoup(html, 'html.parser')
    page = soup.select('.page-number')
    return page[-1].string

def fillTitleList(titleList, html):
    soup = bs4.BeautifulSoup(html, 'html.parser')
    postBlock = soup.select('.post-block')

    for pb in postBlock:
        t = pb.select('.post-title-link')
        strpb = str(pb)
        rmt = re.compile(r'修改时间：(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})')
        rrt = re.compile(r'创建时间：(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})')
        try:
            modTime = rmt.search(strpb)
            modTime = modTime.group(1)
        except:
            modTime = None
        relTime = rrt.search(strpb).group(1)
        tList = [t[0].string, relTime, modTime]
        titleList.append(tList)


def storageTitleList(TitleList):
    os.chdir('C:\\Users\\treasurew\\Desktop')
    try:
        wb = openpyxl.load_workbook('result.xlsx')
    except:
        wb = openpyxl.Workbook()
    sheet = wb.active
    sheet['A1'] = '文章名'
    sheet['B1'] = '创建时间'
    sheet['C1'] = '修改时间'
    for rowNum, row in zip(range(2, len(TitleList) + 2), TitleList):
        for column, cell in zip(range(1, 4), row):
            sheet.cell(row = rowNum, column = column, value = str(cell))
    wb.save('result.xlsx')

def main():
    titleList = []
    start_url = 'http://treasurew.com/'
    page = getPage(start_url)
    for i in range(1, int(page) + 1):
        try:
            if i == 1:
                url = start_url
            else:
                url = start_url + 'page/' + str(i) + '/'
            html = getHTMLText(url)
            fillTitleList(titleList, html)
        except:
            continue
    storageTitleList(titleList)

main()