網絡爬蟲抓取頁面的一種存儲方法

本文轉載自查看原文 2012-04-02 17:38 8194 Web Crawler

前言：

網絡爬蟲抓取下來的頁面，都是大文本，應該如何存儲呢？我覺得，如果存儲在mysql 或是 sqlserver這種關系型數據庫當中，應該不是很恰當的。首先，頁面相對獨立，基本沒什么關系型可言，只有url或是描文本->頁面這種簡單的關系，而關系型數據庫系統為了支持關系以及高效查詢會增加很多額外的開銷，這樣得不償失。不僅如此，爬蟲在抓取頁面工程中，效率應該很高，如果用關系型數據庫存頁面的華，短時間內會有大量的數據插入I/O，插入肯定會是一個瓶頸問題，這對數據庫維護網絡以及物理磁盤來說，壓力也是比較大的。因此，我覺得直接存儲為數據文本比較合適，開源的larbin爬蟲，也采用的是文本方式的存儲，但是它默認是為每個頁面存儲一個物理文件，我個人覺得這樣的話，頻繁的文件創建、寫入、flush、關閉，系統開銷也比較大的。綜合考慮我設計了一個方案，即一個物理文件存儲多個頁面，為了支持適當的查找和分割、合並操作，數據文件會對應一個索引文件。這樣的話，在操作工程中，可以再索引文件中進行，索引文件相對數據文件要小得多，遍歷或是查詢會非常快。不僅如果，當進行數據合並的時候，只需要合並索引文件即可，這樣會方便得多。

設計方案：

索引項的格式： 頁面號碼，頁面大小，URL文本大小，時間文本大小，URL文本，時間文本 (之間以空格分割，當然也可以添加其他多種數據屬性)。
例子：60 14328 56 25 http://www.cnblogs.com:80//coser/archive/2012/02/23.html Fri Mar 30 08:31:34 2012 ；該記錄就表示，頁面號碼為60，頁面數據大小為14328字節，URL文本大小：56，時間文本大小：25（這兩個文本大小主要用於解析后面的真實文本的時候，截取比較方便而已）,URL： http://www.cnblogs.com:80//coser/archive/2012/02/23.html，抓取時間：Fri Mar 30 08:31:34 2012

對於數據文件，多少個頁面存儲為一個物理文件是可以自定義的，也就是說每個數據文件的最多頁面數是有一個閥值的。索引文件會根據這個閥值自動確定到頁面文件。爬蟲抓取下來的頁面，沒有任何的格式修飾，直接以Append的方式存入到文本中，只不過再存儲之前需要確定該頁面的大小和相關屬性。對於頁面的大小無須重計算，因為在網絡爬取過程中的read函數會自動返回字節的大小。接下來討論的就是，如果根據索引文件，比較高效的定位或是分割出指定的頁面數據。在linux系統編程中，存在幾個重要的文件操作函數open、read、write、lseek等等。前三個函數都比較常見就不解釋了，最后一個lseek說一下，lseek()是用來控制文件的讀寫位置的，其原型為off_t lseek(int fildes,off_t offset ,int whence); 使用該函數，就可以通過索引文件中的頁面文件大小的屬性，來控制數據文件指針的移動，從而高效的定位到要查找的指定文件位置，根據頁面size再通過read函數便可以將其讀取出來。

總結：

這是我目前畢業設計（分布式網絡爬蟲）中的一部分內容，由於自己能力有限，之前也沒過多接觸這方面的內容，所以不敢保證以上內容是完全正確且合理的。寫在這里，只是想記錄一下自己遇到的一些問題，以及因此產生的一些想法和解決方案，大家可以一起來探討，歡迎批評指正。

最后，貼一些關鍵代碼（C++），感興趣的朋友，可以參考下：

indexFile.h

/*
 * indexFile.h
 *
 *  Created on: Mar 30, 2012
 *      Author: mayday
 */

#ifndef INDEXFILE_H_
#define INDEXFILE_H_

#include<iostream>
#include<sys/stat.h>
#include<fcntl.h>
#include<stdio.h>
#include<string.h>
#include<assert.h>
#include<time.h>
#include<stdlib.h>
#include<vector>

#define MAXFILELIMIT 10

using namespace std;

//索引項
struct indexer
{
    int num; //存儲號碼
    char url [150]; //url文本
    char timep [36]; //抓取時間文本
    int size; //存儲頁面的大小
};

/*
 * 索引文件工具類
 */
class indexFile
{
public:
    //構造函數，file_name指定索引文件名， pflag指定文件讀寫屬性
    indexFile(const char * file_name , int pflag);
    //設置索引參數
    void set(const int size_text , const int num_text ,const char * url_text , const char * time_text);
    //根據指定格式，對索引文件記錄text進行解析
    void parse (const char * text , int & pnum , int & psize , char * purl, char * ptime);
    //迭代遍歷利索引項
    bool next (indexer & index);
    //寫文件
    void writef();
    //讀文件，按行讀
    void readnextline();
    //關閉文件
    void closef();
    //讀寫屬性
    const static int R = 0;
    const static int W = 1;
    const static int A = 2;
private:
    int flag;
    char filename [30];
    char array [150 + 20 + 36];
    indexer idx;
    mode_t mode;
    FILE * fp;
};

/*
 * 根據索引文件遍歷對應頁面儲存文件
 */
class scanFile
{
public:
    //構造函數，初始化相關參數 ，basename指定頁面存儲文件名字 ，index_name指定索引文件名字
    scanFile(const char * base_name , const char * index_name);
    ~scanFile();
    //根據索引文件中的頁面記錄號，定位頁面，返回頁面數據
    char * locate(const int page_num);
private:
    //初始化offset，【agesize
    void init();
    //重新定位文件
    void setnewfile(const int file_num);
    indexFile * index;
    int filefd;
    char * basename;
    char currentfilename [32];
    int currentfilenumber;
    vector<int> offset;
    vector<int> pagesize;
    char * page_content;
};


#endif /* INDEXFILE_H_ */

indexFile.cpp

/*
 * indexFile.cpp
 *
 *  Created on: Mar 30, 2012
 *      Author: mayday
 */

#include"indexFile.h"

indexFile::indexFile(const char * file_name , int pflag)
{
    assert(file_name!=NULL);
    strcpy(filename,file_name);
    flag = pflag;
    if (pflag == R)
        fp = fopen(filename,"r");
    else if (pflag == W)
        fp = fopen(filename,"w");
    else if (pflag == A)
        fp = fopen(filename,"a");
    assert(fp!=NULL);
}

void indexFile::set
    (const int num_text , const int size_text ,const char * url_text , const char * time_text)
{
    assert(url_text!=NULL&&time_text!=NULL);
    idx.size = size_text;
    idx.num = num_text;
    strcpy(idx.url,url_text);
    strcpy(idx.timep,time_text);
}

void indexFile::parse
    (const char * text , int & pnum , int & psize , char * purl, char * ptime)
{
    assert(text!=NULL);
    int size_url , size_time;
    int len = strlen(text);
    sscanf(text,"%d%d%d%d", &pnum, &psize, &size_url, &size_time);
    int pos1 = len - size_time - size_url - 1;
    int pos2 = pos1 + size_url;

    if(purl!=NULL)
    {
        strncpy(purl,text+pos1,pos2-pos1);
        purl[pos2-pos1] = '\0';
    }
    if(ptime!=NULL)
    {
        strncpy(ptime,text+pos2+1,len-pos2-1);
        ptime[len-pos2-1] = '\0';
    }
}

bool indexFile::next(indexer & index)
{
    assert(flag == R);
    char *line = NULL;
    size_t len = 0;
    getline(&line,&len,fp);
    if(strlen(line)<=1) return false;
    parse(line,index.num , index.size ,index.url , index.timep);
    return true;
}

void indexFile::writef()
{
    assert(flag != R);
    int size_url = strlen(idx.url);
    int size_time = strlen(idx.timep);
    sprintf(array,"%d %d %d %d %s %s",idx.num,idx.size,size_url,size_time,idx.url,idx.timep);
    fwrite(array,strlen(array),sizeof(char),fp);
}

void indexFile::closef()
{
    fflush(fp);
    fclose(fp);
}

///////////////////////////////////////////////////
scanFile::scanFile(const char * base_name , const char * index_name)
{
    assert(basename!=NULL && index_name!=NULL);
    index = new indexFile(index_name,indexFile::R);

    int len = strlen(base_name);
    basename = new char [len + 1];
    strcpy(basename,base_name);

    filefd = 0;
    setnewfile(0);

    offset.push_back(0);
    page_content = new char [96000];

    init();
}

void scanFile::setnewfile(const int page_num)
{
    close(filefd);
    //重新命名
    char num_str [7];
    sprintf(num_str,"%06d",page_num);
    strcpy(currentfilename,basename);
    strcat(currentfilename,num_str);

    currentfilenumber = page_num;

    //創建 or 打開新文件
    mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
    filefd = open(currentfilename,O_RDONLY, mode);
}

scanFile::~scanFile()
{
    delete [] page_content;
    delete [] basename;
}
void scanFile::init()
{
    indexer idx;
    int counter = 0;
    int pre = 0;
    //遍歷索引文件，記錄數據頁面偏移量
    while(index->next(idx))
    {
        if(counter%MAXFILELIMIT==0)
        {
            pre = 0;
        }
        offset.push_back(pre+idx.size);
        pagesize.push_back(idx.size);
        pre = offset.back();
        counter++;
    }
}

char * scanFile::locate(const int page_num)
{
    int number = page_num / MAXFILELIMIT;
    if (number!=currentfilenumber)
    {
        //update filefd  , currentfilenumber , currentfilenumber
        setnewfile(number);
    }
    int size_offset = offset.at(page_num-1);
    int size = pagesize.at(page_num-1);
    //找到數據文件的指定序號的開始位置
    lseek(filefd,size_offset,SEEK_SET);
    //從上面的文件位置，進行讀取
    read(filefd,page_content,size);
    page_content[size] = '\0';
    return page_content;
}

/*
//Test
int main()
{
    time_t timep;
    time (&timep);
    char * t = asctime(gmtime(&timep));

    char filename [] = "fife10002.idx";
    indexFile index(filename , indexFile::R);
    indexer idx;
    while(index.next(idx))
    {
        cout<<"No.\t"<<idx.num<<"\tSize:"<<idx.size<<endl;
        cout<<"Fetched Url:\t"<<idx.url<<"Fetched Time:\t"<<idx.timep<<endl;
    }
    index.closef();

    return 0;
}
*/

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 JAVA爬蟲抓取頁面的URL數據一種基於自定義代碼的asp.net網站首頁根據IP自動跳轉指定頁面的方法！為何大量網站不能抓取?爬蟲突破封禁的6種常見方法站點頁面Service Unavailable 503的一種解決方法 PHP抓取網絡數據的6種常見方法一種計算π的方法 PHP抓取頁面的幾種方式使用Jsoup 抓取頁面的數據用C#抓取AJAX頁面的內容 php抓取一個頁面的圖片