c++ 實現https網頁上的圖片爬取


一.主要的原理

  我們通過發送一個http請求,獲得目標網頁的html源代碼,然后通過正則表達式獲取到圖片的URL,把該網頁的所有的圖片都保存到一個文件夾,這就是整個軟件的流程。

二.具體的實踐

  現在很多的網站都是https協議但是有一部分還是http協議,其實https就是http協議的安全版本,相當於http+ssl,SSL是介於HTTP應用層和TCP傳輸層,和HTTP相比HTTPS發送數據需要SSL加密,然后發送。所以說我們想通過https協議發送數據給服務器,需要經歷一下這幾個步驟:

首先我們先和服務器進行socket連接,然后將SSL和創建的socket套接字進行綁定,之后我們發送數據都是通過SSL發送即可,下面介紹一下具體的流程:

首先我們先進行socket連接:

//建立TCP連接
bool Connect()
{
	//初始化套接字
	WSADATA wsadata;
	if (0 != WSAStartup(MAKEWORD(2, 2), &wsadata)) return false;

	//創建套接字
	g_sock = socket(AF_INET, SOCK_STREAM, 0);
	if (g_sock == INVALID_SOCKET) return false;

	//將域名轉換為IP地址
	hostent *p = gethostbyname(g_Host);
	if (p == NULL) return false;

	sockaddr_in sa;
	memcpy(&sa.sin_addr, p->h_addr, 4);
	sa.sin_family = AF_INET;
	sa.sin_port = htons(443);

	if (SOCKET_ERROR == connect(g_sock, (sockaddr*)&sa, sizeof(sockaddr))) return false;
	return true;
}

  

HTTPS=HTTP + SSL,因此利用OpenSSL發送請求給HTTPS站點和第二章的SOCKET發送HTTP是非常相似的,只不過要在原生的套接字上套上SSL層,基本流程如下:

a. WSAStartup對Winsock服務進行初始化

b. 建立socket套接字

c. connect連接服務端

d. 建立SSL上下文

e. 建立SSL

f. 將SSL與前面建立的socket套接字綁定

g. SSL_write()發送數據

h. SSL_read()接收數據

bool SSL_Connect()
{
	// Register the error strings for libcrypto & libssl

	ERR_load_BIO_strings();
	// SSl庫的初始化,載入SSL的所有算法,載入所有的SSL錯誤信息
	SSL_library_init();
	OpenSSL_add_all_algorithms();
	SSL_load_error_strings();

	// New context saying we are a client, and using SSL 2 or 3
	sslContext = SSL_CTX_new(SSLv23_client_method());
	if (sslContext == NULL)
	{
		ERR_print_errors_fp(stderr);
		return false;
	}
	// Create an SSL struct for the connection
	sslHandle = SSL_new(sslContext);
	if (sslHandle == NULL)
	{
		ERR_print_errors_fp(stderr);
		return false;
	}
	// Connect the SSL struct to our connection
	if (!SSL_set_fd(sslHandle, g_sock))
	{
		ERR_print_errors_fp(stderr);
		return false;
	}
	// Initiate SSL handshake
	if (SSL_connect(sslHandle) != 1)
	{
		ERR_print_errors_fp(stderr);
		return false;
	}

	return true;
}

 

三.遇到的問題

1.首先VS控制台的編碼方式是GBK的方式,但是有的網頁就是UTF-8所以我們要進行這方面的轉換

string UtfToGbk(const char* utf8)
{
	int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
	wchar_t* wstr = new wchar_t[len + 1];
	memset(wstr, 0, len + 1);
	MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
	len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
	char* str = new char[len + 1];
	memset(str, 0, len + 1);
	WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
	if (wstr) delete[] wstr;
	return str;
}

2.OPENSSL的安裝

  網上有很多自己下載源碼然后自己編譯的,但是在這個項目中我們可以直接使用別人編譯好的庫不用自己進行編譯,直接在VS的項目目錄中添加即可

3.還有就是圖片的下載,是這種LPCWSTR數據類型的,我們通過string轉換成LPCWSTR(不知道為什么URLDownloadToFile里面的兩個都要這樣進行轉換不能用其他的函數)

		string savepath = "E:\\c++_file\\網絡爬蟲1\\網絡爬蟲\\網絡爬蟲\\img\\"+to_string(i)+".jpg";
		size_t len1 = savepath.length();
		wchar_t* imgsavepath = new wchar_t[len1];
		int nmlen1 = MultiByteToWideChar(CP_ACP, 0, savepath.c_str(), len1 + 1, imgsavepath, len1);

 這是完整的下載的代碼:

           //URL生成
		per = mat[1].str();
		size_t len = per.length();//獲取字符串長度
		int nmlen = MultiByteToWideChar(CP_ACP, 0, per.c_str(), len + 1, NULL, 0);//如果函數運行成功,並且cchWideChar為零 //返回值是接收到待轉換字符串的緩沖區所需求的寬字符數大小。
		wchar_t* buffer = new wchar_t[nmlen];
		MultiByteToWideChar(CP_ACP, 0, per.c_str(), len + 1, buffer, nmlen);
		//保存路徑
		string savepath = "E:\\c++_file\\網絡爬蟲1\\網絡爬蟲\\網絡爬蟲\\img\\"+to_string(i)+".jpg";
		size_t len1 = savepath.length();
		wchar_t* imgsavepath = new wchar_t[len1];
		int nmlen1 = MultiByteToWideChar(CP_ACP, 0, savepath.c_str(), len1 + 1, imgsavepath, len1);
		cout << mat.str() << endl;
		cout << savepath << endl;
		//下載文件
		HRESULT hr = URLDownloadToFile(NULL, buffer, imgsavepath, 0, NULL);
		if (hr == S_OK)
		{
			cout << "-------ok" << endl;
		}  

 

 四.完整代碼

#include "spider.h"



int main()
{
	cout << "*********************************************************" << endl;
	cout << "***********************爬取圖片系統***********************" << endl;
	cout << "*********************************************************" << endl;

	//創建圖片的儲存的目錄
	CreateDirectory(L"./img", NULL);

	//開始抓取
	string starturl = "https://www.shiyanlou.com/#sign-modal";

	StartCatch(starturl);
	//while (1);
	return 0;
}


void StartCatch(string startUrl)
{
	queue<string> q;
	q.push(startUrl);

	while (!q.empty())
	{
		string cururl = q.front();
		q.pop();

		//解析URL
		if (false == Analyse(cururl))
		{
			cout << "解析URL失敗,錯誤碼:" << GetLastError() << endl;
			continue;
		}

		//連接服務器
		if (false == Connect())
		{
			cout << "連接服務器失敗,錯誤代碼:" << GetLastError() << endl;
			continue;
		}

		//建立ssl連接
		if (false == SSL_Connect())
		{
			cout << "建立SSL連接失敗,錯誤代碼:" << GetLastError() << endl;
			continue;
		}

		//獲取網頁
		string html;
		if (false == Gethtml(html))
		{
			cout << "獲取網頁數據失敗,錯誤代碼:" << GetLastError() << endl;
			continue;
		}
		if (false == RegexIamage(html))
		{
			cout << "獲取網頁數據失敗,錯誤代碼:" << GetLastError() << endl;
			continue;
		}
		//cout << html << endl;
	}
	//釋放
	SSL_shutdown(sslHandle);
	SSL_free(sslHandle);
	SSL_CTX_free(sslContext);
	closesocket(g_sock);
	WSACleanup();
}


//解析url
bool Analyse(string url)
{
	char *pUrl = new char[url.length() + 1];
	strcpy(pUrl, url.c_str());

	char *pos = strstr(pUrl, "https://");//找到http://開頭的字符串
	if (pos == NULL) return false;
	else pos += 8;//將http://開頭省略

	sscanf(pos, "%[^/]%s", g_Host, g_Object);

	delete[] pUrl;
	return true;
}


//建立TCP連接
bool Connect()
{
	//初始化套接字
	WSADATA wsadata;
	if (0 != WSAStartup(MAKEWORD(2, 2), &wsadata)) return false;

	//創建套接字
	g_sock = socket(AF_INET, SOCK_STREAM, 0);
	if (g_sock == INVALID_SOCKET) return false;

	//將域名轉換為IP地址
	hostent *p = gethostbyname(g_Host);
	if (p == NULL) return false;

	sockaddr_in sa;
	memcpy(&sa.sin_addr, p->h_addr, 4);
	sa.sin_family = AF_INET;
	sa.sin_port = htons(443);

	if (SOCKET_ERROR == connect(g_sock, (sockaddr*)&sa, sizeof(sockaddr))) return false;
	return true;
}


bool SSL_Connect()
{
	// Register the error strings for libcrypto & libssl

	ERR_load_BIO_strings();
	// SSl庫的初始化,載入SSL的所有算法,載入所有的SSL錯誤信息
	SSL_library_init();
	OpenSSL_add_all_algorithms();
	SSL_load_error_strings();

	// New context saying we are a client, and using SSL 2 or 3
	sslContext = SSL_CTX_new(SSLv23_client_method());
	if (sslContext == NULL)
	{
		ERR_print_errors_fp(stderr);
		return false;
	}
	// Create an SSL struct for the connection
	sslHandle = SSL_new(sslContext);
	if (sslHandle == NULL)
	{
		ERR_print_errors_fp(stderr);
		return false;
	}
	// Connect the SSL struct to our connection
	if (!SSL_set_fd(sslHandle, g_sock))
	{
		ERR_print_errors_fp(stderr);
		return false;
	}
	// Initiate SSL handshake
	if (SSL_connect(sslHandle) != 1)
	{
		ERR_print_errors_fp(stderr);
		return false;
	}

	return true;
}

bool Gethtml(string & html)
{
	char temp1[100];
	sprintf(temp1, "%d", 166);
	string c_get;   
	c_get = c_get
		+ "GET " + g_Object + " HTTP/1.1\r\n"
		+ "Host: " + g_Host + "\r\n"
		+ "Content-Type: text/html; charset=UTF-8\r\n"
		//+ "Content-Length:" + temp1 + "\r\n"
		//+ "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299\r\n"
		+ "Connection:Close\r\n\r\n";
		//+ temp;

	SSL_write(sslHandle, c_get.c_str(), c_get.length());

	char buff[101];
	int nreal = 0;

	while ((nreal = SSL_read(sslHandle, buff, 100)) > 0)
	{
		buff[nreal] = '\0';
		html += UtfToGbk(buff);
		//printf("%s\n", buff);
		memset(buff, 0, sizeof(buff));
	}
	//printf("%s\n", html);
	return true;
}

string UtfToGbk(const char* utf8)
{
	int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0);
	wchar_t* wstr = new wchar_t[len + 1];
	memset(wstr, 0, len + 1);
	MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len);
	len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
	char* str = new char[len + 1];
	memset(str, 0, len + 1);
	WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL);
	if (wstr) delete[] wstr;
	return str;
}


bool RegexIamage(string & html)
{
	smatch mat;
	regex rgx("src=\"(.*(png|svg|jpg))\"");
	string::const_iterator start = html.begin();
	string::const_iterator end = html.end();
	string per;
	int i = 1;
	while (regex_search(start, end, mat, rgx))
	{
		//URL生成
		per = mat[1].str();
		size_t len = per.length();//獲取字符串長度
		int nmlen = MultiByteToWideChar(CP_ACP, 0, per.c_str(), len + 1, NULL, 0);//如果函數運行成功,並且cchWideChar為零 //返回值是接收到待轉換字符串的緩沖區所需求的寬字符數大小。
		wchar_t* buffer = new wchar_t[nmlen];
		MultiByteToWideChar(CP_ACP, 0, per.c_str(), len + 1, buffer, nmlen);
		//保存路徑
		string savepath = "E:\\c++_file\\網絡爬蟲1\\網絡爬蟲\\網絡爬蟲\\img\\"+to_string(i)+".jpg";
		size_t len1 = savepath.length();
		wchar_t* imgsavepath = new wchar_t[len1];
		int nmlen1 = MultiByteToWideChar(CP_ACP, 0, savepath.c_str(), len1 + 1, imgsavepath, len1);
		cout << mat.str() << endl;
		cout << savepath << endl;
		//下載文件
		HRESULT hr = URLDownloadToFile(NULL, buffer, imgsavepath, 0, NULL);
		if (hr == S_OK)
		{
			cout << "-------ok" << endl;
		}
		start = mat[0].second;
		i++;	
	}
	return true;
}

LPCWSTR stringToLPCWSTR(string orig)
{
	size_t origsize = orig.length() + 1;
	const size_t newsize = 100;
	size_t convertedChars = 0;
	wchar_t *wcstring = (wchar_t *)malloc(sizeof(wchar_t) *(orig.length() - 1));
	mbstowcs_s(&convertedChars, wcstring, origsize, orig.c_str(), _TRUNCATE);


	return wcstring;
}

  下面是頭文件:

#pragma once

#include <iostream>
#include <Windows.h>
#include <string>
#include <queue>
#include <regex>
#include <urlmon.h>

#include <openssl/rand.h>
#include <openssl/ssl.h>
#include <openssl/err.h>

#pragma comment(lib, "urlmon.lib")
#pragma comment( lib, "libeay32.lib" )
#pragma comment( lib, "ssleay32.lib" )

#pragma comment(lib, "WS2_32")  // 鏈接到WS2_32.lib 

using namespace std;

char g_Host[MAX_PATH];
char g_Object[MAX_PATH];

SOCKET g_sock;
SSL *sslHandle;
SSL_CTX *sslContext;
BIO * bio;


//開始抓取
void StartCatch(string startUrl);
//解析URL
bool Analyse(string url);
//連接服務器
bool Connect();
//建立SSl連接
bool SSL_Connect();
//得到html
bool Gethtml(string& html);
//UTF轉GBK
std::string UtfToGbk(const char* utf8);
//正則表達式
bool RegexIamage(string & html);

LPCWSTR stringToLPCWSTR(std::string orig);

  

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM