做了好幾天,終於寫出來了,以前沒有想到過,用C++也可以爬取網頁,經過這么多天的努力終於做好了,解決了亂碼問題。
從中學到很多,小到一個函數的參數,達到如何使用一個函數。
還有C++中一直讓人頭疼的編碼問題,unicode編碼問題,研究了很多資料,又對Mutibytetowidechar和widechartomultibyte進行了重新的認識。
一個重要的關鍵是windows默認的是ANSI字符集,同時對HTML的格式進行了分析,以判斷編碼問題。
感覺那么多天的辛苦沒有白費,付出有了收獲。不過在此,真的感謝那些牛人,期間也參考了他們的代碼。
代碼:
#include <iostream>
#include <winsock2.h>
#include <cstring>
#include <fstream>
#pragma comment(lib,"ws2_32.lib")
using namespace std;
void getWebPage(char *url)
{
SOCKET sock;
WSADATA wsa;
struct sockaddr_in addrclient;
ofstream of;
WSAStartup(MAKEWORD(2,2),&wsa);
of.open("temp.txt");
if(!of)
{
cout<<"open fail!"<<endl;
return;
}
static char content[100000]="";
char myurl[256];
char host[256];
char dom[256];
char header[256];
char type[512];
char *p;
memset(myurl,'\0',256);
memset(host,'\0',256);
memset(dom,'\0',256);
memset(header,'\0',256);
memset(type,'\0',512);
char *purl=0;
struct hostent *phost;
sock=socket(PF_INET,SOCK_STREAM,IPPROTO_TCP);
strcpy(myurl,url);
for(purl=myurl;*purl!='/'&&purl!='\0';++purl);
if(int(purl-myurl)==strlen(myurl))
strcpy(host,"/");
else
strcpy(host,purl);
*purl='\0';
strcpy(dom,myurl);
cout<<dom<<endl; //輸出域名
cout<<host<<endl; //輸出地址
of<<dom<<endl;
of<<host<<endl;
phost=gethostbyname(dom);
addrclient.sin_family=AF_INET;
addrclient.sin_port=htons(80);
addrclient.sin_addr.S_un.S_addr=*((unsigned long *)phost->h_addr);
connect(sock,(struct sockaddr*)&addrclient,sizeof(addrclient));
strcat(header, "GET ");
strcat(header, host);
strcat(header, " HTTP/1.1\r\n");
strcat(header, "Host: ");
strcat(header, dom);
strcat(header, "\r\nConnection: Close\r\n\r\n");
send(sock,header,strlen(header),0);
recv(sock,type,512,0);
cout<<type<<endl;
of<<type;
p=strstr(type,"utf-8");
if(p)
{
memset(content,'\0',100000);
while(recv(sock,content,100000,0)>0)
{
int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
unsigned short * wszGBK = new unsigned short[len+1];
memset(wszGBK, 0, len * 2 + 2);
MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);
char *szGBK=new char[len + 1];
memset(szGBK, 0, len + 1);
WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
cout<<szGBK;
of<<szGBK;
strnset(content,'\0',100000);
delete []wszGBK;
delete [] szGBK;
}
}
else
{
memset(type,'\0',512);
recv(sock,type,512,0);
cout<<type;
of<<type;
p=strstr(type,"gb2312");
if(p)
{
while(recv(sock,content,100000,0))
{
cout<<content;
of<<content;
strnset(content,'\0',100000);
}
}
else
{
while(recv(sock,content,100000,0)>0)
{
int len=MultiByteToWideChar(CP_UTF8, 0, content, -1, NULL,0);
unsigned short * wszGBK = new unsigned short[len+1];
memset(wszGBK, 0, len * 2 + 2);
MultiByteToWideChar(CP_UTF8, 0, content, -1, (LPWSTR)wszGBK, len);
len = WideCharToMultiByte(CP_ACP, 0, (LPCWSTR)wszGBK, -1, NULL, 0, NULL, NULL);
char *szGBK=new char[len + 1];
memset(szGBK, 0, len + 1);
WideCharToMultiByte (CP_ACP, 0, (LPCWSTR)wszGBK, -1, szGBK, len, NULL,NULL);
cout<<szGBK;
of<<szGBK;
strnset(content,'\0',100000);
delete []wszGBK;
delete [] szGBK;
}
}
}
closesocket(sock);
WSACleanup();
of.close();
cout<<endl;
}
int main()
{
char url[256];
cout<<"http://";
cin>>url;
getWebPage(url);
return 0;
}
對此,又對socket編程產生了興趣,socket編程魅力無窮。
