作为一个合理的爬虫程序,cookie和proxy是必须解决的问题, 相信很多朋友都遇到过类似问题。
wininet.dll中包含很多win32下和网络有关的函数,包括internet,ftp,cookie,Proxy等,比如百度知道和新浪微博的登陆信息可以保存N天,你在登陆后把系统时间改为2天后,登陆信息就失效了,使用InternetSetCookie可以自己设置过期日期。 首先在IE中登陆,登陆时选择信息保存2周,然后运行如下代码,运行之后你可以把日期调整到2012年看效果:
测试结果:应用以下代码不必担心cookie过期的问题,广大虫友们让你的爬虫强大起来吧!
using System;
using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.Linq;using System.Text;using System.Windows.Forms;using System.Runtime.InteropServices;using System.Text.RegularExpressions;using Common;using mshtml;namespace spider
{ public partial class WininetTest : Form {/// <summary>
/// 获取cookie/// </summary>
[DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)] public static extern bool InternetGetCookie(string url, string name, StringBuilder data, ref int dataSize); /// <summary> /// 设置cookie /// </summary> [DllImport("wininet.dll", CharSet = CharSet.Auto, SetLastError = true)] public static extern bool InternetSetCookie(string lpszUrlName, string lbszCookieName, string lpszCookieData); public WininetTest() { InitializeComponent(); }private void WininetTest_Load(object sender, EventArgs e)
{ //string url = ""; string url = ""; // string url = "";this.webBrowser.Navigate(url); }
private void btnGetCookie_Click(object sender, EventArgs e)
{ this.txtCookie.Text = GetCookie();}
private void btnSetCookie_Click(object sender, EventArgs e)
{ 删除旧的 foreach (string fileName in System.IO.Directory.GetFiles(System.Environment.GetFolderPath(Environment.SpecialFolder.Cookies))) {if (fileName.ToLower().IndexOf("zhidao") > 0)
{System.IO.File.Delete("zhidao");
}
// if (fileName.ToLower().IndexOf("soso") > 0)
// {// System.IO.File.Delete("soso");
// }
}
//生成新的
foreach (string c in GetCookie().Split(';'))
{string[] item = c.Split('=');
if (item.Length == 2) { string name = item[0]; string value = item[1] + ";expires=Sun,22-Feb-2099 00:00:00 GMT"; InternetSetCookie(webBrowser.Url.ToString(), name, value); this.txtNewCookie.Text += name + "=" + value + ";"; } } }public string GetCookie()
{ //获取旧的StringBuilder cookie = new StringBuilder(new String(' ', 2048));
int datasize = cookie.Length;bool b = InternetGetCookie(webBrowser.Url.ToString(), null, cookie, ref datasize);
if (b) { return webBrowser.Document.Cookie; } return null; }private void btnSave_Click(object sender, EventArgs e)
{ string cookie = this.txtNewCookie.Text; }}
}以下是proxy关键代码.
public class ProxyHelper
{ [DllImport("wininet.dll", SetLastError = true)] private static extern bool InternetSetOption(IntPtr hInternet, int dwOption, IntPtr lpBuffer, int lpdwBufferLength);public void RefreshIESettings(string strProxy)
{ const int INTERNET_OPTION_PROXY = 38; const int INTERNET_OPEN_TYPE_PROXY = 3;Struct_INTERNET_PROXY_INFO struct_IPI;
// Filling in structure
struct_IPI.dwAccessType = INTERNET_OPEN_TYPE_PROXY; struct_IPI.proxy = Marshal.StringToHGlobalAnsi(strProxy); struct_IPI.proxyBypass = Marshal.StringToHGlobalAnsi("local");// Allocating memory
IntPtr intptrStruct = Marshal.AllocCoTaskMem(Marshal.SizeOf(struct_IPI));// Converting structure to IntPtr
Marshal.StructureToPtr(struct_IPI, intptrStruct, true);bool iReturn = InternetSetOption(IntPtr.Zero, INTERNET_OPTION_PROXY, intptrStruct, Marshal.SizeOf(struct_IPI));
}}
由于soso问问的cookie是在服务端有独立的运行模式。目前没有找到合适的解决方案。