DotnetSpider, a .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling & scraping framework for .NET
- Visual Studio 2017(15.3 or later)
- .NET Core 2.0
Storage data to mysql. Download MySql
grant all on *.* to 'root'@'localhost' IDENTIFIED BY '' with grant option; flush privileges;
Run distributed crawler. Download Redis for windows
Please see the Projet DotnetSpider.Sample in the solution.
public static void CrawlerPagesTraversal()
// Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等
var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true };
// Set start/seed url
Spider spider = Spider.Create(site,
// crawler identity
// use memoery queue scheduler
new QueueDuplicateRemovedScheduler(),
// default page processor will save whole html, and extract urls to target urls via regex
new DefaultPageProcessor(new[] { "cnblogs\\.com" }))
// save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
.AddPipeline(new FilePipeline());
// dowload html by http client
spider.Downloader = new HttpClientDownloader();
// 4 threads 4线程
spider.ThreadNum = 4;
// traversal deep 遍历深度
spider.Deep = 3;
// stop crawler if it can't get url from the scheduler after 30000 ms 当爬虫连续30秒无法从调度中心取得需要采集的链接时结束.
spider.EmptySleepTime = 30000;
// start crawler 启动爬虫
public static void CustmizeProcessorAndPipeline()
// Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等
var site = new Site { EncodingName = "GB2312", RemoveOutboundLinks = true };
//for (int i = 1; i < 5; ++i)
// // Add start/feed urls. 添加初始采集链接
// site.AddStartUrl("http://" + $"{i}.html");
Spider spider = Spider.Create(site,
// use memoery queue scheduler. 使用内存调度
new QueueDuplicateRemovedScheduler(),
// use custmize processor for youku 为优酷自定义的 Processor
new YoukuPageProcessor())
// use custmize pipeline for youku 为优酷自定义的 Pipeline
.AddPipeline(new YoukuPipeline());
spider.Downloader = new HttpClientDownloader();
spider.ThreadNum = 1;
spider.EmptySleepTime = 3000;
// Start crawler 启动爬虫
public class YoukuPipeline : BasePipeline
private static long count = 0;
public override void Process(params ResultItems[] resultItems)
foreach (var resultItem in resultItems)
StringBuilder builder = new StringBuilder();
foreach (YoukuVideo entry in resultItem.Results["VideoResult"])
builder.Append($" [YoukuVideo {count}] {entry.Name}");
// Other actions like save data to DB. 可以自由实现插入数据库或保存到文件
public class YoukuPageProcessor : BasePageProcessor
protected override void Handle(Page page)
// 利用 Selectable 查询并构造自己想要的数据对象
var totalVideoElements = page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-pack pack-film']")).Nodes();
List<YoukuVideo> results = new List<YoukuVideo>();
foreach (var videoElement in totalVideoElements)
var video = new YoukuVideo();
video.Name = videoElement.Select(Selectors.XPath(".//img[@class='quic']/@alt")).GetValue();
// Save data object by key. 以自定义KEY存入page对象中供Pipeline调用
page.AddResultItem("VideoResult", results);
// Add target requests to scheduler. 解析需要采集的URL
//foreach (var url in page.Selectable.SelectList(Selectors.XPath("//ul[@class='yk-pages']")).Links().Nodes())
// page.AddTargetRequest(new Request(url.GetValue(), null));
public class YoukuVideo
public string Name { get; set; }
public class JdSkuSampleSpider : EntitySpider
public JdSkuSampleSpider() : base("JdSkuSample", new Site
//HttpProxyPool = new HttpProxyPool(new KuaidailiProxySupplier("快代理API"))
protected override void MyInit(params string[] arguments)
ThreadNum = 1;
// dowload html by http client
Downloader = new HttpClientDownloader();
// storage data to mysql, default is mysql entity pipeline, so you can comment this line. Don't miss sslmode.
AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;"));
AddStartUrl(",653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手机" }, { "cat3", "655" } });
[Table("test", "jd_sku", TableSuffix.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })]
[EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
[TargetUrlsSelector(XPaths = new[] { "//span[@class=\"p-num\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Product : SpiderEntity
[PropertyDefine(Expression = "./@data-sku", Length = 100)]
public string Sku { get; set; }
[PropertyDefine(Expression = "name", Type = SelectorType.Enviroment, Length = 100)]
public string Category { get; set; }
[PropertyDefine(Expression = "cat3", Type = SelectorType.Enviroment)]
public int CategoryId { get; set; }
[PropertyDefine(Expression = "./div[1]/a/@href")]
public string Url { get; set; }
[PropertyDefine(Expression = "./div[5]/strong/a")]
public long CommentsCount { get; set; }
[PropertyDefine(Expression = ".//div[@class='p-shop']/@data-shop_name", Length = 100)]
public string ShopName { get; set; }
[PropertyDefine(Expression = ".//div[@class='p-name']/a/em", Length = 100)]
public string Name { get; set; }
[PropertyDefine(Expression = "./@venderid", Length = 100)]
public string VenderId { get; set; }
[PropertyDefine(Expression = "./@jdzy_shop_id", Length = 100)]
public string JdzyShopId { get; set; }
[PropertyDefine(Expression = "Monday", Type = SelectorType.Enviroment)]
public DateTime RunId { get; set; }
public static void Main()
Startup.Run(new string[] { "-s:JdSkuSample", "-tid:JdSkuSample", "-i:guid" });
Command: -s:[spider type name] -i:[identity] -a:[arg1,arg2...] -tid:[taskId] -n:[name] -e:[en1=value1,en2=value2,...]
- -s: Type name of spider or TaskNameAttribute for example: DotnetSpider.Sample.BaiduSearchSpiderl
- -i: Set identity.
- -a: Pass arguments to spider's Run method.
- -tid: Set task id.
- -n: Set name.
- -c: Set config file path, for example you want to run with a customize config:
When you want to collect a page JS loaded, there is only one thing to do, set the downloader to WebDriverDownloader.
Downloader=new WebDriverDownloader(Browser.Chrome);
- Make sure there is a ChromeDriver.exe in bin forlder when you try to use Chrome. You can contain it to your project via NUGET manager: Chromium.ChromeDriver
- Make sure you already add a *.webdriver Firefox profile when you try to use Firefox:
- Make sure there is a PhantomJS.exe in bin folder when you try to use PhantomJS. You can contain it to your project via NUGET manager: PhantomJS
- Set SystemConnection in app.config
- Update nlog.config like
timeout 0
tcp-keepalive 60
- EntitSpider定义的表名和列名全部小写化, 以备不同数据库间转换或者MYSQL win/linux的切换
- 允许不添加Pipeline执行爬虫
QQ Group: 477731655 Email: [email protected]