Skip to content

Commit a645709

Browse files
committed
1. fix proxy issues 2. impl mysql queue 3. other updates
1 parent d21672e commit a645709

File tree

183 files changed

+3700
-2941
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

183 files changed

+3700
-2941
lines changed

Directory.Build.props

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
<Project>
2-
3-
<PropertyGroup>
4-
<TargetFramework>netstandard2.0</TargetFramework>
5-
<LangVersion>latest</LangVersion>
6-
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
7-
</PropertyGroup>
8-
2+
3+
<!-- <PropertyGroup>-->
4+
<!-- <TargetFrameworks>netstandard2.1;netstandard2.0</TargetFrameworks>-->
5+
<!-- <LangVersion>latest</LangVersion>-->
6+
<!-- <AllowUnsafeBlocks>true</AllowUnsafeBlocks>-->
7+
<!-- </PropertyGroup>-->
8+
99
</Project>

DotnetSpider.sln

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ ProjectSection(SolutionItems) = preProject
1717
.editorconfig = .editorconfig
1818
.gitignore = .gitignore
1919
azure-pipelines.yml = azure-pipelines.yml
20-
Directory.Build.props = Directory.Build.props
2120
LICENSE.txt = LICENSE.txt
2221
package.props = package.props
2322
README.md = README.md
@@ -29,6 +28,7 @@ ProjectSection(SolutionItems) = preProject
2928
build_spiders.sh = build_spiders.sh
3029
build_agent.sh = build_agent.sh
3130
build_portal.sh = build_portal.sh
31+
publish_nuget.sh = publish_nuget.sh
3232
EndProjectSection
3333
EndProject
3434
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotnetSpider.Agent", "src\DotnetSpider.Agent\DotnetSpider.Agent.csproj", "{FCCA37B0-FF8F-4C32-9286-26A39EFA94E5}"

package.props

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
<Project>
22

33
<PropertyGroup>
4-
<TargetFramework>netstandard2.0</TargetFramework>
5-
<LangVersion>latest</LangVersion>
64
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
75
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
86
<PackageLicenseFile>LICENSE.txt</PackageLicenseFile>
9-
<Version>5.0.1-beta7</Version>
10-
<FileVersion>5.0.1.7</FileVersion>
7+
<Version>5.0.1-beta8</Version>
8+
<FileVersion>5.0.1.8</FileVersion>
9+
<AssemblyVersion>5.0.1.8</AssemblyVersion>
1110
<Authors>[email protected];</Authors>
1211
<Copyright>Copyright 2018 Lewis Zou</Copyright>
1312
<Description>DotnetSpider, a .NET Standard web crawling library. It is lightweight, efficient and fast high-level web crawling &amp; scraping framework</Description>

publish_nuget.sh

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bash
2+
export NUGET_SERVER=https://api.nuget.org/v3/index.json
3+
echo $NUGET_SERVER
4+
rm -rf src/DotnetSpider/bin/Release
5+
rm -rf src/DotnetSpider.HBase/bin/Release
6+
rm -rf src/DotnetSpider.Mongo.AccessControl/bin/Release
7+
rm -rf src/DotnetSpider.MySql/bin/Release
8+
rm -rf src/DotnetSpider.PostgreSql/bin/Release
9+
rm -rf src/DotnetSpider.RabbitMQ/bin/Release
10+
dotnet build -c Release
11+
dotnet pack -c Release
12+
nuget push src/DotnetSpider/bin/Release/*.nupkg -Source $NUGET_SERVER
13+
nuget push src/DotnetSpider.HBase/bin/Release/*.nupkg -Source $NUGET_SERVER
14+
nuget push src/DotnetSpider.Mongo/bin/Release/*.nupkg -Source $NUGET_SERVER
15+
nuget push src/DotnetSpider.MySql/bin/Release/*.nupkg -Source $NUGET_SERVER
16+
nuget push src/DotnetSpider.PostgreSql/bin/Release/*.nupkg -Source $NUGET_SERVER
17+
nuget push src/DotnetSpider.RabbitMQ/bin/Release/*.nupkg -Source $NUGET_SERVER

src/DotnetSpider.Agent/DotnetSpider.Agent.csproj

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22

33
<PropertyGroup>
44
<OutputType>Exe</OutputType>
5-
<TargetFramework>netcoreapp3.1</TargetFramework>
5+
<TargetFramework>net5.0</TargetFramework>
66
</PropertyGroup>
77

88
<Import Project="../../package.props" />
9-
9+
1010
<ItemGroup>
1111
<ProjectReference Include="..\DotnetSpider.RabbitMQ\DotnetSpider.RabbitMQ.csproj" />
1212
<ProjectReference Include="..\DotnetSpider\DotnetSpider.csproj" />

src/DotnetSpider.Agent/Program.cs

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
using System;
22
using System.Threading.Tasks;
33
using DotnetSpider.Downloader;
4-
using DotnetSpider.Extensions;
54
using DotnetSpider.RabbitMQ;
65
using Microsoft.Extensions.Hosting;
76
using Serilog;
@@ -33,15 +32,14 @@ static void Main(string[] args)
3332
var builder = Host.CreateDefaultBuilder(args);
3433
var id = i2;
3534
builder.UseSerilog();
36-
builder.ConfigureServices(x =>
35+
builder.ConfigureServices((context, services) =>
3736
{
38-
x.AddAgent<HttpClientDownloader>(o =>
37+
services.AddAgent<HttpClientDownloader>(o =>
3938
{
4039
o.AgentId = "agent" + id;
4140
o.AgentName = o.AgentId;
4241
});
43-
var configuration = builder.GetConfiguration();
44-
x.AddRabbitMQ(configuration);
42+
services.AddRabbitMQ(context.Configuration);
4543
});
4644
await builder.Build().RunAsync();
4745
}

src/DotnetSpider.AgentCenter/DotnetSpider.AgentCenter.csproj

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22

33
<PropertyGroup>
44
<OutputType>Exe</OutputType>
5-
<TargetFramework>netcoreapp3.1</TargetFramework>
5+
<TargetFramework>net5.0</TargetFramework>
66
</PropertyGroup>
77

88
<Import Project="../../package.props" />
9-
9+
1010
<ItemGroup>
11-
<PackageReference Include="Microsoft.Extensions.Hosting" Version="3.1.8" />
11+
<PackageReference Include="Microsoft.Extensions.Hosting" Version="5.0.0" />
1212
<PackageReference Include="Serilog.AspNetCore" Version="3.4.0" />
1313
<PackageReference Include="Serilog.Sinks.Console" Version="3.1.1" />
1414
<PackageReference Include="Serilog.Sinks.RollingFile" Version="3.3.0" />

src/DotnetSpider.AgentCenter/Program.cs

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
using System;
22
using System.Threading.Tasks;
3-
using DotnetSpider.Extensions;
43
using DotnetSpider.MySql.AgentCenter;
54
using DotnetSpider.RabbitMQ;
65
using DotnetSpider.Statistics;
@@ -26,14 +25,13 @@ static async Task Main(string[] args)
2625
.CreateLogger();
2726

2827
var builder = Host.CreateDefaultBuilder(args);
29-
builder.ConfigureServices(x =>
28+
builder.ConfigureServices((context, x) =>
3029
{
31-
var configuration = builder.GetConfiguration();
32-
x.Configure<AgentCenterOptions>(configuration);
30+
x.Configure<AgentCenterOptions>(context.Configuration);
3331
x.AddHttpClient();
3432
x.AddAgentCenter<MySqlAgentStore>();
3533
x.AddStatistics<MySqlStatisticsStore>();
36-
x.AddRabbitMQ(configuration);
34+
x.AddRabbitMQ(context.Configuration);
3735
});
3836
builder.UseSerilog();
3937
await builder.Build().RunAsync();
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
<Project Sdk="Microsoft.NET.Sdk">
22

3-
<Import Project="../../package.props" />
3+
<Import Project="../../package.props"/>
44
<PropertyGroup>
55
<PackageId>DotnetSpider.HBase</PackageId>
6+
<TargetFrameworks>netstandard2.1;netstandard2.0</TargetFrameworks>
67
</PropertyGroup>
78
<ItemGroup>
8-
<ProjectReference Include="..\DotnetSpider\DotnetSpider.csproj" />
9+
<ProjectReference Include="..\DotnetSpider\DotnetSpider.csproj"/>
910
</ItemGroup>
1011

1112
</Project>

src/DotnetSpider.HBase/HBaseStorage.cs

+6-7
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
using System.Text;
55
using System.Threading.Tasks;
66
using DotnetSpider.DataFlow;
7-
using DotnetSpider.DataFlow.Storage;
8-
using DotnetSpider.Extensions;
97
using Microsoft.Extensions.Configuration;
108
using Microsoft.Extensions.DependencyInjection;
119
using Microsoft.Extensions.Logging;
@@ -30,13 +28,14 @@ public HBaseOptions(IConfiguration configuration)
3028
/// $ hbase shell
3129
/// $ create_namespace 'dotnet_spider'
3230
/// </summary>
33-
public class HBaseStorage : StorageBase
31+
public class HBaseStorage : DataFlowBase
3432
{
3533
private readonly string _rest;
36-
private readonly string _columnName = "data:".ToBase64String();
34+
35+
private readonly string _columnName = Convert.ToBase64String(Encoding.UTF8.GetBytes("data:"));
3736

3837
private readonly ConcurrentDictionary<string, bool>
39-
_tableCreatedDict = new ConcurrentDictionary<string, bool>();
38+
_tableCreatedDict = new();
4039

4140
/// <summary>
4241
/// 根据配置返回存储器
@@ -56,7 +55,7 @@ public HBaseStorage(string restServer)
5655
_rest = uri.ToString();
5756
}
5857

59-
protected override async Task StoreAsync(DataFlowContext context)
58+
public override async Task HandleAsync(DataFlowContext context)
6059
{
6160
var id = context.Request.Owner;
6261
var table = $"dotnet_spider:response_{id}";
@@ -78,7 +77,7 @@ protected override async Task StoreAsync(DataFlowContext context)
7877
{
7978
var httpRequestMessage = new HttpRequestMessage(HttpMethod.Put, $"{_rest}{table}/row");
8079
httpRequestMessage.Headers.TryAddWithoutValidation("Accept", "application/json");
81-
var rowKey = hash.ToBase64String();
80+
var rowKey = Convert.ToBase64String(Encoding.UTF8.GetBytes(hash));
8281

8382
var body =
8483
"{\"Row\":[{\"key\":\"" + rowKey +

src/DotnetSpider.Mongo/DotnetSpider.Mongo.csproj

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
<Import Project="../../package.props" />
44
<PropertyGroup>
55
<PackageId>DotnetSpider.Mongo</PackageId>
6+
<TargetFrameworks>netstandard2.1;netstandard2.0</TargetFrameworks>
67
</PropertyGroup>
78

89
<ItemGroup>
9-
<PackageReference Include="MongoDB.Driver" Version="2.11.2" />
10+
<PackageReference Include="MongoDB.Driver" Version="2.11.6" />
1011
</ItemGroup>
1112

1213
<ItemGroup>

src/DotnetSpider.Mongo/MongoEntityStorage.cs

+16-25
Original file line numberDiff line numberDiff line change
@@ -14,64 +14,50 @@
1414

1515
namespace DotnetSpider.Mongo
1616
{
17-
public class MongoOptions
18-
{
19-
private readonly IConfiguration _configuration;
20-
21-
public MongoOptions(IConfiguration configuration)
22-
{
23-
_configuration = configuration;
24-
}
25-
26-
public string ConnectionString => _configuration["Mongo:ConnectionString"];
27-
}
28-
2917
/// <summary>
3018
/// MongoDB 保存解析(实体)结果 TODO: 是否要考虑存储模式:插入,新的插入旧的更新,更新 ETC
3119
/// </summary>
3220
public class MongoEntityStorage : EntityStorageBase
3321
{
3422
private readonly IMongoClient _client;
3523

36-
private readonly ConcurrentDictionary<Type, TableMetadata> _tableMetadatas =
37-
new ConcurrentDictionary<Type, TableMetadata>();
24+
private readonly ConcurrentDictionary<Type, TableMetadata> _tableMetadataDict =
25+
new();
3826

3927
private readonly ConcurrentDictionary<string, IMongoDatabase> _cache =
40-
new ConcurrentDictionary<string, IMongoDatabase>();
28+
new();
4129

4230
public static IDataFlow CreateFromOptions(IConfiguration configuration)
4331
{
4432
var options = new MongoOptions(configuration);
4533
return new MongoEntityStorage(options.ConnectionString);
4634
}
4735

36+
public string ConnectionString { get; }
37+
4838
/// <summary>
4939
/// 构造方法
5040
/// </summary>
5141
/// <param name="connectionString">连接字符串</param>
5242
public MongoEntityStorage(string connectionString)
5343
{
54-
ConnectionString = connectionString;
5544
_client = new MongoClient(connectionString);
45+
ConnectionString = connectionString;
5646
}
5747

5848
internal MongoEntityStorage(IMongoClient mongoClient)
5949
{
6050
_client = mongoClient;
6151
}
6252

63-
/// <summary>
64-
/// 连接字符串
65-
/// </summary>
66-
public string ConnectionString { get; }
67-
68-
protected override async Task StoreAsync(DataFlowContext context, Dictionary<Type, List<dynamic>> dict)
53+
protected override async Task HandleAsync(DataFlowContext context,
54+
IDictionary<Type, ICollection<dynamic>> entities)
6955
{
70-
foreach (var kv in dict)
56+
foreach (var kv in entities)
7157
{
7258
var list = (IList)kv.Value;
73-
var tableMetadata = _tableMetadatas.GetOrAdd(kv.Key,
74-
type => ((IEntity)list[0]).GetTableMetadata());
59+
var tableMetadata = _tableMetadataDict.GetOrAdd(kv.Key,
60+
_ => ((IEntity)list[0]).GetTableMetadata());
7561

7662
if (string.IsNullOrWhiteSpace(tableMetadata.Schema.Database))
7763
{
@@ -95,5 +81,10 @@ protected override async Task StoreAsync(DataFlowContext context, Dictionary<Typ
9581
await collection.InsertManyAsync(bsonDocs);
9682
}
9783
}
84+
85+
public override string ToString()
86+
{
87+
return $"{ConnectionString}";
88+
}
9889
}
9990
}
+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
using Microsoft.Extensions.Configuration;
2+
3+
namespace DotnetSpider.Mongo
4+
{
5+
public class MongoOptions
6+
{
7+
private readonly IConfiguration _configuration;
8+
9+
public MongoOptions(IConfiguration configuration)
10+
{
11+
_configuration = configuration;
12+
}
13+
14+
public string ConnectionString => _configuration["Mongo:ConnectionString"];
15+
}
16+
}

0 commit comments

Comments
 (0)