From f5f952d84cf16fb200b07a3a5b2ba4695695061c Mon Sep 17 00:00:00 2001 From: Joao Duarte Date: Thu, 20 Aug 2020 16:26:52 +0100 Subject: [PATCH 1/2] deeply sort hashes to ensure consistent fingerprints Fingerprinting is done by either fetching the .to_hash representation or by fetching fields with .get(field) The content of these often contain hashes, which we don't guarantee order during insertion, so this commit recursively sorts hashes. This happens in all modes: * concatenate_all_sources * concatenate_fields * normal single field fingerprint --- lib/logstash/filters/fingerprint.rb | 22 +++++++++--- spec/filters/fingerprint_spec.rb | 54 +++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 4 deletions(-) diff --git a/lib/logstash/filters/fingerprint.rb b/lib/logstash/filters/fingerprint.rb index ae08657..78a3063 100644 --- a/lib/logstash/filters/fingerprint.rb +++ b/lib/logstash/filters/fingerprint.rb @@ -119,12 +119,12 @@ def filter(event) if @concatenate_sources || @concatenate_all_fields to_string = "" if @concatenate_all_fields - event.to_hash.sort.map do |k,v| + deep_sort_hashes(event.to_hash).each do |k,v| to_string << "|#{k}|#{v}" end else @source.sort.each do |k| - to_string << "|#{k}|#{event.get(k)}" + to_string << "|#{k}|#{deep_sort_hashes(event.get(k))}" end end to_string << "|" @@ -134,9 +134,9 @@ def filter(event) @source.each do |field| next unless event.include?(field) if event.get(field).is_a?(Array) - event.set(@target, event.get(field).collect { |v| fingerprint(v) }) + event.set(@target, event.get(field).collect { |v| fingerprint(deep_sort_hashes(v)) }) else - event.set(@target, fingerprint(event.get(field))) + event.set(@target, fingerprint(deep_sort_hashes(event.get(field)))) end end end @@ -145,6 +145,20 @@ def filter(event) end private + def deep_sort_hashes(object) + case object + when Hash + sorted_hash = Hash.new + object.sort.each do |sorted_key, value| + sorted_hash[sorted_key] = deep_sort_hashes(value) + end + sorted_hash + when Array + object.map {|element| deep_sort_hashes(element) } + else + object + end + end def fingerprint_ipv4_network(ip_string) # in JRuby 1.7.11 outputs as US-ASCII diff --git a/spec/filters/fingerprint_spec.rb b/spec/filters/fingerprint_spec.rb index fea3c95..6fbba60 100644 --- a/spec/filters/fingerprint_spec.rb +++ b/spec/filters/fingerprint_spec.rb @@ -230,4 +230,58 @@ end end + describe "tolerance to hash order" do + # insertion order can influence the result of to_hash's keys + let(:data1) { { + "a" => {"a0" => 0, "a1" => 1}, + "b" => {"b0" => 0, "b1" => 1}, + } } + let(:event1) { LogStash::Event.new(data1) } + let(:data2) { { + "b" => {"b1" => 1, "b0" => 0}, + "a" => {"a1" => 1, "a0" => 0}, + } } + let(:event2) { LogStash::Event.new(data2) } + let(:config) { { "source" => [ "a" ] } } + + before(:each) do + # for testing purposes we want to ensure the hash order is different. + # since we can't easily control the order on the underlying Map, + # we're mocking the order here: + allow(event1).to receive(:to_hash).and_return(data1) + allow(event2).to receive(:to_hash).and_return(data2) + # by default event.get(key) fetches data from the event. + # mocking the default value has to be done first, and only + # then we can mock the getting "a" and "b" + allow(event1).to receive(:get).and_call_original + allow(event2).to receive(:get).and_call_original + # mock event.get("a") and event.get("b") for both events + # so we can inject an inconsistent order for the tests + allow(event1).to receive(:get).with("a") {|arg| data1["a"] } + allow(event1).to receive(:get).with("b") {|arg| data1["b"] } + allow(event2).to receive(:get).with("a") {|arg| data2["a"] } + allow(event2).to receive(:get).with("b") {|arg| data2["b"] } + plugin.filter(event1) + plugin.filter(event2) + end + it "computes the same hash" do + # confirm the order of the keys in the nested hash is different + # (of course it is, we just mocked the to_hash return) + expect(event1.to_hash["a"].keys).to_not eq(event2.to_hash["a"].keys) + # let's check that the fingerprint doesn't care about the insertion order + expect(event1.get("fingerprint")).to eq(event2.get("fingerprint")) + end + context "concatenate_sources" do + let("config") { { "source" => [ "a", "b"], "concatenate_sources" => true } } + it "computes the same hash" do + expect(event1.get("fingerprint")).to eq(event2.get("fingerprint")) + end + end + context "concatenate_all_fields => true" do + let(:config) { { "concatenate_all_fields" => true } } + it "computes the same hash" do + expect(event1.get("fingerprint")).to eq(event2.get("fingerprint")) + end + end + end end From 09529129335781b7f4fdbaa3096119e462582ed2 Mon Sep 17 00:00:00 2001 From: Joao Duarte Date: Mon, 24 Aug 2020 10:36:40 +0100 Subject: [PATCH 2/2] [skip ci] bump to 3.2.2 --- CHANGELOG.md | 3 +++ logstash-filter-fingerprint.gemspec | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f0e43fc..f7164be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 3.2.2 + - Fixed lack of consistent fingerprints on Hash/Map objects [#55](https://github.com/logstash-plugins/logstash-filter-fingerprint/pull/55) + ## 3.2.1 - Fixed concurrent SHA fingerprinting by making the instances thread local diff --git a/logstash-filter-fingerprint.gemspec b/logstash-filter-fingerprint.gemspec index e76822f..f92b05d 100644 --- a/logstash-filter-fingerprint.gemspec +++ b/logstash-filter-fingerprint.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'logstash-filter-fingerprint' - s.version = '3.2.1' + s.version = '3.2.2' s.licenses = ['Apache-2.0'] s.summary = "Fingerprints fields by replacing values with a consistent hash" s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"