diff --git a/docs/index.asciidoc b/docs/index.asciidoc index 3f9380a..0d34d66 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -63,6 +63,23 @@ filter plugins. When set to `true`, the `SHA1`, `SHA256`, `SHA384`, `SHA512` and `MD5` fingerprint methods will produce base64 encoded rather than hex encoded strings. +[id="plugins-{type}s-{plugin}-deep_sort"] +===== `deep_sort` + + * Value type is <> + * Default value is `false` + +When set to `true` then the input fields will be deep sorted when +serializing prior to fingerprint calculation. +This is needed when you have nested hashes because otherwise the order of +the inner hashes is non-deterministic and is going to result in different +fingerprints for the same inputs. + +When you just start using this plugin, it should be safe to set `deep_sort` +to `true` from the beginning. The reason this is not the default is because +the serialization format changes with `deep_sort` even if there are no +nested hashes. + [id="plugins-{type}s-{plugin}-concatenate_sources"] ===== `concatenate_sources` diff --git a/lib/logstash/filters/fingerprint.rb b/lib/logstash/filters/fingerprint.rb index 6df1361..da20f6d 100644 --- a/lib/logstash/filters/fingerprint.rb +++ b/lib/logstash/filters/fingerprint.rb @@ -63,6 +63,18 @@ class LogStash::Filters::Fingerprint < LogStash::Filters::Base # be generated. The result will be random and thus not a consistent hash. config :method, :validate => ['SHA1', 'SHA256', 'SHA384', 'SHA512', 'MD5', "MURMUR3", "IPV4_NETWORK", "UUID", "PUNCTUATION"], :required => true, :default => 'SHA1' + # When set to `true` then the input fields will be deep sorted when + # serializing prior to fingerprint calculation. + # This is needed when you have nested hashes because otherwise the order of + # the inner hashes is non-deterministic and is going to result in different + # fingerprints for the same inputs. + # + # When you just start using this plugin, it should be safe to set `deep_sort` + # to `true` from the beginning. The reason this is not the default is because + # the serialization format changes with `deep_sort` even if there are no + # nested hashes. + config :deep_sort, :validate => :boolean, :default => false + # When set to `true` and `method` isn't `UUID` or `PUNCTUATION`, the # plugin concatenates the names and values of all fields given in the # `source` option into one string (like the old checksum filter) before @@ -104,6 +116,22 @@ class << self; alias_method :fingerprint, :fingerprint_openssl; end end end + def serialize(event) + to_string = "" + if @deep_sort and event.respond_to?(:to_hash) + to_string << "{" + event.to_hash.sort.map do |k,v| + to_string << "#{k}:#{serialize(v)}," + end + to_string << "}" + else + # If not a hash and backwards compatibility. + to_string << "#{event}" + end + + return to_string + end + def filter(event) case @method when :UUID @@ -120,12 +148,16 @@ def filter(event) if @concatenate_sources || @concatenate_all_fields to_string = "" if @concatenate_all_fields - event.to_hash.sort.map do |k,v| - to_string << "|#{k}|#{v}" + if @deep_sort + to_string << serialize(event) + else + event.to_hash.sort.map do |k,v| + to_string << "|#{k}|#{v}" + end end else @source.sort.each do |k| - to_string << "|#{k}|#{event.get(k)}" + to_string << "|#{k}|#{serialize(event.get(k))}" end end to_string << "|" @@ -135,9 +167,9 @@ def filter(event) @source.each do |field| next unless event.include?(field) if event.get(field).is_a?(Array) - event.set(@target, event.get(field).collect { |v| fingerprint(v) }) + event.set(@target, event.get(field).collect { |v| fingerprint(serialize(v)) }) else - event.set(@target, fingerprint(event.get(field))) + event.set(@target, fingerprint(serialize(event.get(field)))) end end end diff --git a/spec/filters/fingerprint_spec.rb b/spec/filters/fingerprint_spec.rb index 24ebf3d..f5493fc 100644 --- a/spec/filters/fingerprint_spec.rb +++ b/spec/filters/fingerprint_spec.rb @@ -83,6 +83,24 @@ end end + describe "fingerprint string with SHA1 HMAC algorithm on all event fields with deep_sort" do + config <<-CONFIG + filter { + fingerprint { + concatenate_all_fields => true + key => "longencryptionkey" + method => 'SHA1' + deep_sort => true + } + } + CONFIG + + # The @timestamp field is specified in this sample event as we need the event contents to be constant for the tests + sample("@timestamp" => "2017-07-26T14:44:27.064Z", "clientip" => "123.123.123.123", "message" => "This is a test message", "log_level" => "INFO", "offset" => 123456789, "type" => "test", "beat" => {"hostname" => "gnu.example.com", "name" => "gnu.example.com", "version" => "5.2.2"}) do + insist { subject.get("fingerprint") } == "e39ef60e5fb431aa7f9847a4591bf1ffe49cd410" + end + end + describe "fingerprint string with SHA1 algorithm and base64 encoding" do config <<-CONFIG filter {