diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..03463f3 --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +ruby-3.3.0 diff --git a/Gemfile.lock b/Gemfile.lock index dffdffa..6162ca4 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,37 +1,42 @@ GEM remote: http://rubygems.org/ specs: - addressable (2.4.0) - diff-lcs (1.2.5) - docile (1.1.5) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) + diff-lcs (1.5.1) + docile (1.4.0) domainatrix (0.0.11) addressable - hoe (3.14.2) - rake (>= 0.8, < 11.0) - json (1.8.3) + hoe (4.2.2) + rake (>= 0.8, < 15.0) one_hundred_percent_coverage (0.0.2) simplecov (>= 0.3.7) - rake (10.5.0) - rdoc (4.2.2) - json (~> 1.4) - rspec (3.4.0) - rspec-core (~> 3.4.0) - rspec-expectations (~> 3.4.0) - rspec-mocks (~> 3.4.0) - rspec-core (3.4.3) - rspec-support (~> 3.4.0) - rspec-expectations (3.4.0) + psych (5.1.2) + stringio + public_suffix (6.0.1) + rake (13.2.1) + rdoc (6.7.0) + psych (>= 4.0.0) + rspec (3.13.0) + rspec-core (~> 3.13.0) + rspec-expectations (~> 3.13.0) + rspec-mocks (~> 3.13.0) + rspec-core (3.13.0) + rspec-support (~> 3.13.0) + rspec-expectations (3.13.1) diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.4.0) - rspec-mocks (3.4.1) + rspec-support (~> 3.13.0) + rspec-mocks (3.13.1) diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.4.0) - rspec-support (3.4.1) - simplecov (0.11.2) - docile (~> 1.1.0) - json (~> 1.8) - simplecov-html (~> 0.10.0) - simplecov-html (0.10.0) + rspec-support (~> 3.13.0) + rspec-support (3.13.1) + simplecov (0.22.0) + docile (~> 1.1) + simplecov-html (~> 0.11) + simplecov_json_formatter (~> 0.1) + simplecov-html (0.12.3) + simplecov_json_formatter (0.1.4) + stringio (3.1.1) PLATFORMS ruby @@ -45,4 +50,4 @@ DEPENDENCIES simplecov BUNDLED WITH - 1.11.2 + 2.5.10 diff --git a/README.rdoc b/README.rdoc index e4591c0..8e06020 100644 --- a/README.rdoc +++ b/README.rdoc @@ -1,6 +1,6 @@ = Despamilator -* http://github.com/moowahaha/despamilator +* home :: http://github.com/moowahaha/despamilator == DESCRIPTION: diff --git a/Rakefile b/Rakefile index 9e514c7..1449aa2 100644 --- a/Rakefile +++ b/Rakefile @@ -11,6 +11,7 @@ Hoe.plugin :newgem # Generate all the Rake tasks # Run 'rake -T' to see list of generated tasks (from gem root directory) $hoe = Hoe.spec 'despamilator' do + self.version = Despamilator::VERSION self.developer 'Stephen Hardisty', 'moowahaha@hotmail.com' self.post_install_message = 'PostInstall.txt' end diff --git a/lib/despamilator.rb b/lib/despamilator.rb index 41e5c96..07be72c 100644 --- a/lib/despamilator.rb +++ b/lib/despamilator.rb @@ -1,10 +1,12 @@ $:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__))) +require 'despamilator/version' require 'despamilator/filter' Dir.glob(File.join(File.dirname(__FILE__), 'despamilator', 'filter', '*.rb')).each do |filter_file| require filter_file end require 'despamilator/subject' +require 'despamilator/text' require 'ostruct' #== SYNOPSIS: diff --git a/lib/despamilator/filter/html_tags.rb b/lib/despamilator/filter/html_tags.rb index a3f74cc..7ca6a29 100644 --- a/lib/despamilator/filter/html_tags.rb +++ b/lib/despamilator/filter/html_tags.rb @@ -6,8 +6,8 @@ def parse subject text = subject.text.downcase html_tags.each do |tag| - opening_elements = text.count(/<\s*#{tag}\W/) - closing_elements = text.count(/\W#{tag}\s*\/>/) + opening_elements = Despamilator::Text.count(text, /<\s*#{tag}\W/) + closing_elements = Despamilator::Text.count(text, /\W#{tag}\s*\/>/) if opening_elements > 0 or closing_elements > 0 safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements diff --git a/lib/despamilator/filter/ip_address_url.rb b/lib/despamilator/filter/ip_address_url.rb index a5b2cc8..46b6c28 100644 --- a/lib/despamilator/filter/ip_address_url.rb +++ b/lib/despamilator/filter/ip_address_url.rb @@ -15,7 +15,7 @@ def description def parse subject subject.register_match!({ :score => 0.5, :filter => self - }) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0 + }) if Despamilator::Text.count(subject.text.downcase, /http:\/\/\d+\.\d+\.\d+\.\d+/) > 0 end end diff --git a/lib/despamilator/filter/long_words.rb b/lib/despamilator/filter/long_words.rb index 60b88eb..3d38002 100644 --- a/lib/despamilator/filter/long_words.rb +++ b/lib/despamilator/filter/long_words.rb @@ -13,7 +13,10 @@ def description end def parse subject - subject.text.without_uris.words.each do |word| + without_uris = Despamilator::Text.without_uris(subject.text) + words = Despamilator::Text.words(without_uris) + + words.each do |word| subject.register_match!({ :score => 0.1, :filter => self }) if word.length > 20 diff --git a/lib/despamilator/filter/mixed_case.rb b/lib/despamilator/filter/mixed_case.rb index 54e3a2c..f7943a5 100644 --- a/lib/despamilator/filter/mixed_case.rb +++ b/lib/despamilator/filter/mixed_case.rb @@ -10,9 +10,12 @@ def description end def parse subject - text = subject.text.without_uris - count = text.remove_and_count!(/[a-z][A-Z]/) - count += text.remove_and_count!(/[a-z][A-Z][a-z]/) + text = Despamilator::Text.without_uris(subject.text) + text, count1 = Despamilator::Text.remove_and_count(text, /[a-z][A-Z]/) + _, count2 = Despamilator::Text.remove_and_count(text, /[a-z][A-Z][a-z]/) + + count = count1 + count2 + subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0 end diff --git a/lib/despamilator/filter/numbers_and_words.rb b/lib/despamilator/filter/numbers_and_words.rb index c5aebce..ae4e40e 100644 --- a/lib/despamilator/filter/numbers_and_words.rb +++ b/lib/despamilator/filter/numbers_and_words.rb @@ -35,7 +35,7 @@ def description private def tidy_text subject - text = subject.text.without_uris + text = Despamilator::Text.without_uris(subject.text) text.downcase! # strip out "good numbers" diff --git a/lib/despamilator/filter/obfuscated_urls.rb b/lib/despamilator/filter/obfuscated_urls.rb index c03335c..f2d838c 100644 --- a/lib/despamilator/filter/obfuscated_urls.rb +++ b/lib/despamilator/filter/obfuscated_urls.rb @@ -10,7 +10,7 @@ def description end def parse subject - text = subject.text.without_uris.downcase + text = Despamilator::Text.without_uris(subject.text.downcase) count = find_space_separated_parts text count += find_space_separated_characters text @@ -21,7 +21,7 @@ def parse subject private def find_space_separated_parts text - text.count(/www\s+\w+\s+com/) + Despamilator::Text.count(text, /www\s+\w+\s+com/) end def find_space_separated_characters text diff --git a/lib/despamilator/filter/prices.rb b/lib/despamilator/filter/prices.rb index b2447de..5e0f72e 100644 --- a/lib/despamilator/filter/prices.rb +++ b/lib/despamilator/filter/prices.rb @@ -10,7 +10,7 @@ def description end def parse subject - price_count = subject.text.count(/\$\s*\d+/) + price_count = Despamilator::Text.count(subject.text,/\$\s*\d+/) subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0 end diff --git a/lib/despamilator/filter/shouting.rb b/lib/despamilator/filter/shouting.rb index 26794c8..9b10681 100644 --- a/lib/despamilator/filter/shouting.rb +++ b/lib/despamilator/filter/shouting.rb @@ -19,7 +19,7 @@ def parse subject return if text.length < 20 uppercased = text.scan(/[A-Z][A-Z]+/).join.length - lowercased = text.count(/[a-z]/) + lowercased = Despamilator::Text.count(text, /[a-z]/) if uppercased > 0 subject.register_match!({ diff --git a/lib/despamilator/filter/spammy_tlds.rb b/lib/despamilator/filter/spammy_tlds.rb index c747be4..10d75e2 100644 --- a/lib/despamilator/filter/spammy_tlds.rb +++ b/lib/despamilator/filter/spammy_tlds.rb @@ -13,7 +13,7 @@ def description end def parse subject - matches = subject.text.count(/\w{5,}\.(info|biz|xxx)\b/) + matches = Despamilator::Text.count(subject.text, /\w{5,}\.(info|biz|xxx)\b/) subject.register_match!({:score => 0.05 * matches, :filter => self}) if matches > 0 end diff --git a/lib/despamilator/filter/trailing_number.rb b/lib/despamilator/filter/trailing_number.rb index 0653b6d..c08d77e 100644 --- a/lib/despamilator/filter/trailing_number.rb +++ b/lib/despamilator/filter/trailing_number.rb @@ -13,7 +13,7 @@ def description end def parse subject - subject.register_match!({:score => 0.1, :filter => self}) if subject.text.without_uris =~ /\b\d+\s*$/ + subject.register_match!({:score => 0.1, :filter => self}) if Despamilator::Text.without_uris(subject.text) =~ /\b\d+\s*$/ end end diff --git a/lib/despamilator/filter/unusual_characters.rb b/lib/despamilator/filter/unusual_characters.rb index 7fccc7c..a1f484c 100644 --- a/lib/despamilator/filter/unusual_characters.rb +++ b/lib/despamilator/filter/unusual_characters.rb @@ -14,7 +14,7 @@ def description def parse subject initialize_combos - tokenize(subject.text.without_uris).each do |token| + tokenize(Despamilator::Text.without_uris(subject.text)).each do |token| subject.register_match!({:score => 0.05, :filter => self}) if @@combos[token.to_sym] end end diff --git a/lib/despamilator/filter/urls.rb b/lib/despamilator/filter/urls.rb index 01bcba1..aa248d4 100644 --- a/lib/despamilator/filter/urls.rb +++ b/lib/despamilator/filter/urls.rb @@ -14,7 +14,7 @@ def description def parse subject text = subject.text.downcase.gsub(/http:\/\/\d+\.\d+\.\d+\.\d+/, '') - matches = text.count(/https?:\/\//) + matches = Despamilator::Text.count(text, /https?:\/\/\S+/) 1.upto(matches > 2 ? 2 : matches) do subject.register_match!({:score => 0.4, :filter => self}) end diff --git a/lib/despamilator/filter/weird_punctuation.rb b/lib/despamilator/filter/weird_punctuation.rb index 80c6f99..17d5567 100644 --- a/lib/despamilator/filter/weird_punctuation.rb +++ b/lib/despamilator/filter/weird_punctuation.rb @@ -13,18 +13,21 @@ def description end def parse subject - text = subject.text.without_uris.downcase + text = Despamilator::Text.without_uris(subject.text).downcase text.gsub!(/\w&\w/, 'xx') text.gsub!(/[a-z](!|\?)(\s|$)/, 'x') text.gsub!(/(?:#{punctuation}){20,}/, '') - matches = text.remove_and_count!(/(?:\W|\s|^)(#{punctuation})/) - matches += text.remove_and_count!(/\w,\w/) - matches += text.remove_and_count!(/\w\w\.\w/) - matches += text.remove_and_count!(/\w\.\w\w/) - matches += text.remove_and_count!(/(#{punctuation})(#{punctuation})/) - matches += text.remove_and_count!(/(#{punctuation})$/) - matches += text.remove_and_count!(/(?:\W|\s|^)\d+(#{punctuation})/) + + text, matches1 = Despamilator::Text.remove_and_count(text, /(?:\W|\s|^)(#{punctuation})/) + text, matches2 = Despamilator::Text.remove_and_count(text, /\w,\w/) + text, matches3 = Despamilator::Text.remove_and_count(text, /\w\w\.\w/) + text, matches4 = Despamilator::Text.remove_and_count(text, /\w\.\w\w/) + text, matches5 = Despamilator::Text.remove_and_count(text, /(#{punctuation})(#{punctuation})/) + text, matches6 = Despamilator::Text.remove_and_count(text, /(#{punctuation})$/) + _, matches7 = Despamilator::Text.remove_and_count(text, /(?:\W|\s|^)\d+(#{punctuation})/) + + matches = matches1 + matches2 + matches3 + matches4 + matches5 + matches6 + matches7 subject.register_match!({:score => 0.03 * matches, :filter => self}) if matches > 0 end diff --git a/lib/despamilator/subject.rb b/lib/despamilator/subject.rb index 48e6e3a..a478168 100644 --- a/lib/despamilator/subject.rb +++ b/lib/despamilator/subject.rb @@ -1,5 +1,3 @@ -require 'despamilator/subject/text' - class Despamilator class Subject attr_reader :score, :text @@ -7,7 +5,7 @@ class Subject def initialize text @score = 0.0 @matches = {} - @text = Despamilator::Subject::Text.new(text) + @text = text end def register_match! details diff --git a/lib/despamilator/subject/text.rb b/lib/despamilator/subject/text.rb deleted file mode 100644 index 8186f74..0000000 --- a/lib/despamilator/subject/text.rb +++ /dev/null @@ -1,32 +0,0 @@ -require 'uri' - -class Despamilator - class Subject - class Text < String - - def initialize text - super text if text - freeze - end - - def without_uris - gsub(/\b(?:https?|mailto|ftp):.+?(\s|$)/i, '') - end - - def words - split(/\W+/) - end - - def count pattern - scan(pattern).flatten.compact.length - end - - def remove_and_count! pattern - count = count(pattern) - gsub!(pattern, '') - count - end - - end - end -end diff --git a/lib/despamilator/text.rb b/lib/despamilator/text.rb new file mode 100644 index 0000000..94c4715 --- /dev/null +++ b/lib/despamilator/text.rb @@ -0,0 +1,22 @@ +require "uri" + +class Despamilator + module Text + def self.without_uris(text) + text.gsub(/\b(?:https?|mailto|ftp):.+?(\s|$)/i, "") + end + + def self.words(text) + text.split(/\W+/) + end + + def self.count(text, pattern) + text.scan(pattern).flatten.compact.length + end + + def self.remove_and_count(text, pattern) + count = count(text, pattern) + [text.gsub(pattern, ""), count] + end + end +end diff --git a/spec/despamilator_spec.rb b/spec/despamilator_spec.rb index 3f6b4da..c282246 100644 --- a/spec/despamilator_spec.rb +++ b/spec/despamilator_spec.rb @@ -1,3 +1,5 @@ +require "helpers/spec_helper" + describe Despamilator do before do @@ -15,7 +17,7 @@ context :matched_by do before do - @dspam.should_receive(:warn).with(/matched_by is deprecated/) + expect(@dspam).to receive(:warn).with(/matched_by is deprecated/) @gtubs = @dspam.matched_by { |f| f.class == DespamilatorFilter::GtubsTestFilter }.collect.first end diff --git a/spec/helpers/spec_helper.rb b/spec/helpers/spec_helper.rb index fc617e9..4d3d9c2 100644 --- a/spec/helpers/spec_helper.rb +++ b/spec/helpers/spec_helper.rb @@ -4,3 +4,9 @@ Dir.glob(File.join(File.dirname(__FILE__), '*.rb')).each do |file| require file end + +RSpec.configure do |config| + config.expect_with :rspec do |c| + c.syntax = [:should, :expect] + end +end diff --git a/spec/subject_text_spec.rb b/spec/subject_text_spec.rb index 70d5435..c988d0d 100644 --- a/spec/subject_text_spec.rb +++ b/spec/subject_text_spec.rb @@ -1,35 +1,27 @@ -describe Despamilator::Subject::Text do - it 'should be a kind of string' do - Despamilator::Subject::Text.new('aa').should be_kind_of(String) - end - - it 'should be immutable' do - text = Despamilator::Subject::Text.new('aa') - -> {text.gsub!('a', 'b')}.should raise_error(RuntimeError) - end - +describe Despamilator::Text do it 'should strip urls' do - Despamilator::Subject::Text.new( + Despamilator::Text.without_uris( 'blah https://www.google.com de.http://yahoo.com blah http://www.dcyder.com?x={abc} blah' - ).without_uris.should == 'blah de.blah blah' + ).should == 'blah de.blah blah' end it 'should split into words' do - Despamilator::Subject::Text.new( + Despamilator::Text.words( 'hello there you-rule' - ).words.should == %w{hello there you rule} + ).should == %w{hello there you rule} end it 'should count the matches for a regular expression' do - Despamilator::Subject::Text.new( - 'yXyXy' - ).count(/X/).should == 2 + Despamilator::Text.count( + 'yXyXy', + /X/ + ).should == 2 end it 'should count the matches for a regular expression' do - text = Despamilator::Subject::Text.new('yXyXy').dup - text.remove_and_count!(/X/).should == 2 + text, count = Despamilator::Text.remove_and_count('yXyXy', /X/) text.should == 'yyy' + count.should == 2 end end \ No newline at end of file