diff --git a/Makefile b/Makefile index 238674f7..91fd5b83 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ test-coverage: $(LINGUIST_PATH) tail -n +2 $(COVERAGE_PROFILE) >> $(COVERAGE_REPORT); \ rm $(COVERAGE_PROFILE); \ fi; \ - done; + done; code-generate: $(LINGUIST_PATH) mkdir -p data diff --git a/README.md b/README.md index 8e23bccc..d982e059 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ Note that even if enry's CLI is compatible with linguist's, its main point is th Development ------------ -*enry* re-uses parts of original [linguist](https://github.com/github/linguist) especially data in `languages.yml` to generate internal data structures. In oreder to update to latest upstream run +*enry* re-uses parts of original [linguist](https://github.com/github/linguist) especially data in `languages.yml` to generate internal data structures. In order to update to latest upstream run make clean code-generate @@ -140,6 +140,7 @@ Using [linguist/samples](https://github.com/github/linguist/tree/master/samples) * all files for SQL language fall to the classifier because we don't parse this [disambiguator expresion](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.rb#L433) for `*.sql` files right. This expression doesn't comply with the pattern for the rest of [heuristics.rb](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.rb) file. + Benchmarks ------------ @@ -172,6 +173,38 @@ to get time averages for main detection function and strategies for the whole sa if you want see measures by sample file + +.gitattributes +-------------- + +Like in linguist you can override the strategies via `.gitattributes` file. +Add a `.gitattributes` file to the directory and use the same matchers that you would use in linguist `linguist-documentation`,`linguist-language` or `linguist-vendored` to do the override. + +#### Vendored code + +Use the `linguist-vendored` attribute to vendor or un-vendor paths. + +``` +$ cat .gitattributes +this-is-a-vendor-directory/ linguist-vendored +this-is-not/ linguist-vendored=false +``` +#### Documentation + +Documentation works the same way as vendored code but using `linguist-documentation` and `linguist-documentation=false`. + +#### Language assignation + +If you want some files to be classified according to certain language use `linguist-language=[language]`. + +``` +$ cat .gitattributes +.*\.go linguist-language=MyFavouriteLanguage +``` + +Note that the regular expression that matches the file name should be compatible with go, see: [Golang regexp](https://golang.org/pkg/regexp/). + + Why Enry? ------------ diff --git a/cli/enry/main.go b/cli/enry/main.go index b48b27c8..c2402db2 100644 --- a/cli/enry/main.go +++ b/cli/enry/main.go @@ -29,6 +29,12 @@ func main() { log.Fatal(err) } + gitAttributes := enry.NewGitAttributes() + reader, err := os.Open(".gitattributes") + if err == nil { + gitAttributes.LoadGitAttributes("", reader) + } + errors := false out := make(map[string][]string, 0) err = filepath.Walk(root, func(path string, f os.FileInfo, err error) error { @@ -53,8 +59,9 @@ func main() { relativePath = relativePath + "/" } - if enry.IsVendor(relativePath) || enry.IsDotFile(relativePath) || - enry.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) { + if gitAttributes.IsVendor(relativePath) || enry.IsDotFile(relativePath) || + gitAttributes.IsDocumentation(relativePath) || enry.IsConfiguration(relativePath) || + gitAttributes.IsGenerated(path) { if f.IsDir() { return filepath.SkipDir } @@ -66,20 +73,18 @@ func main() { return nil } - language, ok := enry.GetLanguageByExtension(path) - if !ok { - if language, ok = enry.GetLanguageByFilename(path); !ok { - content, err := ioutil.ReadFile(path) - if err != nil { - errors = true - log.Println(err) - return nil - } - - language = enry.GetLanguage(filepath.Base(path), content) - if language == enry.OtherLanguage { - return nil - } + content, err := ioutil.ReadFile(path) + if err != nil { + errors = true + log.Println(err) + return nil + } + + language := gitAttributes.GetLanguage(filepath.Base(path)) + if language == enry.OtherLanguage { + language = enry.GetLanguage(filepath.Base(path), content) + if language == enry.OtherLanguage { + return nil } } diff --git a/common.go b/common.go index b299d364..83ca5f9f 100644 --- a/common.go +++ b/common.go @@ -3,6 +3,7 @@ package enry import ( "bufio" "bytes" + "os" "path/filepath" "regexp" "strings" @@ -95,6 +96,12 @@ func GetLanguageByClassifier(content []byte, candidates []string) (language stri return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates) } +// GetLanguageByGitattributes returns the language assigned to a file for a given regular expresion in .gitattributes. +// This strategy needs to be initialized calling LoadGitattributes +func GetLanguageByGitattributes(filename string) (language string, safe bool) { + return getLanguageByStrategy(GetLanguagesByGitAttributes, filename, nil, nil) +} + func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) { languages := strategy(filename, content, candidates) return getFirstLanguageAndSafe(languages) @@ -407,6 +414,25 @@ func GetLanguagesBySpecificClassifier(content []byte, candidates []string, class return classifier.Classify(content, mapCandidates) } +// GetLanguagesByGitAttributes returns either a string slice with the language +// if the filename matches with a regExp in .gitattributes or returns an empty slice +// in case no regExp matches the filename. It complies with the signature to be a Strategy type. +func GetLanguagesByGitAttributes(filename string, content []byte, candidates []string) []string { + gitAttributes := NewGitAttributes() + reader, err := os.Open(".gitattributes") + if err != nil { + return nil + } + + gitAttributes.LoadGitAttributes("", reader) + lang := gitAttributes.GetLanguage(filename) + if lang != OtherLanguage { + return []string{} + } + + return []string{lang} +} + // GetLanguageExtensions returns the different extensions being used by the language. func GetLanguageExtensions(language string) []string { return data.ExtensionsByLanguage[language] diff --git a/gitattributes.go b/gitattributes.go new file mode 100644 index 00000000..2b9867c1 --- /dev/null +++ b/gitattributes.go @@ -0,0 +1,234 @@ +package enry + +import ( + "errors" + "fmt" + "io" + "io/ioutil" + "regexp" + "strings" +) + +type attrType int + +const ( + vendor attrType = iota + documentation + generated + language +) + +const attrTypeName = "vendordocumentationgeneratedlanguage" + +var attrTypeIndex = [...]uint8{0, 6, 19, 28, 36} + +func (i attrType) String() string { + if i < 0 || i >= attrType(len(attrTypeIndex)-1) { + return fmt.Sprintf("attrType(%d)", i) + } + + return attrTypeName[attrTypeIndex[i]:attrTypeIndex[i+1]] +} + +type boolAttribute struct { + kind attrType + matchers []string + attributes map[string]bool +} + +type regExpAttribute struct { + matchers []string + attributes map[*regexp.Regexp]string +} + +// GitAttributes is a struct that contains two maps: +// boolAttributes contains all the attributes that works like a boolean condition, +// regExpAttributes contains all the attributes that match a regExp to choose if an attribute is applied or not +type GitAttributes struct { + boolAttributes map[attrType]boolAttribute + regExpAttributes map[attrType]regExpAttribute +} + +type overrideError struct { + attribute attrType + path string +} + +func (e *overrideError) Error() string { + return fmt.Sprintf("gitattributes: You are overriding a %v attribute of one of your previous lines %s\n", e.attribute, e.path) +} + +// Returns whether or not path is a vendor path. +func (gitAttrs *GitAttributes) IsVendor(path string) bool { + if val, ok := gitAttrs.boolAttributes[vendor].attributes[path]; ok { + return val + } + + return IsVendor(path) +} + +// Returns whether or not path is a documentation path. +func (gitAttrs *GitAttributes) IsDocumentation(path string) bool { + if val, ok := gitAttrs.boolAttributes[documentation].attributes[path]; ok { + return val + } + + return IsDocumentation(path) +} + +// Returns whether or not path is a generated path. +func (gitAttrs *GitAttributes) IsGenerated(path string) bool { + if val, ok := gitAttrs.boolAttributes[generated].attributes[path]; ok { + return val + } + + return false +} + +// GetLanguage get the language of a file matching the language attributes given. +// Returns either OthetLanguage or the language if the regExp matches +func (gitAttrs *GitAttributes) GetLanguage(filename string) string { + for regExp, language := range gitAttrs.regExpAttributes[language].attributes { + if regExp.MatchString(filename) { + return language + } + } + + return OtherLanguage +} + +// NewGitAttributes initialize a Gitattributes object +func NewGitAttributes() *GitAttributes { + gitAttrs := GitAttributes{ + boolAttributes: map[attrType]boolAttribute{ + vendor: boolAttribute{matchers: []string{"linguist-vendored", "linguist-vendored=false"}, attributes: map[string]bool{}}, + documentation: boolAttribute{matchers: []string{"linguist-documentation", "linguist-documentation=false"}, attributes: map[string]bool{}}, + generated: boolAttribute{matchers: []string{"linguist-generated", "linguist-generated=false"}, attributes: map[string]bool{}}, + }, + regExpAttributes: map[attrType]regExpAttribute{ + language: regExpAttribute{matchers: []string{"linguist-language="}, attributes: map[*regexp.Regexp]string{}}, + }, + } + + return &gitAttrs +} + +// LoadGitattributes reads and parses the file .gitattributes which overrides the standard strategies. +// Returns slice of errors that have may ocurred in the load. +func (gitAttrs *GitAttributes) LoadGitAttributes(path string, reader io.Reader) []error { + rawAttributes, errArr := loadRawGitAttributes(reader) + if len(rawAttributes) == 0 { + return []error{} + } + + return append(gitAttrs.parseAttributes(path, rawAttributes), errArr...) +} + +func loadRawGitAttributes(reader io.Reader) (map[string][]string, []error) { + rawAttributes := map[string][]string{} + var errArr []error + data, err := ioutil.ReadAll(reader) + if err != nil { + errArr = append(errArr, err) + return nil, errArr + } + + if len(data) > 0 { + lines := strings.Split(string(data), "\n") + for _, line := range lines { + err := loadLine(line, rawAttributes) + if err != nil { + errArr = append(errArr, err) + } + } + } + + return rawAttributes, errArr +} + +func loadLine(line string, gitattributes map[string][]string) error { + tokens := strings.Fields(line) + if len(tokens) == 2 { + gitattributes[tokens[0]] = append(gitattributes[tokens[0]], tokens[1]) + return nil + } else if len(tokens) != 0 { + err := errors.New("gitattributes: Each line only can have a pair of elements E.g. path/to/file attribute") + return err + } + + return nil +} + +func (gitAttrs *GitAttributes) parseAttributes(path string, attributes map[string][]string) []error { + errArray := []error{} + for key, values := range attributes { + for _, val := range values { + err := gitAttrs.parseAttribute(path+key, val) + if err != nil { + errArray = append(errArray, err) + } + } + } + + return errArray +} + +func (gitAttrs *GitAttributes) matches(kind attrType, str string) bool { + if bollAttrs, ok := gitAttrs.boolAttributes[kind]; ok && strings.Contains(str, bollAttrs.matchers[0]) { + return true + } else if regExpAttrs, ok := gitAttrs.regExpAttributes[kind]; ok && strings.Contains(str, regExpAttrs.matchers[0]) { + return true + } + + return false +} + +func (gitAttrs *GitAttributes) parseAttribute(key string, attribute string) error { + var err error + for kind := vendor; kind <= language; kind++ { + if gitAttrs.matches(kind, attribute) { + if kind < language { + err = gitAttrs.processBoolAttr(kind, key, attribute) + } else { + err = gitAttrs.processRegExpAttr(kind, key, attribute) + } + } + } + + return err +} + +func (gitAttrs *GitAttributes) processBoolAttr(kind attrType, key string, attribute string) error { + var err error + if _, ok := gitAttrs.boolAttributes[kind].attributes[key]; ok { + err = &overrideError{attribute: kind, path: key} + } + + switch { + case attribute == gitAttrs.boolAttributes[kind].matchers[0]: + gitAttrs.boolAttributes[kind].attributes[key] = true + case attribute == gitAttrs.boolAttributes[kind].matchers[1]: + gitAttrs.boolAttributes[kind].attributes[key] = false + default: + err = errors.New(fmt.Sprintf("gitattributes: The matcher %s doesn't exists\n", attribute)) + } + + return err +} + +func (gitAttrs *GitAttributes) processRegExpAttr(kind attrType, regExpString string, attribute string) error { + tokens := strings.SplitN(attribute, "=", 2) + regExp, err := regexp.Compile(regExpString) + if err != nil { + return err + } + + lang, _ := GetLanguageByAlias(tokens[1]) + if lang != OtherLanguage { + gitAttrs.regExpAttributes[kind].attributes[regExp] = lang + } else { + gitAttrs.regExpAttributes[kind].attributes[regExp] = tokens[1] + } + + return nil +} diff --git a/gitattributes_test.go b/gitattributes_test.go new file mode 100644 index 00000000..e55469d9 --- /dev/null +++ b/gitattributes_test.go @@ -0,0 +1,196 @@ +package enry + +import ( + "fmt" + "io/ioutil" + "os" + + "github.com/stretchr/testify/assert" +) + +func (s *EnryTestSuite) TestLoadGitAttributes() { + gitAttrs := NewGitAttributes() + tmpGitAttributes, err := ioutil.TempFile("/tmp", "gitattributes") + assert.NoError(s.T(), err) + data := []byte("path linguist-vendored\n path/foo linguist-vendored=false\n path/vendor linguist-vendored=false \n path/foo linguist-documentation\n path/generated linguist-generated\n" + + "path/bar linguist-vendored=fail\n path/foo linguist-documentation=false\n path/bar not-a-matcher\n path/a linguist-documentation linguist-vendored") + tmpGitAttributes.Write(data) + tmpGitAttributes.Close() + reader, err := os.Open(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) + errArr := gitAttrs.LoadGitAttributes("test/", reader) + if len(errArr) != 3 { + fmt.Println(errArr) + s.Fail("The error length it's not the expected") + } + + tests := []struct { + name string + expected int + }{ + {name: "TestLoadGitAttributes_1", expected: 3}, + {name: "TestLoadGitAttributes_2", expected: 1}, + {name: "TestLoadGitAttributes_3", expected: 1}, + {name: "TestLoadGitAttributes_4", expected: 0}, + } + + for i, test := range tests { + if attrType(i) < language { + assert.Equal(s.T(), len(gitAttrs.boolAttributes[attrType(i)].attributes), test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, len(gitAttrs.boolAttributes[attrType(i)].attributes), test.expected)) + } else { + assert.Equal(s.T(), len(gitAttrs.regExpAttributes[attrType(i)].attributes), test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, len(gitAttrs.regExpAttributes[attrType(i)].attributes), test.expected)) + } + } + + err = os.RemoveAll(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) +} + +func (s *EnryTestSuite) TestLoadGitAttributesEmpty() { + gitAttrs := NewGitAttributes() + tmpGitAttributes, err := ioutil.TempFile("/tmp", "gitattributes") + assert.NoError(s.T(), err) + reader, err := os.Open(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) + errArr := gitAttrs.LoadGitAttributes("test/", reader) + if len(errArr) != 0 { + fmt.Println(errArr) + s.Fail("The error length it's not the expected") + } +} + +func (s *EnryTestSuite) TestIsVendorGitAttributes() { + gitAttrs := NewGitAttributes() + tmpGitAttributes, err := ioutil.TempFile("/tmp", "gitattributes") + assert.NoError(s.T(), err) + data := []byte("path linguist-vendored\n path/foo linguist-vendored=false\n path/vendor linguist-vendored=false") + tmpGitAttributes.Write(data) + tmpGitAttributes.Close() + reader, err := os.Open(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) + errArr := gitAttrs.LoadGitAttributes("", reader) + if len(errArr) != 0 { + fmt.Println(errArr) + s.Fail("The error length it's not the expected") + } + + tests := []struct { + name string + path string + expected bool + }{ + {name: "TestIsVendorGitAttributes_1", path: "path", expected: true}, + {name: "TestIsVendorGitAttributes_2", path: "path/foo", expected: false}, + {name: "TestIsVendorGitAttributes_3", path: "path/vendor", expected: false}, + {name: "TestIsVendorGitAttributes_4", path: "vendor/", expected: true}, + {name: "TestIsVendorGitAttributes_5", path: "dir/", expected: false}, + } + for _, test := range tests { + is := gitAttrs.IsVendor(test.path) + assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected)) + } + + err = os.RemoveAll(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) +} + +func (s *EnryTestSuite) TestIsDocumentationGitAttributes() { + gitAttrs := NewGitAttributes() + tmpGitAttributes, err := ioutil.TempFile("/tmp", "gitattributes") + assert.NoError(s.T(), err) + data := []byte("path linguist-documentation\n path/foo linguist-documentation=false\n path/documentation linguist-vendored=false") + tmpGitAttributes.Write(data) + tmpGitAttributes.Close() + reader, err := os.Open(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) + errArr := gitAttrs.LoadGitAttributes("", reader) + if len(errArr) != 0 { + fmt.Println(errArr) + s.Fail("The error length it's not the expected") + } + + tests := []struct { + name string + path string + expected bool + }{ + {name: "TestIsDocumentationGitAttributes_1", path: "path", expected: true}, + {name: "TestIsDocumentationGitAttributes_2", path: "path/foo", expected: false}, + {name: "TestIsDocumentationGitAttributes_3", path: "path/documentation", expected: false}, + {name: "TestIsDocumentationGitAttributes_4", path: "README", expected: true}, + {name: "TestIsDocumentationGitAttributes_5", path: "dir/", expected: false}, + } + for _, test := range tests { + is := gitAttrs.IsDocumentation(test.path) + assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected)) + } + + err = os.RemoveAll(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) +} + +func (s *EnryTestSuite) TestIsGeneratedGitAttributes() { + gitAttrs := NewGitAttributes() + tmpGitAttributes, err := ioutil.TempFile("/tmp", "gitattributes") + assert.NoError(s.T(), err) + data := []byte("path linguist-generated\n path/foo linguist-generated=false\n path/generated linguist-generated=false") + tmpGitAttributes.Write(data) + tmpGitAttributes.Close() + reader, err := os.Open(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) + errArr := gitAttrs.LoadGitAttributes("", reader) + if len(errArr) != 0 { + fmt.Println(errArr) + s.Fail("The error length it's not the expected") + } + + tests := []struct { + name string + path string + expected bool + }{ + {name: "TestIsGeneratedGitAttributes_1", path: "path", expected: true}, + {name: "TestIsGeneratedGitAttributes_2", path: "path/foo", expected: false}, + {name: "TestIsGeneratedGitAttributes_3", path: "path/generated", expected: false}, + {name: "TestIsGeneratedGitAttributes_4", path: "path2", expected: false}, + } + for _, test := range tests { + is := gitAttrs.IsGenerated(test.path) + assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected)) + } + + err = os.RemoveAll(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) +} + +func (s *EnryTestSuite) TestGetLanguageGitAttributes() { + gitAttrs := NewGitAttributes() + tmpGitAttributes, err := ioutil.TempFile("/tmp", "gitattributes") + assert.NoError(s.T(), err) + data := []byte(".*\\.go linguist-language=GO\n path/not-java/.*\\.java linguist-language=notJava\n") + tmpGitAttributes.Write(data) + tmpGitAttributes.Close() + reader, err := os.Open(tmpGitAttributes.Name()) + assert.NoError(s.T(), err) + errArr := gitAttrs.LoadGitAttributes("", reader) + if len(errArr) != 0 { + fmt.Println(errArr) + s.Fail("The error length it's not the expected") + } + + tests := []struct { + name string + path string + expected string + }{ + {name: "TestGetLanguageGitAttributes_1", path: "path/files/a.go", expected: "Go"}, + {name: "TestGetLanguageGitAttributes_2", path: "path/files/subdir/b.go", expected: "Go"}, + {name: "TestGetLanguageGitAttributes_3", path: "path/not-java/c.java", expected: "notJava"}, + {name: "TestGetLanguageGitAttributes_4", path: "path/d.py", expected: ""}, + } + + for _, test := range tests { + is := gitAttrs.GetLanguage(test.path) + assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected)) + } +} diff --git a/utils_test.go b/utils_test.go index b1d1e1f5..db9d95d6 100644 --- a/utils_test.go +++ b/utils_test.go @@ -79,3 +79,35 @@ func (s *EnryTestSuite) TestIsBinary() { assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected)) } } + +func (s *EnryTestSuite) TestIdDot() { + tests := []struct { + name string + path string + expected bool + }{ + {name: "TestIsDot_1", path: "foo/var/.dotfile", expected: true}, + {name: "TestIsDot_2", path: "foo/var/file", expected: false}, + {name: "TestIsDot_3", path: "foo/var/file.dot", expected: false}, + } + for _, test := range tests { + is := IsDotFile(test.path) + assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected)) + } +} + +func (s *EnryTestSuite) TestIsAuxiliaryLanguage() { + tests := []struct { + name string + lang string + expected bool + }{ + {name: "TestIsAuxilaryLang_1", lang: "YAML", expected: true}, + {name: "TestIsAuxilaryLang_2", lang: "Go", expected: false}, + {name: "TestIsAuxilaryLang_3", lang: "JSON", expected: true}, + } + for _, test := range tests { + is := IsAuxiliaryLanguage(test.lang) + assert.Equal(s.T(), is, test.expected, fmt.Sprintf("%v: is = %v, expected: %v", test.name, is, test.expected)) + } +}