Skip to content

Commit

Permalink
Merge pull request #33337 from github/repo-sync
Browse files Browse the repository at this point in the history
Repo sync
  • Loading branch information
docs-bot authored Jun 4, 2024
2 parents 7650023 + a518d9d commit b5714a0
Show file tree
Hide file tree
Showing 7 changed files with 448 additions and 16 deletions.
23 changes: 23 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@
"dependencies": {
"@elastic/elasticsearch": "8.13.1",
"@github/failbot": "0.8.3",
"@horizon-rs/language-guesser": "0.1.1",
"@octokit/plugin-retry": "6.0.1",
"@octokit/request-error": "6.1.1",
"@primer/behaviors": "^1.5.1",
Expand All @@ -216,6 +217,7 @@
"connect-datadog": "0.0.9",
"connect-timeout": "1.9.0",
"cookie-parser": "^1.4.6",
"cuss": "2.2.0",
"dayjs": "^1.11.3",
"dotenv": "^16.4.5",
"escape-string-regexp": "5.0.0",
Expand Down Expand Up @@ -313,6 +315,7 @@
"commander": "^12.1.0",
"cross-env": "^7.0.3",
"csp-parse": "0.0.2",
"csv-parse": "5.5.6",
"eslint": "8.57.0",
"eslint-config-prettier": "9.1.0",
"eslint-config-standard": "17.1.0",
Expand Down
158 changes: 158 additions & 0 deletions src/events/analyze-comment.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import { cuss } from 'cuss'
import { cuss as cussPt } from 'cuss/pt'
import { cuss as cussFr } from 'cuss/fr'
import { cuss as cussEs } from 'cuss/es'
import { Language } from '@horizon-rs/language-guesser'

const language = new Language()

// Exported for the debugging CLI script
export const SIGNAL_RATINGS = [
{
reduction: 1.0,
name: 'email-only',
validator: (comment) => isEmailOnly(comment),
},
{
reduction: 0.2,
name: 'contains-email',
validator: (comment) => isContainingEmail(comment),
},
{
reduction: 0.1,
name: 'url-only',
validator: (comment) => isURL(comment),
},
{
reduction: 0.1,
name: 'numbers-only',
validator: (comment) => isNumbersOnly(comment),
},
{
reduction: 0.1,
name: 'all-uppercase',
validator: (comment) => isAllUppercase(comment),
},
{
reduction: 0.1,
name: 'too-short',
validator: (comment) => isTooShort(comment),
},
{
reduction: 0.2,
name: 'not-language',
validator: (comment, language) => isNotLanguage(comment, language),
},
{
reduction: 0.3,
name: 'cuss-words-likely',
validator: (comment, language) => isLikelyCussWords(comment, language),
},
{
reduction: 0.1,
name: 'cuss-words-maybe',
validator: (comment, language) => isMaybeCussWords(comment, language),
},
]

export async function analyzeComment(text, language = 'en') {
const signals = []
let rating = 1.0
for (const { reduction, name, validator } of SIGNAL_RATINGS) {
if (validator(text, language)) {
signals.push(name)
rating -= reduction
}
if (rating <= 0) break
}

return { signals, rating }
}

function isEmailOnly(text) {
if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
const atSigns = text.split('@').length
if (atSigns === 2) {
return true
}
}
}

function isContainingEmail(text) {
if (text.includes('@') && !isEmailOnly(text)) {
// Don't use splitWords() here because `[email protected]` will be
// split up into ['foo', 'example.com'].
return text.split(/\s+/g).some((word) => isEmailOnly(word))
}
return false
}

function isURL(text) {
if (!text.trim().includes(' ')) {
if (URL.canParse(text.trim())) return true
}
}

function isNumbersOnly(text) {
return /^\d+$/.test(text.replace(/\s/g, ''))
}

function isAllUppercase(text) {
return /[A-Z]/.test(text) && text === text.toUpperCase()
}

function isTooShort(text) {
const split = text.trim().split(/\s+/)
if (split.length <= 1) {
// return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
return true
}
}

function isNotLanguage(text, language_) {
const bestGuess = language.guessBest(text.trim())
if (!bestGuess) return true // Can happen if the text is just whitespace
// @horizon-rs/language-guesser is based on tri-grams and can lead
// to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
// Haitian! And that 'I wanne robux 1000' is Polish!
// But that's because they are short and there's not enough clues to
// guess what language it is. You and I might know those are actually
// attempts to be English, despite the spelling.
// But are they useful comments? Given that this is just a signal,
// and not a hard blocker, it's more of a clue than a fact.
return bestGuess.alpha2 !== language_
}

function getCussWords(lang) {
switch (lang) {
case 'pt':
return cussPt
case 'fr':
return cussFr
case 'es':
return cussEs
default:
return cuss
}
}

function isLikelyCussWords(text, language_, rating = 2) {
const cussWords = getCussWords(language_)
for (const word of splitWords(text, language_ || 'en')) {
if (cussWords[word] && cussWords[word] === rating) {
return true
}
}
return false
}

function isMaybeCussWords(text, language_) {
return isLikelyCussWords(text, language_, 1)
}

const segmenter = new Intl.Segmenter([], { granularity: 'word' })

function splitWords(text) {
const segmentedText = segmenter.segment(text)
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
}
14 changes: 3 additions & 11 deletions src/events/middleware.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { noCacheControl } from '#src/frame/middleware/cache-control.js'
import { getJsonValidator } from '#src/tests/lib/validate-json-schema.js'
import { formatErrors } from './lib/middleware-errors.js'
import { publish as _publish } from './lib/hydro.js'
import { analyzeComment } from './analyze-comment.js'

const router = express.Router()
const OMIT_FIELDS = ['type']
Expand Down Expand Up @@ -90,18 +91,9 @@ router.post(
return res.status(400).json({ message: 'Empty comment' })
}

const signals = []
const rating = 1.0
const { rating } = await analyzeComment(comment, locale)

// if (comment.includes('@') && !comment.includes(' ')) {
// // XXX Make it a simple email validator
// signals.push({
// email: 'Looks like an email address',
// })
// rating -= 0.1
// }

return res.json({ rating, signals })
return res.json({ rating })
}),
)

Expand Down
100 changes: 100 additions & 0 deletions src/events/scripts/analyze-comments-csv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/**
* This script is used to analyze posted survey comments in a CSV file.
* The CSV file is expected to have come from the Azure Data Explorer
* after having queries the `docs_v0_survey_event` table.
*
*
*/

import fs from 'node:fs'
import util from 'node:util'

import chalk from 'chalk'
import { parse } from 'csv-parse'
import { program } from 'commander'

import { SIGNAL_RATINGS } from '../analyze-comment'

type Options = {
outputFile: string
limit: string
random: boolean
}
program
.description('Analyze survey comments in a CSV file')
.option('-o, --output-file <path>', 'path to the output', 'stdout')
.option('--limit <number>', 'limit number of records analyzed', 'Infinity')
.option(
'--random',
'randomize the lines analyzed (useful when limit is less than size of CSV)',
false,
)
.argument('<csv-files...>', 'path to the exported CSV file')
.action(main)

program.parse(process.argv)

async function main(csvFile: string[], options: Options) {
for (const file of csvFile) {
await analyzeFile(file, options)
}
}

type Record = {
[key: string]: string | number
}

async function analyzeFile(csvFile: string, options: Options) {
const parser = fs.createReadStream(csvFile).pipe(
parse({
// Needed when parsing CSVs from the Azure Data Explorer
bom: true,
}),
)
let headers: null | string[] = null
const records: Record[] = []
for await (const record of parser) {
if (headers === null) {
headers = record as string[]
} else {
const obj: {
[key: string]: string
} = {}
for (let i = 0; i < headers.length; i++) {
obj[headers[i]] = record[i]
}
records.push(obj)
}
}

const limit = parseInt(options.limit)
if (options.random) {
records.sort(() => Math.random() - 0.5)
}
for (const record of records.slice(0, limit)) {
const language = record.survey_comment_language || 'en'
let rating = 1.0
let first = true
for (const { reduction, name, validator } of SIGNAL_RATINGS) {
const hit = validator(record.survey_comment, language)
if (hit) {
rating -= reduction
if (first) {
console.log(util.inspect(record.survey_comment))
first = false
}
console.log(name.padEnd(10), reduction)
if (rating <= 0.0) {
break
}
}
}
if (rating !== 1.0) {
console.log(chalk.yellow(`Rating: ${rating}`))
} else {
console.log(chalk.green('No rating reduction'))
}

console.log('\n')
}
}
Loading

0 comments on commit b5714a0

Please sign in to comment.