diff --git a/src/lib/philomena/search/QueryLexer.ts b/src/lib/philomena/search/QueryLexer.ts index 76d2d5f..b3fa598 100644 --- a/src/lib/philomena/search/QueryLexer.ts +++ b/src/lib/philomena/search/QueryLexer.ts @@ -41,21 +41,27 @@ export class QuotedTermToken extends Token { } static decode(value: string): string { - return value.replace(/\\([\\"])/g, "$1"); + return value + .replaceAll(/\\([\\"])/g, "$1") + .replaceAll(/^"|"$/g, ''); } static encode(value: string): string { - return value.replace(/[\\"]/g, "\\$&"); + return `"${value.replaceAll(/[\\"]/g, "\\$&")}"`; } } export class TermToken extends Token { } -type MatchResultCarry = { +interface MatchResultCarry { match?: RegExpMatchArray | null } +interface SuccessfulMatchResultCarry { + match: RegExpMatchArray; +} + /** * Search query tokenizer. Should mostly work for the cases of parsing and finding the selected term for * auto-completion. Follows the rules described in the Philomena booru engine. @@ -94,26 +100,26 @@ export class QueryLexer { } if (this.#match(QueryLexer.#negotiationOperator, result)) { - tokens.push(new NotToken(this.#index, result.match![0])); - this.#index += result.match![0].length; + tokens.push(new NotToken(this.#index, result.match[0])); + this.#index += result.match[0].length; continue; } if (this.#match(QueryLexer.#andOperator, result)) { - tokens.push(new AndToken(this.#index, result.match![0])); - this.#index += result.match![0].length; + tokens.push(new AndToken(this.#index, result.match[0])); + this.#index += result.match[0].length; continue; } if (this.#match(QueryLexer.#orOperator, result)) { - tokens.push(new OrToken(this.#index, result.match![0])); - this.#index += result.match![0].length; + tokens.push(new OrToken(this.#index, result.match[0])); + this.#index += result.match[0].length; continue; } if (this.#match(QueryLexer.#notOperator, result)) { - tokens.push(new NotToken(this.#index, result.match![0])); - this.#index += result.match![0].length; + tokens.push(new NotToken(this.#index, result.match[0])); + this.#index += result.match[0].length; continue; } @@ -130,26 +136,26 @@ export class QueryLexer { } if (this.#match(QueryLexer.#boostOperator, result)) { - tokens.push(new BoostToken(this.#index, result.match![0])); - this.#index += result.match![0].length; + tokens.push(new BoostToken(this.#index, result.match[0])); + this.#index += result.match[0].length; continue; } if (this.#match(QueryLexer.#whitespaces, result)) { - this.#index += result.match![0].length; + this.#index += result.match[0].length; continue; } if (this.#match(QueryLexer.#quotedText, result)) { - tokens.push(new QuotedTermToken(this.#index, result.match![0], result.match![1])); - this.#index += result.match![0].length; + tokens.push(new QuotedTermToken(this.#index, result.match[0], result.match[1])); + this.#index += result.match[0].length; continue; } dirtyText = this.#parseDirtyText(this.#index); if (dirtyText) { - tokens.push(new TermToken(this.#index, dirtyText)); + tokens.push(new TermToken(this.#index, dirtyText.trim())); this.#index += dirtyText.length; continue; } @@ -168,7 +174,7 @@ export class QueryLexer { * * @return Is there a match? */ - #match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): boolean { + #match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry { return this.#matchAt(targetRegExp, this.#index, resultCarrier); } @@ -181,9 +187,9 @@ export class QueryLexer { * * @return Is there a match? */ - #matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): boolean { + #matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry { targetRegExp.lastIndex = index; - resultCarrier.match = this.#value.match(targetRegExp); + resultCarrier.match = targetRegExp.exec(this.#value); return resultCarrier.match !== null; } @@ -207,16 +213,10 @@ export class QueryLexer { break; } - if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) { - resultValue += result.match![0]; - index += result.match![0].length; - continue; - } - if (this.#value[index] === QueryLexer.#bracketsOpenCharacter) { let bracketsContent = QueryLexer.#bracketsOpenCharacter + this.#parseDirtyText(index + 1); - if (this.#value[index + bracketsContent.length + 1] === QueryLexer.#bracketsCloseCharacter) { + if (this.#value[index + bracketsContent.length] === QueryLexer.#bracketsCloseCharacter) { bracketsContent += QueryLexer.#bracketsCloseCharacter; } @@ -227,22 +227,28 @@ export class QueryLexer { continue; } + if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) { + resultValue += result.match[0]; + index += result.match[0].length; + continue; + } + break; } return resultValue; } - static #commaCharacter = ','; - static #negotiationOperator = /[!-]/y; - static #andOperator = /\s+(?:AND|&&)\s+/y; - static #orOperator = /\s+(?:OR|\|\|)\s+/y; - static #notOperator = /NOT\s+/y; - static #bracketsOpenCharacter = "("; - static #bracketsCloseCharacter = ")"; - static #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y; - static #whitespaces = /\s+/y; - static #quotedText = /"((?:\\.|[^\\"])+)"/y; - static #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s+(?:\)|\^[+-]?\d+(?:\.\d+)?)/y; - static #dirtyTextContent = /\\.|[^()]/y; + static readonly #commaCharacter = ','; + static readonly #negotiationOperator = /[!-]/y; + static readonly #andOperator = /\s+(?:AND|&&)\s+/y; + static readonly #orOperator = /\s+(?:OR|\|\|)\s+/y; + static readonly #notOperator = /NOT\s+/y; + static readonly #bracketsOpenCharacter = "("; + static readonly #bracketsCloseCharacter = ")"; + static readonly #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y; + static readonly #whitespaces = /\s+/y; + static readonly #quotedText = /"\s*((?:\\.|[^\\"])+?)\s*"/y; + static readonly #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s*(?:\)|\^[+-]?\d+(?:\.\d+)?)/y; + static readonly #dirtyTextContent = /\\.|[^()]/y; } diff --git a/src/lib/philomena/tag-utils.ts b/src/lib/philomena/tag-utils.ts index 0890a2c..91b5464 100644 --- a/src/lib/philomena/tag-utils.ts +++ b/src/lib/philomena/tag-utils.ts @@ -42,7 +42,7 @@ const tagLinkRegExp = /\/tags\/(?[^/?#]+)/; * * @see https://github.com/philomena-dev/philomena/blob/6086757b654da8792ae52adb2a2f501ea6c30d12/lib/philomena/slug.ex#L52-L57 */ -const slugEncodedCharacters: Map = new Map([ +export const slugEncodedCharacters: Map = new Map([ ['-dash-', '-'], ['-fwslash-', '/'], ['-bwslash-', '\\'], @@ -101,9 +101,8 @@ export function resolveTagNameFromLink(tagLink: URL): string | null { } return decodeURIComponent(encodedTagName) - .replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match) - .replaceAll('-', ' ') - .replaceAll('+', ' '); + .replaceAll('+', ' ') + .replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match); } /** diff --git a/tests/lib/philomena/search/QueryLexer.spec.ts b/tests/lib/philomena/search/QueryLexer.spec.ts new file mode 100644 index 0000000..125708a --- /dev/null +++ b/tests/lib/philomena/search/QueryLexer.spec.ts @@ -0,0 +1,111 @@ +import { + AndToken, + BoostToken, + GroupEndToken, + GroupStartToken, + NotToken, + OrToken, + QueryLexer, + QuotedTermToken, + TermToken, + Token +} from "$lib/philomena/search/QueryLexer"; + +describe('QueryLexer', () => { + function parseQuery(query: string): Token[] { + return new QueryLexer(query).parse(); + } + + function parseQueryTypes(query: string): (typeof Token)[] { + return parseQuery(query) + .map(term => (term.constructor as any) as typeof Token); + } + + it('should properly parse different kinds of queries', () => { + expect(parseQueryTypes('safe')).toEqual([TermToken]); + expect(parseQueryTypes('safe^1')).toEqual([TermToken, BoostToken]); + expect(parseQueryTypes('safe, avali')).toEqual([TermToken, AndToken, TermToken]); + expect(parseQueryTypes('!avali')).toEqual([NotToken, TermToken]); + expect(parseQueryTypes('avali || 4 ears')).toEqual([TermToken, OrToken, TermToken]); + expect(parseQueryTypes('avali && !4 ears')).toEqual([TermToken, AndToken, NotToken, TermToken]); + + expect(parseQueryTypes('avali AND (NOT 4 ears OR -3 fingers)')).toEqual([ + TermToken, AndToken, GroupStartToken, NotToken, TermToken, OrToken, NotToken, TermToken, GroupEndToken, + ]); + }); + + it('should not treat parentheses as groups inside the term', () => { + expect(parseQueryTypes('!(experiment (casualties unknown) || milky (casualties unknown))')).toEqual([ + NotToken, GroupStartToken, TermToken, OrToken, TermToken, GroupEndToken, + ]); + }); + + it('should accept any amount of whitespaces between different tokens', () => { + expect(parseQueryTypes('! ( avali , experiment (casualties unknown) ) && safe')).toEqual([ + NotToken, GroupStartToken, TermToken, AndToken, TermToken, GroupEndToken, AndToken, TermToken, + ]); + }); + + it('should trim whitespaces inside the terms, even in quoted ones', () => { + const [termWithSpaces] = parseQuery(' avali '); + expect(termWithSpaces.value).toBe('avali'); + + const [quotedTermWithSpaces] = parseQuery(' " avali " '); + expect(quotedTermWithSpaces instanceof QuotedTermToken && quotedTermWithSpaces.decodedValue || new Error('Wrong token')).toBe('avali'); + }); + + it('should properly differentiate between word-like operators and parts of tags', () => { + expect(parseQueryTypes('safe AND sound')).toEqual([TermToken, AndToken, TermToken]); + expect(parseQueryTypes('NOT safe AND dangerous')).toEqual([NotToken, TermToken, AndToken, TermToken]); + }); + + it('should only detect word-like operators when spaces are in place', () => { + // Require whitespace between operator and other tokens + expect(parseQueryTypes('NOT safeANDsound')).toEqual([NotToken, TermToken]); + + // If none are there, just should treat it as a part of a term + expect(parseQuery('safeAND sound')[0].value).toEqual('safeAND sound'); + + // All operators should be in all caps, otherwise it's just a term + const [lowercaseOperatorWords] = parseQuery('avali are cute and you know it or else'); + expect(lowercaseOperatorWords.value).toBe('avali are cute and you know it or else'); + + // And if it in caps, but part of some word, then it's just a word + const [wordsInCapsContainingOperators] = parseQuery('THAT POOR KNOT IS PLAIN AS SAND'); + expect(wordsInCapsContainingOperators.value).toBe('THAT POOR KNOT IS PLAIN AS SAND'); + }); + + it('should not treat any operators inside the quoted term as actual operators', () => { + const tokens = parseQuery('"this AND that OR these NOT there || () && ^123"'); + const [quotedTermToken] = tokens; + + expect(tokens).toHaveLength(1); + + expect(quotedTermToken instanceof QuotedTermToken && quotedTermToken.decodedValue || null) + .toBe('this AND that OR these NOT there || () && ^123'); + }); + + describe('QuotedTermToken', () => { + it('should decode and encode quotes and backslash', () => { + const encodedQuote = `"term with \\\" inside of it"`; + const decodedQuote = 'term with " inside of it'; + + expect(QuotedTermToken.decode(encodedQuote)).toBe(decodedQuote); + expect(QuotedTermToken.encode(decodedQuote)).toBe(encodedQuote); + + const encodedBackslash = `"term with \\\\ inside of it"`; + const decodedBackslash = 'term with \\ inside of it'; + + expect(QuotedTermToken.decode(encodedBackslash)).toBe(decodedBackslash); + expect(QuotedTermToken.encode(decodedBackslash)).toBe(encodedBackslash); + }); + + it('should not care for anything else', () => { + const encodedTerm = '"operators: , && || AND OR NOT ! ^ ? *"'; + const decodedTerm = 'operators: , && || AND OR NOT ! ^ ? *'; + + expect(QuotedTermToken.decode(encodedTerm)).toBe(decodedTerm); + expect(QuotedTermToken.encode(decodedTerm)).toBe(encodedTerm); + }); + }); +}); diff --git a/tests/lib/philomena/tag-utils.spec.ts b/tests/lib/philomena/tag-utils.spec.ts new file mode 100644 index 0000000..a322a07 --- /dev/null +++ b/tests/lib/philomena/tag-utils.spec.ts @@ -0,0 +1,76 @@ +import { URL } from 'url'; +import { resolveTagNameFromLink, slugEncodedCharacters } from '$lib/philomena/tag-utils'; + +describe('tag-utils', () => { + const origin = 'https://furbooru.org'; + + describe('resolveTagNameFromLink', () => { + function resolveFromSearchQuery(encodedQuery: string): string | null { + return resolveTagNameFromLink(new URL(`/search?q=${encodedQuery}`, origin)); + } + + describe('Parsing from /search/?q=tag links', () => { + it('should resolve a single tag from /search URLs', () => { + expect(resolveFromSearchQuery('safe')).toBe('safe'); + }); + + it('should return null for queries with multiple comma-separated tags', () => { + // Comma acts as a separator in the query, resulting in multiple tokens + expect(resolveFromSearchQuery('safe, suggestive')).toBe(null); + }); + + it('should return null if query is empty or not a term', () => { + expect(resolveFromSearchQuery('')).toBe(null); + expect(resolveFromSearchQuery('!')).toBe(null); + }); + + it('should properly treat parentheses in the query with single tag', () => { + // Parentheses are operators in the query language, but when inside the tag name, they should still be properly + // working. + expect(resolveFromSearchQuery('experiment (casualties unknown)')).toBe('experiment (casualties unknown)'); + }); + + it('should properly resolve queries with encoded characters', () => { + expect(resolveFromSearchQuery('pok%C3%A9mon')).toBe('pokémon'); + }); + + it('should unquote quoted term', () => { + expect(resolveFromSearchQuery('"experiment (casualties unknown)"')).toBe('experiment (casualties unknown)') + expect(resolveFromSearchQuery('"single tag, really"')).toBe('single tag, really'); + }); + }) + + describe('Parsing from /tags/name links', () => { + function resolveFromTagLink(encodedTagName: string): string | null { + return resolveTagNameFromLink(new URL(`/tags/${encodedTagName}`, origin)); + } + + it('should resolve a single tag', () => { + expect(resolveFromTagLink('safe')).toBe('safe'); + }); + + it('should only read the tag page even if query is provided', () => { + expect(resolveFromTagLink('grotesque?q=explicit')).toBe('grotesque'); + }); + + it('should properly resolve links with encoded characters', () => { + expect(resolveFromTagLink('pok%C3%A9mon')).toBe('pokémon'); + }); + + it('should decoded slug-encoded characters', () => { + // More common example where tag is. + expect(resolveFromTagLink(`namespace-colon-tag+name`)).toBe('namespace:tag name'); + + // Testing the whole list of encoded characters. + for (const [encodedCharacter, decodedCharacter] of slugEncodedCharacters.entries()) { + expect(resolveFromTagLink(`test+symbol${encodedCharacter}without+spaces`)).toBe(`test symbol${decodedCharacter}without spaces`); + expect(resolveFromTagLink(`test+symbol+${encodedCharacter}+with+spaces`)).toBe(`test symbol ${decodedCharacter} with spaces`); + } + }); + }); + + it('should return null for unsupported URLs', () => { + expect(resolveTagNameFromLink(new URL('/pages/example', origin))).toBe(null); + }); + }); +});