Merge pull request #172 from koloml/bugfix/tags-in-forums-with-encoded-text

Fixed wrong parsing of tag links inside forum posts
2026-06-23 18:22:20 +00:00 · 2026-06-13 19:55:32 -04:00
parent b8043bace6 3404877091
commit 2f5c37d21f
4 changed files with 235 additions and 43 deletions
--- a/src/lib/philomena/search/QueryLexer.ts
+++ b/src/lib/philomena/search/QueryLexer.ts
@@ -41,21 +41,27 @@ export class QuotedTermToken extends Token {
  }

  static decode(value: string): string {
-    return value.replace(/\\([\\"])/g, "$1");
+    return value
+      .replaceAll(/\\([\\"])/g, "$1")
+      .replaceAll(/^"|"$/g, '');
  }

  static encode(value: string): string {
-    return value.replace(/[\\"]/g, "\\$&");
+    return `"${value.replaceAll(/[\\"]/g, "\\$&")}"`;
  }
 }

 export class TermToken extends Token {
 }

-type MatchResultCarry = {
+interface MatchResultCarry {
  match?: RegExpMatchArray | null
 }

+interface SuccessfulMatchResultCarry {
+  match: RegExpMatchArray;
+}
+
 /**
 * Search query tokenizer. Should mostly work for the cases of parsing and finding the selected term for
 * auto-completion. Follows the rules described in the Philomena booru engine.
@@ -94,26 +100,26 @@ export class QueryLexer {
      }

      if (this.#match(QueryLexer.#negotiationOperator, result)) {
-        tokens.push(new NotToken(this.#index, result.match![0]));
-        this.#index += result.match![0].length;
+        tokens.push(new NotToken(this.#index, result.match[0]));
+        this.#index += result.match[0].length;
        continue;
      }

      if (this.#match(QueryLexer.#andOperator, result)) {
-        tokens.push(new AndToken(this.#index, result.match![0]));
-        this.#index += result.match![0].length;
+        tokens.push(new AndToken(this.#index, result.match[0]));
+        this.#index += result.match[0].length;
        continue;
      }

      if (this.#match(QueryLexer.#orOperator, result)) {
-        tokens.push(new OrToken(this.#index, result.match![0]));
-        this.#index += result.match![0].length;
+        tokens.push(new OrToken(this.#index, result.match[0]));
+        this.#index += result.match[0].length;
        continue;
      }

      if (this.#match(QueryLexer.#notOperator, result)) {
-        tokens.push(new NotToken(this.#index, result.match![0]));
-        this.#index += result.match![0].length;
+        tokens.push(new NotToken(this.#index, result.match[0]));
+        this.#index += result.match[0].length;
        continue;
      }

@@ -130,26 +136,26 @@ export class QueryLexer {
      }

      if (this.#match(QueryLexer.#boostOperator, result)) {
-        tokens.push(new BoostToken(this.#index, result.match![0]));
-        this.#index += result.match![0].length;
+        tokens.push(new BoostToken(this.#index, result.match[0]));
+        this.#index += result.match[0].length;
        continue;
      }

      if (this.#match(QueryLexer.#whitespaces, result)) {
-        this.#index += result.match![0].length;
+        this.#index += result.match[0].length;
        continue;
      }

      if (this.#match(QueryLexer.#quotedText, result)) {
-        tokens.push(new QuotedTermToken(this.#index, result.match![0], result.match![1]));
-        this.#index += result.match![0].length;
+        tokens.push(new QuotedTermToken(this.#index, result.match[0], result.match[1]));
+        this.#index += result.match[0].length;
        continue;
      }

      dirtyText = this.#parseDirtyText(this.#index);

      if (dirtyText) {
-        tokens.push(new TermToken(this.#index, dirtyText));
+        tokens.push(new TermToken(this.#index, dirtyText.trim()));
        this.#index += dirtyText.length;
        continue;
      }
@@ -168,7 +174,7 @@ export class QueryLexer {
   *
   * @return Is there a match?
   */
-  #match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): boolean {
+  #match(targetRegExp: RegExp, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry {
    return this.#matchAt(targetRegExp, this.#index, resultCarrier);
  }

@@ -181,9 +187,9 @@ export class QueryLexer {
   *
   * @return Is there a match?
   */
-  #matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): boolean {
+  #matchAt(targetRegExp: RegExp, index: number, resultCarrier: MatchResultCarry = {}): resultCarrier is SuccessfulMatchResultCarry {
    targetRegExp.lastIndex = index;
-    resultCarrier.match = this.#value.match(targetRegExp);
+    resultCarrier.match = targetRegExp.exec(this.#value);

    return resultCarrier.match !== null;
  }
@@ -207,16 +213,10 @@ export class QueryLexer {
        break;
      }

-      if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) {
-        resultValue += result.match![0];
-        index += result.match![0].length;
-        continue;
-      }
-
      if (this.#value[index] === QueryLexer.#bracketsOpenCharacter) {
        let bracketsContent = QueryLexer.#bracketsOpenCharacter + this.#parseDirtyText(index + 1);

-        if (this.#value[index + bracketsContent.length + 1] === QueryLexer.#bracketsCloseCharacter) {
+        if (this.#value[index + bracketsContent.length] === QueryLexer.#bracketsCloseCharacter) {
          bracketsContent += QueryLexer.#bracketsCloseCharacter;
        }

@@ -227,22 +227,28 @@ export class QueryLexer {
        continue;
      }

+      if (this.#matchAt(QueryLexer.#dirtyTextContent, index, result)) {
+        resultValue += result.match[0];
+        index += result.match[0].length;
+        continue;
+      }
+
      break;
    }

    return resultValue;
  }

-  static #commaCharacter = ',';
-  static #negotiationOperator = /[!-]/y;
-  static #andOperator = /\s+(?:AND|&&)\s+/y;
-  static #orOperator = /\s+(?:OR|\|\|)\s+/y;
-  static #notOperator = /NOT\s+/y;
-  static #bracketsOpenCharacter = "(";
-  static #bracketsCloseCharacter = ")";
-  static #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y;
-  static #whitespaces = /\s+/y;
-  static #quotedText = /"((?:\\.|[^\\"])+)"/y;
-  static #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s+(?:\)|\^[+-]?\d+(?:\.\d+)?)/y;
-  static #dirtyTextContent = /\\.|[^()]/y;
+  static readonly #commaCharacter = ',';
+  static readonly #negotiationOperator = /[!-]/y;
+  static readonly #andOperator = /\s+(?:AND|&&)\s+/y;
+  static readonly #orOperator = /\s+(?:OR|\|\|)\s+/y;
+  static readonly #notOperator = /NOT\s+/y;
+  static readonly #bracketsOpenCharacter = "(";
+  static readonly #bracketsCloseCharacter = ")";
+  static readonly #boostOperator = /\^[+-]?\d+(?:\.\d+)?/y;
+  static readonly #whitespaces = /\s+/y;
+  static readonly #quotedText = /"\s*((?:\\.|[^\\"])+?)\s*"/y;
+  static readonly #dirtyTextStopWords = /,|\s+(?:AND|&&|OR|\|\|)\s+|\s*(?:\)|\^[+-]?\d+(?:\.\d+)?)/y;
+  static readonly #dirtyTextContent = /\\.|[^()]/y;
 }
--- a/src/lib/philomena/tag-utils.ts
+++ b/src/lib/philomena/tag-utils.ts
@@ -42,7 +42,7 @@ const tagLinkRegExp = /\/tags\/(?<encodedTagName>[^/?#]+)/;
 *
 * @see https://github.com/philomena-dev/philomena/blob/6086757b654da8792ae52adb2a2f501ea6c30d12/lib/philomena/slug.ex#L52-L57
 */
-const slugEncodedCharacters: Map<string, string> = new Map([
+export const slugEncodedCharacters: Map<string, string> = new Map([
  ['-dash-', '-'],
  ['-fwslash-', '/'],
  ['-bwslash-', '\\'],
@@ -101,9 +101,8 @@ export function resolveTagNameFromLink(tagLink: URL): string | null {
  }

  return decodeURIComponent(encodedTagName)
-    .replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match)
-    .replaceAll('-', ' ')
-    .replaceAll('+', ' ');
+    .replaceAll('+', ' ')
+    .replaceAll(/-[a-z]+-/gi, match => slugEncodedCharacters.get(match) ?? match);
 }

 /**
--- a/tests/lib/philomena/search/QueryLexer.spec.ts
+++ b/tests/lib/philomena/search/QueryLexer.spec.ts
@@ -0,0 +1,111 @@
+import {
+  AndToken,
+  BoostToken,
+  GroupEndToken,
+  GroupStartToken,
+  NotToken,
+  OrToken,
+  QueryLexer,
+  QuotedTermToken,
+  TermToken,
+  Token
+} from "$lib/philomena/search/QueryLexer";
+
+describe('QueryLexer', () => {
+  function parseQuery(query: string): Token[] {
+    return new QueryLexer(query).parse();
+  }
+
+  function parseQueryTypes(query: string): (typeof Token)[] {
+    return parseQuery(query)
+      .map(term => (term.constructor as any) as typeof Token);
+  }
+
+  it('should properly parse different kinds of queries', () => {
+    expect(parseQueryTypes('safe')).toEqual([TermToken]);
+    expect(parseQueryTypes('safe^1')).toEqual([TermToken, BoostToken]);
+    expect(parseQueryTypes('safe, avali')).toEqual([TermToken, AndToken, TermToken]);
+    expect(parseQueryTypes('!avali')).toEqual([NotToken, TermToken]);
+    expect(parseQueryTypes('avali || 4 ears')).toEqual([TermToken, OrToken, TermToken]);
+    expect(parseQueryTypes('avali && !4 ears')).toEqual([TermToken, AndToken, NotToken, TermToken]);
+
+    expect(parseQueryTypes('avali AND (NOT 4 ears OR -3 fingers)')).toEqual([
+      TermToken, AndToken, GroupStartToken, NotToken, TermToken, OrToken, NotToken, TermToken, GroupEndToken,
+    ]);
+  });
+
+  it('should not treat parentheses as groups inside the term', () => {
+    expect(parseQueryTypes('!(experiment (casualties unknown) || milky (casualties unknown))')).toEqual([
+      NotToken, GroupStartToken, TermToken, OrToken, TermToken, GroupEndToken,
+    ]);
+  });
+
+  it('should accept any amount of whitespaces between different tokens', () => {
+    expect(parseQueryTypes('!     ( avali     ,  experiment (casualties unknown)   )  &&  safe')).toEqual([
+      NotToken, GroupStartToken, TermToken, AndToken, TermToken, GroupEndToken, AndToken, TermToken,
+    ]);
+  });
+
+  it('should trim whitespaces inside the terms, even in quoted ones', () => {
+    const [termWithSpaces] = parseQuery('  avali  ');
+    expect(termWithSpaces.value).toBe('avali');
+
+    const [quotedTermWithSpaces] = parseQuery('   "  avali  "   ');
+    expect(quotedTermWithSpaces instanceof QuotedTermToken && quotedTermWithSpaces.decodedValue || new Error('Wrong token')).toBe('avali');
+  });
+
+  it('should properly differentiate between word-like operators and parts of tags', () => {
+    expect(parseQueryTypes('safe AND sound')).toEqual([TermToken, AndToken, TermToken]);
+    expect(parseQueryTypes('NOT safe AND dangerous')).toEqual([NotToken, TermToken, AndToken, TermToken]);
+  });
+
+  it('should only detect word-like operators when spaces are in place', () => {
+    // Require whitespace between operator and other tokens
+    expect(parseQueryTypes('NOT safeANDsound')).toEqual([NotToken, TermToken]);
+
+    // If none are there, just should treat it as a part of a term
+    expect(parseQuery('safeAND sound')[0].value).toEqual('safeAND sound');
+
+    // All operators should be in all caps, otherwise it's just a term
+    const [lowercaseOperatorWords] = parseQuery('avali are cute and you know it or else');
+    expect(lowercaseOperatorWords.value).toBe('avali are cute and you know it or else');
+
+    // And if it in caps, but part of some word, then it's just a word
+    const [wordsInCapsContainingOperators] = parseQuery('THAT POOR KNOT IS PLAIN AS SAND');
+    expect(wordsInCapsContainingOperators.value).toBe('THAT POOR KNOT IS PLAIN AS SAND');
+  });
+
+  it('should not treat any operators inside the quoted term as actual operators', () => {
+    const tokens = parseQuery('"this AND that OR these NOT there || () && ^123"');
+    const [quotedTermToken] = tokens;
+
+    expect(tokens).toHaveLength(1);
+
+    expect(quotedTermToken instanceof QuotedTermToken && quotedTermToken.decodedValue || null)
+      .toBe('this AND that OR these NOT there || () && ^123');
+  });
+
+  describe('QuotedTermToken', () => {
+    it('should decode and encode quotes and backslash', () => {
+      const encodedQuote = `"term with \\\" inside of it"`;
+      const decodedQuote = 'term with " inside of it';
+
+      expect(QuotedTermToken.decode(encodedQuote)).toBe(decodedQuote);
+      expect(QuotedTermToken.encode(decodedQuote)).toBe(encodedQuote);
+
+      const encodedBackslash = `"term with \\\\ inside of it"`;
+      const decodedBackslash = 'term with \\ inside of it';
+
+      expect(QuotedTermToken.decode(encodedBackslash)).toBe(decodedBackslash);
+      expect(QuotedTermToken.encode(decodedBackslash)).toBe(encodedBackslash);
+    });
+
+    it('should not care for anything else', () => {
+      const encodedTerm = '"operators: , && || AND OR NOT ! ^ ? *"';
+      const decodedTerm = 'operators: , && || AND OR NOT ! ^ ? *';
+
+      expect(QuotedTermToken.decode(encodedTerm)).toBe(decodedTerm);
+      expect(QuotedTermToken.encode(decodedTerm)).toBe(encodedTerm);
+    });
+  });
+});
--- a/tests/lib/philomena/tag-utils.spec.ts
+++ b/tests/lib/philomena/tag-utils.spec.ts
@@ -0,0 +1,76 @@
+import { URL } from 'url';
+import { resolveTagNameFromLink, slugEncodedCharacters } from '$lib/philomena/tag-utils';
+
+describe('tag-utils', () => {
+  const origin = 'https://furbooru.org';
+
+  describe('resolveTagNameFromLink', () => {
+    function resolveFromSearchQuery(encodedQuery: string): string | null {
+      return resolveTagNameFromLink(new URL(`/search?q=${encodedQuery}`, origin));
+    }
+
+    describe('Parsing from /search/?q=tag links', () => {
+      it('should resolve a single tag from /search URLs', () => {
+        expect(resolveFromSearchQuery('safe')).toBe('safe');
+      });
+
+      it('should return null for queries with multiple comma-separated tags', () => {
+        // Comma acts as a separator in the query, resulting in multiple tokens
+        expect(resolveFromSearchQuery('safe, suggestive')).toBe(null);
+      });
+
+      it('should return null if query is empty or not a term', () => {
+        expect(resolveFromSearchQuery('')).toBe(null);
+        expect(resolveFromSearchQuery('!')).toBe(null);
+      });
+
+      it('should properly treat parentheses in the query with single tag', () => {
+        // Parentheses are operators in the query language, but when inside the tag name, they should still be properly
+        // working.
+        expect(resolveFromSearchQuery('experiment (casualties unknown)')).toBe('experiment (casualties unknown)');
+      });
+
+      it('should properly resolve queries with encoded characters', () => {
+        expect(resolveFromSearchQuery('pok%C3%A9mon')).toBe('pokémon');
+      });
+
+      it('should unquote quoted term', () => {
+        expect(resolveFromSearchQuery('"experiment (casualties unknown)"')).toBe('experiment (casualties unknown)')
+        expect(resolveFromSearchQuery('"single tag, really"')).toBe('single tag, really');
+      });
+    })
+
+    describe('Parsing from /tags/name links', () => {
+      function resolveFromTagLink(encodedTagName: string): string | null {
+        return resolveTagNameFromLink(new URL(`/tags/${encodedTagName}`, origin));
+      }
+
+      it('should resolve a single tag', () => {
+        expect(resolveFromTagLink('safe')).toBe('safe');
+      });
+
+      it('should only read the tag page even if query is provided', () => {
+        expect(resolveFromTagLink('grotesque?q=explicit')).toBe('grotesque');
+      });
+
+      it('should properly resolve links with encoded characters', () => {
+        expect(resolveFromTagLink('pok%C3%A9mon')).toBe('pokémon');
+      });
+
+      it('should decoded slug-encoded characters', () => {
+        // More common example where tag is.
+        expect(resolveFromTagLink(`namespace-colon-tag+name`)).toBe('namespace:tag name');
+
+        // Testing the whole list of encoded characters.
+        for (const [encodedCharacter, decodedCharacter] of slugEncodedCharacters.entries()) {
+          expect(resolveFromTagLink(`test+symbol${encodedCharacter}without+spaces`)).toBe(`test symbol${decodedCharacter}without spaces`);
+          expect(resolveFromTagLink(`test+symbol+${encodedCharacter}+with+spaces`)).toBe(`test symbol ${decodedCharacter} with spaces`);
+        }
+      });
+    });
+
+    it('should return null for unsupported URLs', () => {
+      expect(resolveTagNameFromLink(new URL('/pages/example', origin))).toBe(null);
+    });
+  });
+});