summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSavagePeanut <sourcehut@lazytapir.com>2023-09-03 15:10:36 -0500
committerSavagePeanut <sourcehut@lazytapir.com>2023-09-03 15:10:36 -0500
commit0431079e517b2fa6292ce289e3a6e25a6493a855 (patch)
treefdcfd183f769e468fce6d01b35b2d18d5b600f75
parent2a88ed30e27f25bed0b72bf5b31aa59b83f2b1d7 (diff)
start telegram support
-rw-r--r--src/lib.rs108
-rw-r--r--tests/test_style_parser.py276
2 files changed, 101 insertions, 283 deletions
diff --git a/src/lib.rs b/src/lib.rs
index 8de599a..d868dda 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,5 @@
use std::collections::HashMap;
+use std::cmp::Ordering;
use pyo3::prelude::*;
@@ -6,6 +7,14 @@ const KEYWORDS: [char; 4] = ['*', '_', '~', '`'];
const NO_SUB_PARSING_KEYWORDS: [char; 1] = ['`'];
const QUOTE_KEYWORDS: [char; 1] = ['>'];
const PLACEHOLDER: &str = "\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}";
+const TELEGRAM_STYLES: &[(&'static str, &'static str)] = &[
+ ("_", "italics"),
+ ("*", "bold"),
+ ("~", "strikethrough"),
+ ("||", "spoiler"),
+ ("`", "code"),
+ ("```", "pre")
+];
#[pyfunction]
fn format_body(body: String, new_tags: HashMap<String, (String, String)>) -> PyResult<String> {
@@ -16,9 +25,8 @@ fn format_body(body: String, new_tags: HashMap<String, (String, String)>) -> PyR
let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0);
let parse_quotes = new_tags.contains_key(&">".to_string());
- let mut tags: Vec<(usize, String, usize)> = vec![];
- for style in styles {
- let (keyword, start, remove_start, end, remove_end) = style;
+ let mut tags: Vec<(usize, String, usize)> = Vec::with_capacity(styles.len() * 2);
+ for (keyword, start, remove_start, end, remove_end) in styles {
if new_tags.contains_key(&keyword) {
let opening_tag = if keyword == "```language" {
new_tags.get(&keyword).unwrap().0.clone()
@@ -30,15 +38,14 @@ fn format_body(body: String, new_tags: HashMap<String, (String, String)>) -> PyR
};
tags.push((start, opening_tag, remove_start));
tags.push((end, new_tags.get(&keyword).unwrap().1.clone(), remove_end));
- } else if keyword == ">>" && parse_quotes {
+ } else if (keyword == ">>" && parse_quotes) || keyword == "```>" {
tags.push((start, "".to_string(), start+1));
}
}
tags.sort_by(|a, b| b.0.cmp(&a.0));
- for tag in tags {
- let (index, tag, end) = tag;
+ for (index, tag, end) in tags {
chars = [chars[..index].to_vec(), tag.chars().collect(), chars[end..].to_vec()].concat();
}
@@ -51,6 +58,92 @@ fn format_body(body: String, new_tags: HashMap<String, (String, String)>) -> PyR
Ok(remove_non_escaped_backslashes(text))
}
+#[pyfunction]
+fn parse_for_telegram(body: String) -> PyResult<(String, Vec<(String, usize, usize, String)>)> {
+ let mut chars: Vec<char> = body.chars().collect();
+ if chars.len() < 1 {
+ return Ok((body, Vec::with_capacity(0)));
+ }
+
+ let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0);
+ let mut remove_tags: Vec<(usize, usize)> = Vec::with_capacity(styles.len() * 2);
+ for (keyword, start, remove_start, end, remove_end) in &styles {
+ if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
+ remove_tags.push((*start, *remove_start));
+ remove_tags.push((*end, *remove_end));
+ } else if keyword == "```>" {
+ remove_tags.push((*start, *remove_start));
+ }
+ }
+
+ remove_tags.sort_by(|a, b| b.0.cmp(&a.0));
+
+ for (index, end) in remove_tags {
+ chars = [chars[..index].to_vec(), chars[end..].to_vec()].concat();
+ }
+
+ let mut message_entities: Vec<(bool, usize, String, usize, String)> = Vec::with_capacity(styles.len() * 2);
+ let mut all_indexes: Vec<Vec<usize>> = Vec::with_capacity(styles.len());
+ for (keyword, start, remove_start, end, remove_end) in &styles {
+ if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
+ let language = if keyword == "```language" {
+ chars[start+3..remove_start-1]
+ .into_iter()
+ .collect::<String>()
+ } else {
+ "".to_string()
+ };
+ all_indexes.push(vec![*start, *remove_start, *end, *remove_end]);
+ let last_index = all_indexes.len() - 1;
+ message_entities.push((true, last_index, TELEGRAM_STYLES.iter().find(|&&(k, _)| k == keyword).unwrap().1.to_string(), *start, language));
+ message_entities.push((false, last_index, "".to_string(), *end, "".to_string()));
+ } else if keyword == "```>" {
+ all_indexes.push(vec![0, 0, *start, 1]);
+ message_entities.push((false, all_indexes.len() - 1, "".to_string(), *start, "".to_string()));
+ }
+ }
+ message_entities.sort_by(sort_message_entities);
+
+ let formatted_text = chars.into_iter().collect::<String>();
+ let utf16_lengths: Vec<usize> = utf8_to_utf16_length(&formatted_text);
+
+ let mut offset = 0;
+ for (is_start, index, _, _, _) in &message_entities {
+ let indexes = &mut all_indexes[*index];
+ if *is_start {
+ indexes[0] -= offset;
+ offset += indexes[1];
+ } else {
+ indexes[2] -= offset;
+ offset += indexes[3];
+ }
+ }
+ Ok((
+ formatted_text,
+ message_entities.into_iter()
+ .filter(|(is_start, _, _, _, _)| { *is_start } )
+ .map(|(_, index, format, _, language)| { (format, utf16_lengths[all_indexes[index][0]], utf16_lengths[all_indexes[index][2]], language) })
+ .collect()
+ ))
+}
+
+fn sort_message_entities(first: &(bool, usize, String, usize, String), second: &(bool, usize, String, usize, String)) -> Ordering {
+ return first.3.cmp(&second.3);
+}
+
+fn utf8_to_utf16_length(utf8_str: &str) -> Vec<usize> {
+ let mut utf16_lengths = Vec::with_capacity(utf8_str.len());
+
+ let mut length = 0;
+ for byte in utf8_str.as_bytes() {
+ if (byte & 0xc0) != 0x80 {
+ length += if *byte >= 0xf0 { 2 } else { 1 };
+ }
+ utf16_lengths.push(length);
+ }
+ utf16_lengths
+}
+
fn remove_non_escaped_backslashes(text: String) -> String {
let tmp_string = text.replace("\\\\", PLACEHOLDER);
let tmp_string = tmp_string.replace("\\", "");
@@ -172,7 +265,7 @@ fn parse_quotes_in_code_block(chars: &Vec<char>, start: usize, end: usize, depth
let c = chars[index];
if QUOTE_KEYWORDS.contains(&c) {
if is_nested_quote(chars, index, depth) {
- quotes.push((">>".to_string(), index, index + 1, index + 1, index + 1));
+ quotes.push(("```>".to_string(), index, index + 1, index + 1, index + 1));
}
index += 1;
continue;
@@ -331,5 +424,6 @@ fn preceeded_by_backslash(chars: &Vec<char>, index: usize, start: usize) -> bool
#[pymodule]
fn slidge_style_parser(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(format_body, m)?)?;
+ m.add_function(wrap_pyfunction!(parse_for_telegram, m)?)?;
Ok(())
}
diff --git a/tests/test_style_parser.py b/tests/test_style_parser.py
deleted file mode 100644
index 8f39580..0000000
--- a/tests/test_style_parser.py
+++ /dev/null
@@ -1,276 +0,0 @@
-from slidge_style_parser import format_body
-
-MATRIX_FORMATS = {
- "_": ("<em>", "</em>"),
- "*": ("<strong>", "</strong>"),
- "~": ("<strike>", "</strike>"),
- "`": ("<code>", "</code>"),
- "```": ("<pre><code>", "</code></pre>"),
- "```language": ("<pre><code class=\"language-{}\">", "</code></pre>"),
- ">": ("<blockquote>", "</blockquote>"),
- "||": ("<span data-mx-spoiler>", "</span>"),
- "\n": ("<br>", "")
-}
-
-def test_basic():
- test = "_underline_"
- formatted_body = "<em>underline</em>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "*bold*"
- formatted_body = "<strong>bold</strong>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "~strikethrough~"
- formatted_body = "<strike>strikethrough</strike>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "`code span`"
- formatted_body = "<code>code span</code>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = """
- ```python
- def test_basic():
- test = "_underline_"
- formatted_body = "<em>underline</em>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
- ```
- """
- formatted_body = test = """<pre><code class="language-python">def test_basic():<br> test = "_underline_"<br> formatted_body = "<em>underline</em>"<br> assert(format_body(test, MATRIX_FORMATS) == (test, formatted_body))</pre></code><br>"""
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "```\ncode block\n```"
- formatted_body = "<pre><code>code block</code></pre>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "||this message contains a spoiler||"
- formatted_body = "<span data-mx-spoiler>this message contains a spoiler</span>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-def test_quotes():
- test = ">single"
- formatted_body = "<blockquote>single</blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">single arrow ->"
- formatted_body = "<blockquote>single arrow -></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">single\n>grouped"
- formatted_body = "<blockquote>single<br>grouped</blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>double"
- formatted_body = "<blockquote><blockquote>double</blockquote></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>double\n>>double"
- formatted_body = "<blockquote><blockquote>double<br>double</blockquote></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>double\n&>not quote"
- formatted_body = "<blockquote><blockquote>double</blockquote></blockquote><br>&>not quote"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>double\n>grouped single"
- formatted_body = "<blockquote><blockquote>double</blockquote><br>grouped single</blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>>tripple\n>single\n>>double"
- formatted_body = "<blockquote><blockquote><blockquote>tripple</blockquote></blockquote><br>single<br><blockquote>double</blockquote></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-def test_code_blocks():
- test = "```\nhacker\ncode\n```"
- formatted_body = "<pre><code>hacker<br>code</code></pre>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "```python\nhacker code\n```"
- formatted_body = "<pre><code class=\"language-python\">hacker code</code></pre>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "```python\nhacker code\n```\nnormal text"
- formatted_body = "<pre><code class=\"language-python\">hacker code</code></pre><br>normal text"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">```java\n>why are you quoting a code block\n>```"
- formatted_body = "<blockquote><pre><code class=\"language-java\">why are you quoting a code block</code></pre></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>```\n>>double quote code block\n>single quote not in code block\nnormal text"
- formatted_body = "<blockquote><blockquote><pre><code>double quote code block</code></pre></blockquote><br>single quote not in code block</blockquote><br>normal text"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">```\n>please stop trying to break my parser ;-;"
- formatted_body = "<blockquote><pre><code>please stop trying to break my parser ;-;</code></pre></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>```\n>>>>double quote code block\n>single quote not in code block\nnormal text"
- formatted_body = "<blockquote><blockquote><pre><code>>>double quote code block</code></pre></blockquote><br>single quote not in code block</blockquote><br>normal text"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "_```_ignored\ninvalid code block\n```"
- formatted_body = "<em>```</em>ignored<br>invalid code block<br>```"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-
-def test_escaped():
- test = "\\_no underline_"
- formatted_body = "_no underline_"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "\\\\_no underline_"
- formatted_body = "\\_no underline_"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">>>tripple\n\\>none\n>>double"
- formatted_body = "<blockquote><blockquote><blockquote>tripple</blockquote></blockquote></blockquote><br>>none<br><blockquote><blockquote>double</blockquote></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-def test_nested():
- test = "`*~_code span_~*`"
- formatted_body = "<code>*~_code span_~*</code>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "*_~`code span`~_*"
- formatted_body = "<strong><em><strike><code>code span</code></strike></em></strong>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">*_~`code span`~_*"
- formatted_body = "<blockquote><strong><em><strike><code>code span</code></strike></em></strong></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "*bold star >*< star bold*"
- formatted_body = "<strong>bold star >*< star bold</strong>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "*_bold*_"
- formatted_body = "<strong>_bold</strong>_"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "__underlined__"
- formatted_body = "<em><em>underlined</em></em>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-def test_no_changes():
- test = ""
- formatted_body = ""
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "~~ empty `````` styles **"
- formatted_body = "~~ empty `````` styles **"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "this is not an empty string"
- formatted_body = "this is not an empty string"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "arrow ->"
- formatted_body = "arrow ->"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = " > no quote"
- formatted_body = " > no quote"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "_not underlined"
- formatted_body = "_not underlined"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "|not a spoiler|"
- formatted_body = "|not a spoiler|"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "||\nalso\nnot\na\nspoiler||"
- formatted_body = "||<br>also<br>not<br>a<br>spoiler||"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "`no code\nblock here`"
- formatted_body = "`no code<br>block here`"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "invalid ```\ncode block\n```"
- formatted_body = "invalid ```<br>code block<br>```"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "```\ncode block\ninvalid```"
- formatted_body = "```<br>code block<br>invalid```"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "```\ncode block\n```invalid"
- formatted_body = "```<br>code block<br>```invalid"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-def test_assorted():
- test = "\n"
- formatted_body = "<br>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "at the ||end||"
- formatted_body = "at the <span data-mx-spoiler>end</span>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "in the ~middle~ here"
- formatted_body = "in the <strike>middle</strike> here"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "_underline_ *bold* ~strikethrough~ >not quote ||spoiler||\n>quote\nnothing\nnothing\n>>>>another quote with ||~_*```four```*_~||"
- formatted_body = "<em>underline</em> <strong>bold</strong> <strike>strikethrough</strike> >not quote <span data-mx-spoiler>spoiler</span><br><blockquote>quote</blockquote><br>nothing<br>nothing<br><blockquote><blockquote><blockquote><blockquote>another quote with <span data-mx-spoiler><strike><em><strong>```four```</strong></em></strike></span></blockquote></blockquote></blockquote></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">```\n>do be do be dooo ba do be do be do ba\n>>>"
- formatted_body = "<blockquote><pre><code>do be do be dooo ba do be do be do ba<br>>></code></pre></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "\n\n>```\n>do be do be dooo ba do be do be do ba\na\n\n\naoeu\n"
- formatted_body = "<br><br><blockquote><pre><code>do be do be dooo ba do be do be do ba</code></pre></blockquote><br>a<br><br><br>aoeu<br>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">```\n>do be do be dooo ba do be do be do ba\n>\n>\n>aoeu"
- formatted_body = "<blockquote><pre><code>do be do be dooo ba do be do be do ba<br><br><br>aoeu</code></pre></blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">```\n>code block\n>```invalid end\n"
- formatted_body = "<blockquote><pre><code>code block<br>```invalid end</code></pre></blockquote><br>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "invalid ```\ncode block\n*bold*\n```"
- formatted_body = "invalid ```<br>code block<br><strong>bold</strong><br>```"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-def test_weird_utf8():
- test = "โค๏ธ๐Ÿ’“๐Ÿ’•๐Ÿ’–๐Ÿ’— ||๐Ÿ’™๐Ÿ’š๐Ÿ’›๐Ÿ’œ๐Ÿ–ค|| ๐Ÿ’๐Ÿ’ž๐Ÿ’Ÿโฃ๏ธ"
- formatted_body = "โค๏ธ๐Ÿ’“๐Ÿ’•๐Ÿ’–๐Ÿ’— <span data-mx-spoiler>๐Ÿ’™๐Ÿ’š๐Ÿ’›๐Ÿ’œ๐Ÿ–ค</span> ๐Ÿ’๐Ÿ’ž๐Ÿ’Ÿโฃ๏ธ"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง _underline_๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ง"
- formatted_body = "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ง <em>underline</em>๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘ฆโ€๐Ÿ‘ง"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "\u202eRight to left"
- formatted_body = "\u202eRight to left"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = ">\u202eRight to left quote?"
- formatted_body = "<blockquote>\u202eRight to left quote?</blockquote>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "_Invisible\u200bseparator_"
- formatted_body = "<em>Invisible\u200bseparator</em>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
- test = "~\u200b~"
- formatted_body = "<strike>\u200b</strike>"
- assert(format_body(test, MATRIX_FORMATS) == formatted_body)
-
-LIMITED_FORMATS = {
- "_": ("<em>", "</em>"),
- "~": ("<strike>", "</strike>"),
- "`": ("<code>", "</code>")
-}
-
-def test_limited():
- test = "_underline_ *bold* ~strikethrough~ >not quote ||spoiler||\n>quote\nnothing\nnothing\n>>>>another quote with ||~_*```four```*_~||"
- formatted_body = "<em>underline</em> *bold* <strike>strikethrough</strike> >not quote ||spoiler||\n>quote\nnothing\nnothing\n>>>>another quote with ||<strike><em>*```four```*</em></strike>||"
- assert(format_body(test, LIMITED_FORMATS) == formatted_body)