split up files

author: SavagePeanut <sourcehut@lazytapir.com> 2023-09-03 15:31:17 -0500
committer: SavagePeanut <sourcehut@lazytapir.com> 2023-09-03 15:31:17 -0500
commit: 651ab7c90f436de60035a138e91561b1848b5725 (patch)
tree: 95e021d0b90e9f6bedcf4169ab1e08db600f4ac4
parent: 13043d6ec8efe62d13336745a0f650ff3e899648 (diff)
4 files changed, 434 insertions, 418 deletions
diff --git a/src/general.rs b/src/general.rs
new file mode 100644
index 0000000..517bf92
--- /dev/null
+++ b/src/general.rs
@@ -0,0 +1,55 @@
+use std::collections::HashMap;
+
+use pyo3::prelude::*;
+
+use crate::parser::parse_with_limits;
+
+const PLACEHOLDER: &str = "\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}";
+
+#[pyfunction]
+pub fn format_body(body: String, new_tags: HashMap<String, (String, String)>) -> PyResult<String> {
+    let mut chars: Vec<char> = body.chars().collect();
+    if chars.len() < 1 {
+        return Ok(body);
+    }
+    let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0);
+    let parse_quotes = new_tags.contains_key(&">".to_string());
+
+    let mut tags: Vec<(usize, String, usize)> = Vec::with_capacity(styles.len() * 2);
+    for (keyword, start, remove_start, end, remove_end) in styles {
+        if new_tags.contains_key(&keyword) {
+            let opening_tag = if keyword == "```language" {
+                new_tags.get(&keyword).unwrap().0.clone()
+                .replace("{}", &chars[start+3..remove_start-1]
+                .into_iter()
+                .collect::<String>())
+            } else {
+                new_tags.get(&keyword).unwrap().0.clone()
+            };
+            tags.push((start, opening_tag, remove_start));
+            tags.push((end, new_tags.get(&keyword).unwrap().1.clone(), remove_end));
+        } else if (keyword == ">>" && parse_quotes) || keyword == "```>" {
+            tags.push((start, "".to_string(), start+1));
+        }
+    }
+
+    tags.sort_by(|a, b| b.0.cmp(&a.0));
+
+    for (index, tag, end) in tags {
+        chars = [chars[..index].to_vec(), tag.chars().collect(), chars[end..].to_vec()].concat();
+    }
+
+    let text: String = if new_tags.contains_key("\n") {
+        chars.into_iter().collect::<String>().replace("\n", &new_tags.get(&"\n".to_string()).unwrap().0)
+    } else {
+        chars.into_iter().collect::<String>()
+    };
+
+    Ok(remove_non_escaped_backslashes(text))
+}
+
+fn remove_non_escaped_backslashes(text: String) -> String {
+    let tmp_string = text.replace("\\\\", PLACEHOLDER);
+    let tmp_string = tmp_string.replace("\\", "");
+    tmp_string.replace(PLACEHOLDER, "\\")
+}
diff --git a/src/lib.rs b/src/lib.rs
index d868dda..d45d857 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,425 +1,12 @@
-use std::collections::HashMap;
-use std::cmp::Ordering;
-
 use pyo3::prelude::*;
 
-const KEYWORDS: [char; 4] = ['*', '_', '~', '`'];
-const NO_SUB_PARSING_KEYWORDS: [char; 1] = ['`'];
-const QUOTE_KEYWORDS: [char; 1] = ['>'];
-const PLACEHOLDER: &str = "\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}\u{200B}";
-const TELEGRAM_STYLES: &[(&'static str, &'static str)] = &[
-    ("_", "italics"),
-    ("*", "bold"),
-    ("~", "strikethrough"),
-    ("||", "spoiler"),
-    ("`", "code"),
-    ("```", "pre")
-];
-
-#[pyfunction]
-fn format_body(body: String, new_tags: HashMap<String, (String, String)>) -> PyResult<String> {
-    let mut chars: Vec<char> = body.chars().collect();
-    if chars.len() < 1 {
-        return Ok(body);
-    }
-    let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0);
-    let parse_quotes = new_tags.contains_key(&">".to_string());
-
-    let mut tags: Vec<(usize, String, usize)> = Vec::with_capacity(styles.len() * 2);
-    for (keyword, start, remove_start, end, remove_end) in styles {
-        if new_tags.contains_key(&keyword) {
-            let opening_tag = if keyword == "```language" {
-                new_tags.get(&keyword).unwrap().0.clone()
-                .replace("{}", &chars[start+3..remove_start-1]
-                .into_iter()
-                .collect::<String>())
-            } else {
-                new_tags.get(&keyword).unwrap().0.clone()
-            };
-            tags.push((start, opening_tag, remove_start));
-            tags.push((end, new_tags.get(&keyword).unwrap().1.clone(), remove_end));
-        } else if (keyword == ">>" && parse_quotes) || keyword == "```>" {
-            tags.push((start, "".to_string(), start+1));
-        }
-    }
-
-    tags.sort_by(|a, b| b.0.cmp(&a.0));
-
-    for (index, tag, end) in tags {
-        chars = [chars[..index].to_vec(), tag.chars().collect(), chars[end..].to_vec()].concat();
-    }
-
-    let text: String = if new_tags.contains_key("\n") {
-        chars.into_iter().collect::<String>().replace("\n", &new_tags.get(&"\n".to_string()).unwrap().0)
-    } else {
-        chars.into_iter().collect::<String>()
-    };
-
-    Ok(remove_non_escaped_backslashes(text))
-}
-
-#[pyfunction]
-fn parse_for_telegram(body: String) -> PyResult<(String, Vec<(String, usize, usize, String)>)> {
-    let mut chars: Vec<char> = body.chars().collect();
-    if chars.len() < 1 {
-        return Ok((body, Vec::with_capacity(0)));
-    }
-
-    let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0);
-    let mut remove_tags: Vec<(usize, usize)> = Vec::with_capacity(styles.len() * 2);
-    for (keyword, start, remove_start, end, remove_end) in &styles {
-        if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
-            remove_tags.push((*start, *remove_start));
-            remove_tags.push((*end, *remove_end));
-        } else if keyword == "```>" {
-            remove_tags.push((*start, *remove_start));
-        }
-    }
-
-    remove_tags.sort_by(|a, b| b.0.cmp(&a.0));
-
-    for (index, end) in remove_tags {
-        chars = [chars[..index].to_vec(), chars[end..].to_vec()].concat();
-    }
-
-    let mut message_entities: Vec<(bool, usize, String, usize, String)> = Vec::with_capacity(styles.len() * 2);
-    let mut all_indexes: Vec<Vec<usize>> = Vec::with_capacity(styles.len());
-    for (keyword, start, remove_start, end, remove_end) in &styles {
-        if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
-            let language = if keyword == "```language" {
-                chars[start+3..remove_start-1]
-                .into_iter()
-                .collect::<String>()
-            } else {
-                "".to_string()
-            };
-            all_indexes.push(vec![*start, *remove_start, *end, *remove_end]);
-            let last_index = all_indexes.len() - 1;
-            message_entities.push((true, last_index, TELEGRAM_STYLES.iter().find(|&&(k, _)| k == keyword).unwrap().1.to_string(), *start, language));
-            message_entities.push((false, last_index, "".to_string(), *end, "".to_string()));
-        } else if keyword == "```>" {
-            all_indexes.push(vec![0, 0, *start, 1]);
-            message_entities.push((false, all_indexes.len() - 1, "".to_string(), *start, "".to_string()));
-        }
-    }
-    message_entities.sort_by(sort_message_entities);
-
-    let formatted_text = chars.into_iter().collect::<String>();
-    let utf16_lengths: Vec<usize> = utf8_to_utf16_length(&formatted_text);
-
-    let mut offset = 0;
-    for (is_start, index, _, _, _) in &message_entities {
-        let indexes = &mut all_indexes[*index];
-        if *is_start {
-            indexes[0] -= offset;
-            offset += indexes[1];
-        } else {
-            indexes[2] -= offset;
-            offset += indexes[3];
-        }
-    }
-    Ok((
-        formatted_text,
-        message_entities.into_iter()
-            .filter(|(is_start, _, _, _, _)| { *is_start } )
-            .map(|(_, index, format, _, language)| { (format, utf16_lengths[all_indexes[index][0]], utf16_lengths[all_indexes[index][2]], language) })
-            .collect()
-    ))
-}
-
-fn sort_message_entities(first: &(bool, usize, String, usize, String), second: &(bool, usize, String, usize, String)) -> Ordering {
-    return first.3.cmp(&second.3);
-}
-
-fn utf8_to_utf16_length(utf8_str: &str) -> Vec<usize> {
-    let mut utf16_lengths = Vec::with_capacity(utf8_str.len());
-
-    let mut length = 0;
-    for byte in utf8_str.as_bytes() {
-        if (byte & 0xc0) != 0x80 {
-            length += if *byte >= 0xf0 { 2 } else { 1 };
-        }
-        utf16_lengths.push(length);
-    }
-    utf16_lengths
-}
-
-fn remove_non_escaped_backslashes(text: String) -> String {
-    let tmp_string = text.replace("\\\\", PLACEHOLDER);
-    let tmp_string = tmp_string.replace("\\", "");
-    tmp_string.replace(PLACEHOLDER, "\\")
-}
-
-fn parse_with_limits(chars: &Vec<char>, start: usize, end: usize, depth: usize) -> Vec<(String, usize, usize, usize, usize)> {
-    let mut styles = Vec::new();
-    let mut index = start;
-    let end = end.min(chars.len() - 1);
-
-    while index <= end {
-        if preceeded_by_backslash(chars, index, start) {
-            index += 1;
-            continue;
-        }
-
-        let c = chars[index];
-        if QUOTE_KEYWORDS.contains(&c) {
-            if is_quote_start(chars, index, depth) {
-                let to = seek_end_of_quote(chars, index, end, depth);
-                styles.push((">".to_string(), index, index + 1, to, to));
-                styles.append(&mut parse_with_limits(chars, index + 1, to, depth + 1));
-                index = to;
-                continue;
-            }
-            if is_nested_quote(chars, index, depth) {
-                styles.push((">>".to_string(), index, index + 1, index + 1, index + 1));
-            }
-            index += 1;
-            continue;
-        }
-
-        if c == '`' && is_char_repeating(chars, c, 2, index + 1, end) {
-            let end_of_line = seek_end_of_line(chars, index + 1, end);
-            if end_of_line == end {
-                index += 3;
-                continue;
-            }
-            match seek_end_block(chars, c, end_of_line, end, depth) {
-                Some(to) => {
-                    println!("to {}", to);
-                    if to != index + 3 && is_quote_start(chars, index, depth) {
-                        let keyword = if end_of_line == index + 3 {
-                            "```".to_string()
-                        } else {
-                            "```language".to_string()
-                        };
-                        let remove_end = if depth > 0 && (to == end || to == chars.len()) {
-                            to
-                        } else {
-                            to + 4 + depth
-                        };
-                        styles.push((keyword, index, end_of_line + 1, to, remove_end));
-                        styles.append(&mut parse_quotes_in_code_block(chars, index + 3, to, depth));
-                        index = to;
-                    }
-                }
-                None => ()
-            }
-            index += 3;
-            continue;
-        }
-
-        if !preceeded_by_whitespace(chars, index, start) || followed_by_whitespace(chars, index, end) {
-            index += 1;
-            continue;
-        }
-
-        if c == '|' && is_char_repeating(chars, c, 1, index + 1, end) {
-            match seek_end(chars, c, index + 2, 1, end) {
-                Some(to) => {
-                    if to != index + 2 {
-                        let keyword = "||".to_string();
-                        styles.push((keyword, index, index + 2, to, to + 2));
-                        styles.append(&mut parse_with_limits(chars, index + 2, to - 1, depth));
-                    }
-                    index = to + 2;
-                    continue;
-                }
-                None => ()
-            }
-            index += 2;
-            continue;
-        }
+mod parser;
 
-        if !KEYWORDS.contains(&c) {
-            index += 1;
-            continue;
-        }
+mod telegram;
+use telegram::parse_for_telegram;
 
-        match seek_end(chars, c, index + 1, 0, end) {
-            Some (to) => {
-                if to != index + 1 {
-                    styles.push((c.to_string(), index, index + 1, to, to + 1));
-                    if !NO_SUB_PARSING_KEYWORDS.contains(&c) {
-                        styles.append(&mut parse_with_limits(chars, index + 1, to - 1, depth));
-                    }
-                }
-                index = to;
-            }
-            None => ()
-        }
-        index += 1;
-    }
-    styles
-}
-
-fn parse_quotes_in_code_block(chars: &Vec<char>, start: usize, end: usize, depth: usize) -> Vec<(String, usize, usize, usize, usize)> {
-    let mut quotes = Vec::new();
-    let mut index = start;
-    let end = end.min(chars.len() - 1);
-
-    if depth < 1 {
-        return quotes;
-    }
-
-    while index <= end {
-        let c = chars[index];
-        if QUOTE_KEYWORDS.contains(&c) {
-            if is_nested_quote(chars, index, depth) {
-                quotes.push(("```>".to_string(), index, index + 1, index + 1, index + 1));
-            }
-            index += 1;
-            continue;
-        }
-        index += 1;
-    }
-    quotes
-}
-
-fn is_nested_quote(chars: &Vec<char>, start: usize, depth: usize) -> bool {
-    let mut index = start;
-    let mut count = 0;
-
-    while index > 0 {
-        if chars[index] == '\n' {
-            return true;
-        }
-        if !QUOTE_KEYWORDS.contains(&chars[index]) {
-            return false;
-        }
-        count += 1;
-        if count > depth {
-            return false;
-        }
-        index -= 1;
-    }
-    true
-}
-
-fn is_char_repeating(chars: &Vec<char>, keyword: char, repetitions: usize, index: usize, end: usize) -> bool {
-    (0..repetitions as usize)
-        .all(|i| index + i <= end && chars[index + i] == keyword)
-}
-
-fn preceeded_by_whitespace(chars: &Vec<char>, index: usize, start: usize) -> bool {
-    index == start || chars[index - 1].is_whitespace()
-}
-
-fn followed_by_whitespace(chars: &Vec<char>, index: usize, end: usize) -> bool {
-    index >= end || chars[index + 1].is_whitespace()
-}
-
-fn seek_end(chars: &Vec<char>, keyword: char, start: usize, repetitions: usize, end: usize) -> Option<usize> {
-    for i in start..=end {
-        let c = chars[i];
-        if c == '\n' {
-            return None;
-        }
-        if c == keyword
-            && !chars[i - 1].is_whitespace()
-            && !preceeded_by_backslash(chars, i, start)
-            && is_char_repeating(chars, keyword, repetitions, i + 1, end)
-        {
-            match seek_higher_order_end(chars, c, i + 1, repetitions, end) {
-                Some(higher_order_i) => {
-                    return Some(higher_order_i);
-                }
-                None => {
-                    return Some(i);
-                }
-            }
-        }
-    }
-    None
-}
-
-fn seek_higher_order_end(chars: &Vec<char>, keyword: char, start: usize, repetitions: usize, end: usize) -> Option<usize> {
-    for i in start..=end {
-        let c = chars[i];
-        if c == '\n' {
-            return None;
-        }
-        if c == keyword
-            && chars[i - 1].is_whitespace()
-            && !followed_by_whitespace(chars, i, end)
-            && is_char_repeating(chars, keyword, repetitions, i + 1, end)
-        {
-            return None; // "*bold* *<--- beginning of new bold>*"
-        }
-        if c == keyword
-            && !chars[i - 1].is_whitespace()
-            && followed_by_whitespace(chars, i, end)
-            && !preceeded_by_backslash(chars, i, start)
-            && is_char_repeating(chars, keyword, repetitions, i + 1, end)
-        {
-            return Some(i);
-        }
-    }
-    None
-}
-
-fn seek_end_of_line(chars: &Vec<char>, start: usize, end: usize) -> usize {
-    chars[start..=end]
-        .iter()
-        .enumerate()
-        .find(|&(_, &c)| c == '\n')
-        .map_or(end + 1, |(i, _)| start + i)
-}
-
-fn seek_end_of_quote(chars: &Vec<char>, start: usize, end: usize, depth: usize) -> usize {
-    for i in start..=end {
-        if chars[i] == '\n' {
-            if i + 2 + depth > chars.len() {
-                return i;
-            }
-            if chars[i + 1..=i + 1 + depth].iter().any(|&c| !QUOTE_KEYWORDS.contains(&c)) {
-                return i;
-            }
-        }
-    }
-    end + 1
-}
-
-fn seek_end_block(chars: &Vec<char>, keyword: char, start: usize, end: usize, depth: usize) -> Option<usize> {
-    for i in start..=end {
-        if chars[i] == '\n' {
-            if i + depth == end && chars[i + 1..i + 1 + depth].iter().all(|&c| QUOTE_KEYWORDS.contains(&c)) {
-                continue;
-            }
-            if i + 1 + depth > end {
-                return Some(i);
-            }
-            if seek_end_of_line(chars, i + 1, end) == i + depth + 4
-                && chars[i + 1..i + 1 + depth].iter().all(|&c| QUOTE_KEYWORDS.contains(&c))
-                && chars[i + 1 + depth] == keyword
-                && is_char_repeating(chars, keyword, 2, i + 1 + depth, end)
-            {
-                return Some(i);
-            }
-        }
-    }
-    if end == chars.len() - 1 {
-        if depth == 0 {
-            return None;
-        }
-        return Some(chars.len());
-    }
-    Some(end)
-}
-
-fn is_quote_start(chars: &Vec<char>, index: usize, depth: usize) -> bool {
-    index - depth == 0 || chars[index - 1 - depth] == '\n'
-}
-
-fn preceeded_by_backslash(chars: &Vec<char>, index: usize, start: usize) -> bool {
-    if index == start {
-        return false;
-    }
-    let mut num_backslashes = 0;
-    while index > num_backslashes && chars[index - 1 - num_backslashes] == '\\' {
-        num_backslashes += 1;
-    }
-    num_backslashes % 2 == 1
-}
+mod general;
+use general::format_body;
 
 #[pymodule]
 fn slidge_style_parser(_py: Python, m: &PyModule) -> PyResult<()> {
diff --git a/src/parser.rs b/src/parser.rs
new file mode 100644
index 0000000..8e96a69
--- /dev/null
+++ b/src/parser.rs
@@ -0,0 +1,274 @@
+const KEYWORDS: [char; 4] = ['*', '_', '~', '`'];
+const NO_SUB_PARSING_KEYWORDS: [char; 1] = ['`'];
+const QUOTE_KEYWORDS: [char; 1] = ['>'];
+
+pub fn parse_with_limits(chars: &Vec<char>, start: usize, end: usize, depth: usize) -> Vec<(String, usize, usize, usize, usize)> {
+    let mut styles = Vec::new();
+    let mut index = start;
+    let end = end.min(chars.len() - 1);
+
+    while index <= end {
+        if preceeded_by_backslash(chars, index, start) {
+            index += 1;
+            continue;
+        }
+
+        let c = chars[index];
+        if QUOTE_KEYWORDS.contains(&c) {
+            if is_quote_start(chars, index, depth) {
+                let to = seek_end_of_quote(chars, index, end, depth);
+                styles.push((">".to_string(), index, index + 1, to, to));
+                styles.append(&mut parse_with_limits(chars, index + 1, to, depth + 1));
+                index = to;
+                continue;
+            }
+            if is_nested_quote(chars, index, depth) {
+                styles.push((">>".to_string(), index, index + 1, index + 1, index + 1));
+            }
+            index += 1;
+            continue;
+        }
+
+        if c == '`' && is_char_repeating(chars, c, 2, index + 1, end) {
+            let end_of_line = seek_end_of_line(chars, index + 1, end);
+            if end_of_line == end {
+                index += 3;
+                continue;
+            }
+            match seek_end_block(chars, c, end_of_line, end, depth) {
+                Some(to) => {
+                    println!("to {}", to);
+                    if to != index + 3 && is_quote_start(chars, index, depth) {
+                        let keyword = if end_of_line == index + 3 {
+                            "```".to_string()
+                        } else {
+                            "```language".to_string()
+                        };
+                        let remove_end = if depth > 0 && (to == end || to == chars.len()) {
+                            to
+                        } else {
+                            to + 4 + depth
+                        };
+                        styles.push((keyword, index, end_of_line + 1, to, remove_end));
+                        styles.append(&mut parse_quotes_in_code_block(chars, index + 3, to, depth));
+                        index = to;
+                    }
+                }
+                None => ()
+            }
+            index += 3;
+            continue;
+        }
+
+        if !preceeded_by_whitespace(chars, index, start) || followed_by_whitespace(chars, index, end) {
+            index += 1;
+            continue;
+        }
+
+        if c == '|' && is_char_repeating(chars, c, 1, index + 1, end) {
+            match seek_end(chars, c, index + 2, 1, end) {
+                Some(to) => {
+                    if to != index + 2 {
+                        let keyword = "||".to_string();
+                        styles.push((keyword, index, index + 2, to, to + 2));
+                        styles.append(&mut parse_with_limits(chars, index + 2, to - 1, depth));
+                    }
+                    index = to + 2;
+                    continue;
+                }
+                None => ()
+            }
+            index += 2;
+            continue;
+        }
+
+        if !KEYWORDS.contains(&c) {
+            index += 1;
+            continue;
+        }
+
+        match seek_end(chars, c, index + 1, 0, end) {
+            Some (to) => {
+                if to != index + 1 {
+                    styles.push((c.to_string(), index, index + 1, to, to + 1));
+                    if !NO_SUB_PARSING_KEYWORDS.contains(&c) {
+                        styles.append(&mut parse_with_limits(chars, index + 1, to - 1, depth));
+                    }
+                }
+                index = to;
+            }
+            None => ()
+        }
+        index += 1;
+    }
+    styles
+}
+
+fn parse_quotes_in_code_block(chars: &Vec<char>, start: usize, end: usize, depth: usize) -> Vec<(String, usize, usize, usize, usize)> {
+    let mut quotes = Vec::new();
+    let mut index = start;
+    let end = end.min(chars.len() - 1);
+
+    if depth < 1 {
+        return quotes;
+    }
+
+    while index <= end {
+        let c = chars[index];
+        if QUOTE_KEYWORDS.contains(&c) {
+            if is_nested_quote(chars, index, depth) {
+                quotes.push(("```>".to_string(), index, index + 1, index + 1, index + 1));
+            }
+            index += 1;
+            continue;
+        }
+        index += 1;
+    }
+    quotes
+}
+
+fn is_nested_quote(chars: &Vec<char>, start: usize, depth: usize) -> bool {
+    let mut index = start;
+    let mut count = 0;
+
+    while index > 0 {
+        if chars[index] == '\n' {
+            return true;
+        }
+        if !QUOTE_KEYWORDS.contains(&chars[index]) {
+            return false;
+        }
+        count += 1;
+        if count > depth {
+            return false;
+        }
+        index -= 1;
+    }
+    true
+}
+
+fn is_char_repeating(chars: &Vec<char>, keyword: char, repetitions: usize, index: usize, end: usize) -> bool {
+    (0..repetitions as usize)
+        .all(|i| index + i <= end && chars[index + i] == keyword)
+}
+
+fn preceeded_by_whitespace(chars: &Vec<char>, index: usize, start: usize) -> bool {
+    index == start || chars[index - 1].is_whitespace()
+}
+
+fn followed_by_whitespace(chars: &Vec<char>, index: usize, end: usize) -> bool {
+    index >= end || chars[index + 1].is_whitespace()
+}
+
+fn seek_end(chars: &Vec<char>, keyword: char, start: usize, repetitions: usize, end: usize) -> Option<usize> {
+    for i in start..=end {
+        let c = chars[i];
+        if c == '\n' {
+            return None;
+        }
+        if c == keyword
+            && !chars[i - 1].is_whitespace()
+            && !preceeded_by_backslash(chars, i, start)
+            && is_char_repeating(chars, keyword, repetitions, i + 1, end)
+        {
+            match seek_higher_order_end(chars, c, i + 1, repetitions, end) {
+                Some(higher_order_i) => {
+                    return Some(higher_order_i);
+                }
+                None => {
+                    return Some(i);
+                }
+            }
+        }
+    }
+    None
+}
+
+fn seek_higher_order_end(chars: &Vec<char>, keyword: char, start: usize, repetitions: usize, end: usize) -> Option<usize> {
+    for i in start..=end {
+        let c = chars[i];
+        if c == '\n' {
+            return None;
+        }
+        if c == keyword
+            && chars[i - 1].is_whitespace()
+            && !followed_by_whitespace(chars, i, end)
+            && is_char_repeating(chars, keyword, repetitions, i + 1, end)
+        {
+            return None; // "*bold* *<--- beginning of new bold>*"
+        }
+        if c == keyword
+            && !chars[i - 1].is_whitespace()
+            && followed_by_whitespace(chars, i, end)
+            && !preceeded_by_backslash(chars, i, start)
+            && is_char_repeating(chars, keyword, repetitions, i + 1, end)
+        {
+            return Some(i);
+        }
+    }
+    None
+}
+
+fn seek_end_of_line(chars: &Vec<char>, start: usize, end: usize) -> usize {
+    chars[start..=end]
+        .iter()
+        .enumerate()
+        .find(|&(_, &c)| c == '\n')
+        .map_or(end + 1, |(i, _)| start + i)
+}
+
+fn seek_end_of_quote(chars: &Vec<char>, start: usize, end: usize, depth: usize) -> usize {
+    for i in start..=end {
+        if chars[i] == '\n' {
+            if i + 2 + depth > chars.len() {
+                return i;
+            }
+            if chars[i + 1..=i + 1 + depth].iter().any(|&c| !QUOTE_KEYWORDS.contains(&c)) {
+                return i;
+            }
+        }
+    }
+    end + 1
+}
+
+fn seek_end_block(chars: &Vec<char>, keyword: char, start: usize, end: usize, depth: usize) -> Option<usize> {
+    for i in start..=end {
+        if chars[i] == '\n' {
+            if i + depth == end && chars[i + 1..i + 1 + depth].iter().all(|&c| QUOTE_KEYWORDS.contains(&c)) {
+                continue;
+            }
+            if i + 1 + depth > end {
+                return Some(i);
+            }
+            if seek_end_of_line(chars, i + 1, end) == i + depth + 4
+                && chars[i + 1..i + 1 + depth].iter().all(|&c| QUOTE_KEYWORDS.contains(&c))
+                && chars[i + 1 + depth] == keyword
+                && is_char_repeating(chars, keyword, 2, i + 1 + depth, end)
+            {
+                return Some(i);
+            }
+        }
+    }
+    if end == chars.len() - 1 {
+        if depth == 0 {
+            return None;
+        }
+        return Some(chars.len());
+    }
+    Some(end)
+}
+
+fn is_quote_start(chars: &Vec<char>, index: usize, depth: usize) -> bool {
+    index - depth == 0 || chars[index - 1 - depth] == '\n'
+}
+
+fn preceeded_by_backslash(chars: &Vec<char>, index: usize, start: usize) -> bool {
+    if index == start {
+        return false;
+    }
+    let mut num_backslashes = 0;
+    while index > num_backslashes && chars[index - 1 - num_backslashes] == '\\' {
+        num_backslashes += 1;
+    }
+    num_backslashes % 2 == 1
+}
diff --git a/src/telegram.rs b/src/telegram.rs
new file mode 100644
index 0000000..ac6eeb3
--- /dev/null
+++ b/src/telegram.rs
@@ -0,0 +1,100 @@
+use std::cmp::Ordering;
+
+use pyo3::prelude::*;
+
+use crate::parser::parse_with_limits;
+
+const TELEGRAM_STYLES: &[(&'static str, &'static str)] = &[
+    ("_", "italics"),
+    ("*", "bold"),
+    ("~", "strikethrough"),
+    ("||", "spoiler"),
+    ("`", "code"),
+    ("```", "pre")
+];
+
+#[pyfunction]
+pub fn parse_for_telegram(body: String) -> PyResult<(String, Vec<(String, usize, usize, String)>)> {
+    let mut chars: Vec<char> = body.chars().collect();
+    if chars.len() < 1 {
+        return Ok((body, Vec::with_capacity(0)));
+    }
+
+    let styles: Vec<(String, usize, usize, usize, usize)> = parse_with_limits(&chars, 0, chars.len() - 1, 0);
+    let mut remove_tags: Vec<(usize, usize)> = Vec::with_capacity(styles.len() * 2);
+    for (keyword, start, remove_start, end, remove_end) in &styles {
+        if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
+            remove_tags.push((*start, *remove_start));
+            remove_tags.push((*end, *remove_end));
+        } else if keyword == "```>" {
+            remove_tags.push((*start, *remove_start));
+        }
+    }
+
+    remove_tags.sort_by(|a, b| b.0.cmp(&a.0));
+
+    for (index, end) in remove_tags {
+        chars = [chars[..index].to_vec(), chars[end..].to_vec()].concat();
+    }
+
+    let mut message_entities: Vec<(bool, usize, String, usize, String)> = Vec::with_capacity(styles.len() * 2);
+    let mut all_indexes: Vec<Vec<usize>> = Vec::with_capacity(styles.len());
+    for (keyword, start, remove_start, end, remove_end) in &styles {
+        if TELEGRAM_STYLES.iter().any(|&(k, _)| k == keyword) {
+            let language = if keyword == "```language" {
+                chars[start+3..remove_start-1]
+                .into_iter()
+                .collect::<String>()
+            } else {
+                "".to_string()
+            };
+            all_indexes.push(vec![*start, *remove_start, *end, *remove_end]);
+            let last_index = all_indexes.len() - 1;
+            message_entities.push((true, last_index, TELEGRAM_STYLES.iter().find(|&&(k, _)| k == keyword).unwrap().1.to_string(), *start, language));
+            message_entities.push((false, last_index, "".to_string(), *end, "".to_string()));
+        } else if keyword == "```>" {
+            all_indexes.push(vec![0, 0, *start, 1]);
+            message_entities.push((false, all_indexes.len() - 1, "".to_string(), *start, "".to_string()));
+        }
+    }
+    message_entities.sort_by(sort_message_entities);
+
+    let formatted_text = chars.into_iter().collect::<String>();
+    let utf16_lengths: Vec<usize> = utf8_to_utf16_length(&formatted_text);
+
+    let mut offset = 0;
+    for (is_start, index, _, _, _) in &message_entities {
+        let indexes = &mut all_indexes[*index];
+        if *is_start {
+            indexes[0] -= offset;
+            offset += indexes[1];
+        } else {
+            indexes[2] -= offset;
+            offset += indexes[3];
+        }
+    }
+    Ok((
+        formatted_text,
+        message_entities.into_iter()
+            .filter(|(is_start, _, _, _, _)| { *is_start } )
+            .map(|(_, index, format, _, language)| { (format, utf16_lengths[all_indexes[index][0]], utf16_lengths[all_indexes[index][2]], language) })
+            .collect()
+    ))
+}
+
+fn sort_message_entities(first: &(bool, usize, String, usize, String), second: &(bool, usize, String, usize, String)) -> Ordering {
+    return first.3.cmp(&second.3);
+}
+
+fn utf8_to_utf16_length(utf8_str: &str) -> Vec<usize> {
+    let mut utf16_lengths = Vec::with_capacity(utf8_str.len());
+
+    let mut length = 0;
+    for byte in utf8_str.as_bytes() {
+        if (byte & 0xc0) != 0x80 {
+            length += if *byte >= 0xf0 { 2 } else { 1 };
+        }
+        utf16_lengths.push(length);
+    }
+    utf16_lengths
+}
author	SavagePeanut <sourcehut@lazytapir.com>	2023-09-03 15:31:17 -0500
committer	SavagePeanut <sourcehut@lazytapir.com>	2023-09-03 15:31:17 -0500
commit	651ab7c90f436de60035a138e91561b1848b5725 (patch)
tree	95e021d0b90e9f6bedcf4169ab1e08db600f4ac4
parent	13043d6ec8efe62d13336745a0f650ff3e899648 (diff)