// SPDX-FileCopyrightText: 2023 James Graham // SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL #include "texthandler.h" #include #include #include #include static const QStringList allowedTags = { QStringLiteral("font"), QStringLiteral("del"), QStringLiteral("h1"), QStringLiteral("h2"), QStringLiteral("h3"), QStringLiteral("h4"), QStringLiteral("h5"), QStringLiteral("h6"), QStringLiteral("blockquote"), QStringLiteral("p"), QStringLiteral("a"), QStringLiteral("ul"), QStringLiteral("ol"), QStringLiteral("sup"), QStringLiteral("sub"), QStringLiteral("li"), QStringLiteral("b"), QStringLiteral("i"), QStringLiteral("u"), QStringLiteral("strong"), QStringLiteral("em"), QStringLiteral("strike"), QStringLiteral("code"), QStringLiteral("hr"), QStringLiteral("br"), QStringLiteral("div"), QStringLiteral("table"), QStringLiteral("thead"), QStringLiteral("tbody"), QStringLiteral("tr"), QStringLiteral("th"), QStringLiteral("td"), QStringLiteral("caption"), QStringLiteral("pre"), QStringLiteral("span"), QStringLiteral("img"), QStringLiteral("details"), QStringLiteral("summary")}; static const QHash allowedAttributes = { {QStringLiteral("font"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("color")}}, {QStringLiteral("span"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("data-mx-spoiler")}}, {QStringLiteral("a"), {QStringLiteral("name"), QStringLiteral("target"), QStringLiteral("href")}}, {QStringLiteral("img"), {QStringLiteral("width"), QStringLiteral("height"), QStringLiteral("alt"), QStringLiteral("title"), QStringLiteral("src")}}, {QStringLiteral("ol"), {QStringLiteral("start")}}, {QStringLiteral("code"), {QStringLiteral("class")}}}; static const QStringList allowedLinkSchemes = {QStringLiteral("https"), QStringLiteral("http"), QStringLiteral("ftp"), QStringLiteral("mailto"), QStringLiteral("magnet")}; QString TextHandler::data() const { return m_data; } void TextHandler::setData(const QString &string) { m_data = string; m_pos = 0; } QString TextHandler::handleSendText() { m_pos = 0; m_dataBuffer = markdownToHTML(m_data); nextTokenType(); // Strip any disallowed tags/attributes. QString outputString; while (m_pos < m_dataBuffer.length()) { next(); QString nextTokenBuffer = m_nextToken; if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) { nextTokenBuffer = escapeHtml(nextTokenBuffer); } else if (m_nextTokenType == Type::Tag) { if (!isAllowedTag(getTagType())) { nextTokenBuffer = QString(); } nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer); } outputString.append(nextTokenBuffer); nextTokenType(); } return outputString; } QString TextHandler::handleRecieveRichText(Qt::TextFormat inputFormat, const NeoChatRoom *room, const Quotient::RoomEvent *event, bool stripNewlines) { m_pos = 0; m_dataBuffer = m_data; // Strip mx-reply if present. m_dataBuffer.remove(TextRegex::removeRichReply); // For plain text, convert links, escape html and convert line brakes. if (inputFormat == Qt::PlainText) { m_dataBuffer = escapeHtml(m_dataBuffer); m_dataBuffer.replace(u'\n', QStringLiteral("
")); } // Linkify any plain text urls m_dataBuffer = linkifyUrls(m_dataBuffer); // Apply user style m_dataBuffer.replace(TextRegex::userPill, QStringLiteral(R"(\1)")); // Make all media URLs resolvable. if (room && event) { QRegularExpressionMatchIterator i = TextRegex::mxcImage.globalMatch(m_dataBuffer); while (i.hasNext()) { const QRegularExpressionMatch match = i.next(); #ifdef QUOTIENT_07 const QUrl mediaUrl = room->makeMediaUrl(event->id(), QUrl(QStringLiteral("mxc://") + match.captured(2) + u'/' + match.captured(3))); m_dataBuffer.replace(match.captured(0), QStringLiteral("'); #else auto url = room->connection()->homeserver(); auto base = url.scheme() + QStringLiteral("://") + url.host() + (url.port() != -1 ? ':' + QString::number(url.port()) : QString()); m_dataBuffer.replace(match.captured(0), QStringLiteral("'); #endif } } // Strip any disallowed tags/attributes. QString outputString; nextTokenType(); while (m_pos < m_dataBuffer.length()) { next(); QString nextTokenBuffer = m_nextToken; if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) { nextTokenBuffer = escapeHtml(nextTokenBuffer); } else if (m_nextTokenType == Type::Tag) { if (!isAllowedTag(getTagType())) { nextTokenBuffer = QString(); } else if ((getTagType() == QStringLiteral("br") && stripNewlines)) { nextTokenBuffer = u' '; } nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer); } outputString.append(nextTokenBuffer); nextTokenType(); } /** * Replace with * Note: is still not a valid tag for the message from the server. We * convert as that is what is needed for Qt::RichText. */ outputString.replace(TextRegex::strikethrough, QStringLiteral("\\1")); return outputString; } QString TextHandler::handleRecievePlainText(Qt::TextFormat inputFormat, const bool &stripNewlines) { m_pos = 0; m_dataBuffer = m_data; // Strip mx-reply if present. m_dataBuffer.remove(TextRegex::removeRichReply); // Escaping then unescaping allows < and > to be maintained in a plain text string // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely. if (inputFormat == Qt::PlainText) { m_dataBuffer = escapeHtml(m_dataBuffer); } /** * This seems counterproductive but by converting any markup which could * arrive (e.g. in a caption body) it can then be stripped by the same code. */ m_dataBuffer = markdownToHTML(m_dataBuffer); if (stripNewlines) { m_dataBuffer.replace(QStringLiteral("
\n"), QStringLiteral(" ")); m_dataBuffer.replace(QStringLiteral("
"), QStringLiteral(" ")); m_dataBuffer.replace(QStringLiteral("
\n"), QStringLiteral(" ")); m_dataBuffer.replace(QStringLiteral("
"), QStringLiteral(" ")); m_dataBuffer.replace(u'\n', QStringLiteral(" ")); m_dataBuffer.replace(u'\u2028', " "); } // Strip all tags/attributes except code blocks which will be escaped. QString outputString; nextTokenType(); while (m_pos < m_dataBuffer.length()) { next(); QString nextTokenBuffer = m_nextToken; if (m_nextTokenType == Type::TextCode) { nextTokenBuffer = unescapeHtml(nextTokenBuffer); } else if (m_nextTokenType == Type::Tag) { nextTokenBuffer = QString(); } outputString.append(nextTokenBuffer); nextTokenType(); } // Escaping then unescaping allows < and > to be maintained in a plain text string // otherwise markdownToHTML will strip what it thinks is a bad html tag entirely. if (inputFormat == Qt::PlainText) { outputString = unescapeHtml(outputString); } outputString = outputString.trimmed(); return outputString; } void TextHandler::next() { QString searchStr; if (m_nextTokenType == Type::Tag) { searchStr = u'>'; } else if (m_nextTokenType == Type::TextCode) { // Anything between code tags is assumed to be plain text searchStr = QStringLiteral(""); } else { searchStr = u'<'; } int tokenEnd = m_dataBuffer.indexOf(searchStr, m_pos + 1); if (tokenEnd == -1) { tokenEnd = m_dataBuffer.length(); } m_nextToken = m_dataBuffer.mid(m_pos, tokenEnd - m_pos + (m_nextTokenType == Type::Tag ? 1 : 0)); m_pos = tokenEnd + (m_nextTokenType == Type::Tag ? 1 : 0); } void TextHandler::nextTokenType() { if (m_pos >= m_dataBuffer.length()) { // This is to stop the function accessing an index outside the length of // m_dataBuffer during the final loop. m_nextTokenType = Type::End; } else if (m_nextTokenType == Type::Tag && getTagType() == QStringLiteral("code") && !isCloseTag() && m_dataBuffer.indexOf(QStringLiteral(""), m_pos) != m_pos) { m_nextTokenType = Type::TextCode; } else if (m_dataBuffer[m_pos] == u'<' && m_dataBuffer[m_pos + 1] != u' ') { m_nextTokenType = Type::Tag; } else { m_nextTokenType = Type::Text; } } QString TextHandler::getTagType() const { const int tagTypeStart = m_nextToken[1] == u'/' ? 2 : 1; const int tagTypeEnd = m_nextToken.indexOf(TextRegex::endTagType, tagTypeStart); return m_nextToken.mid(tagTypeStart, tagTypeEnd - tagTypeStart); } bool TextHandler::isCloseTag() const { return m_nextToken[1] == u'/'; } QString TextHandler::getAttributeType(const QString &string) { if (!string.contains(u'=')) { return string; } const int equalsPos = string.indexOf(u'='); return string.left(equalsPos); } QString TextHandler::getAttributeData(const QString &string) { if (!string.contains(u'=')) { return QStringLiteral(); } const int equalsPos = string.indexOf(u'='); return string.right(string.length() - equalsPos - 1); } bool TextHandler::isAllowedTag(const QString &type) { return allowedTags.contains(type); } bool TextHandler::isAllowedAttribute(const QString &tag, const QString &attribute) { return allowedAttributes[tag].contains(attribute); } bool TextHandler::isAllowedLink(const QString &link, bool isImg) { const QUrl linkUrl = QUrl(link); if (isImg) { #ifdef QUOTIENT_07 return !linkUrl.isRelative() && linkUrl.scheme() == "mxc"; #else return !linkUrl.isRelative() && (linkUrl.scheme() == "mxc" || linkUrl.scheme() == "https"); #endif } else { return !linkUrl.isRelative() && allowedLinkSchemes.contains(linkUrl.scheme()); } } QString TextHandler::cleanAttributes(const QString &tag, const QString &tagString) { int nextAttributeIndex = tagString.indexOf(u' ', 1); if (nextAttributeIndex != -1) { QString outputString = tagString.left(nextAttributeIndex); QString nextAttribute; int nextSpaceIndex; nextAttributeIndex += 1; while (nextAttributeIndex < tagString.length()) { nextSpaceIndex = tagString.indexOf(TextRegex::endTagType, nextAttributeIndex); if (nextSpaceIndex == -1) { nextSpaceIndex = tagString.length(); } nextAttribute = tagString.mid(nextAttributeIndex, nextSpaceIndex - nextAttributeIndex); if (isAllowedAttribute(tag, getAttributeType(nextAttribute))) { if (tag == QStringLiteral("img") && getAttributeType(nextAttribute) == QStringLiteral("src")) { QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1); if (isAllowedLink(attributeData, true)) { outputString.append(u' ' + nextAttribute); } } else if (tag == u'a' && getAttributeType(nextAttribute) == QStringLiteral("href")) { QString attributeData = TextRegex::attributeData.match(getAttributeData(nextAttribute)).captured(1); if (isAllowedLink(attributeData)) { outputString.append(u' ' + nextAttribute); } } else if (tag == QStringLiteral("code") && getAttributeType(nextAttribute) == QStringLiteral("class")) { if (getAttributeData(nextAttribute).remove(u'"').startsWith(QStringLiteral("language-"))) { outputString.append(u' ' + nextAttribute); } } else { outputString.append(u' ' + nextAttribute); } } nextAttributeIndex = nextSpaceIndex + 1; } outputString += u'>'; return outputString; } return tagString; } QString TextHandler::markdownToHTML(const QString &markdown) { const auto str = markdown.toUtf8(); char *tmp_buf = cmark_markdown_to_html(str.constData(), str.size(), CMARK_OPT_HARDBREAKS | CMARK_OPT_UNSAFE); const std::string html(tmp_buf); free(tmp_buf); auto result = QString::fromStdString(html).trimmed(); result.replace(QStringLiteral(""), QString()); return result; } /** * TODO: make this more intelligent currently other characters are not escaped * especially & as this can conflict with the cmark markdown to html conversion * which already escapes characters in code blocks. The < > still need to be handled * when the user manually types in the html. */ QString TextHandler::escapeHtml(QString stringIn) { stringIn.replace(u'<', QStringLiteral("<")); stringIn.replace(u'>', QStringLiteral(">")); return stringIn; } QString TextHandler::unescapeHtml(QString stringIn) { // For those situations where brackets in code block get double escaped stringIn.replace(QStringLiteral("&lt;"), QStringLiteral("<")); stringIn.replace(QStringLiteral("&gt;"), QStringLiteral(">")); stringIn.replace(QStringLiteral("<"), QStringLiteral("<")); stringIn.replace(QStringLiteral(">"), QStringLiteral(">")); stringIn.replace(QStringLiteral("&"), QStringLiteral("&")); stringIn.replace(QStringLiteral("""), QStringLiteral("\"")); return stringIn; } QString TextHandler::linkifyUrls(QString stringIn) { stringIn = stringIn.replace(TextRegex::mxId, QStringLiteral(R"(\1\2)")); stringIn.replace(TextRegex::fullUrl, QStringLiteral(R"(\1)")); stringIn = stringIn.replace(TextRegex::emailAddress, QStringLiteral(R"(\1\2)")); return stringIn; }