Cherrypick Improve Text Handling to 23.04

Improve the handling of text both when sending and receiving.

The main feature is to fix the linked bug (and a host of others that are unreported but similar) which is caused by the fact that we don't properly clean html. This mr does that as per the matrix spec https://spec.matrix.org/v1.5/client-server-api/#mroommessage-msgtypes. So any disallowed tags or attributes are removed and it does the special handling for certain attributes.

Additionally the functions are also designed to cover any other text formatting required, particularly fro received strings.

The receive side is covered by 2 functions `handleRecieveRichText` and `handleRecievePlainText`. The rich/plain in the function name refers to the output type not the input type (both can take plain and rich input), so `handleRecieveRichText` is called to get a string suitable to go in a rich text control and `handleRecievePlainText` for a plain control.

The functions also handle the following some of which was previously handled by `eventToString` in `NeoChatRoom`:
- Strip and reply from the string
- Format any user mentions
- Linkify links in plain strings
- Handle mxc urls in rich text (uses the new `room->makeMediaUrl` functionality from libQuotient)
- `handleRecievePlainText` also deals with markup making `NeoChatRoom->subtitle` redundant

There is also an extensive test suite which defines the behaviour and the best way to review this is probably to look at the tests and decide whether you agree with the expected output given the inputs and/or if there is any missing behaviour.

The final aim especially with the test suite is to give us a framework to make further updates in the future easier and hopefully prevent a new feature breaking old behaviour with the tests.

BUG: 463932 \
BUG: 466330 \
BUG: 466930


(cherry picked from commit f6ba4f2ecd)
This commit is contained in:
James Graham
2023-03-13 18:18:17 +00:00
parent 498cfedfea
commit da1c664f94
16 changed files with 1041 additions and 148 deletions

View File

@@ -24,7 +24,6 @@ add_library(neochat STATIC
models/publicroomlistmodel.cpp
models/userdirectorylistmodel.cpp
models/keywordnotificationrulemodel.cpp
utils.cpp
notificationsmanager.cpp
models/sortfilterroomlistmodel.cpp
chatdocumenthandler.cpp
@@ -47,6 +46,7 @@ add_library(neochat STATIC
models/statemodel.cpp
filetransferpseudojob.cpp
models/searchmodel.cpp
texthandler.cpp
)
add_executable(neochat-app

View File

@@ -20,25 +20,10 @@
#include "neochatroom.h"
#include "neochatuser.h"
#include "roommanager.h"
#include "texthandler.h"
using namespace Quotient;
QString markdownToHTML(const QString &markdown)
{
const auto str = markdown.toUtf8();
char *tmp_buf = cmark_markdown_to_html(str.constData(), str.size(), CMARK_OPT_HARDBREAKS | CMARK_OPT_UNSAFE);
const std::string html(tmp_buf);
free(tmp_buf);
auto result = QString::fromStdString(html).trimmed();
result.replace("<!-- raw HTML omitted -->", "");
return result;
}
ActionsHandler::ActionsHandler(QObject *parent)
: QObject(parent)
{
@@ -169,7 +154,10 @@ void ActionsHandler::handleMessage(const QString &text, QString handledText, con
}
handledText = CustomEmojiModel::instance().preprocessText(handledText);
handledText = markdownToHTML(handledText);
TextHandler textHandler;
textHandler.setData(handledText);
handledText = textHandler.handleSendText();
if (handledText.count("<p>") == 1 && handledText.count("</p>") == 1) {
handledText.remove("<p>");
handledText.remove("</p>");

View File

@@ -50,5 +50,3 @@ private:
QString handleMentions(QString handledText, const bool &isEdit = false);
void handleMessage(const QString &text, QString handledText, const bool &isEdit = false);
};
QString markdownToHTML(const QString &markdown);

View File

@@ -27,7 +27,6 @@
#include <KLocalizedString>
#include "neochatuser.h"
#include "utils.h"
using namespace Quotient;

View File

@@ -417,7 +417,7 @@ QVariant RoomListModel::data(const QModelIndex &index, int role) const
return m_categoryVisibility.value(data(index, CategoryRole).toInt(), true);
}
if (role == SubtitleTextRole) {
return room->subtitleText();
return room->lastEventToString(Qt::PlainText, true);
}
if (role == AvatarImageRole) {
return room->avatar(128);

View File

@@ -47,7 +47,7 @@
#endif
#include "filetransferpseudojob.h"
#include "stickerevent.h"
#include "utils.h"
#include "texthandler.h"
#ifndef Q_OS_ANDROID
#include <KIO/Job>
@@ -257,10 +257,11 @@ bool NeoChatRoom::lastEventIsSpoiler() const
return false;
}
QString NeoChatRoom::lastEventToString() const
QString NeoChatRoom::lastEventToString(Qt::TextFormat format, bool stripNewlines) const
{
if (auto event = lastEvent()) {
return roomMembername(event->senderId()) + (event->isStateEvent() ? " " : ": ") + eventToString(*event);
return roomMembername(event->senderId()) + (event->isStateEvent() ? QLatin1String(" ") : QLatin1String(": "))
+ eventToString(*event, format, stripNewlines);
}
return QLatin1String("");
}
@@ -329,45 +330,6 @@ QDateTime NeoChatRoom::lastActiveTime()
return messageEvents().rbegin()->get()->originTimestamp();
}
QString NeoChatRoom::subtitleText()
{
static const QRegularExpression blockquote("(\r\n\t|\n|\r\t|)> ");
static const QRegularExpression heading("(\r\n\t|\n|\r\t|)\\#{1,6} ");
static const QRegularExpression newlines("(\r\n\t|\n|\r\t|\r\n)");
static const QRegularExpression bold1("(\\*\\*|__)(?=\\S)([^\\r]*\\S)\\1");
static const QRegularExpression bold2("(\\*|_)(?=\\S)([^\\r]*\\S)\\1");
static const QRegularExpression strike1("~~(.*)~~");
static const QRegularExpression strike2("~(.*)~");
static const QRegularExpression del("<del>(.*)</del>");
static const QRegularExpression multileLineCode("```([^```]+)```");
static const QRegularExpression singleLinecode("`([^`]+)`");
QString subtitle = lastEventToString().size() == 0 ? topic() : lastEventToString();
subtitle
// replace blockquote, i.e. '> text'
.replace(blockquote, " ")
// replace headings, i.e. "# text"
.replace(heading, " ")
// replace newlines
.replace(newlines, " ")
// replace '**text**' and '__text__'
.replace(bold1, "\\2")
// replace '*text*' and '_text_'
.replace(bold2, "\\2")
// replace '~~text~~'
.replace(strike1, "\\1")
// replace '~text~'
.replace(strike2, "\\1")
// replace '<del>text</del>'
.replace(del, "\\1")
// replace '```code```'
.replace(multileLineCode, "\\1")
// replace '`code`'
.replace(singleLinecode, "\\1");
return subtitle.size() > 0 ? subtitle : QStringLiteral(" ");
}
int NeoChatRoom::savedTopVisibleIndex() const
{
return firstDisplayedMarker() == historyEdge() ? 0 : int(firstDisplayedMarker() - messageEvents().rbegin());
@@ -451,7 +413,7 @@ QString NeoChatRoom::avatarMediaId() const
return {};
}
QString NeoChatRoom::eventToString(const RoomEvent &evt, Qt::TextFormat format, bool removeReply) const
QString NeoChatRoom::eventToString(const RoomEvent &evt, Qt::TextFormat format, bool stripNewlines) const
{
const bool prettyPrint = (format == Qt::RichText);
@@ -462,53 +424,43 @@ QString NeoChatRoom::eventToString(const RoomEvent &evt, Qt::TextFormat format,
return visit(
#endif
evt,
[this, prettyPrint, removeReply](const RoomMessageEvent &e) {
[this, format, stripNewlines](const RoomMessageEvent &e) {
using namespace MessageEventContent;
// 1. prettyPrint/HTML
if (prettyPrint && e.hasTextContent() && e.mimeType().name() != "text/plain") {
auto htmlBody = static_cast<const TextContent *>(e.content())->body;
if (removeReply) {
htmlBody.remove(utils::removeRichReplyRegex);
}
htmlBody.replace(utils::userPillRegExp, R"(<b class="user-pill">\1</b>)");
htmlBody.replace(utils::strikethroughRegExp, "<s>\\1</s>");
auto url = connection()->homeserver();
auto base = url.scheme() + QStringLiteral("://") + url.host() + (url.port() != -1 ? ':' + QString::number(url.port()) : QString());
htmlBody.replace(utils::mxcImageRegExp, QStringLiteral(R"(<img \1 src="%1/_matrix/media/r0/download/\2/\3" \4 > )").arg(base));
return htmlBody;
}
TextHandler textHandler;
if (e.hasFileContent()) {
auto fileCaption = e.content()->fileInfo()->originalName.toHtmlEscaped();
auto fileCaption = e.content()->fileInfo()->originalName;
if (fileCaption.isEmpty()) {
fileCaption = prettyPrint ? Quotient::prettyPrint(e.plainBody()) : e.plainBody();
fileCaption = e.plainBody();
} else if (e.content()->fileInfo()->originalName != e.plainBody()) {
fileCaption = e.plainBody() + " | " + fileCaption;
}
return !fileCaption.isEmpty() ? fileCaption : i18n("a file");
textHandler.setData(fileCaption);
return !fileCaption.isEmpty() ? textHandler.handleRecievePlainText() : i18n("a file");
}
// 2. prettyPrint/text 3. plainText/HTML 4. plainText/text
QString plainBody;
if (e.hasTextContent() && e.content() && e.mimeType().name() == "text/plain") { // 2/4
plainBody = static_cast<const TextContent *>(e.content())->body;
} else { // 3
plainBody = e.plainBody();
QString body;
if (e.hasTextContent() && e.content()) {
body = static_cast<const TextContent *>(e.content())->body;
} else {
body = e.plainBody();
}
if (prettyPrint) {
if (removeReply) {
plainBody.remove(utils::removeReplyRegex);
}
return Quotient::prettyPrint(plainBody);
textHandler.setData(body);
Qt::TextFormat inputFormat;
if (e.mimeType().name() == "text/plain") {
inputFormat = Qt::PlainText;
} else {
inputFormat = Qt::RichText;
}
if (removeReply) {
return plainBody.remove(utils::removeReplyRegex);
if (format == Qt::RichText) {
return textHandler.handleRecieveRichText(inputFormat, this, &e, stripNewlines);
} else {
return textHandler.handleRecievePlainText(inputFormat, stripNewlines);
}
return plainBody;
},
[](const StickerEvent &e) {
return e.body();

View File

@@ -124,7 +124,7 @@ public:
///
/// \see lastEvent
/// \see lastEventIsSpoiler
[[nodiscard]] QString lastEventToString() const;
[[nodiscard]] QString lastEventToString(Qt::TextFormat format = Qt::PlainText, bool stripNewlines = false) const;
/// Convenient way to check if the last event looks like it has spoilers.
///
@@ -137,12 +137,6 @@ public:
/// \see lastEvent
[[nodiscard]] QDateTime lastActiveTime();
/// Get subtitle text for room
///
/// Fetches last event and removes markdown formatting
/// \see lastEventToString
[[nodiscard]] QString subtitleText();
[[nodiscard]] bool isSpace();
bool isEventHighlighted(const Quotient::RoomEvent *e) const;
@@ -262,7 +256,7 @@ public:
[[nodiscard]] QString avatarMediaId() const;
[[nodiscard]] QString eventToString(const Quotient::RoomEvent &evt, Qt::TextFormat format = Qt::PlainText, bool removeReply = true) const;
[[nodiscard]] QString eventToString(const Quotient::RoomEvent &evt, Qt::TextFormat format = Qt::PlainText, bool stripNewlines = false) const;
[[nodiscard]] QString eventToGenericString(const Quotient::RoomEvent &evt) const;
Q_INVOKABLE [[nodiscard]] bool containsUser(const QString &userID) const;

View File

@@ -22,11 +22,11 @@
#include <jobs/basejob.h>
#include <user.h>
#include "actionshandler.h"
#include "controller.h"
#include "neochatconfig.h"
#include "neochatroom.h"
#include "roommanager.h"
#include "texthandler.h"
#include "windowcontroller.h"
using namespace Quotient;
@@ -85,7 +85,9 @@ void NotificationsManager::postNotification(NeoChatRoom *room,
std::unique_ptr<KNotificationReplyAction> replyAction(new KNotificationReplyAction(i18n("Reply")));
replyAction->setPlaceholderText(i18n("Reply..."));
connect(replyAction.get(), &KNotificationReplyAction::replied, this, [room, replyEventId](const QString &text) {
room->postMessage(text, markdownToHTML(text), RoomMessageEvent::MsgType::Text, replyEventId, QString());
TextHandler textHandler;
textHandler.setData(text);
room->postMessage(text, textHandler.handleSendText(), RoomMessageEvent::MsgType::Text, replyEventId, QString());
});
notification->setReplyAction(std::move(replyAction));
}

View File

@@ -16,25 +16,7 @@ TextEdit {
property bool isEmote: false
property bool isReplyLabel: false
readonly property var linkRegex: /(href=["'])?(\b(https?):\/\/[^\s\<\>\"\'\\\?\:\)\(]+(\(.*?\))*(\?(?=[a-z])[^\s\\\)]+|$)?)/g
property string textMessage: model.display.includes("http")
? model.display.replace(linkRegex, function() {
if (arguments[0].includes("/_matrix/media/r0/download/")) {
return arguments[0];
}
if (arguments[1]) {
return arguments[0];
}
const l = arguments[2];
if ([".", ","].includes(l[l.length-1])) {
const link = l.substring(0, l.length-1);
const leftover = l[l.length-1];
return `<a href="${link}">${link}</a>${leftover}`;
}
return `<a href="${l}">${l}</a>`;
})
: model.display
property string textMessage: model.display
property bool spoilerRevealed: !hasSpoiler.test(textMessage)
ListView.onReused: Qt.binding(() => !hasSpoiler.test(textMessage))
@@ -46,6 +28,7 @@ TextEdit {
Controller.forceRefreshTextDocument(contentLabel.textDocument, contentLabel)
}
onTextChanged: console.log(text)
text: "<style>
table {
width:100%;

378
src/texthandler.cpp Normal file
View File

@@ -0,0 +1,378 @@
// SPDX-FileCopyrightText: 2023 James Graham <james.h.graham@protonmail.com>
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
#include "texthandler.h"
#include <QDebug>
#include <QUrl>
#include <util.h>
#include <cmark.h>
static const QStringList allowedTags = {
QStringLiteral("font"), QStringLiteral("del"), QStringLiteral("h1"), QStringLiteral("h2"), QStringLiteral("h3"), QStringLiteral("h4"),
QStringLiteral("h5"), QStringLiteral("h6"), QStringLiteral("blockquote"), QStringLiteral("p"), QStringLiteral("a"), QStringLiteral("ul"),
QStringLiteral("ol"), QStringLiteral("sup"), QStringLiteral("sub"), QStringLiteral("li"), QStringLiteral("b"), QStringLiteral("i"),
QStringLiteral("u"), QStringLiteral("strong"), QStringLiteral("em"), QStringLiteral("strike"), QStringLiteral("code"), QStringLiteral("hr"),
QStringLiteral("br"), QStringLiteral("div"), QStringLiteral("table"), QStringLiteral("thead"), QStringLiteral("tbody"), QStringLiteral("tr"),
QStringLiteral("th"), QStringLiteral("td"), QStringLiteral("caption"), QStringLiteral("pre"), QStringLiteral("span"), QStringLiteral("img"),
QStringLiteral("details"), QStringLiteral("summary")};
static const QHash<QString, QStringList> allowedAttributes = {
{QStringLiteral("font"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("color")}},
{QStringLiteral("span"), {QStringLiteral("data-mx-bg-color"), QStringLiteral("data-mx-color"), QStringLiteral("data-mx-spoiler")}},
{QStringLiteral("a"), {QStringLiteral("name"), QStringLiteral("target"), QStringLiteral("href")}},
{QStringLiteral("img"), {QStringLiteral("width"), QStringLiteral("height"), QStringLiteral("alt"), QStringLiteral("title"), QStringLiteral("src")}},
{QStringLiteral("ol"), {QStringLiteral("start")}},
{QStringLiteral("code"), {QStringLiteral("class")}}};
static const QStringList allowedLinkSchemes = {QStringLiteral("https"),
QStringLiteral("http"),
QStringLiteral("ftp"),
QStringLiteral("mailto"),
QStringLiteral("magnet")};
QString TextHandler::data() const
{
return m_data;
}
void TextHandler::setData(const QString &string)
{
m_data = string;
m_pos = 0;
}
QString TextHandler::handleSendText()
{
m_pos = 0;
m_dataBuffer = markdownToHTML(m_data);
nextTokenType();
// Strip any disallowed tags/attributes.
QString outputString;
while (m_pos < m_dataBuffer.length()) {
next();
QString nextTokenBuffer = m_nextToken;
if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) {
nextTokenBuffer = escapeHtml(nextTokenBuffer);
} else if (m_nextTokenType == Type::Tag) {
if (!isAllowedTag(getTagType())) {
nextTokenBuffer = QString();
}
nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer);
}
outputString.append(nextTokenBuffer);
nextTokenType();
}
return outputString;
}
QString TextHandler::handleRecieveRichText(Qt::TextFormat inputFormat, const NeoChatRoom *room, const Quotient::RoomEvent *event, bool stripNewlines)
{
m_pos = 0;
m_dataBuffer = m_data;
// Strip mx-reply if present.
m_dataBuffer.remove(TextRegex::removeRichReply);
// For plain text, convert links, escape html and convert line brakes.
if (inputFormat == Qt::PlainText) {
m_dataBuffer = escapeHtml(m_dataBuffer);
m_dataBuffer.replace(u'\n', QStringLiteral("<br>"));
}
// Linkify any plain text urls
m_dataBuffer = linkifyUrls(m_dataBuffer);
// Apply user style
m_dataBuffer.replace(TextRegex::userPill, QStringLiteral(R"(<b>\1</b>)"));
// Make all media URLs resolvable.
if (room && event) {
QRegularExpressionMatchIterator i = TextRegex::mxcImage.globalMatch(m_dataBuffer);
while (i.hasNext()) {
const QRegularExpressionMatch match = i.next();
#ifdef QUOTIENT_07
const QUrl mediaUrl = room->makeMediaUrl(event->id(), QUrl(QStringLiteral("mxc://") + match.captured(2) + u'/' + match.captured(3)));
m_dataBuffer.replace(match.captured(0),
QStringLiteral("<img ") + match.captured(1) + QStringLiteral("src=\"") + mediaUrl.toString() + u'"' + match.captured(4)
+ u'>');
#else
auto url = room->connection()->homeserver();
auto base = url.scheme() + QStringLiteral("://") + url.host() + (url.port() != -1 ? ':' + QString::number(url.port()) : QString());
m_dataBuffer.replace(match.captured(0),
QStringLiteral("<img ") + match.captured(1) + QStringLiteral("src=\"") + base + QStringLiteral("/_matrix/media/r0/download/")
+ match.captured(2) + u'/' + match.captured(3) + u'"' + match.captured(4) + u'>');
#endif
}
}
// Strip any disallowed tags/attributes.
QString outputString;
nextTokenType();
while (m_pos < m_dataBuffer.length()) {
next();
QString nextTokenBuffer = m_nextToken;
if (m_nextTokenType == Type::Text || m_nextTokenType == Type::TextCode) {
nextTokenBuffer = escapeHtml(nextTokenBuffer);
} else if (m_nextTokenType == Type::Tag) {
if (!isAllowedTag(getTagType())) {
nextTokenBuffer = QString();
} else if ((getTagType() == QStringLiteral("br") && stripNewlines)) {
nextTokenBuffer = u' ';
}
nextTokenBuffer = cleanAttributes(getTagType(), nextTokenBuffer);
}
outputString.append(nextTokenBuffer);
nextTokenType();
}
/**
* Replace <del> with <s>
* Note: <s> is still not a valid tag for the message from the server. We
* convert as that is what is needed for Qt::RichText.
*/
outputString.replace(TextRegex::strikethrough, QStringLiteral("<s>\\1</s>"));
return outputString;
}
QString TextHandler::handleRecievePlainText(Qt::TextFormat inputFormat, const bool &stripNewlines)
{
m_pos = 0;
m_dataBuffer = m_data;
// Strip mx-reply if present.
m_dataBuffer.remove(TextRegex::removeRichReply);
if (stripNewlines) {
m_dataBuffer.replace(QStringLiteral("<br>"), QStringLiteral(" "));
m_dataBuffer.replace(QStringLiteral("<br />"), QStringLiteral(" "));
m_dataBuffer.replace(u'\n', QStringLiteral(" "));
}
// Escaping then unescaping allows < and > to be maintained in a plain text string
// otherwise markdownToHTML will strip what it thinks is a bad html tag entirely.
if (inputFormat == Qt::PlainText) {
m_dataBuffer = escapeHtml(m_dataBuffer);
}
/**
* This seems counterproductive but by converting any markup which could
* arrive (e.g. in a caption body) it can then be stripped by the same code.
*/
m_dataBuffer = markdownToHTML(m_dataBuffer);
// Strip all tags/attributes except code blocks which will be escaped.
QString outputString;
nextTokenType();
while (m_pos < m_dataBuffer.length()) {
next();
QString nextTokenBuffer = m_nextToken;
if (m_nextTokenType == Type::TextCode) {
nextTokenBuffer = unescapeHtml(nextTokenBuffer);
} else if (m_nextTokenType == Type::Tag) {
nextTokenBuffer = QString();
}
outputString.append(nextTokenBuffer);
nextTokenType();
}
// Escaping then unescaping allows < and > to be maintained in a plain text string
// otherwise markdownToHTML will strip what it thinks is a bad html tag entirely.
if (inputFormat == Qt::PlainText) {
outputString = unescapeHtml(outputString);
}
return outputString;
}
void TextHandler::next()
{
QString searchStr;
if (m_nextTokenType == Type::Tag) {
searchStr = u'>';
} else if (m_nextTokenType == Type::TextCode) {
// Anything between code tags is assumed to be plain text
searchStr = QStringLiteral("</code>");
} else {
searchStr = u'<';
}
int tokenEnd = m_dataBuffer.indexOf(searchStr, m_pos + 1);
if (tokenEnd == -1) {
tokenEnd = m_dataBuffer.length();
}
m_nextToken = m_dataBuffer.mid(m_pos, tokenEnd - m_pos + (m_nextTokenType == Type::Tag ? 1 : 0));
m_pos = tokenEnd + (m_nextTokenType == Type::Tag ? 1 : 0);
}
void TextHandler::nextTokenType()
{
if (m_nextTokenType == Type::Tag && getTagType() == QStringLiteral("code") && !isCloseTag()
&& m_dataBuffer.indexOf(QStringLiteral("</code>"), m_pos) != m_pos) {
m_nextTokenType = Type::TextCode;
} else if (m_dataBuffer[m_pos] == u'<' && m_dataBuffer[m_pos + 1] != u' ') {
m_nextTokenType = Type::Tag;
} else {
m_nextTokenType = Type::Text;
}
}
QString TextHandler::getTagType() const
{
const int tagTypeStart = m_nextToken[1] == u'/' ? 2 : 1;
const int tagTypeEnd = m_nextToken.indexOf(TextRegex::endTagType, tagTypeStart);
return m_nextToken.mid(tagTypeStart, tagTypeEnd - tagTypeStart);
}
bool TextHandler::isCloseTag() const
{
return m_nextToken[1] == u'/';
}
QString TextHandler::getAttributeType(const QString &string)
{
if (!string.contains(u'=')) {
return string;
}
const int equalsPos = string.indexOf(u'=');
return string.left(equalsPos);
}
QString TextHandler::getAttributeData(const QString &string)
{
if (!string.contains(u'=')) {
return QStringLiteral();
}
const int equalsPos = string.indexOf(u'=');
return string.right(string.length() - equalsPos - 1);
}
bool TextHandler::isAllowedTag(const QString &type)
{
return allowedTags.contains(type);
}
bool TextHandler::isAllowedAttribute(const QString &tag, const QString &attribute)
{
return allowedAttributes[tag].contains(attribute);
}
bool TextHandler::isAllowedLink(const QString &link, bool isImg)
{
const QUrl linkUrl = QUrl(link);
if (isImg) {
#ifdef QUOTIENT_07
return !linkUrl.isRelative() && linkUrl.scheme() == "mxc";
#else
return !linkUrl.isRelative() && (linkUrl.scheme() == "mxc" || linkUrl.scheme() == "https");
#endif
} else {
return !linkUrl.isRelative() && allowedLinkSchemes.contains(linkUrl.scheme());
}
}
QString TextHandler::cleanAttributes(const QString &tag, const QString &tagString)
{
int nextAttributeIndex = tagString.indexOf(u' ', 1);
if (nextAttributeIndex != -1) {
QString outputString = tagString.left(nextAttributeIndex);
QString nextAttribute;
int nextSpaceIndex;
nextAttributeIndex += 1;
while (nextAttributeIndex < tagString.length()) {
nextSpaceIndex = tagString.indexOf(TextRegex::endTagType, nextAttributeIndex);
if (nextSpaceIndex == -1) {
nextSpaceIndex = tagString.length();
}
nextAttribute = tagString.mid(nextAttributeIndex, nextSpaceIndex - nextAttributeIndex);
if (isAllowedAttribute(tag, getAttributeType(nextAttribute))) {
if (tag == QStringLiteral("img") && getAttributeType(nextAttribute) == QStringLiteral("src")) {
QString attributeData = getAttributeData(nextAttribute).remove(u'"');
if (isAllowedLink(attributeData, true)) {
outputString.append(u' ' + nextAttribute);
}
} else if (tag == u'a' && getAttributeType(nextAttribute) == QStringLiteral("href")) {
if (isAllowedLink(getAttributeData(nextAttribute).remove(u'"'))) {
outputString.append(u' ' + nextAttribute);
}
} else if (tag == QStringLiteral("code") && getAttributeType(nextAttribute) == QStringLiteral("class")) {
if (getAttributeData(nextAttribute).remove(u'"').startsWith(QStringLiteral("language-"))) {
outputString.append(u' ' + nextAttribute);
}
} else {
outputString.append(u' ' + nextAttribute);
}
}
nextAttributeIndex = nextSpaceIndex + 1;
}
outputString += u'>';
return outputString;
}
return tagString;
}
QString TextHandler::markdownToHTML(const QString &markdown)
{
const auto str = markdown.toUtf8();
char *tmp_buf = cmark_markdown_to_html(str.constData(), str.size(), CMARK_OPT_HARDBREAKS | CMARK_OPT_UNSAFE);
const std::string html(tmp_buf);
free(tmp_buf);
auto result = QString::fromStdString(html).trimmed();
result.replace(QStringLiteral("<!-- raw HTML omitted -->"), QString());
return result;
}
/**
* TODO: make this more intelligent currently other characters are not escaped
* especially & as this can conflict with the cmark markdown to html conversion
* which already escapes characters in code blocks. The < > still need to be handled
* when the user manually types in the html.
*/
QString TextHandler::escapeHtml(QString stringIn)
{
stringIn.replace(u'<', QStringLiteral("&lt;"));
stringIn.replace(u'>', QStringLiteral("&gt;"));
return stringIn;
}
QString TextHandler::unescapeHtml(QString stringIn)
{
// For those situations where brackets in code block get double escaped
stringIn.replace(QStringLiteral("&amp;lt;"), QStringLiteral("<"));
stringIn.replace(QStringLiteral("&amp;gt;"), QStringLiteral(">"));
stringIn.replace(QStringLiteral("&lt;"), QStringLiteral("<"));
stringIn.replace(QStringLiteral("&gt;"), QStringLiteral(">"));
stringIn.replace(QStringLiteral("&amp;"), QStringLiteral("&"));
return stringIn;
}
QString TextHandler::linkifyUrls(QString stringIn)
{
stringIn = stringIn.replace(TextRegex::mxId, QStringLiteral(R"(\1<a href="https://matrix.to/#/\2">\2</a>)"));
stringIn.replace(TextRegex::fullUrl, QStringLiteral(R"(<a href="\1">\1</a>)"));
stringIn = stringIn.replace(TextRegex::emailAddress, QStringLiteral(R"(<a href="mailto:\2">\1\2</a>)"));
return stringIn;
}

131
src/texthandler.h Normal file
View File

@@ -0,0 +1,131 @@
// SPDX-FileCopyrightText: 2023 James Graham <james.h.graham@protonmail.com>
// SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
#pragma once
#include <QHash>
#include <QRegularExpression>
#include <QString>
#include <QStringList>
#include "neochatroom.h"
namespace TextRegex
{
static const QRegularExpression endTagType{QStringLiteral("(>| )")};
static const QRegularExpression removeReply{QStringLiteral("> <.*?>.*?\\n\\n"), QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression removeRichReply{QStringLiteral("<mx-reply>.*?</mx-reply>"), QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression codePill{QStringLiteral("<pre><code[^>]*>(.*?)</code></pre>"), QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression userPill{QStringLiteral("(<a href=\"https://matrix.to/#/@.*?:.*?\">.*?</a>)"), QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression strikethrough{QStringLiteral("<del>(.*?)</del>"), QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression mxcImage{QStringLiteral(R"AAA(<img(.*?)src="mxc:\/\/(.*?)\/(.*?)"(.*?)>)AAA")};
static const QRegularExpression fullUrl(
QStringLiteral(
R"(<a.*?<\/a>(*SKIP)(*F)|\b((www\.(?!\.)(?!(\w|\.|-)+@)|(https?|ftp):(//)?\w|(magnet|matrix):)(&(?![lg]t;)|[^&\s<>'"])+(&(?![lg]t;)|[^&!,.\s<>'"\]):])))"),
QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
static const QRegularExpression emailAddress(QStringLiteral(R"(<a.*?<\/a>(*SKIP)(*F)|\b(mailto:)?((\w|\.|-)+@(\w|\.|-)+\.\w+\b))"),
QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
static const QRegularExpression mxId(QStringLiteral(R"((^|[][[:space:](){}`'";])([!#@][-a-z0-9_=#/.]{1,252}:\w(?:\w|\.|-)*\.\w+(?::\d{1,5})?))"),
QRegularExpression::CaseInsensitiveOption | QRegularExpression::UseUnicodePropertiesOption);
}
/**
* @class TextHandler
*
* This class is designed to handle the text of both incoming and outgoing messages.
*
* This includes converting markdown to html and removing any html tags that shouldn't
* be present as per the matrix spec
* (https://spec.matrix.org/v1.5/client-server-api/#mroommessage-msgtypes).
*/
class TextHandler
{
public:
/**
* @brief List of token types
*/
enum Type {
Text, /*!< Anything not a tag that doesn't have special handling */
Tag, /*!< For any generic tag that doesn't have special handling */
TextCode, /*!< Text between code tags */
};
/**
* @brief Get the string being handled.
*
* Setting new data resets the TextHandler.
*/
QString data() const;
/**
* @brief Set the string being handled.
*
* @note The TextHandler doesn't modify the input data variable so the unhandled
* text can always be retrieved.
*/
void setData(const QString &string);
/**
* @brief Handle the text for a message that is being sent.
*/
QString handleSendText();
/**
* @brief Handle the text as a rich output for a message being received.
*
* The function does the following:
* - Removes invalid html tags and attributes
* - Strips any reply from the message
* - Formats user mentions
*
* @note In this case the rich text refers to the output format. The input
* can be in either and the parameter inputFormat just needs to be set
* appropriately.
*/
QString handleRecieveRichText(Qt::TextFormat inputFormat = Qt::RichText,
const NeoChatRoom *room = nullptr,
const Quotient::RoomEvent *event = nullptr,
bool stripNewlines = false);
/**
* @brief Handle the text as a plain output for a message being received.
*
* The function does the following:
* - Removes all html tags and attributes (except inside of code tags)
* - Strips any reply from the message
*
* @note In this case the plain text refers to the output format. The input
* can be in either and the parameter inputFormat just needs to be set
* appropriately.
*
* @warning The output of this function should NEVER be input into a rich text
* control. It will try to preserve < and > in the plain string which
* could be malicious tags if the control uses rich text format.
*/
QString handleRecievePlainText(Qt::TextFormat inputFormat = Qt::PlainText, const bool &stripNewlines = false);
private:
QString m_data;
QString m_dataBuffer;
int m_pos;
Type m_nextTokenType;
QString m_nextToken;
void next();
void nextTokenType();
QString getTagType() const;
bool isCloseTag() const;
QString getAttributeType(const QString &string);
QString getAttributeData(const QString &string);
bool isAllowedTag(const QString &type);
bool isAllowedAttribute(const QString &tag, const QString &attribute);
bool isAllowedLink(const QString &link, bool isImg = false);
QString cleanAttributes(const QString &tag, const QString &tagString);
QString markdownToHTML(const QString &markdown);
QString escapeHtml(QString stringIn);
QString unescapeHtml(QString stringIn);
QString linkifyUrls(QString stringIn);
};

View File

@@ -1,4 +0,0 @@
// SPDX-FileCopyrightText: 2018 Black Hat <bhat@encom.eu.org>
// SPDX-License-Identifier: GPL-3.0-only
#include "utils.h"

View File

@@ -1,16 +0,0 @@
// SPDX-FileCopyrightText: 2018 Black Hat <bhat@encom.eu.org>
// SPDX-License-Identifier: GPL-3.0-only
#pragma once
#include <QRegularExpression>
namespace utils
{
static const QRegularExpression removeReplyRegex{"> <.*?>.*?\\n\\n", QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression removeRichReplyRegex{"<mx-reply>.*?</mx-reply>", QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression codePillRegExp{"<pre><code[^>]*>(.*?)</code></pre>", QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression userPillRegExp{"(<a href=\"https://matrix.to/#/@.*?:.*?\">.*?</a>)", QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression strikethroughRegExp{"<del>(.*?)</del>", QRegularExpression::DotMatchesEverythingOption};
static const QRegularExpression mxcImageRegExp{R"AAA(<img(.*?)src="mxc:\/\/(.*?)\/(.*?)"(.*?)>)AAA"};
}