diff mbox series

[9/9] crypto: qce - switch to using a mutex

Message ID 20241203-crypto-qce-refactor-v1-9-c5901d2dd45c@linaro.org
State New
Headers show
Series crypto: qce - refactor the driver | expand

Commit Message

Bartosz Golaszewski Dec. 3, 2024, 9:19 a.m. UTC
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

Having switched to workqueue from tasklet, we are no longer limited to
atomic APIs and can now convert the spinlock to a mutex. This, along
with the conversion from tasklet to workqueue grants us ~15% improvement
in cryptsetup benchmarks for AES encryption.

While at it: use guards to simplify locking code.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/crypto/qce/core.c | 46 +++++++++++++++++++++-------------------------
 drivers/crypto/qce/core.h |  3 ++-
 2 files changed, 23 insertions(+), 26 deletions(-)

Comments

Neil Armstrong Dec. 4, 2024, 10:17 a.m. UTC | #1
On 03/12/2024 16:10, Bartosz Golaszewski wrote:
> On Tue, 3 Dec 2024 14:53:21 +0100, neil.armstrong@linaro.org said:
>> On 03/12/2024 10:19, Bartosz Golaszewski wrote:
>>> From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
>>>
>>> Having switched to workqueue from tasklet, we are no longer limited to
>>> atomic APIs and can now convert the spinlock to a mutex. This, along
>>> with the conversion from tasklet to workqueue grants us ~15% improvement
>>> in cryptsetup benchmarks for AES encryption.
>>
>> Can you share on which platforms you did the tests and the results you got ?
>>
> 
> Sure, I tested on sm8650 with the following results (they vary from
> one run to other but are more or less in this range):
> 
> With this series:
> 
> #     Algorithm |       Key |      Encryption |      Decryption
>          aes-cbc        128b        94.1 MiB/s       138.6 MiB/s
>      serpent-cbc        128b               N/A               N/A
>      twofish-cbc        128b               N/A               N/A
>          aes-cbc        256b        94.8 MiB/s       128.5 MiB/s
>      serpent-cbc        256b               N/A               N/A
>      twofish-cbc        256b               N/A               N/A
>          aes-xts        256b       132.9 MiB/s       131.8 MiB/s
>      serpent-xts        256b               N/A               N/A
>      twofish-xts        256b               N/A               N/A
>          aes-xts        512b       122.6 MiB/s       122.4 MiB/s
>      serpent-xts        512b               N/A               N/A
>      twofish-xts        512b               N/A               N/A
> 
> Without it:
> 
> #     Algorithm |       Key |      Encryption |      Decryption
>          aes-cbc        128b        96.4 MiB/s       141.0 MiB/s
>      serpent-cbc        128b               N/A               N/A
>      twofish-cbc        128b               N/A               N/A
>          aes-cbc        256b        67.0 MiB/s        97.8 MiB/s
>      serpent-cbc        256b               N/A               N/A
>      twofish-cbc        256b               N/A               N/A
>          aes-xts        256b       131.7 MiB/s       132.0 MiB/s
>      serpent-xts        256b               N/A               N/A
>      twofish-xts        256b               N/A               N/A
>          aes-xts        512b        93.9 MiB/s        96.8 MiB/s
>      serpent-xts        512b               N/A               N/A
>      twofish-xts        512b               N/A               N/A
> 
> AES-CBC and AES-XTS with shorter keys remain pretty much the same. I'm not
> sure why that is. I also tested on sa8775p but there are no visible
> improvements there. :(

Thanks for the results !

Neil

> 
> Bart
diff mbox series

Patch

diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index 6de9f1e23e282..e95e84486d9ae 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -3,6 +3,7 @@ 
  * Copyright (c) 2010-2014, The Linux Foundation. All rights reserved.
  */
 
+#include <linux/cleanup.h>
 #include <linux/clk.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
@@ -11,7 +12,6 @@ 
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
 #include <linux/platform_device.h>
-#include <linux/spinlock.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
@@ -89,34 +89,28 @@  static int qce_handle_queue(struct qce_device *qce,
 			    struct crypto_async_request *req)
 {
 	struct crypto_async_request *async_req, *backlog;
-	unsigned long flags;
 	int ret = 0, err;
 
-	spin_lock_irqsave(&qce->lock, flags);
+	scoped_guard(mutex, &qce->lock) {
+		if (req)
+			ret = crypto_enqueue_request(&qce->queue, req);
 
-	if (req)
-		ret = crypto_enqueue_request(&qce->queue, req);
+		/* busy, do not dequeue request */
+		if (qce->req)
+			return ret;
 
-	/* busy, do not dequeue request */
-	if (qce->req) {
-		spin_unlock_irqrestore(&qce->lock, flags);
-		return ret;
+		backlog = crypto_get_backlog(&qce->queue);
+		async_req = crypto_dequeue_request(&qce->queue);
+		if (async_req)
+			qce->req = async_req;
 	}
 
-	backlog = crypto_get_backlog(&qce->queue);
-	async_req = crypto_dequeue_request(&qce->queue);
-	if (async_req)
-		qce->req = async_req;
-
-	spin_unlock_irqrestore(&qce->lock, flags);
-
 	if (!async_req)
 		return ret;
 
 	if (backlog) {
-		spin_lock_bh(&qce->lock);
-		crypto_request_complete(backlog, -EINPROGRESS);
-		spin_unlock_bh(&qce->lock);
+		scoped_guard(mutex, &qce->lock)
+			crypto_request_complete(backlog, -EINPROGRESS);
 	}
 
 	err = qce_handle_request(async_req);
@@ -133,12 +127,11 @@  static void qce_req_done_work(struct work_struct *work)
 	struct qce_device *qce = container_of(work, struct qce_device,
 					      done_work);
 	struct crypto_async_request *req;
-	unsigned long flags;
 
-	spin_lock_irqsave(&qce->lock, flags);
-	req = qce->req;
-	qce->req = NULL;
-	spin_unlock_irqrestore(&qce->lock, flags);
+	scoped_guard(mutex, &qce->lock) {
+		req = qce->req;
+		qce->req = NULL;
+	}
 
 	if (req)
 		crypto_request_complete(req, qce->result);
@@ -243,7 +236,10 @@  static int qce_crypto_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	spin_lock_init(&qce->lock);
+	ret = devm_mutex_init(qce->dev, &qce->lock);
+	if (ret)
+		return ret;
+
 	INIT_WORK(&qce->done_work, qce_req_done_work);
 	crypto_init_queue(&qce->queue, QCE_QUEUE_LENGTH);
 
diff --git a/drivers/crypto/qce/core.h b/drivers/crypto/qce/core.h
index 39e75a75a4293..eb6fa7a8b64a8 100644
--- a/drivers/crypto/qce/core.h
+++ b/drivers/crypto/qce/core.h
@@ -6,6 +6,7 @@ 
 #ifndef _CORE_H_
 #define _CORE_H_
 
+#include <linux/mutex.h>
 #include <linux/workqueue.h>
 
 #include "dma.h"
@@ -30,7 +31,7 @@ 
  */
 struct qce_device {
 	struct crypto_queue queue;
-	spinlock_t lock;
+	struct mutex lock;
 	struct work_struct done_work;
 	struct crypto_async_request *req;
 	int result;