From ef1371a93fcd7bfb678e33fec58677f718578748 Mon Sep 17 00:00:00 2001
From: chrysos349 <chrysostom349@gmail.com>
Date: Tue, 19 Sep 2023 04:11:27 +0300
Subject: [PATCH] ccextractor: revbump for tesseract-5.3.3

---
 srcpkgs/ccextractor/patches/fix-ocr.patch | 106 ++++++++++++++++++++++
 srcpkgs/ccextractor/template              |  10 +-
 2 files changed, 114 insertions(+), 2 deletions(-)
 create mode 100644 srcpkgs/ccextractor/patches/fix-ocr.patch

diff --git a/srcpkgs/ccextractor/patches/fix-ocr.patch b/srcpkgs/ccextractor/patches/fix-ocr.patch
new file mode 100644
index 00000000000..2681c60aa41
--- /dev/null
+++ b/srcpkgs/ccextractor/patches/fix-ocr.patch
@@ -0,0 +1,106 @@
+--- a/src/lib_ccx/hardsubx.c
++++ b/src/lib_ccx/hardsubx.c
+@@ -221,7 +221,7 @@
+ 	char *pars_values = strdup("/dev/null");
+ 	char *tessdata_path = NULL;
+ 
+-	char *lang = options->ocrlang;
++	char *lang = (char *)options->ocrlang;
+ 	if (!lang)
+ 		lang = "eng"; // English is default language
+ 
+@@ -245,7 +245,7 @@
+ 
+ 	int ret = -1;
+ 
+-	if (!strncmp("4.", TessVersion(), 2))
++	if (!strncmp("4.", TessVersion(), 2) || !strncmp("5.", TessVersion(), 2))
+ 	{
+ 		char tess_path[1024];
+ 		if (ccx_options.ocr_oem < 0)
+--- a/src/lib_ccx/ocr.c
++++ b/src/lib_ccx/ocr.c
+@@ -97,36 +97,22 @@
+ char *probe_tessdata_location(const char *lang)
+ {
+ 	int ret = 0;
+-	char *tessdata_dir_path = getenv("TESSDATA_PREFIX");
+ 
+-	ret = search_language_pack(tessdata_dir_path, lang);
+-	if (!ret)
+-		return tessdata_dir_path;
+-
+-	tessdata_dir_path = "./";
+-	ret = search_language_pack(tessdata_dir_path, lang);
+-	if (!ret)
+-		return tessdata_dir_path;
+-
+-	tessdata_dir_path = "/usr/share/";
+-	ret = search_language_pack(tessdata_dir_path, lang);
+-	if (!ret)
+-		return tessdata_dir_path;
+-
+-	tessdata_dir_path = "/usr/local/share/";
+-	ret = search_language_pack(tessdata_dir_path, lang);
+-	if (!ret)
+-		return tessdata_dir_path;
+-
+-	tessdata_dir_path = "/usr/share/tesseract-ocr/";
+-	ret = search_language_pack(tessdata_dir_path, lang);
+-	if (!ret)
+-		return tessdata_dir_path;
+-
+-	tessdata_dir_path = "/usr/share/tesseract-ocr/4.00/";
+-	ret = search_language_pack(tessdata_dir_path, lang);
+-	if (!ret)
+-		return tessdata_dir_path;
++	const char *paths[] = {
++	    getenv("TESSDATA_PREFIX"),
++	    "./",
++	    "/usr/share/",
++	    "/usr/local/share/",
++	    "/usr/share/tesseract-ocr/",
++	    "/usr/share/tesseract-ocr/4.00/",
++	    "/usr/share/tesseract-ocr/5/",
++	    "/usr/share/tesseract/"};
++
++	for (int i = 0; i < sizeof(paths) / sizeof(paths[0]); i++)
++	{
++		if (!search_language_pack(paths[i], lang))
++			return (char *)paths[i];
++	}
+ 
+ 	return NULL;
+ }
+@@ -174,7 +160,7 @@
+ 	char *pars_values = strdup("tess.log");
+ 
+ 	ctx->api = TessBaseAPICreate();
+-	if (!strncmp("4.", TessVersion(), 2))
++	if (!strncmp("4.", TessVersion(), 2) || !strncmp("5.", TessVersion(), 2))
+ 	{
+ 		char tess_path[1024];
+ 		snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata");
+@@ -331,6 +317,11 @@
+ 	}
+ 
+ 	BOX *crop_points = ignore_alpha_at_edge(copy->alpha, copy->data, w, h, color_pix, &color_pix_out);
++
++	l_int32 x, y, _w, _h;
++
++	boxGetGeometry(crop_points, &x, &y, &_w, &_h);
++
+ 	// Converting image to grayscale for OCR to avoid issues with transparency
+ 	cpix_gs = pixConvertRGBToGray(cpix, 0.0, 0.0, 0.0);
+ 
+@@ -426,8 +417,8 @@
+ 				{
+ 					for (int j = x1; j <= x2; j++)
+ 					{
+-						if (copy->data[(crop_points->y + i) * w + (crop_points->x + j)] != firstpixel)
+-							histogram[copy->data[(crop_points->y + i) * w + (crop_points->x + j)]]++;
++						if (copy->data[(y + i) * w + (x + j)] != firstpixel)
++							histogram[copy->data[(y + i) * w + (x + j)]]++;
+ 					}
+ 				}
+ 				/* sorted in increasing order of intensity */
diff --git a/srcpkgs/ccextractor/template b/srcpkgs/ccextractor/template
index 9abcd82852b..84059ffd023 100644
--- a/srcpkgs/ccextractor/template
+++ b/srcpkgs/ccextractor/template
@@ -1,7 +1,7 @@
 # Template file for 'ccextractor'
 pkgname=ccextractor
 version=0.93
-revision=1
+revision=2
 build_wrksrc="linux"
 build_style=gnu-configure
 configure_args="--enable-ocr --enable-hardsubx"
@@ -16,8 +16,14 @@ distfiles="https://github.com/CCExtractor/${pkgname}/archive/v${version}.tar.gz"
 checksum=0e66d3e360db1b02a88271af11313ca4c9bbda1b03728e264a44c4c9f77192e3
 CFLAGS="-I${XBPS_CROSS_BASE}/usr/include/tesseract -DPNG_POWERPC_VSX_OPT=0 -fcommon"
 
+if [ "$CROSS_BUILD" ]; then
+	hostmakedepends+=" tesseract-ocr-devel"
+fi
+
 pre_configure() {
-	sed -i -e "s/tesseract --version/tesseract-ocr --version/g" configure.ac
+	vsed -i configure.ac \
+		-e "s/tesseract --version/tesseract-ocr --version/g" \
+		-e "s/\[lept\]/[leptonica]/"
 	./autogen.sh
 }