Add Syriac script support (#13800)

* Add Syriac Language support dictionary

The Syriac Script is a Unicode block containing characters for all forms of the Syriac alphabet, including the Estrangela, Serto, Eastern Syriac, and the Christian Palestinian Aramaic variants. It is used in Literary Syriac, Neo-Aramaic, and Arabic among Syriac-speaking Christians. It was used historically to write Armenian, Persian, Ottoman Turkish, and Malayalam. The script, like Arabic and Hebrew is RTL.

https://en.wikipedia.org/wiki/Syriac_(Unicode_block)
https://en.wikipedia.org/wiki/Syriac_language

* Add Syriac script support for training

The Syriac Script is a Unicode block containing characters for all forms of the Syriac alphabet, including the Estrangela, Serto, Eastern Syriac, and the Christian Palestinian Aramaic variants. It is used in Literary Syriac, Neo-Aramaic, and Arabic among Syriac-speaking Christians. It was used historically to write Armenian, Persian, Ottoman Turkish, and Malayalam. The script, like Arabic and Hebrew is RTL.

https://en.wikipedia.org/wiki/Syriac_(Unicode_block)
https://en.wikipedia.org/wiki/Syriac_language
pull/13820/head
johnlockejrr 2024-09-01 05:10:42 -07:00 committed by GitHub
parent 6225a90ef0
commit ada310811a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 267 additions and 0 deletions

View File

@ -0,0 +1,110 @@
Global:
use_gpu: true
epoch_num: 500
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_syriac_lite
save_epoch_step: 3
eval_batch_step:
- 0
- 2000
cal_metric_during_train: true
pretrained_model: null
checkpoints: null
save_inference_dir: null
use_visualdl: false
infer_img: null
character_dict_path: ppocr/utils/dict/syriac_dict.txt
max_text_length: 25
infer_mode: false
use_space_char: true
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Cosine
learning_rate: 0.001
regularizer:
name: L2
factor: 1.0e-05
Architecture:
model_type: rec
algorithm: CRNN
Transform: null
Backbone:
name: MobileNetV3
scale: 0.5
model_name: small
small_stride:
- 1
- 2
- 2
- 2
Neck:
name: SequenceEncoder
encoder_type: rnn
hidden_size: 48
Head:
name: CTCHead
fc_decay: 1.0e-05
Loss:
name: CTCLoss
PostProcess:
name: CTCLabelDecode
Metric:
name: RecMetric
main_indicator: acc
Train:
dataset:
name: SimpleDataSet
data_dir: train_data/
label_file_list:
- train_data/syriac_train.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- RecAug: null
- CTCLabelEncode: null
- RecResizeImg:
image_shape:
- 3
- 32
- 320
- KeepKeys:
keep_keys:
- image
- label
- length
loader:
shuffle: true
batch_size_per_card: 256
drop_last: true
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: train_data/
label_file_list:
- train_data/syriac_val.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- CTCLabelEncode: null
- RecResizeImg:
image_shape:
- 3
- 32
- 320
- KeepKeys:
keep_keys:
- image
- label
- length
loader:
shuffle: false
drop_last: false
batch_size_per_card: 256
num_workers: 8

View File

@ -0,0 +1,157 @@
!
#
$
%
&
'
(
+
,
-
.
/
0
1
2
3
4
5
6
7
8
9
:
?
@
A
B
C
D
E
F
G
H
I
J
K
L
M
N
O
P
Q
R
S
T
U
V
W
X
Y
Z
_
a
b
c
d
e
f
g
h
i
j
k
l
m
n
o
p
q
r
s
t
u
v
w
x
y
z
É
é
܀
܁
܂
܃
܄
܅
܆
܇
܈
܉
܊
܋
܌
܍
܏
ܐ
ܑ
ܒ
ܓ
ܔ
ܕ
ܖ
ܗ
ܘ
ܙ
ܚ
ܛ
ܜ
ܝ
ܞ
ܟ
ܠ
ܡ
ܢ
ܣ
ܤ
ܥ
ܦ
ܧ
ܨ
ܩ
ܪ
ܫ
ܬ
ܭ
ܮ
ܯ
ܰ
ܱ
ܲ
ܳ
ܴ
ܵ
ܶ
ܷ
ܸ
ܹ
ܺ
ܻ
ܼ
ܽ
ܾ
ܿ
݀
݁
݂
݃
݄
݅
݆
݇
݈
݉
݊
ݍ
ݎ
ݏ