summaryrefslogtreecommitdiff
path: root/vere/ext/gmp/gen/aarch64-linux/mpn/popcount.s
blob: e0d09b09b1d2d1e6a0fe6f02362815bed1a0a7b2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169




































































	.text
	.align	3
	.globl	__gmpn_popcount 
	.type	__gmpn_popcount,@function
__gmpn_popcount:

	mov	x11, #0x1fff
	cmp	x1, x11
	b.hi	.Lgt8k

.Llt8k:
	movi	v4.16b, #0			
	movi	v5.16b, #0			

	tbz	x1, #0, .Lxx0
	sub	x1, x1, #1
	ld1	{v0.1d}, [x0], #8		
	cnt	v6.16b, v0.16b
	uadalp	v4.8h,  v6.16b			

.Lxx0:	tbz	x1, #1, .Lx00
	sub	x1, x1, #2
	ld1	{v0.2d}, [x0], #16		
	cnt	v6.16b, v0.16b
	uadalp	v4.8h,  v6.16b

.Lx00:	tbz	x1, #2, .L000
	subs	x1, x1, #4
	ld1	{v0.2d,v1.2d}, [x0], #32	
	b.ls	.Lsum

.Lgt4:	ld1	{v2.2d,v3.2d}, [x0], #32	
	sub	x1, x1, #4
	cnt	v6.16b, v0.16b
	cnt	v7.16b, v1.16b
	b	.Lmid

.L000:	subs	x1, x1, #8
	b.lo	.Le0

.Lchu:	ld1	{v2.2d,v3.2d}, [x0], #32	
	ld1	{v0.2d,v1.2d}, [x0], #32	
	cnt	v6.16b, v2.16b
	cnt	v7.16b, v3.16b
	subs	x1, x1, #8
	b.lo	.Lend

.Ltop:	ld1	{v2.2d,v3.2d}, [x0], #32	
	uadalp	v4.8h,  v6.16b
	cnt	v6.16b, v0.16b
	uadalp	v5.8h,  v7.16b
	cnt	v7.16b, v1.16b
.Lmid:	ld1	{v0.2d,v1.2d}, [x0], #32	
	subs	x1, x1, #8
	uadalp	v4.8h,  v6.16b
	cnt	v6.16b, v2.16b
	uadalp	v5.8h,  v7.16b
	cnt	v7.16b, v3.16b
	b.hs	.Ltop

.Lend:	uadalp	v4.8h,  v6.16b
	uadalp	v5.8h,  v7.16b
.Lsum:	cnt	v6.16b, v0.16b
	cnt	v7.16b, v1.16b
	uadalp	v4.8h,  v6.16b
	uadalp	v5.8h,  v7.16b
	add	v4.8h, v4.8h, v5.8h
					
.Le0:	uaddlp	v4.4s,  v4.8h		
	uaddlp	v4.2d,  v4.4s		
	mov	x0, v4.d[0]
	mov	x1, v4.d[1]
	add	x0, x0, x1
	ret


			
.Lgt8k:
	mov	x8, x30
	mov	x7, x1			
	mov	x4, #0			
	mov	x9, #0x1ff0*8	
	mov	x10, #0x1ff0		

1:	add	x5, x0, x9		
	mov	x1, #0x1ff0-8		
	movi	v4.16b, #0		
	movi	v5.16b, #0		
	bl	.Lchu			
	add	x4, x4, x0
	mov	x0, x5			
	sub	x7, x7, x10
	cmp	x7, x11
	b.hi	1b

	mov	x1, x7			
	bl	.Llt8k
	add	x0, x4, x0
	mov	x30, x8
	ret
	.size	__gmpn_popcount,.-__gmpn_popcount