summaryrefslogtreecommitdiffstats
path: root/2020/subtitles/emacsconf-2020--23-incremental-parsing-with-emacs-tree-sitter--tuan-anh-nguyen-autogen.vtt
blob: 62ad5f65d1cdda845d3ca518df900c6dfaec9c7f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
WEBVTT

00:00:01.520 --> 00:00:04.400
Hello, everyone! My name is Tuấn-Anh.

00:00:04.400 --> 00:00:07.200
I've been using Emacs for about 10 years.

00:00:07.200 --> 00:00:09.280
Today, I'm going to talk about tree-sitter,

00:00:09.280 --> 00:00:11.351
a new Emacs package that allows Emacs

00:00:11.351 --> 00:00:17.840
to parse multiple programming languages
in real-time.

00:00:17.840 --> 00:00:21.840
So what is the problem statement?

00:00:21.840 --> 00:00:24.131
In order to support programming
functionalities

00:00:24.131 --> 00:00:25.760
for a particular language,

00:00:25.760 --> 00:00:27.680
a text editor needs to have some degree

00:00:27.680 --> 00:00:29.679
of language understanding.

00:00:29.679 --> 00:00:31.840
Traditionally, text editors have relied

00:00:31.840 --> 00:00:34.960
very heavily on regular expressions for
this.

00:00:34.960 --> 00:00:37.013
Emacs is no different.

00:00:37.013 --> 00:00:40.170
Most language major modes use regular
expressions

00:00:40.170 --> 00:00:42.960
for syntax-highlighting, code navigation,

00:00:42.960 --> 00:00:46.618
folding, indexing, and so on.

00:00:46.618 --> 00:00:50.559
Regular expressions are problematic for
a couple of reasons.

00:00:50.559 --> 00:00:53.778
They're slow and inaccurate.

00:00:53.778 --> 00:00:56.800
They also make the code hard to read and
write.

00:00:56.800 --> 00:01:01.199
Sometimes it's because the regular
expressions themselves are very hairy,

00:01:01.199 --> 00:01:05.199
and sometimes because they are just not
powerful enough.

00:01:05.199 --> 00:01:08.625
Some helper code is usually needed

00:01:08.625 --> 00:01:11.200
to parse more intricate language
features.

00:01:11.200 --> 00:01:16.159
That also illustrates the core problem
with regular expressions,

00:01:16.159 --> 00:01:21.119
in that they are not powerful enough to
parse programming languages.

00:01:21.119 --> 00:01:25.040
An example feature that regular
expressions cannot handle very well

00:01:25.040 --> 00:01:28.320
is string interpolation, which is a very
common feature

00:01:28.320 --> 00:01:31.680
in many modern programming languages.

00:01:31.680 --> 00:01:34.079
It would be much nicer if Emacs somehow

00:01:34.079 --> 00:01:39.520
had structural understanding of source
code, like IDEs do.

00:01:39.520 --> 00:01:41.981
There have been multiple efforts

00:01:41.981 --> 00:01:45.280
to bring this kind of programming
language understanding into Emacs.

00:01:45.280 --> 00:01:47.119
There are language-specific parsers

00:01:47.119 --> 00:01:48.640
written in Elisp

00:01:48.640 --> 00:01:50.675
that can be thought of

00:01:50.675 --> 00:01:51.989
as the next logical step
of the glue code

00:01:51.989 --> 00:01:53.856
on top of regular expressions,

00:01:53.856 --> 00:01:57.356
moving from partial local pattern
recognition

00:01:57.356 --> 00:01:59.840
into a full-fledged parser.

00:01:59.840 --> 00:02:02.023
The most prominent example of this
approach

00:02:02.023 --> 00:02:06.479
is probably the famous js2-mode.

00:02:06.479 --> 00:02:10.080
However, this approach has several issues.

00:02:10.080 --> 00:02:12.606
Parsing is computationally expensive,

00:02:12.606 --> 00:02:16.800
and Emacs Lisp is not good at that kind
of stuff.

00:02:16.800 --> 00:02:19.156
Furthermore, maintenance is very
troublesome.

00:02:19.156 --> 00:02:22.160
In order to work on these parsers,

00:02:22.160 --> 00:02:24.239
first, you have to know Elisp
well enough,

00:02:24.239 --> 00:02:26.606
and then you have to be comfortable with

00:02:26.606 --> 00:02:29.739
writing a recursive descending parser,

00:02:29.739 --> 00:02:34.000
while constantly keeping up with changes
to the language itself,

00:02:34.000 --> 00:02:36.356
which can be evolving very quickly,

00:02:36.356 --> 00:02:39.360
like Javascript, for example.

00:02:39.360 --> 00:02:42.373
Together, these constraints
significantly reduce

00:02:42.373 --> 00:02:45.680
the pool of potential maintainers.

00:02:45.680 --> 00:02:47.760
The biggest issue, though, in my opinion,

00:02:47.760 --> 00:02:52.139
is lack of the set of generic and
reusable APIs.

00:02:52.139 --> 00:02:54.319
This makes them very hard to use

00:02:54.319 --> 00:02:55.920
for minor modes that want to deal with

00:02:55.920 --> 00:02:59.920
cross-cutting concerns across multiple
languages.

00:02:59.920 --> 00:03:01.760
The other approach which has been

00:03:01.760 --> 00:03:04.319
gaining a lot of momentum
in recent years

00:03:04.319 --> 00:03:06.560
is externalizing language understanding

00:03:06.560 --> 00:03:08.159
to another process,

00:03:08.159 --> 00:03:12.239
also known as language server protocol.

00:03:12.239 --> 00:03:16.560
This second approach is actually a very
interesting one.

00:03:16.560 --> 00:03:18.400
By decoupling language understanding

00:03:18.400 --> 00:03:21.280
from the editing facility itself,

00:03:21.280 --> 00:03:25.120
the LSP servers can attract a lot more
contributors,

00:03:25.120 --> 00:03:27.189
which makes maintenance easier.

00:03:27.189 --> 00:03:32.400
However, they also have several issues
of their own.

00:03:32.400 --> 00:03:34.089
Being a separate process,

00:03:34.089 --> 00:03:37.073
they are usually more
resource-intensive,

00:03:37.073 --> 00:03:39.920
and depending on the language,

00:03:39.920 --> 00:03:42.159
the LSP server itself can bring with it

00:03:42.159 --> 00:03:44.640
a host of additional dependencies

00:03:44.640 --> 00:03:50.640
external to Emacs, which may be messy to
install and manage.

00:03:50.640 --> 00:03:55.120
Furthermore, JSON over RPC has pretty
high latency.

00:03:55.120 --> 00:03:57.840
For one-off tasks like jumping to source

00:03:57.840 --> 00:04:00.879
or on-demand completion, it's great.

00:04:00.879 --> 00:04:03.040
But for things like code highlighting,

00:04:03.040 --> 00:04:06.000
the latency is just too much.

00:04:06.000 --> 00:04:08.319
I was using Rust and I was following the

00:04:08.319 --> 00:04:11.760
community effort to improve its
IDE support,

00:04:11.760 --> 00:04:15.760
hoping to integrate some of that into
Emacs itself.

00:04:15.760 --> 00:04:19.759
Then I heard someone from the community
mention tree-sitter,

00:04:19.759 --> 00:04:23.360
and I decided to check it out.

00:04:23.360 --> 00:04:28.720
Basically, tree-sitter is an incremental
parsing library and a parser generator.

00:04:28.720 --> 00:04:33.040
It was introduced by the Atom editor in
2018.

00:04:33.040 --> 00:04:35.923
Besides Atom, it is also being
integrated

00:04:35.923 --> 00:04:37.623
into the NeoVim editor,

00:04:37.623 --> 00:04:41.040
and Github is using it to power

00:04:41.040 --> 00:04:42.423
their source code analysis

00:04:42.423 --> 00:04:45.840
and navigation features.

00:04:45.840 --> 00:04:48.639
It is written in C and can be compiled

00:04:48.639 --> 00:04:50.623
for all major platforms.

00:04:50.623 --> 00:04:53.120
It can even be compiled

00:04:53.120 --> 00:04:55.323
to web assembly to run on the web.

00:04:55.323 --> 00:05:00.800
That's how Github is using it
on their website.

00:05:00.800 --> 00:05:05.840
So why is tree-sitter an interesting
solution to this problem?

00:05:05.840 --> 00:05:10.000
There are multiple features that make it
an attractive option.

00:05:10.000 --> 00:05:11.839
It is designed to be fast.

00:05:11.839 --> 00:05:13.680
By being incremental,

00:05:13.680 --> 00:05:15.680
the initial parse of a typical big file

00:05:15.680 --> 00:05:18.160
can take tens of milliseconds,

00:05:18.160 --> 00:05:20.240
while subsequent incremental processes

00:05:20.240 --> 00:05:22.560
are sub-millisecond.

00:05:22.560 --> 00:05:26.240
It achieves this by using
structural sharing,

00:05:26.240 --> 00:05:29.360
meaning replacing only affected nodes

00:05:29.360 --> 00:05:32.960
in the old tree when it needs to.

00:05:32.960 --> 00:05:37.120
Also, unlike LSP, being in
the same process,

00:05:37.120 --> 00:05:40.639
it has much lower latency.

00:05:40.639 --> 00:05:44.960
Secondly, it provides a uniform
programming interface.

00:05:44.960 --> 00:05:47.039
The same data structures and functions

00:05:47.039 --> 00:05:50.400
work on parse trees of different
languages.

00:05:50.400 --> 00:05:52.160
Syntax nodes of different languages

00:05:52.160 --> 00:05:54.160
differ only by their types

00:05:54.160 --> 00:05:55.723
and their possible child nodes.

00:05:55.723 --> 00:06:02.240
This is a big advantage over
language-specific parsers.

00:06:02.240 --> 00:06:06.880
Thirdly, it's written in self-contained
embeddable C.

00:06:06.880 --> 00:06:11.723
As I mentioned previously, it can even
be compiled to webassembly.

00:06:11.723 --> 00:06:16.106
This makes integrating it into various
editors quite easy

00:06:16.106 --> 00:06:22.880
without having to install any external
dependencies.

00:06:22.880 --> 00:06:25.503
One thing that is not mentioned here

00:06:25.503 --> 00:06:28.000
is that being a parser generator,

00:06:28.000 --> 00:06:31.039
its grammars are declarative.

00:06:31.039 --> 00:06:34.880
Together with being editor-independent,

00:06:34.880 --> 00:06:39.139
this makes the pool of potential
contributors much larger.

00:06:39.139 --> 00:06:45.520
So I was convinced that tree-sitter is a
good fit for Emacs.

00:06:45.520 --> 00:06:48.000
Last year, I started writing the bindings

00:06:48.000 --> 00:06:53.280
using dynamic module support introduced
in Emacs 25.

00:06:53.280 --> 00:06:58.479
Dynamic module means there is
platform-specific native code involved,

00:06:58.479 --> 00:07:00.560
but since there are pre-compiled binaries

00:07:00.560 --> 00:07:02.880
for the three major platforms,

00:07:02.880 --> 00:07:04.706
it should work in most places.

00:07:04.706 --> 00:07:09.440
Currently, the core functionalities are
in a pretty good shape.

00:07:09.440 --> 00:07:12.560
Syntax highlighting is working nicely.

00:07:12.560 --> 00:07:16.080
The whole thing is split into three
packages.

00:07:16.080 --> 00:07:20.319
tree-sitter is the main package that
other packages should depend on.

00:07:20.319 --> 00:07:22.800
tree-sitter-langs is the language bundle

00:07:22.800 --> 00:07:24.000
that includes support

00:07:24.000 --> 00:07:27.199
for most common languages.

00:07:27.199 --> 00:07:32.160
And finally, the core APIs are in the
package tsc,

00:07:32.160 --> 00:07:36.160
which stands for tree-sitter-core.

00:07:36.160 --> 00:07:38.800
It is the implicit dependency of the

00:07:38.800 --> 00:07:43.520
tree-sitter package.

00:07:43.520 --> 00:07:47.520
The main package includes the minor mode
tree-sitter-mode.

00:07:47.520 --> 00:07:52.560
This provides the base for other major
or minor modes to build on.

00:07:52.560 --> 00:07:54.839
Using Emacs's change tracking hooks,

00:07:54.839 --> 00:07:57.073
it enables incremental parsing

00:07:57.073 --> 00:08:00.800
and provides a syntax tree that is
always up to date

00:08:00.800 --> 00:08:04.080
after any edits in a buffer.

00:08:04.080 --> 00:08:06.223
There is also a basic debug mode

00:08:06.223 --> 00:08:10.080
that shows the parse tree in
another buffer.

00:08:10.080 --> 00:08:13.360
Here is a quick demo.

00:08:13.360 --> 00:08:15.673
Here I'm in an empty Python buffer

00:08:15.673 --> 00:08:17.520
with tree-sitter enabled.

00:08:17.520 --> 00:08:19.440
I'm going to turn on the debug mode to

00:08:19.440 --> 00:08:26.560
see the parse tree.

00:08:26.560 --> 00:08:28.106
Since the buffer is empty,

00:08:28.106 --> 00:08:30.423
there is only one node in the
syntax tree:

00:08:30.423 --> 00:08:33.279
the top-level module node.

00:08:33.279 --> 00:09:11.040
Let's try typing some code.

00:09:11.040 --> 00:09:14.640
As you can see, as I type into the
Python buffer,

00:09:14.640 --> 00:09:19.120
the syntax tree updates in real time.

00:09:19.120 --> 00:09:22.039
The other minor mode included in the
main package

00:09:22.039 --> 00:09:24.389
is tree-sitter-hl-mode.

00:09:24.389 --> 00:09:26.349
It overrides font-lock mode

00:09:26.349 --> 00:09:28.480
and provides its own set of phases

00:09:28.480 --> 00:09:30.139
and customization options

00:09:30.139 --> 00:09:32.800
It is query-driven.

00:09:32.800 --> 00:09:36.240
That means instead of regular
expressions,

00:09:36.240 --> 00:09:39.518
it uses a Lisp-like query language

00:09:39.518 --> 00:09:40.320
to map syntax nodes

00:09:40.320 --> 00:09:41.923
to highlighting phrases.

00:09:41.923 --> 00:09:45.760
I'm going to open a python file with
small snippets

00:09:45.760 --> 00:09:54.320
that showcase syntax highlighting.

00:09:54.320 --> 00:09:55.920
So this is the default highlighting

00:09:55.920 --> 00:10:00.880
provided by python-mode.

00:10:00.880 --> 00:10:04.640
This is the highlighting enabled
by tree-sitter.

00:10:04.640 --> 00:10:07.680
as you can see string interpolation

00:10:07.680 --> 00:10:11.680
and decorators are highlighted correctly

00:10:11.680 --> 00:10:17.440
function calls are also highlighted

00:10:17.440 --> 00:10:20.240
you can also note that property

00:10:20.240 --> 00:10:21.839
assessors

00:10:21.839 --> 00:10:24.640
and property assignments are highlighted

00:10:24.640 --> 00:10:27.440
differently

00:10:27.440 --> 00:10:29.360
what I like the most about this is that

00:10:29.360 --> 00:10:30.880
new bindings are consistently

00:10:30.880 --> 00:10:32.640
highlighted

00:10:32.640 --> 00:10:36.320
this included local variable

00:10:36.320 --> 00:10:39.760
function parameters and property

00:10:39.760 --> 00:10:45.760
mutations

00:10:45.760 --> 00:10:48.000
before going through the three queries

00:10:48.000 --> 00:10:49.279
and the syntax highlighting

00:10:49.279 --> 00:10:51.680
customization options

00:10:51.680 --> 00:10:53.760
let's take a brief look at the core data

00:10:53.760 --> 00:10:55.040
structures and functions

00:10:55.040 --> 00:10:58.079
that tree sitter provides

00:10:58.079 --> 00:10:59.839
so parsing is done with the help of a

00:10:59.839 --> 00:11:02.240
generic parser object

00:11:02.240 --> 00:11:04.160
a single parser object can be used to

00:11:04.160 --> 00:11:06.000
pass different languages

00:11:06.000 --> 00:11:08.320
by sending different language objects to

00:11:08.320 --> 00:11:09.279
it

00:11:09.279 --> 00:11:10.880
the language objects themselves are

00:11:10.880 --> 00:11:14.079
loaded from shared libraries

00:11:14.079 --> 00:11:16.079
since three seater mode already handles

00:11:16.079 --> 00:11:17.360
the parsing part

00:11:17.360 --> 00:11:19.440
we will instead focus on the functions

00:11:19.440 --> 00:11:20.800
that inspect nodes

00:11:20.800 --> 00:11:25.279
and in the resulting path tree

00:11:25.279 --> 00:11:27.200
we can ask tree sitter what is the

00:11:27.200 --> 00:11:44.240
syntax node at point

00:11:44.240 --> 00:11:47.200
uh is it an opaque object so this is not

00:11:47.200 --> 00:11:48.480
very useful

00:11:48.480 --> 00:12:03.760
we can instead ask what is its type

00:12:03.760 --> 00:12:06.560
so his type is the symbol comparison

00:12:06.560 --> 00:12:08.959
operator

00:12:08.959 --> 00:12:11.600
trees there are two kinds of nodes

00:12:11.600 --> 00:12:13.680
anonymous nodes and named nodes

00:12:13.680 --> 00:12:15.519
anonymous nodes correspond to simple

00:12:15.519 --> 00:12:17.040
grammar elements

00:12:17.040 --> 00:12:19.839
like keywords operators punctuations and

00:12:19.839 --> 00:12:21.279
so on

00:12:21.279 --> 00:12:24.160
name nodes on the other hand grammar

00:12:24.160 --> 00:12:25.920
elements that are interesting enough for

00:12:25.920 --> 00:12:26.639
their own

00:12:26.639 --> 00:12:30.320
to have a name like an identifier an

00:12:30.320 --> 00:12:31.839
expression

00:12:31.839 --> 00:12:35.440
or a function definition

00:12:35.440 --> 00:12:37.760
name node types are symbols while

00:12:37.760 --> 00:12:42.639
anonymous node types are strings

00:12:42.639 --> 00:12:46.320
for example if we are on this

00:12:46.320 --> 00:12:49.760
comparison operator

00:12:49.760 --> 00:12:55.920
the node type should be a string

00:12:55.920 --> 00:12:57.920
we can also get other information about

00:12:57.920 --> 00:12:58.959
the node

00:12:58.959 --> 00:13:09.680
for example what is this text

00:13:09.680 --> 00:13:20.800
or where it is in the buffer

00:13:20.800 --> 00:13:43.199
or what is its parent

00:13:43.199 --> 00:13:46.160
there are many other apis to query or

00:13:46.160 --> 00:13:46.839
not

00:13:46.839 --> 00:13:52.639
properties

00:13:52.639 --> 00:13:54.399
tree sitter allows searching for

00:13:54.399 --> 00:13:58.240
structural patterns within a parse tree

00:13:58.240 --> 00:14:01.440
it does so through a list like language

00:14:01.440 --> 00:14:03.519
this language supports by the matching

00:14:03.519 --> 00:14:04.639
by node types

00:14:04.639 --> 00:14:07.760
field names and predicates

00:14:07.760 --> 00:14:10.079
it also allows capturing nodes for

00:14:10.079 --> 00:14:12.639
further processing

00:14:12.639 --> 00:14:37.680
let's try to see some examples

00:14:37.680 --> 00:14:41.040
so in this very simple query we just

00:14:41.040 --> 00:14:43.839
try to highlight all the identifiers in

00:14:43.839 --> 00:14:49.040
the buffer

00:14:49.040 --> 00:14:51.920
this s side tells trisito to capture a

00:14:51.920 --> 00:14:53.120
node

00:14:53.120 --> 00:14:55.839
in the context of the query builder it's

00:14:55.839 --> 00:14:57.360
not very important

00:14:57.360 --> 00:15:00.320
but in normal highlighting query this

00:15:00.320 --> 00:15:01.760
will determine

00:15:01.760 --> 00:15:06.639
the face used to highlight the note

00:15:06.639 --> 00:15:08.800
suppose we want to capture all the

00:15:08.800 --> 00:15:10.320
function names

00:15:10.320 --> 00:15:13.519
instead of just any identifier

00:15:13.519 --> 00:15:29.440
you can improve the query like this

00:15:29.440 --> 00:15:31.600
uh this will highlight the whole

00:15:31.600 --> 00:15:32.639
definition

00:15:32.639 --> 00:15:35.519
but we only want to capture the function

00:15:35.519 --> 00:15:36.399
name

00:15:36.399 --> 00:15:39.600
which means the identifier

00:15:39.600 --> 00:15:42.800
here so we

00:15:42.800 --> 00:15:46.320
move the capture to after the identifier

00:15:46.320 --> 00:15:49.600
node

00:15:49.600 --> 00:15:51.759
if we want to capture the class names as

00:15:51.759 --> 00:15:52.959
well

00:15:52.959 --> 00:16:10.079
we just add another pattern

00:16:10.079 --> 00:16:20.320
let's look at a more practical example

00:16:20.320 --> 00:16:22.959
here we can see that single quotes

00:16:22.959 --> 00:16:23.759
strings and

00:16:23.759 --> 00:16:25.600
double quotes screens are highlighted

00:16:25.600 --> 00:16:27.279
the same

00:16:27.279 --> 00:16:30.399
but in some places

00:16:30.399 --> 00:16:33.440
because of some coding conventions

00:16:33.440 --> 00:16:35.440
it may be desirable to highlight them

00:16:35.440 --> 00:16:37.279
differently for example if

00:16:37.279 --> 00:16:39.680
the string is single quoted we may want

00:16:39.680 --> 00:16:40.880
to highlight it

00:16:40.880 --> 00:16:44.399
as a constant

00:16:44.399 --> 00:16:46.160
let's try to see whether we can

00:16:46.160 --> 00:16:47.600
distinguish these

00:16:47.600 --> 00:16:56.240
two cases

00:16:56.240 --> 00:17:00.639
so here we get all the strings

00:17:00.639 --> 00:17:04.079
if we want to see if it's single quotes

00:17:04.079 --> 00:17:04.559
or

00:17:04.559 --> 00:17:08.799
double quote strings

00:17:08.799 --> 00:17:11.039
we can try looking at the first

00:17:11.039 --> 00:17:12.480
character

00:17:12.480 --> 00:17:15.280
of the string I mean the first character

00:17:15.280 --> 00:17:16.720
of the note

00:17:16.720 --> 00:17:19.360
to check whether it's a single quote or

00:17:19.360 --> 00:17:33.600
a double quote

00:17:33.600 --> 00:17:36.080
yeah so for that we use the three

00:17:36.080 --> 00:17:36.799
setters

00:17:36.799 --> 00:17:40.160
support for predicate in this case

00:17:40.160 --> 00:17:43.360
we use a match predicate

00:17:43.360 --> 00:17:46.080
to check whether the string where the

00:17:46.080 --> 00:17:46.799
note

00:17:46.799 --> 00:17:50.320
starts with a single quote and with this

00:17:50.320 --> 00:17:51.280
pattern

00:17:51.280 --> 00:17:58.840
we only capture the single quotes

00:17:58.840 --> 00:18:00.400
strings

00:18:00.400 --> 00:18:03.760
let's try to give it a different face

00:18:03.760 --> 00:18:13.039
so we copy the pattern

00:18:13.039 --> 00:18:18.640
and we add this pattern

00:18:18.640 --> 00:18:25.120
pop item only

00:18:25.120 --> 00:18:28.400
but we also want to give the

00:18:28.400 --> 00:18:31.440
capture a different name

00:18:31.440 --> 00:18:40.840
let's say we want to highlight it as a

00:18:40.840 --> 00:18:46.559
keyword

00:18:46.559 --> 00:19:06.320
and now if we refresh the buffer

00:19:06.320 --> 00:19:08.799
we see that single quote strings are

00:19:08.799 --> 00:19:10.320
highlighted as

00:19:10.320 --> 00:19:14.400
keywords

00:19:14.400 --> 00:19:16.400
the highlighting patterns can also be

00:19:16.400 --> 00:19:19.200
set for a single project

00:19:19.200 --> 00:19:23.440
using directory local variable

00:19:23.440 --> 00:19:26.880
for example let's take a look at

00:19:26.880 --> 00:19:35.760
ems source code

00:19:35.760 --> 00:19:40.400
so in image c source there are a lot of

00:19:40.400 --> 00:19:43.760
uses of these different macros

00:19:43.760 --> 00:19:47.679
to define functions

00:19:47.679 --> 00:19:51.200
and you can see

00:19:51.200 --> 00:19:53.520
this is actually the function name but

00:19:53.520 --> 00:19:55.760
it's highlighted as the

00:19:55.760 --> 00:19:59.120
string so what we want

00:19:59.120 --> 00:20:03.679
is to somehow recognize this pattern

00:20:03.679 --> 00:20:07.600
and highlight it

00:20:07.600 --> 00:20:11.280
as highlight this part

00:20:11.280 --> 00:20:14.559
with the function phase instead

00:20:14.559 --> 00:20:17.679
in order to do that

00:20:17.679 --> 00:20:20.240
we put a pattern in this project

00:20:20.240 --> 00:20:21.760
directory local

00:20:21.760 --> 00:20:31.760
settings file

00:20:31.760 --> 00:20:34.799
so we can put this button in the c

00:20:34.799 --> 00:20:40.159
mode section

00:20:40.159 --> 00:20:48.000
and now if we enable tree sitter

00:20:48.000 --> 00:20:50.480
you can see that this is the highlighted

00:20:50.480 --> 00:20:53.200
uh

00:20:53.200 --> 00:20:55.520
as a normal function definition so this

00:20:55.520 --> 00:20:56.559
is the function

00:20:56.559 --> 00:21:01.200
face like we wanted

00:21:01.200 --> 00:21:03.760
the pattern for this is actually pretty

00:21:03.760 --> 00:21:07.200
simple

00:21:07.200 --> 00:21:10.720
it's only

00:21:10.720 --> 00:21:14.720
only this part so

00:21:14.720 --> 00:21:17.440
if it's a function call where the name

00:21:17.440 --> 00:21:19.679
of the function is different

00:21:19.679 --> 00:21:21.600
then we highlight the different as a

00:21:21.600 --> 00:21:24.240
keyword

00:21:24.240 --> 00:21:27.360
and then the first string element we

00:21:27.360 --> 00:21:28.159
highlighted

00:21:28.159 --> 00:21:35.360
as a function name

00:21:35.360 --> 00:21:37.679
since the language objects are actually

00:21:37.679 --> 00:21:39.280
native code

00:21:39.280 --> 00:21:40.799
they have to be compiled for each

00:21:40.799 --> 00:21:43.440
platform that we want to support

00:21:43.440 --> 00:21:45.600
this will become a big obstacle for

00:21:45.600 --> 00:21:48.159
3-seater adoption

00:21:48.159 --> 00:21:50.240
therefore I've created a language window

00:21:50.240 --> 00:21:52.960
package 3-seater length

00:21:52.960 --> 00:21:54.960
that takes care of pre-compiling the

00:21:54.960 --> 00:21:56.320
grammars the

00:21:56.320 --> 00:21:59.679
most common grammars for all three major

00:21:59.679 --> 00:22:01.600
platforms

00:22:01.600 --> 00:22:04.080
it also takes care of distributing these

00:22:04.080 --> 00:22:05.360
binaries

00:22:05.360 --> 00:22:08.080
and provides some highlighting queries

00:22:08.080 --> 00:22:11.440
for some of the languages

00:22:11.440 --> 00:22:13.760
it should be noted that this package

00:22:13.760 --> 00:22:15.919
should be treated as a temporary

00:22:15.919 --> 00:22:19.919
distribution mechanism only

00:22:19.919 --> 00:22:22.240
to help with bootstrapping three-seaters

00:22:22.240 --> 00:22:24.720
adoption

00:22:24.720 --> 00:22:27.760
the plan is that eventually these files

00:22:27.760 --> 00:22:29.760
should be provided by the language major

00:22:29.760 --> 00:22:32.480
modes themselves

00:22:32.480 --> 00:22:35.120
but in order to do that we need better

00:22:35.120 --> 00:22:36.320
tooling

00:22:36.320 --> 00:22:40.240
so we're not there yet

00:22:40.240 --> 00:22:42.559
since the call already works reasonably

00:22:42.559 --> 00:22:43.280
well

00:22:43.280 --> 00:22:44.640
there are several areas that would

00:22:44.640 --> 00:22:46.320
benefit from the community's

00:22:46.320 --> 00:22:49.120
contribution

00:22:49.120 --> 00:22:51.520
so three seaters upstream language

00:22:51.520 --> 00:22:52.640
prepositories

00:22:52.640 --> 00:22:54.400
already contain highlighting queries on

00:22:54.400 --> 00:22:55.679
their own

00:22:55.679 --> 00:22:58.480
however they are pretty basic and they

00:22:58.480 --> 00:23:00.480
may not fit well with existing emax

00:23:00.480 --> 00:23:02.559
conventions

00:23:02.559 --> 00:23:04.320
therefore the language bundle has its

00:23:04.320 --> 00:23:07.120
own set of highlighting queries

00:23:07.120 --> 00:23:10.559
this requires maintenance until language

00:23:10.559 --> 00:23:11.600
measurements adopt

00:23:11.600 --> 00:23:13.760
three sitter and maintain the queries on

00:23:13.760 --> 00:23:16.640
their own

00:23:16.640 --> 00:23:18.480
the queries are actually quite easy to

00:23:18.480 --> 00:23:22.000
write as you've already seen

00:23:22.000 --> 00:23:24.240
you just need to be familiar with the

00:23:24.240 --> 00:23:25.360
language

00:23:25.360 --> 00:23:30.000
familiar enough to come up with sensible

00:23:30.000 --> 00:23:35.200
highlighting patterns

00:23:35.200 --> 00:23:37.600
and if you are a maintainer of a

00:23:37.600 --> 00:23:39.679
language major mode

00:23:39.679 --> 00:23:42.320
you may want to consider integrating

00:23:42.320 --> 00:23:43.360
tree sitter into

00:23:43.360 --> 00:23:46.960
your mode initially maybe as an

00:23:46.960 --> 00:23:50.080
optional feature the integration is

00:23:50.080 --> 00:23:53.279
actually pretty straightforward

00:23:53.279 --> 00:23:56.640
especially for syntax highlighting

00:23:56.640 --> 00:24:01.520
or alternatively

00:24:01.520 --> 00:24:03.760
you can also try writing a new major

00:24:03.760 --> 00:24:04.640
mode

00:24:04.640 --> 00:24:08.000
from scratch that relies on tree sitter

00:24:08.000 --> 00:24:12.559
from the very beginning

00:24:12.559 --> 00:24:16.320
the code for such a major mode is

00:24:16.320 --> 00:24:19.679
quite simple for example

00:24:19.679 --> 00:24:23.200
this is the proposed

00:24:23.200 --> 00:24:26.240
what mode for web assembly

00:24:26.240 --> 00:24:31.039
the code is just

00:24:31.039 --> 00:24:34.559
like one page of code not

00:24:34.559 --> 00:24:39.520
not a lot

00:24:39.520 --> 00:24:42.720
you can also try writing new minor modes

00:24:42.720 --> 00:24:46.559
or writing integration packages

00:24:46.559 --> 00:24:50.080
for example a lot of package a lot of

00:24:50.080 --> 00:24:50.880
packages

00:24:50.880 --> 00:24:54.559
may benefit from tree sitter integration

00:24:54.559 --> 00:24:58.840
but no one has written the integration

00:24:58.840 --> 00:25:02.960
yet

00:25:02.960 --> 00:25:05.039
if you are interested in 3-seater you

00:25:05.039 --> 00:25:06.720
can use these links to

00:25:06.720 --> 00:25:10.320
learn more about it I think that's it

00:25:10.320 --> 00:25:11.440
for me today

00:25:11.440 --> 00:25:18.159
I'm happy to answer any questions