summaryrefslogtreecommitdiffstats
path: root/kernel/tools/perf/util/parse-events.c
blob: b48e87693aa56b5ed7c3e3563fb19ab3a15d712c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
#include <linux/hw_breakpoint.h>
#include <linux/err.h>
#include "util.h"
#include "../perf.h"
#include "evlist.h"
#include "evsel.h"
#include "parse-options.h"
#include "parse-events.h"
#include "exec_cmd.h"
#include "string.h"
#include "symbol.h"
#include "cache.h"
#include "header.h"
#include "bpf-loader.h"
#include "debug.h"
#include <api/fs/tracing_path.h>
#include "parse-events-bison.h"
#define YY_EXTRA_TYPE int
#include "parse-events-flex.h"
#include "pmu.h"
#include "thread_map.h"
#include "cpumap.h"
#include "asm/bug.h"

#define MAX_NAME_LEN 100

#ifdef PARSER_DEBUG
extern int parse_events_debug;
#endif
int parse_events_parse(void *data, void *scanner);
static int get_config_terms(struct list_head *head_config,
			    struct list_head *head_terms __maybe_unused);

static struct perf_pmu_event_symbol *perf_pmu_events_list;
/*
 * The variable indicates the number of supported pmu event symbols.
 * 0 means not initialized and ready to init
 * -1 means failed to init, don't try anymore
 * >0 is the number of supported pmu event symbols
 */
static int perf_pmu_events_list_num;

struct event_symbol event_symbols_hw[PERF_COUNT_HW_MAX] = {
	[PERF_COUNT_HW_CPU_CYCLES] = {
		.symbol = "cpu-cycles",
		.alias  = "cycles",
	},
	[PERF_COUNT_HW_INSTRUCTIONS] = {
		.symbol = "instructions",
		.alias  = "",
	},
	[PERF_COUNT_HW_CACHE_REFERENCES] = {
		.symbol = "cache-references",
		.alias  = "",
	},
	[PERF_COUNT_HW_CACHE_MISSES] = {
		.symbol = "cache-misses",
		.alias  = "",
	},
	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = {
		.symbol = "branch-instructions",
		.alias  = "branches",
	},
	[PERF_COUNT_HW_BRANCH_MISSES] = {
		.symbol = "branch-misses",
		.alias  = "",
	},
	[PERF_COUNT_HW_BUS_CYCLES] = {
		.symbol = "bus-cycles",
		.alias  = "",
	},
	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = {
		.symbol = "stalled-cycles-frontend",
		.alias  = "idle-cycles-frontend",
	},
	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = {
		.symbol = "stalled-cycles-backend",
		.alias  = "idle-cycles-backend",
	},
	[PERF_COUNT_HW_REF_CPU_CYCLES] = {
		.symbol = "ref-cycles",
		.alias  = "",
	},
};

struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
	[PERF_COUNT_SW_CPU_CLOCK] = {
		.symbol = "cpu-clock",
		.alias  = "",
	},
	[PERF_COUNT_SW_TASK_CLOCK] = {
		.symbol = "task-clock",
		.alias  = "",
	},
	[PERF_COUNT_SW_PAGE_FAULTS] = {
		.symbol = "page-faults",
		.alias  = "faults",
	},
	[PERF_COUNT_SW_CONTEXT_SWITCHES] = {
		.symbol = "context-switches",
		.alias  = "cs",
	},
	[PERF_COUNT_SW_CPU_MIGRATIONS] = {
		.symbol = "cpu-migrations",
		.alias  = "migrations",
	},
	[PERF_COUNT_SW_PAGE_FAULTS_MIN] = {
		.symbol = "minor-faults",
		.alias  = "",
	},
	[PERF_COUNT_SW_PAGE_FAULTS_MAJ] = {
		.symbol = "major-faults",
		.alias  = "",
	},
	[PERF_COUNT_SW_ALIGNMENT_FAULTS] = {
		.symbol = "alignment-faults",
		.alias  = "",
	},
	[PERF_COUNT_SW_EMULATION_FAULTS] = {
		.symbol = "emulation-faults",
		.alias  = "",
	},
	[PERF_COUNT_SW_DUMMY] = {
		.symbol = "dummy",
		.alias  = "",
	},
	[PERF_COUNT_SW_BPF_OUTPUT] = {
		.symbol = "bpf-output",
		.alias  = "",
	},
};

#define __PERF_EVENT_FIELD(config, name) \
	((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT)

#define PERF_EVENT_RAW(config)		__PERF_EVENT_FIELD(config, RAW)
#define PERF_EVENT_CONFIG(config)	__PERF_EVENT_FIELD(config, CONFIG)
#define PERF_EVENT_TYPE(config)		__PERF_EVENT_FIELD(config, TYPE)
#define PERF_EVENT_ID(config)		__PERF_EVENT_FIELD(config, EVENT)

#define for_each_subsystem(sys_dir, sys_dirent, sys_next)	       \
	while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next)	       \
	if (sys_dirent.d_type == DT_DIR &&				       \
	   (strcmp(sys_dirent.d_name, ".")) &&				       \
	   (strcmp(sys_dirent.d_name, "..")))

static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
{
	char evt_path[MAXPATHLEN];
	int fd;

	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path,
			sys_dir->d_name, evt_dir->d_name);
	fd = open(evt_path, O_RDONLY);
	if (fd < 0)
		return -EINVAL;
	close(fd);

	return 0;
}

#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next)	       \
	while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next)        \
	if (evt_dirent.d_type == DT_DIR &&				       \
	   (strcmp(evt_dirent.d_name, ".")) &&				       \
	   (strcmp(evt_dirent.d_name, "..")) &&				       \
	   (!tp_event_has_id(&sys_dirent, &evt_dirent)))

#define MAX_EVENT_LENGTH 512


struct tracepoint_path *tracepoint_id_to_path(u64 config)
{
	struct tracepoint_path *path = NULL;
	DIR *sys_dir, *evt_dir;
	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
	char id_buf[24];
	int fd;
	u64 id;
	char evt_path[MAXPATHLEN];
	char dir_path[MAXPATHLEN];

	sys_dir = opendir(tracing_events_path);
	if (!sys_dir)
		return NULL;

	for_each_subsystem(sys_dir, sys_dirent, sys_next) {

		snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
			 sys_dirent.d_name);
		evt_dir = opendir(dir_path);
		if (!evt_dir)
			continue;

		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {

			snprintf(evt_path, MAXPATHLEN, "%s/%s/id", dir_path,
				 evt_dirent.d_name);
			fd = open(evt_path, O_RDONLY);
			if (fd < 0)
				continue;
			if (read(fd, id_buf, sizeof(id_buf)) < 0) {
				close(fd);
				continue;
			}
			close(fd);
			id = atoll(id_buf);
			if (id == config) {
				closedir(evt_dir);
				closedir(sys_dir);
				path = zalloc(sizeof(*path));
				path->system = malloc(MAX_EVENT_LENGTH);
				if (!path->system) {
					free(path);
					return NULL;
				}
				path->name = malloc(MAX_EVENT_LENGTH);
				if (!path->name) {
					zfree(&path->system);
					free(path);
					return NULL;
				}
				strncpy(path->system, sys_dirent.d_name,
					MAX_EVENT_LENGTH);
				strncpy(path->name, evt_dirent.d_name,
					MAX_EVENT_LENGTH);
				return path;
			}
		}
		closedir(evt_dir);
	}

	closedir(sys_dir);
	return NULL;
}

struct tracepoint_path *tracepoint_name_to_path(const char *name)
{
	struct tracepoint_path *path = zalloc(sizeof(*path));
	char *str = strchr(name, ':');

	if (path == NULL || str == NULL) {
		free(path);
		return NULL;
	}

	path->system = strndup(name, str - name);
	path->name = strdup(str+1);

	if (path->system == NULL || path->name == NULL) {
		zfree(&path->system);
		zfree(&path->name);
		free(path);
		path = NULL;
	}

	return path;
}

const char *event_type(int type)
{
	switch (type) {
	case PERF_TYPE_HARDWARE:
		return "hardware";

	case PERF_TYPE_SOFTWARE:
		return "software";

	case PERF_TYPE_TRACEPOINT:
		return "tracepoint";

	case PERF_TYPE_HW_CACHE:
		return "hardware-cache";

	default:
		break;
	}

	return "unknown";
}



static struct perf_evsel *
__add_event(struct list_head *list, int *idx,
	    struct perf_event_attr *attr,
	    char *name, struct cpu_map *cpus,
	    struct list_head *config_terms)
{
	struct perf_evsel *evsel;

	event_attr_init(attr);

	evsel = perf_evsel__new_idx(attr, (*idx)++);
	if (!evsel)
		return NULL;

	evsel->cpus     = cpu_map__get(cpus);
	evsel->own_cpus = cpu_map__get(cpus);

	if (name)
		evsel->name = strdup(name);

	if (config_terms)
		list_splice(config_terms, &evsel->config_terms);

	list_add_tail(&evsel->node, list);
	return evsel;
}

static int add_event(struct list_head *list, int *idx,
		     struct perf_event_attr *attr, char *name,
		     struct list_head *config_terms)
{
	return __add_event(list, idx, attr, name, NULL, config_terms) ? 0 : -ENOMEM;
}

static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size)
{
	int i, j;
	int n, longest = -1;

	for (i = 0; i < size; i++) {
		for (j = 0; j < PERF_EVSEL__MAX_ALIASES && names[i][j]; j++) {
			n = strlen(names[i][j]);
			if (n > longest && !strncasecmp(str, names[i][j], n))
				longest = n;
		}
		if (longest > 0)
			return i;
	}

	return -1;
}

int parse_events_add_cache(struct list_head *list, int *idx,
			   char *type, char *op_result1, char *op_result2)
{
	struct perf_event_attr attr;
	char name[MAX_NAME_LEN];
	int cache_type = -1, cache_op = -1, cache_result = -1;
	char *op_result[2] = { op_result1, op_result2 };
	int i, n;

	/*
	 * No fallback - if we cannot get a clear cache type
	 * then bail out:
	 */
	cache_type = parse_aliases(type, perf_evsel__hw_cache,
				   PERF_COUNT_HW_CACHE_MAX);
	if (cache_type == -1)
		return -EINVAL;

	n = snprintf(name, MAX_NAME_LEN, "%s", type);

	for (i = 0; (i < 2) && (op_result[i]); i++) {
		char *str = op_result[i];

		n += snprintf(name + n, MAX_NAME_LEN - n, "-%s", str);

		if (cache_op == -1) {
			cache_op = parse_aliases(str, perf_evsel__hw_cache_op,
						 PERF_COUNT_HW_CACHE_OP_MAX);
			if (cache_op >= 0) {
				if (!perf_evsel__is_cache_op_valid(cache_type, cache_op))
					return -EINVAL;
				continue;
			}
		}

		if (cache_result == -1) {
			cache_result = parse_aliases(str, perf_evsel__hw_cache_result,
						     PERF_COUNT_HW_CACHE_RESULT_MAX);
			if (cache_result >= 0)
				continue;
		}
	}

	/*
	 * Fall back to reads:
	 */
	if (cache_op == -1)
		cache_op = PERF_COUNT_HW_CACHE_OP_READ;

	/*
	 * Fall back to accesses:
	 */
	if (cache_result == -1)
		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;

	memset(&attr, 0, sizeof(attr));
	attr.config = cache_type | (cache_op << 8) | (cache_result << 16);
	attr.type = PERF_TYPE_HW_CACHE;
	return add_event(list, idx, &attr, name, NULL);
}

static void tracepoint_error(struct parse_events_error *e, int err,
			     char *sys, char *name)
{
	char help[BUFSIZ];

	if (!e)
		return;

	/*
	 * We get error directly from syscall errno ( > 0),
	 * or from encoded pointer's error ( < 0).
	 */
	err = abs(err);

	switch (err) {
	case EACCES:
		e->str = strdup("can't access trace events");
		break;
	case ENOENT:
		e->str = strdup("unknown tracepoint");
		break;
	default:
		e->str = strdup("failed to add tracepoint");
		break;
	}

	tracing_path__strerror_open_tp(err, help, sizeof(help), sys, name);
	e->help = strdup(help);
}

static int add_tracepoint(struct list_head *list, int *idx,
			  char *sys_name, char *evt_name,
			  struct parse_events_error *err,
			  struct list_head *head_config)
{
	struct perf_evsel *evsel;

	evsel = perf_evsel__newtp_idx(sys_name, evt_name, (*idx)++);
	if (IS_ERR(evsel)) {
		tracepoint_error(err, PTR_ERR(evsel), sys_name, evt_name);
		return PTR_ERR(evsel);
	}

	if (head_config) {
		LIST_HEAD(config_terms);

		if (get_config_terms(head_config, &config_terms))
			return -ENOMEM;
		list_splice(&config_terms, &evsel->config_terms);
	}

	list_add_tail(&evsel->node, list);
	return 0;
}

static int add_tracepoint_multi_event(struct list_head *list, int *idx,
				      char *sys_name, char *evt_name,
				      struct parse_events_error *err,
				      struct list_head *head_config)
{
	char evt_path[MAXPATHLEN];
	struct dirent *evt_ent;
	DIR *evt_dir;
	int ret = 0, found = 0;

	snprintf(evt_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_name);
	evt_dir = opendir(evt_path);
	if (!evt_dir) {
		tracepoint_error(err, errno, sys_name, evt_name);
		return -1;
	}

	while (!ret && (evt_ent = readdir(evt_dir))) {
		if (!strcmp(evt_ent->d_name, ".")
		    || !strcmp(evt_ent->d_name, "..")
		    || !strcmp(evt_ent->d_name, "enable")
		    || !strcmp(evt_ent->d_name, "filter"))
			continue;

		if (!strglobmatch(evt_ent->d_name, evt_name))
			continue;

		found++;

		ret = add_tracepoint(list, idx, sys_name, evt_ent->d_name,
				     err, head_config);
	}

	if (!found) {
		tracepoint_error(err, ENOENT, sys_name, evt_name);
		ret = -1;
	}

	closedir(evt_dir);
	return ret;
}

static int add_tracepoint_event(struct list_head *list, int *idx,
				char *sys_name, char *evt_name,
				struct parse_events_error *err,
				struct list_head *head_config)
{
	return strpbrk(evt_name, "*?") ?
	       add_tracepoint_multi_event(list, idx, sys_name, evt_name,
					  err, head_config) :
	       add_tracepoint(list, idx, sys_name, evt_name,
			      err, head_config);
}

static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
				    char *sys_name, char *evt_name,
				    struct parse_events_error *err,
				    struct list_head *head_config)
{
	struct dirent *events_ent;
	DIR *events_dir;
	int ret = 0;

	events_dir = opendir(tracing_events_path);
	if (!events_dir) {
		tracepoint_error(err, errno, sys_name, evt_name);
		return -1;
	}

	while (!ret && (events_ent = readdir(events_dir))) {
		if (!strcmp(events_ent->d_name, ".")
		    || !strcmp(events_ent->d_name, "..")
		    || !strcmp(events_ent->d_name, "enable")
		    || !strcmp(events_ent->d_name, "header_event")
		    || !strcmp(events_ent->d_name, "header_page"))
			continue;

		if (!strglobmatch(events_ent->d_name, sys_name))
			continue;

		ret = add_tracepoint_event(list, idx, events_ent->d_name,
					   evt_name, err, head_config);
	}

	closedir(events_dir);
	return ret;
}

struct __add_bpf_event_param {
	struct parse_events_evlist *data;
	struct list_head *list;
};

static int add_bpf_event(struct probe_trace_event *tev, int fd,
			 void *_param)
{
	LIST_HEAD(new_evsels);
	struct __add_bpf_event_param *param = _param;
	struct parse_events_evlist *evlist = param->data;
	struct list_head *list = param->list;
	struct perf_evsel *pos;
	int err;

	pr_debug("add bpf event %s:%s and attach bpf program %d\n",
		 tev->group, tev->event, fd);

	err = parse_events_add_tracepoint(&new_evsels, &evlist->idx, tev->group,
					  tev->event, evlist->error, NULL);
	if (err) {
		struct perf_evsel *evsel, *tmp;

		pr_debug("Failed to add BPF event %s:%s\n",
			 tev->group, tev->event);
		list_for_each_entry_safe(evsel, tmp, &new_evsels, node) {
			list_del(&evsel->node);
			perf_evsel__delete(evsel);
		}
		return err;
	}
	pr_debug("adding %s:%s\n", tev->group, tev->event);

	list_for_each_entry(pos, &new_evsels, node) {
		pr_debug("adding %s:%s to %p\n",
			 tev->group, tev->event, pos);
		pos->bpf_fd = fd;
	}
	list_splice(&new_evsels, list);
	return 0;
}

int parse_events_load_bpf_obj(struct parse_events_evlist *data,
			      struct list_head *list,
			      struct bpf_object *obj)
{
	int err;
	char errbuf[BUFSIZ];
	struct __add_bpf_event_param param = {data, list};
	static bool registered_unprobe_atexit = false;

	if (IS_ERR(obj) || !obj) {
		snprintf(errbuf, sizeof(errbuf),
			 "Internal error: load bpf obj with NULL");
		err = -EINVAL;
		goto errout;
	}

	/*
	 * Register atexit handler before calling bpf__probe() so
	 * bpf__probe() don't need to unprobe probe points its already
	 * created when failure.
	 */
	if (!registered_unprobe_atexit) {
		atexit(bpf__clear);
		registered_unprobe_atexit = true;
	}

	err = bpf__probe(obj);
	if (err) {
		bpf__strerror_probe(obj, err, errbuf, sizeof(errbuf));
		goto errout;
	}

	err = bpf__load(obj);
	if (err) {
		bpf__strerror_load(obj, err, errbuf, sizeof(errbuf));
		goto errout;
	}

	err = bpf__foreach_tev(obj, add_bpf_event, &param);
	if (err) {
		snprintf(errbuf, sizeof(errbuf),
			 "Attach events in BPF object failed");
		goto errout;
	}

	return 0;
errout:
	data->error->help = strdup("(add -v to see detail)");
	data->error->str = strdup(errbuf);
	return err;
}

int parse_events_load_bpf(struct parse_events_evlist *data,
			  struct list_head *list,
			  char *bpf_file_name,
			  bool source)
{
	struct bpf_object *obj;

	obj = bpf__prepare_load(bpf_file_name, source);
	if (IS_ERR(obj)) {
		char errbuf[BUFSIZ];
		int err;

		err = PTR_ERR(obj);

		if (err == -ENOTSUP)
			snprintf(errbuf, sizeof(errbuf),
				 "BPF support is not compiled");
		else
			bpf__strerror_prepare_load(bpf_file_name,
						   source,
						   -err, errbuf,
						   sizeof(errbuf));

		data->error->help = strdup("(add -v to see detail)");
		data->error->str = strdup(errbuf);
		return err;
	}

	return parse_events_load_bpf_obj(data, list, obj);
}

static int
parse_breakpoint_type(const char *type, struct perf_event_attr *attr)
{
	int i;

	for (i = 0; i < 3; i++) {
		if (!type || !type[i])
			break;

#define CHECK_SET_TYPE(bit)		\
do {					\
	if (attr->bp_type & bit)	\
		return -EINVAL;		\
	else				\
		attr->bp_type |= bit;	\
} while (0)

		switch (type[i]) {
		case 'r':
			CHECK_SET_TYPE(HW_BREAKPOINT_R);
			break;
		case 'w':
			CHECK_SET_TYPE(HW_BREAKPOINT_W);
			break;
		case 'x':
			CHECK_SET_TYPE(HW_BREAKPOINT_X);
			break;
		default:
			return -EINVAL;
		}
	}

#undef CHECK_SET_TYPE

	if (!attr->bp_type) /* Default */
		attr->bp_type = HW_BREAKPOINT_R | HW_BREAKPOINT_W;

	return 0;
}

int parse_events_add_breakpoint(struct list_head *list, int *idx,
				void *ptr, char *type, u64 len)
{
	struct perf_event_attr attr;

	memset(&attr, 0, sizeof(attr));
	attr.bp_addr = (unsigned long) ptr;

	if (parse_breakpoint_type(type, &attr))
		return -EINVAL;

	/* Provide some defaults if len is not specified */
	if (!len) {
		if (attr.bp_type == HW_BREAKPOINT_X)
			len = sizeof(long);
		else
			len = HW_BREAKPOINT_LEN_4;
	}

	attr.bp_len = len;

	attr.type = PERF_TYPE_BREAKPOINT;
	attr.sample_period = 1;

	return add_event(list, idx, &attr, NULL, NULL);
}

static int check_type_val(struct parse_events_term *term,
			  struct parse_events_error *err,
			  int type)
{
	if (type == term->type_val)
		return 0;

	if (err) {
		err->idx = term->err_val;
		if (type == PARSE_EVENTS__TERM_TYPE_NUM)
			err->str = strdup("expected numeric value");
		else
			err->str = strdup("expected string value");
	}
	return -EINVAL;
}

typedef int config_term_func_t(struct perf_event_attr *attr,
			       struct parse_events_term *term,
			       struct parse_events_error *err);

static int config_term_common(struct perf_event_attr *attr,
			      struct parse_events_term *term,
			      struct parse_events_error *err)
{
#define CHECK_TYPE_VAL(type)						   \
do {									   \
	if (check_type_val(term, err, PARSE_EVENTS__TERM_TYPE_ ## type)) \
		return -EINVAL;						   \
} while (0)

	switch (term->type_term) {
	case PARSE_EVENTS__TERM_TYPE_CONFIG:
		CHECK_TYPE_VAL(NUM);
		attr->config = term->val.num;
		break;
	case PARSE_EVENTS__TERM_TYPE_CONFIG1:
		CHECK_TYPE_VAL(NUM);
		attr->config1 = term->val.num;
		break;
	case PARSE_EVENTS__TERM_TYPE_CONFIG2:
		CHECK_TYPE_VAL(NUM);
		attr->config2 = term->val.num;
		break;
	case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
		CHECK_TYPE_VAL(NUM);
		break;
	case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
		CHECK_TYPE_VAL(NUM);
		break;
	case PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE:
		/*
		 * TODO uncomment when the field is available
		 * attr->branch_sample_type = term->val.num;
		 */
		break;
	case PARSE_EVENTS__TERM_TYPE_TIME:
		CHECK_TYPE_VAL(NUM);
		if (term->val.num > 1) {
			err->str = strdup("expected 0 or 1");
			err->idx = term->err_val;
			return -EINVAL;
		}
		break;
	case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
		CHECK_TYPE_VAL(STR);
		break;
	case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
		CHECK_TYPE_VAL(NUM);
		break;
	case PARSE_EVENTS__TERM_TYPE_INHERIT:
		CHECK_TYPE_VAL(NUM);
		break;
	case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
		CHECK_TYPE_VAL(NUM);
		break;
	case PARSE_EVENTS__TERM_TYPE_NAME:
		CHECK_TYPE_VAL(STR);
		break;
	default:
		err->str = strdup("unknown term");
		err->idx = term->err_term;
		err->help = parse_events_formats_error_string(NULL);
		return -EINVAL;
	}

	return 0;
#undef CHECK_TYPE_VAL
}

static int config_term_pmu(struct perf_event_attr *attr,
			   struct parse_events_term *term,
			   struct parse_events_error *err)
{
	if (term->type_term == PARSE_EVENTS__TERM_TYPE_USER)
		/*
		 * Always succeed for sysfs terms, as we dont know
		 * at this point what type they need to have.
		 */
		return 0;
	else
		return config_term_common(attr, term, err);
}

static int config_term_tracepoint(struct perf_event_attr *attr,
				  struct parse_events_term *term,
				  struct parse_events_error *err)
{
	switch (term->type_term) {
	case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
	case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
	case PARSE_EVENTS__TERM_TYPE_INHERIT:
	case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
		return config_term_common(attr, term, err);
	default:
		if (err) {
			err->idx = term->err_term;
			err->str = strdup("unknown term");
			err->help = strdup("valid terms: call-graph,stack-size\n");
		}
		return -EINVAL;
	}

	return 0;
}

static int config_attr(struct perf_event_attr *attr,
		       struct list_head *head,
		       struct parse_events_error *err,
		       config_term_func_t config_term)
{
	struct parse_events_term *term;

	list_for_each_entry(term, head, list)
		if (config_term(attr, term, err))
			return -EINVAL;

	return 0;
}

static int get_config_terms(struct list_head *head_config,
			    struct list_head *head_terms __maybe_unused)
{
#define ADD_CONFIG_TERM(__type, __name, __val)			\
do {								\
	struct perf_evsel_config_term *__t;			\
								\
	__t = zalloc(sizeof(*__t));				\
	if (!__t)						\
		return -ENOMEM;					\
								\
	INIT_LIST_HEAD(&__t->list);				\
	__t->type       = PERF_EVSEL__CONFIG_TERM_ ## __type;	\
	__t->val.__name = __val;				\
	list_add_tail(&__t->list, head_terms);			\
} while (0)

	struct parse_events_term *term;

	list_for_each_entry(term, head_config, list) {
		switch (term->type_term) {
		case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
			ADD_CONFIG_TERM(PERIOD, period, term->val.num);
			break;
		case PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ:
			ADD_CONFIG_TERM(FREQ, freq, term->val.num);
			break;
		case PARSE_EVENTS__TERM_TYPE_TIME:
			ADD_CONFIG_TERM(TIME, time, term->val.num);
			break;
		case PARSE_EVENTS__TERM_TYPE_CALLGRAPH:
			ADD_CONFIG_TERM(CALLGRAPH, callgraph, term->val.str);
			break;
		case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
			ADD_CONFIG_TERM(STACK_USER, stack_user, term->val.num);
			break;
		case PARSE_EVENTS__TERM_TYPE_INHERIT:
			ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 1 : 0);
			break;
		case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
			ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 0 : 1);
			break;
		default:
			break;
		}
	}
#undef ADD_EVSEL_CONFIG
	return 0;
}

int parse_events_add_tracepoint(struct list_head *list, int *idx,
				char *sys, char *event,
				struct parse_events_error *err,
				struct list_head *head_config)
{
	if (head_config) {
		struct perf_event_attr attr;

		if (config_attr(&attr, head_config, err,
				config_term_tracepoint))
			return -EINVAL;
	}

	if (strpbrk(sys, "*?"))
		return add_tracepoint_multi_sys(list, idx, sys, event,
						err, head_config);
	else
		return add_tracepoint_event(list, idx, sys, event,
					    err, head_config);
}

int parse_events_add_numeric(struct parse_events_evlist *data,
			     struct list_head *list,
			     u32 type, u64 config,
			     struct list_head *head_config)
{
	struct perf_event_attr attr;
	LIST_HEAD(config_terms);

	memset(&attr, 0, sizeof(attr));
	attr.type = type;
	attr.config = config;

	if (head_config) {
		if (config_attr(&attr, head_config, data->error,
				config_term_common))
			return -EINVAL;

		if (get_config_terms(head_config, &config_terms))
			return -ENOMEM;
	}

	return add_event(list, &data->idx, &attr, NULL, &config_terms);
}

static int parse_events__is_name_term(struct parse_events_term *term)
{
	return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
}

static char *pmu_event_name(struct list_head *head_terms)
{
	struct parse_events_term *term;

	list_for_each_entry(term, head_terms, list)
		if (parse_events__is_name_term(term))
			return term->val.str;

	return NULL;
}

int parse_events_add_pmu(struct parse_events_evlist *data,
			 struct list_head *list, char *name,
			 struct list_head *head_config)
{
	struct perf_event_attr attr;
	struct perf_pmu_info info;
	struct perf_pmu *pmu;
	struct perf_evsel *evsel;
	LIST_HEAD(config_terms);

	pmu = perf_pmu__find(name);
	if (!pmu)
		return -EINVAL;

	if (pmu->default_config) {
		memcpy(&attr, pmu->default_config,
		       sizeof(struct perf_event_attr));
	} else {
		memset(&attr, 0, sizeof(attr));
	}

	if (!head_config) {
		attr.type = pmu->type;
		evsel = __add_event(list, &data->idx, &attr, NULL, pmu->cpus, NULL);
		return evsel ? 0 : -ENOMEM;
	}

	if (perf_pmu__check_alias(pmu, head_config, &info))
		return -EINVAL;

	/*
	 * Configure hardcoded terms first, no need to check
	 * return value when called with fail == 0 ;)
	 */
	if (config_attr(&attr, head_config, data->error, config_term_pmu))
		return -EINVAL;

	if (get_config_terms(head_config, &config_terms))
		return -ENOMEM;

	if (perf_pmu__config(pmu, &attr, head_config, data->error))
		return -EINVAL;

	evsel = __add_event(list, &data->idx, &attr,
			    pmu_event_name(head_config), pmu->cpus,
			    &config_terms);
	if (evsel) {
		evsel->unit = info.unit;
		evsel->scale = info.scale;
		evsel->per_pkg = info.per_pkg;
		evsel->snapshot = info.snapshot;
	}

	return evsel ? 0 : -ENOMEM;
}

int parse_events__modifier_group(struct list_head *list,
				 char *event_mod)
{
	return parse_events__modifier_event(list, event_mod, true);
}

void parse_events__set_leader(char *name, struct list_head *list)
{
	struct perf_evsel *leader;

	if (list_empty(list)) {
		WARN_ONCE(true, "WARNING: failed to set leader: empty list");
		return;
	}

	__perf_evlist__set_leader(list);
	leader = list_entry(list->next, struct perf_evsel, node);
	leader->group_name = name ? strdup(name) : NULL;
}

/* list_event is assumed to point to malloc'ed memory */
void parse_events_update_lists(struct list_head *list_event,
			       struct list_head *list_all)
{
	/*
	 * Called for single event definition. Update the
	 * 'all event' list, and reinit the 'single event'
	 * list, for next event definition.
	 */
	list_splice_tail(list_event, list_all);
	free(list_event);
}

struct event_modifier {
	int eu;
	int ek;
	int eh;
	int eH;
	int eG;
	int eI;
	int precise;
	int precise_max;
	int exclude_GH;
	int sample_read;
	int pinned;
};

static int get_event_modifier(struct event_modifier *mod, char *str,
			       struct perf_evsel *evsel)
{
	int eu = evsel ? evsel->attr.exclude_user : 0;
	int ek = evsel ? evsel->attr.exclude_kernel : 0;
	int eh = evsel ? evsel->attr.exclude_hv : 0;
	int eH = evsel ? evsel->attr.exclude_host : 0;
	int eG = evsel ? evsel->attr.exclude_guest : 0;
	int eI = evsel ? evsel->attr.exclude_idle : 0;
	int precise = evsel ? evsel->attr.precise_ip : 0;
	int precise_max = 0;
	int sample_read = 0;
	int pinned = evsel ? evsel->attr.pinned : 0;

	int exclude = eu | ek | eh;
	int exclude_GH = evsel ? evsel->exclude_GH : 0;

	memset(mod, 0, sizeof(*mod));

	while (*str) {
		if (*str == 'u') {
			if (!exclude)
				exclude = eu = ek = eh = 1;
			eu = 0;
		} else if (*str == 'k') {
			if (!exclude)
				exclude = eu = ek = eh = 1;
			ek = 0;
		} else if (*str == 'h') {
			if (!exclude)
				exclude = eu = ek = eh = 1;
			eh = 0;
		} else if (*str == 'G') {
			if (!exclude_GH)
				exclude_GH = eG = eH = 1;
			eG = 0;
		} else if (*str == 'H') {
			if (!exclude_GH)
				exclude_GH = eG = eH = 1;
			eH = 0;
		} else if (*str == 'I') {
			eI = 1;
		} else if (*str == 'p') {
			precise++;
			/* use of precise requires exclude_guest */
			if (!exclude_GH)
				eG = 1;
		} else if (*str == 'P') {
			precise_max = 1;
		} else if (*str == 'S') {
			sample_read = 1;
		} else if (*str == 'D') {
			pinned = 1;
		} else
			break;

		++str;
	}

	/*
	 * precise ip:
	 *
	 *  0 - SAMPLE_IP can have arbitrary skid
	 *  1 - SAMPLE_IP must have constant skid
	 *  2 - SAMPLE_IP requested to have 0 skid
	 *  3 - SAMPLE_IP must have 0 skid
	 *
	 *  See also PERF_RECORD_MISC_EXACT_IP
	 */
	if (precise > 3)
		return -EINVAL;

	mod->eu = eu;
	mod->ek = ek;
	mod->eh = eh;
	mod->eH = eH;
	mod->eG = eG;
	mod->eI = eI;
	mod->precise = precise;
	mod->precise_max = precise_max;
	mod->exclude_GH = exclude_GH;
	mod->sample_read = sample_read;
	mod->pinned = pinned;

	return 0;
}

/*
 * Basic modifier sanity check to validate it contains only one
 * instance of any modifier (apart from 'p') present.
 */
static int check_modifier(char *str)
{
	char *p = str;

	/* The sizeof includes 0 byte as well. */
	if (strlen(str) > (sizeof("ukhGHpppPSDI") - 1))
		return -1;

	while (*p) {
		if (*p != 'p' && strchr(p + 1, *p))
			return -1;
		p++;
	}

	return 0;
}

int parse_events__modifier_event(struct list_head *list, char *str, bool add)
{
	struct perf_evsel *evsel;
	struct event_modifier mod;

	if (str == NULL)
		return 0;

	if (check_modifier(str))
		return -EINVAL;

	if (!add && get_event_modifier(&mod, str, NULL))
		return -EINVAL;

	__evlist__for_each(list, evsel) {
		if (add && get_event_modifier(&mod, str, evsel))
			return -EINVAL;

		evsel->attr.exclude_user   = mod.eu;
		evsel->attr.exclude_kernel = mod.ek;
		evsel->attr.exclude_hv     = mod.eh;
		evsel->attr.precise_ip     = mod.precise;
		evsel->attr.exclude_host   = mod.eH;
		evsel->attr.exclude_guest  = mod.eG;
		evsel->attr.exclude_idle   = mod.eI;
		evsel->exclude_GH          = mod.exclude_GH;
		evsel->sample_read         = mod.sample_read;
		evsel->precise_max         = mod.precise_max;

		if (perf_evsel__is_group_leader(evsel))
			evsel->attr.pinned = mod.pinned;
	}

	return 0;
}

int parse_events_name(struct list_head *list, char *name)
{
	struct perf_evsel *evsel;

	__evlist__for_each(list, evsel) {
		if (!evsel->name)
			evsel->name = strdup(name);
	}

	return 0;
}

static int
comp_pmu(const void *p1, const void *p2)
{
	struct perf_pmu_event_symbol *pmu1 = (struct perf_pmu_event_symbol *) p1;
	struct perf_pmu_event_symbol *pmu2 = (struct perf_pmu_event_symbol *) p2;

	return strcmp(pmu1->symbol, pmu2->symbol);
}

static void perf_pmu__parse_cleanup(void)
{
	if (perf_pmu_events_list_num > 0) {
		struct perf_pmu_event_symbol *p;
		int i;

		for (i = 0; i < perf_pmu_events_list_num; i++) {
			p = perf_pmu_events_list + i;
			free(p->symbol);
		}
		free(perf_pmu_events_list);
		perf_pmu_events_list = NULL;
		perf_pmu_events_list_num = 0;
	}
}

#define SET_SYMBOL(str, stype)		\
do {					\
	p->symbol = str;		\
	if (!p->symbol)			\
		goto err;		\
	p->type = stype;		\
} while (0)

/*
 * Read the pmu events list from sysfs
 * Save it into perf_pmu_events_list
 */
static void perf_pmu__parse_init(void)
{

	struct perf_pmu *pmu = NULL;
	struct perf_pmu_alias *alias;
	int len = 0;

	pmu = perf_pmu__find("cpu");
	if ((pmu == NULL) || list_empty(&pmu->aliases)) {
		perf_pmu_events_list_num = -1;
		return;
	}
	list_for_each_entry(alias, &pmu->aliases, list) {
		if (strchr(alias->name, '-'))
			len++;
		len++;
	}
	perf_pmu_events_list = malloc(sizeof(struct perf_pmu_event_symbol) * len);
	if (!perf_pmu_events_list)
		return;
	perf_pmu_events_list_num = len;

	len = 0;
	list_for_each_entry(alias, &pmu->aliases, list) {
		struct perf_pmu_event_symbol *p = perf_pmu_events_list + len;
		char *tmp = strchr(alias->name, '-');

		if (tmp != NULL) {
			SET_SYMBOL(strndup(alias->name, tmp - alias->name),
					PMU_EVENT_SYMBOL_PREFIX);
			p++;
			SET_SYMBOL(strdup(++tmp), PMU_EVENT_SYMBOL_SUFFIX);
			len += 2;
		} else {
			SET_SYMBOL(strdup(alias->name), PMU_EVENT_SYMBOL);
			len++;
		}
	}
	qsort(perf_pmu_events_list, len,
		sizeof(struct perf_pmu_event_symbol), comp_pmu);

	return;
err:
	perf_pmu__parse_cleanup();
}

enum perf_pmu_event_symbol_type
perf_pmu__parse_check(const char *name)
{
	struct perf_pmu_event_symbol p, *r;

	/* scan kernel pmu events from sysfs if needed */
	if (perf_pmu_events_list_num == 0)
		perf_pmu__parse_init();
	/*
	 * name "cpu" could be prefix of cpu-cycles or cpu// events.
	 * cpu-cycles has been handled by hardcode.
	 * So it must be cpu// events, not kernel pmu event.
	 */
	if ((perf_pmu_events_list_num <= 0) || !strcmp(name, "cpu"))
		return PMU_EVENT_SYMBOL_ERR;

	p.symbol = strdup(name);
	r = bsearch(&p, perf_pmu_events_list,
			(size_t) perf_pmu_events_list_num,
			sizeof(struct perf_pmu_event_symbol), comp_pmu);
	free(p.symbol);
	return r ? r->type : PMU_EVENT_SYMBOL_ERR;
}

static int parse_events__scanner(const char *str, void *data, int start_token)
{
	YY_BUFFER_STATE buffer;
	void *scanner;
	int ret;

	ret = parse_events_lex_init_extra(start_token, &scanner);
	if (ret)
		return ret;

	buffer = parse_events__scan_string(str, scanner);

#ifdef PARSER_DEBUG
	parse_events_debug = 1;
#endif
	ret = parse_events_parse(data, scanner);

	parse_events__flush_buffer(buffer, scanner);
	parse_events__delete_buffer(buffer, scanner);
	parse_events_lex_destroy(scanner);
	return ret;
}

/*
 * parse event config string, return a list of event terms.
 */
int parse_events_terms(struct list_head *terms, const char *str)
{
	struct parse_events_terms data = {
		.terms = NULL,
	};
	int ret;

	ret = parse_events__scanner(str, &data, PE_START_TERMS);
	if (!ret) {
		list_splice(data.terms, terms);
		zfree(&data.terms);
		return 0;
	}

	if (data.terms)
		parse_events__free_terms(data.terms);
	return ret;
}

int parse_events(struct perf_evlist *evlist, const char *str,
		 struct parse_events_error *err)
{
	struct parse_events_evlist data = {
		.list  = LIST_HEAD_INIT(data.list),
		.idx   = evlist->nr_entries,
		.error = err,
	};
	int ret;

	ret = parse_events__scanner(str, &data, PE_START_EVENTS);
	perf_pmu__parse_cleanup();
	if (!ret) {
		struct perf_evsel *last;

		if (list_empty(&data.list)) {
			WARN_ONCE(true, "WARNING: event parser found nothing");
			return -1;
		}

		perf_evlist__splice_list_tail(evlist, &data.list);
		evlist->nr_groups += data.nr_groups;
		last = perf_evlist__last(evlist);
		last->cmdline_group_boundary = true;

		return 0;
	}

	/*
	 * There are 2 users - builtin-record and builtin-test objects.
	 * Both call perf_evlist__delete in case of error, so we dont
	 * need to bother.
	 */
	return ret;
}

#define MAX_WIDTH 1000
static int get_term_width(void)
{
	struct winsize ws;

	get_term_dimensions(&ws);
	return ws.ws_col > MAX_WIDTH ? MAX_WIDTH : ws.ws_col;
}

static void parse_events_print_error(struct parse_events_error *err,
				     const char *event)
{
	const char *str = "invalid or unsupported event: ";
	char _buf[MAX_WIDTH];
	char *buf = (char *) event;
	int idx = 0;

	if (err->str) {
		/* -2 for extra '' in the final fprintf */
		int width       = get_term_width() - 2;
		int len_event   = strlen(event);
		int len_str, max_len, cut = 0;

		/*
		 * Maximum error index indent, we will cut
		 * the event string if it's bigger.
		 */
		int max_err_idx = 13;

		/*
		 * Let's be specific with the message when
		 * we have the precise error.
		 */
		str     = "event syntax error: ";
		len_str = strlen(str);
		max_len = width - len_str;

		buf = _buf;

		/* We're cutting from the beggining. */
		if (err->idx > max_err_idx)
			cut = err->idx - max_err_idx;

		strncpy(buf, event + cut, max_len);

		/* Mark cut parts with '..' on both sides. */
		if (cut)
			buf[0] = buf[1] = '.';

		if ((len_event - cut) > max_len) {
			buf[max_len - 1] = buf[max_len - 2] = '.';
			buf[max_len] = 0;
		}

		idx = len_str + err->idx - cut;
	}

	fprintf(stderr, "%s'%s'\n", str, buf);
	if (idx) {
		fprintf(stderr, "%*s\\___ %s\n", idx + 1, "", err->str);
		if (err->help)
			fprintf(stderr, "\n%s\n", err->help);
		free(err->str);
		free(err->help);
	}

	fprintf(stderr, "Run 'perf list' for a list of valid events\n");
}

#undef MAX_WIDTH

int parse_events_option(const struct option *opt, const char *str,
			int unset __maybe_unused)
{
	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;
	struct parse_events_error err = { .idx = 0, };
	int ret = parse_events(evlist, str, &err);

	if (ret)
		parse_events_print_error(&err, str);

	return ret;
}

static int
foreach_evsel_in_last_glob(struct perf_evlist *evlist,
			   int (*func)(struct perf_evsel *evsel,
				       const void *arg),
			   const void *arg)
{
	struct perf_evsel *last = NULL;
	int err;

	/*
	 * Don't return when list_empty, give func a chance to report
	 * error when it found last == NULL.
	 *
	 * So no need to WARN here, let *func do this.
	 */
	if (evlist->nr_entries > 0)
		last = perf_evlist__last(evlist);

	do {
		err = (*func)(last, arg);
		if (err)
			return -1;
		if (!last)
			return 0;

		if (last->node.prev == &evlist->entries)
			return 0;
		last = list_entry(last->node.prev, struct perf_evsel, node);
	} while (!last->cmdline_group_boundary);

	return 0;
}

static int set_filter(struct perf_evsel *evsel, const void *arg)
{
	const char *str = arg;

	if (evsel == NULL || evsel->attr.type != PERF_TYPE_TRACEPOINT) {
		fprintf(stderr,
			"--filter option should follow a -e tracepoint option\n");
		return -1;
	}

	if (perf_evsel__append_filter(evsel, "&&", str) < 0) {
		fprintf(stderr,
			"not enough memory to hold filter string\n");
		return -1;
	}

	return 0;
}

int parse_filter(const struct option *opt, const char *str,
		 int unset __maybe_unused)
{
	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;

	return foreach_evsel_in_last_glob(evlist, set_filter,
					  (const void *)str);
}

static int add_exclude_perf_filter(struct perf_evsel *evsel,
				   const void *arg __maybe_unused)
{
	char new_filter[64];

	if (evsel == NULL || evsel->attr.type != PERF_TYPE_TRACEPOINT) {
		fprintf(stderr,
			"--exclude-perf option should follow a -e tracepoint option\n");
		return -1;
	}

	snprintf(new_filter, sizeof(new_filter), "common_pid != %d", getpid());

	if (perf_evsel__append_filter(evsel, "&&", new_filter) < 0) {
		fprintf(stderr,
			"not enough memory to hold filter string\n");
		return -1;
	}

	return 0;
}

int exclude_perf(const struct option *opt,
		 const char *arg __maybe_unused,
		 int unset __maybe_unused)
{
	struct perf_evlist *evlist = *(struct perf_evlist **)opt->value;

	return foreach_evsel_in_last_glob(evlist, add_exclude_perf_filter,
					  NULL);
}

static const char * const event_type_descriptors[] = {
	"Hardware event",
	"Software event",
	"Tracepoint event",
	"Hardware cache event",
	"Raw hardware event descriptor",
	"Hardware breakpoint",
};

static int cmp_string(const void *a, const void *b)
{
	const char * const *as = a;
	const char * const *bs = b;

	return strcmp(*as, *bs);
}

/*
 * Print the events from <debugfs_mount_point>/tracing/events
 */

void print_tracepoint_events(const char *subsys_glob, const char *event_glob,
			     bool name_only)
{
	DIR *sys_dir, *evt_dir;
	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
	char evt_path[MAXPATHLEN];
	char dir_path[MAXPATHLEN];
	char **evt_list = NULL;
	unsigned int evt_i = 0, evt_num = 0;
	bool evt_num_known = false;

restart:
	sys_dir = opendir(tracing_events_path);
	if (!sys_dir)
		return;

	if (evt_num_known) {
		evt_list = zalloc(sizeof(char *) * evt_num);
		if (!evt_list)
			goto out_close_sys_dir;
	}

	for_each_subsystem(sys_dir, sys_dirent, sys_next) {
		if (subsys_glob != NULL &&
		    !strglobmatch(sys_dirent.d_name, subsys_glob))
			continue;

		snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
			 sys_dirent.d_name);
		evt_dir = opendir(dir_path);
		if (!evt_dir)
			continue;

		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
			if (event_glob != NULL &&
			    !strglobmatch(evt_dirent.d_name, event_glob))
				continue;

			if (!evt_num_known) {
				evt_num++;
				continue;
			}

			snprintf(evt_path, MAXPATHLEN, "%s:%s",
				 sys_dirent.d_name, evt_dirent.d_name);

			evt_list[evt_i] = strdup(evt_path);
			if (evt_list[evt_i] == NULL)
				goto out_close_evt_dir;
			evt_i++;
		}
		closedir(evt_dir);
	}
	closedir(sys_dir);

	if (!evt_num_known) {
		evt_num_known = true;
		goto restart;
	}
	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
	evt_i = 0;
	while (evt_i < evt_num) {
		if (name_only) {
			printf("%s ", evt_list[evt_i++]);
			continue;
		}
		printf("  %-50s [%s]\n", evt_list[evt_i++],
				event_type_descriptors[PERF_TYPE_TRACEPOINT]);
	}
	if (evt_num && pager_in_use())
		printf("\n");

out_free:
	evt_num = evt_i;
	for (evt_i = 0; evt_i < evt_num; evt_i++)
		zfree(&evt_list[evt_i]);
	zfree(&evt_list);
	return;

out_close_evt_dir:
	closedir(evt_dir);
out_close_sys_dir:
	closedir(sys_dir);

	printf("FATAL: not enough memory to print %s\n",
			event_type_descriptors[PERF_TYPE_TRACEPOINT]);
	if (evt_list)
		goto out_free;
}

/*
 * Check whether event is in <debugfs_mount_point>/tracing/events
 */

int is_valid_tracepoint(const char *event_string)
{
	DIR *sys_dir, *evt_dir;
	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
	char evt_path[MAXPATHLEN];
	char dir_path[MAXPATHLEN];

	sys_dir = opendir(tracing_events_path);
	if (!sys_dir)
		return 0;

	for_each_subsystem(sys_dir, sys_dirent, sys_next) {

		snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
			 sys_dirent.d_name);
		evt_dir = opendir(dir_path);
		if (!evt_dir)
			continue;

		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
			snprintf(evt_path, MAXPATHLEN, "%s:%s",
				 sys_dirent.d_name, evt_dirent.d_name);
			if (!strcmp(evt_path, event_string)) {
				closedir(evt_dir);
				closedir(sys_dir);
				return 1;
			}
		}
		closedir(evt_dir);
	}
	closedir(sys_dir);
	return 0;
}

static bool is_event_supported(u8 type, unsigned config)
{
	bool ret = true;
	int open_return;
	struct perf_evsel *evsel;
	struct perf_event_attr attr = {
		.type = type,
		.config = config,
		.disabled = 1,
	};
	struct {
		struct thread_map map;
		int threads[1];
	} tmap = {
		.map.nr	 = 1,
		.threads = { 0 },
	};

	evsel = perf_evsel__new(&attr);
	if (evsel) {
		open_return = perf_evsel__open(evsel, NULL, &tmap.map);
		ret = open_return >= 0;

		if (open_return == -EACCES) {
			/*
			 * This happens if the paranoid value
			 * /proc/sys/kernel/perf_event_paranoid is set to 2
			 * Re-run with exclude_kernel set; we don't do that
			 * by default as some ARM machines do not support it.
			 *
			 */
			evsel->attr.exclude_kernel = 1;
			ret = perf_evsel__open(evsel, NULL, &tmap.map) >= 0;
		}
		perf_evsel__delete(evsel);
	}

	return ret;
}

int print_hwcache_events(const char *event_glob, bool name_only)
{
	unsigned int type, op, i, evt_i = 0, evt_num = 0;
	char name[64];
	char **evt_list = NULL;
	bool evt_num_known = false;

restart:
	if (evt_num_known) {
		evt_list = zalloc(sizeof(char *) * evt_num);
		if (!evt_list)
			goto out_enomem;
	}

	for (type = 0; type < PERF_COUNT_HW_CACHE_MAX; type++) {
		for (op = 0; op < PERF_COUNT_HW_CACHE_OP_MAX; op++) {
			/* skip invalid cache type */
			if (!perf_evsel__is_cache_op_valid(type, op))
				continue;

			for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) {
				__perf_evsel__hw_cache_type_op_res_name(type, op, i,
									name, sizeof(name));
				if (event_glob != NULL && !strglobmatch(name, event_glob))
					continue;

				if (!is_event_supported(PERF_TYPE_HW_CACHE,
							type | (op << 8) | (i << 16)))
					continue;

				if (!evt_num_known) {
					evt_num++;
					continue;
				}

				evt_list[evt_i] = strdup(name);
				if (evt_list[evt_i] == NULL)
					goto out_enomem;
				evt_i++;
			}
		}
	}

	if (!evt_num_known) {
		evt_num_known = true;
		goto restart;
	}
	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
	evt_i = 0;
	while (evt_i < evt_num) {
		if (name_only) {
			printf("%s ", evt_list[evt_i++]);
			continue;
		}
		printf("  %-50s [%s]\n", evt_list[evt_i++],
				event_type_descriptors[PERF_TYPE_HW_CACHE]);
	}
	if (evt_num && pager_in_use())
		printf("\n");

out_free:
	evt_num = evt_i;
	for (evt_i = 0; evt_i < evt_num; evt_i++)
		zfree(&evt_list[evt_i]);
	zfree(&evt_list);
	return evt_num;

out_enomem:
	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[PERF_TYPE_HW_CACHE]);
	if (evt_list)
		goto out_free;
	return evt_num;
}

void print_symbol_events(const char *event_glob, unsigned type,
				struct event_symbol *syms, unsigned max,
				bool name_only)
{
	unsigned int i, evt_i = 0, evt_num = 0;
	char name[MAX_NAME_LEN];
	char **evt_list = NULL;
	bool evt_num_known = false;

restart:
	if (evt_num_known) {
		evt_list = zalloc(sizeof(char *) * evt_num);
		if (!evt_list)
			goto out_enomem;
		syms -= max;
	}

	for (i = 0; i < max; i++, syms++) {

		if (event_glob != NULL && syms->symbol != NULL &&
		    !(strglobmatch(syms->symbol, event_glob) ||
		      (syms->alias && strglobmatch(syms->alias, event_glob))))
			continue;

		if (!is_event_supported(type, i))
			continue;

		if (!evt_num_known) {
			evt_num++;
			continue;
		}

		if (!name_only && strlen(syms->alias))
			snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias);
		else
			strncpy(name, syms->symbol, MAX_NAME_LEN);

		evt_list[evt_i] = strdup(name);
		if (evt_list[evt_i] == NULL)
			goto out_enomem;
		evt_i++;
	}

	if (!evt_num_known) {
		evt_num_known = true;
		goto restart;
	}
	qsort(evt_list, evt_num, sizeof(char *), cmp_string);
	evt_i = 0;
	while (evt_i < evt_num) {
		if (name_only) {
			printf("%s ", evt_list[evt_i++]);
			continue;
		}
		printf("  %-50s [%s]\n", evt_list[evt_i++], event_type_descriptors[type]);
	}
	if (evt_num && pager_in_use())
		printf("\n");

out_free:
	evt_num = evt_i;
	for (evt_i = 0; evt_i < evt_num; evt_i++)
		zfree(&evt_list[evt_i]);
	zfree(&evt_list);
	return;

out_enomem:
	printf("FATAL: not enough memory to print %s\n", event_type_descriptors[type]);
	if (evt_list)
		goto out_free;
}

/*
 * Print the help text for the event symbols:
 */
void print_events(const char *event_glob, bool name_only)
{
	print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
			    event_symbols_hw, PERF_COUNT_HW_MAX, name_only);

	print_symbol_events(event_glob, PERF_TYPE_SOFTWARE,
			    event_symbols_sw, PERF_COUNT_SW_MAX, name_only);

	print_hwcache_events(event_glob, name_only);

	print_pmu_events(event_glob, name_only);

	if (event_glob != NULL)
		return;

	if (!name_only) {
		printf("  %-50s [%s]\n",
		       "rNNN",
		       event_type_descriptors[PERF_TYPE_RAW]);
		printf("  %-50s [%s]\n",
		       "cpu/t1=v1[,t2=v2,t3 ...]/modifier",
		       event_type_descriptors[PERF_TYPE_RAW]);
		if (pager_in_use())
			printf("   (see 'man perf-list' on how to encode it)\n\n");

		printf("  %-50s [%s]\n",
		       "mem:<addr>[/len][:access]",
			event_type_descriptors[PERF_TYPE_BREAKPOINT]);
		if (pager_in_use())
			printf("\n");
	}

	print_tracepoint_events(NULL, NULL, name_only);
}

int parse_events__is_hardcoded_term(struct parse_events_term *term)
{
	return term->type_term != PARSE_EVENTS__TERM_TYPE_USER;
}

static int new_term(struct parse_events_term **_term, int type_val,
		    int type_term, char *config,
		    char *str, u64 num, int err_term, int err_val)
{
	struct parse_events_term *term;

	term = zalloc(sizeof(*term));
	if (!term)
		return -ENOMEM;

	INIT_LIST_HEAD(&term->list);
	term->type_val  = type_val;
	term->type_term = type_term;
	term->config = config;
	term->err_term = err_term;
	term->err_val  = err_val;

	switch (type_val) {
	case PARSE_EVENTS__TERM_TYPE_NUM:
		term->val.num = num;
		break;
	case PARSE_EVENTS__TERM_TYPE_STR:
		term->val.str = str;
		break;
	default:
		free(term);
		return -EINVAL;
	}

	*_term = term;
	return 0;
}

int parse_events_term__num(struct parse_events_term **term,
			   int type_term, char *config, u64 num,
			   void *loc_term_, void *loc_val_)
{
	YYLTYPE *loc_term = loc_term_;
	YYLTYPE *loc_val = loc_val_;

	return new_term(term, PARSE_EVENTS__TERM_TYPE_NUM, type_term,
			config, NULL, num,
			loc_term ? loc_term->first_column : 0,
			loc_val ? loc_val->first_column : 0);
}

int parse_events_term__str(struct parse_events_term **term,
			   int type_term, char *config, char *str,
			   void *loc_term_, void *loc_val_)
{
	YYLTYPE *loc_term = loc_term_;
	YYLTYPE *loc_val = loc_val_;

	return new_term(term, PARSE_EVENTS__TERM_TYPE_STR, type_term,
			config, str, 0,
			loc_term ? loc_term->first_column : 0,
			loc_val ? loc_val->first_column : 0);
}

int parse_events_term__sym_hw(struct parse_events_term **term,
			      char *config, unsigned idx)
{
	struct event_symbol *sym;

	BUG_ON(idx >= PERF_COUNT_HW_MAX);
	sym = &event_symbols_hw[idx];

	if (config)
		return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
				PARSE_EVENTS__TERM_TYPE_USER, config,
				(char *) sym->symbol, 0, 0, 0);
	else
		return new_term(term, PARSE_EVENTS__TERM_TYPE_STR,
				PARSE_EVENTS__TERM_TYPE_USER,
				(char *) "event", (char *) sym->symbol,
				0, 0, 0);
}

int parse_events_term__clone(struct parse_events_term **new,
			     struct parse_events_term *term)
{
	return new_term(new, term->type_val, term->type_term, term->config,
			term->val.str, term->val.num,
			term->err_term, term->err_val);
}

void parse_events__free_terms(struct list_head *terms)
{
	struct parse_events_term *term, *h;

	list_for_each_entry_safe(term, h, terms, list)
		free(term);
}

void parse_events_evlist_error(struct parse_events_evlist *data,
			       int idx, const char *str)
{
	struct parse_events_error *err = data->error;

	if (!err)
		return;
	err->idx = idx;
	err->str = strdup(str);
	WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
}

/*
 * Return string contains valid config terms of an event.
 * @additional_terms: For terms such as PMU sysfs terms.
 */
char *parse_events_formats_error_string(char *additional_terms)
{
	char *str;
	static const char *static_terms = "config,config1,config2,name,"
					  "period,freq,branch_type,time,"
					  "call-graph,stack-size\n";

	/* valid terms */
	if (additional_terms) {
		if (!asprintf(&str, "valid terms: %s,%s",
			      additional_terms, static_terms))
			goto fail;
	} else {
		if (!asprintf(&str, "valid terms: %s", static_terms))
			goto fail;
	}
	return str;

fail:
	return NULL;
}
eturn 0; } static int cmp_refcount_rec_by_cpos(const void *a, const void *b) { const struct ocfs2_refcount_rec *l = a, *r = b; u64 l_cpos = le64_to_cpu(l->r_cpos); u64 r_cpos = le64_to_cpu(r->r_cpos); if (l_cpos > r_cpos) return 1; if (l_cpos < r_cpos) return -1; return 0; } static void swap_refcount_rec(void *a, void *b, int size) { struct ocfs2_refcount_rec *l = a, *r = b; swap(*l, *r); } /* * The refcount cpos are ordered by their 64bit cpos, * But we will use the low 32 bit to be the e_cpos in the b-tree. * So we need to make sure that this pos isn't intersected with others. * * Note: The refcount block is already sorted by their low 32 bit cpos, * So just try the middle pos first, and we will exit when we find * the good position. */ static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, u32 *split_pos, int *split_index) { int num_used = le16_to_cpu(rl->rl_used); int delta, middle = num_used / 2; for (delta = 0; delta < middle; delta++) { /* Let's check delta earlier than middle */ if (ocfs2_refcount_rec_no_intersect( &rl->rl_recs[middle - delta - 1], &rl->rl_recs[middle - delta])) { *split_index = middle - delta; break; } /* For even counts, don't walk off the end */ if ((middle + delta + 1) == num_used) continue; /* Now try delta past middle */ if (ocfs2_refcount_rec_no_intersect( &rl->rl_recs[middle + delta], &rl->rl_recs[middle + delta + 1])) { *split_index = middle + delta + 1; break; } } if (delta >= middle) return -ENOSPC; *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); return 0; } static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, struct buffer_head *new_bh, u32 *split_cpos) { int split_index = 0, num_moved, ret; u32 cpos = 0; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_list *rl = &rb->rf_records; struct ocfs2_refcount_block *new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; trace_ocfs2_divide_leaf_refcount_block( (unsigned long long)ref_leaf_bh->b_blocknr, le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); /* * XXX: Improvement later. * If we know all the high 32 bit cpos is the same, no need to sort. * * In order to make the whole process safe, we do: * 1. sort the entries by their low 32 bit cpos first so that we can * find the split cpos easily. * 2. call ocfs2_insert_extent to insert the new refcount block. * 3. move the refcount rec to the new block. * 4. sort the entries by their 64 bit cpos. * 5. dirty the new_rb and rb. */ sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), cmp_refcount_rec_by_low_cpos, swap_refcount_rec); ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); if (ret) { mlog_errno(ret); return ret; } new_rb->rf_cpos = cpu_to_le32(cpos); /* move refcount records starting from split_index to the new block. */ num_moved = le16_to_cpu(rl->rl_used) - split_index; memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], num_moved * sizeof(struct ocfs2_refcount_rec)); /*ok, remove the entries we just moved over to the other block. */ memset(&rl->rl_recs[split_index], 0, num_moved * sizeof(struct ocfs2_refcount_rec)); /* change old and new rl_used accordingly. */ le16_add_cpu(&rl->rl_used, -num_moved); new_rl->rl_used = cpu_to_le16(num_moved); sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), cmp_refcount_rec_by_cpos, swap_refcount_rec); sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), sizeof(struct ocfs2_refcount_rec), cmp_refcount_rec_by_cpos, swap_refcount_rec); *split_cpos = cpos; return 0; } static int ocfs2_new_leaf_refcount_block(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_alloc_context *meta_ac) { int ret; u16 suballoc_bit_start; u32 num_got, new_cpos; u64 suballoc_loc, blkno; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *root_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; struct buffer_head *new_bh = NULL; struct ocfs2_refcount_block *new_rb; struct ocfs2_extent_tree ref_et; BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc, &suballoc_bit_start, &num_got, &blkno); if (ret) { mlog_errno(ret); goto out; } new_bh = sb_getblk(sb, blkno); if (new_bh == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; } ocfs2_set_new_buffer_uptodate(ci, new_bh); ret = ocfs2_journal_access_rb(handle, ci, new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out; } /* Initialize ocfs2_refcount_block. */ new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; memset(new_rb, 0, sb->s_blocksize); strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot); new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc); new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); new_rb->rf_blkno = cpu_to_le64(blkno); new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); new_rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); new_rb->rf_generation = root_rb->rf_generation; ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); if (ret) { mlog_errno(ret); goto out; } ocfs2_journal_dirty(handle, ref_leaf_bh); ocfs2_journal_dirty(handle, new_bh); ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); trace_ocfs2_new_leaf_refcount_block( (unsigned long long)new_bh->b_blocknr, new_cpos); /* Insert the new leaf block with the specific offset cpos. */ ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, 1, 0, meta_ac); if (ret) mlog_errno(ret); out: brelse(new_bh); return ret; } static int ocfs2_expand_refcount_tree(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_alloc_context *meta_ac) { int ret; struct buffer_head *expand_bh = NULL; if (ref_root_bh == ref_leaf_bh) { /* * the old root bh hasn't been expanded to a b-tree, * so expand it first. */ ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, &expand_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; } } else { expand_bh = ref_leaf_bh; get_bh(expand_bh); } /* Now add a new refcount block into the tree.*/ ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, expand_bh, meta_ac); if (ret) mlog_errno(ret); out: brelse(expand_bh); return ret; } /* * Adjust the extent rec in b-tree representing ref_leaf_bh. * * Only called when we have inserted a new refcount rec at index 0 * which means ocfs2_extent_rec.e_cpos may need some change. */ static int ocfs2_adjust_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *rec) { int ret = 0, i; u32 new_cpos, old_cpos; struct ocfs2_path *path = NULL; struct ocfs2_extent_tree et; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; struct ocfs2_extent_list *el; if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) goto out; rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; old_cpos = le32_to_cpu(rb->rf_cpos); new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; if (old_cpos <= new_cpos) goto out; ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); path = ocfs2_new_path_from_et(&et); if (!path) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_find_path(ci, path, old_cpos); if (ret) { mlog_errno(ret); goto out; } /* * 2 more credits, one for the leaf refcount block, one for * the extent block contains the extent rec. */ ret = ocfs2_extend_trans(handle, 2); if (ret < 0) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; } /* change the leaf extent block first. */ el = path_leaf_el(path); for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) break; BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); /* change the r_cpos in the leaf block. */ rb->rf_cpos = cpu_to_le32(new_cpos); ocfs2_journal_dirty(handle, path_leaf_bh(path)); ocfs2_journal_dirty(handle, ref_leaf_bh); out: ocfs2_free_path(path); return ret; } static int ocfs2_insert_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *rec, int index, int merge, struct ocfs2_alloc_context *meta_ac) { int ret; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_list *rf_list = &rb->rf_records; struct buffer_head *new_bh = NULL; BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); if (rf_list->rl_used == rf_list->rl_count) { u64 cpos = le64_to_cpu(rec->r_cpos); u32 len = le32_to_cpu(rec->r_clusters); ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, ref_leaf_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, NULL, &index, &new_bh); if (ret) { mlog_errno(ret); goto out; } ref_leaf_bh = new_bh; rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; rf_list = &rb->rf_records; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } if (index < le16_to_cpu(rf_list->rl_used)) memmove(&rf_list->rl_recs[index + 1], &rf_list->rl_recs[index], (le16_to_cpu(rf_list->rl_used) - index) * sizeof(struct ocfs2_refcount_rec)); trace_ocfs2_insert_refcount_rec( (unsigned long long)ref_leaf_bh->b_blocknr, index, (unsigned long long)le64_to_cpu(rec->r_cpos), le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount)); rf_list->rl_recs[index] = *rec; le16_add_cpu(&rf_list->rl_used, 1); if (merge) ocfs2_refcount_rec_merge(rb, index); ocfs2_journal_dirty(handle, ref_leaf_bh); if (index == 0) { ret = ocfs2_adjust_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, rec); if (ret) mlog_errno(ret); } out: brelse(new_bh); return ret; } /* * Split the refcount_rec indexed by "index" in ref_leaf_bh. * This is much simple than our b-tree code. * split_rec is the new refcount rec we want to insert. * If split_rec->r_refcount > 0, we are changing the refcount(in case we * increase refcount or decrease a refcount to non-zero). * If split_rec->r_refcount == 0, we are punching a hole in current refcount * rec( in case we decrease a refcount to zero). */ static int ocfs2_split_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_refcount_rec *split_rec, int index, int merge, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, recs_need; u32 len; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_list *rf_list = &rb->rf_records; struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; struct ocfs2_refcount_rec *tail_rec = NULL; struct buffer_head *new_bh = NULL; BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), le32_to_cpu(orig_rec->r_refcount), le64_to_cpu(split_rec->r_cpos), le32_to_cpu(split_rec->r_clusters), le32_to_cpu(split_rec->r_refcount)); /* * If we just need to split the header or tail clusters, * no more recs are needed, just split is OK. * Otherwise we at least need one new recs. */ if (!split_rec->r_refcount && (split_rec->r_cpos == orig_rec->r_cpos || le64_to_cpu(split_rec->r_cpos) + le32_to_cpu(split_rec->r_clusters) == le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) recs_need = 0; else recs_need = 1; /* * We need one more rec if we split in the middle and the new rec have * some refcount in it. */ if (split_rec->r_refcount && (split_rec->r_cpos != orig_rec->r_cpos && le64_to_cpu(split_rec->r_cpos) + le32_to_cpu(split_rec->r_clusters) != le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) recs_need++; /* If the leaf block don't have enough record, expand it. */ if (le16_to_cpu(rf_list->rl_used) + recs_need > le16_to_cpu(rf_list->rl_count)) { struct ocfs2_refcount_rec tmp_rec; u64 cpos = le64_to_cpu(orig_rec->r_cpos); len = le32_to_cpu(orig_rec->r_clusters); ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, ref_leaf_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; } /* * We have to re-get it since now cpos may be moved to * another leaf block. */ ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, &tmp_rec, &index, &new_bh); if (ret) { mlog_errno(ret); goto out; } ref_leaf_bh = new_bh; rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; rf_list = &rb->rf_records; orig_rec = &rf_list->rl_recs[index]; } ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } /* * We have calculated out how many new records we need and store * in recs_need, so spare enough space first by moving the records * after "index" to the end. */ if (index != le16_to_cpu(rf_list->rl_used) - 1) memmove(&rf_list->rl_recs[index + 1 + recs_need], &rf_list->rl_recs[index + 1], (le16_to_cpu(rf_list->rl_used) - index - 1) * sizeof(struct ocfs2_refcount_rec)); len = (le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)) - (le64_to_cpu(split_rec->r_cpos) + le32_to_cpu(split_rec->r_clusters)); /* * If we have "len", the we will split in the tail and move it * to the end of the space we have just spared. */ if (len) { tail_rec = &rf_list->rl_recs[index + recs_need]; memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); le64_add_cpu(&tail_rec->r_cpos, le32_to_cpu(tail_rec->r_clusters) - len); tail_rec->r_clusters = cpu_to_le32(len); } /* * If the split pos isn't the same as the original one, we need to * split in the head. * * Note: We have the chance that split_rec.r_refcount = 0, * recs_need = 0 and len > 0, which means we just cut the head from * the orig_rec and in that case we have done some modification in * orig_rec above, so the check for r_cpos is faked. */ if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { len = le64_to_cpu(split_rec->r_cpos) - le64_to_cpu(orig_rec->r_cpos); orig_rec->r_clusters = cpu_to_le32(len); index++; } le16_add_cpu(&rf_list->rl_used, recs_need); if (split_rec->r_refcount) { rf_list->rl_recs[index] = *split_rec; trace_ocfs2_split_refcount_rec_insert( (unsigned long long)ref_leaf_bh->b_blocknr, index, (unsigned long long)le64_to_cpu(split_rec->r_cpos), le32_to_cpu(split_rec->r_clusters), le32_to_cpu(split_rec->r_refcount)); if (merge) ocfs2_refcount_rec_merge(rb, index); } ocfs2_journal_dirty(handle, ref_leaf_bh); out: brelse(new_bh); return ret; } static int __ocfs2_increase_refcount(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 cpos, u32 len, int merge, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0, index; struct buffer_head *ref_leaf_bh = NULL; struct ocfs2_refcount_rec rec; unsigned int set_len = 0; trace_ocfs2_increase_refcount_begin( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)cpos, len); while (len) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } set_len = le32_to_cpu(rec.r_clusters); /* * Here we may meet with 3 situations: * * 1. If we find an already existing record, and the length * is the same, cool, we just need to increase the r_refcount * and it is OK. * 2. If we find a hole, just insert it with r_refcount = 1. * 3. If we are in the middle of one extent record, split * it. */ if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && set_len <= len) { trace_ocfs2_increase_refcount_change( (unsigned long long)cpos, set_len, le32_to_cpu(rec.r_refcount)); ret = ocfs2_change_refcount_rec(handle, ci, ref_leaf_bh, index, merge, 1); if (ret) { mlog_errno(ret); goto out; } } else if (!rec.r_refcount) { rec.r_refcount = cpu_to_le32(1); trace_ocfs2_increase_refcount_insert( (unsigned long long)le64_to_cpu(rec.r_cpos), set_len); ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, &rec, index, merge, meta_ac); if (ret) { mlog_errno(ret); goto out; } } else { set_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + set_len) - cpos; rec.r_cpos = cpu_to_le64(cpos); rec.r_clusters = cpu_to_le32(set_len); le32_add_cpu(&rec.r_refcount, 1); trace_ocfs2_increase_refcount_split( (unsigned long long)le64_to_cpu(rec.r_cpos), set_len, le32_to_cpu(rec.r_refcount)); ret = ocfs2_split_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, &rec, index, merge, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out; } } cpos += set_len; len -= set_len; brelse(ref_leaf_bh); ref_leaf_bh = NULL; } out: brelse(ref_leaf_bh); return ret; } static int ocfs2_remove_refcount_extent(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_extent_tree et; BUG_ON(rb->rf_records.rl_used); trace_ocfs2_remove_refcount_extent( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)ref_leaf_bh->b_blocknr, le32_to_cpu(rb->rf_cpos)); ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), 1, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out; } ocfs2_remove_from_cache(ci, ref_leaf_bh); /* * add the freed block to the dealloc so that it will be freed * when we run dealloc. */ ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(rb->rf_suballoc_slot), le64_to_cpu(rb->rf_suballoc_loc), le64_to_cpu(rb->rf_blkno), le16_to_cpu(rb->rf_suballoc_bit)); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; le32_add_cpu(&rb->rf_clusters, -1); /* * check whether we need to restore the root refcount block if * there is no leaf extent block at atll. */ if (!rb->rf_list.l_next_free_rec) { BUG_ON(rb->rf_clusters); trace_ocfs2_restore_refcount_block( (unsigned long long)ref_root_bh->b_blocknr); rb->rf_flags = 0; rb->rf_parent = 0; rb->rf_cpos = 0; memset(&rb->rf_records, 0, sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_records)); rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); } ocfs2_journal_dirty(handle, ref_root_bh); out: return ret; } int ocfs2_increase_refcount(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { return __ocfs2_increase_refcount(handle, ci, ref_root_bh, cpos, len, 1, meta_ac, dealloc); } static int ocfs2_decrease_refcount_rec(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, struct buffer_head *ref_leaf_bh, int index, u64 cpos, unsigned int len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; struct ocfs2_refcount_block *rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); BUG_ON(cpos + len > le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); trace_ocfs2_decrease_refcount_rec( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)cpos, len); if (cpos == le64_to_cpu(rec->r_cpos) && len == le32_to_cpu(rec->r_clusters)) ret = ocfs2_change_refcount_rec(handle, ci, ref_leaf_bh, index, 1, -1); else { struct ocfs2_refcount_rec split = *rec; split.r_cpos = cpu_to_le64(cpos); split.r_clusters = cpu_to_le32(len); le32_add_cpu(&split.r_refcount, -1); ret = ocfs2_split_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, &split, index, 1, meta_ac, dealloc); } if (ret) { mlog_errno(ret); goto out; } /* Remove the leaf refcount block if it contains no refcount record. */ if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, ref_leaf_bh, meta_ac, dealloc); if (ret) mlog_errno(ret); } out: return ret; } static int __ocfs2_decrease_refcount(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc, int delete) { int ret = 0, index = 0; struct ocfs2_refcount_rec rec; unsigned int r_count = 0, r_len; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct buffer_head *ref_leaf_bh = NULL; trace_ocfs2_decrease_refcount( (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)cpos, len, delete); while (len) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, len, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } r_count = le32_to_cpu(rec.r_refcount); BUG_ON(r_count == 0); if (!delete) BUG_ON(r_count > 1); r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, ref_leaf_bh, index, cpos, r_len, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out; } if (le32_to_cpu(rec.r_refcount) == 1 && delete) { ret = ocfs2_cache_cluster_dealloc(dealloc, ocfs2_clusters_to_blocks(sb, cpos), r_len); if (ret) { mlog_errno(ret); goto out; } } cpos += r_len; len -= r_len; brelse(ref_leaf_bh); ref_leaf_bh = NULL; } out: brelse(ref_leaf_bh); return ret; } /* Caller must hold refcount tree lock. */ int ocfs2_decrease_refcount(struct inode *inode, handle_t *handle, u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc, int delete) { int ret; u64 ref_blkno; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); ret = ocfs2_get_refcount_block(inode, &ref_blkno); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, cpos, len, meta_ac, dealloc, delete); if (ret) mlog_errno(ret); out: brelse(ref_root_bh); return ret; } /* * Mark the already-existing extent at cpos as refcounted for len clusters. * This adds the refcount extent flag. * * If the existing extent is larger than the request, initiate a * split. An attempt will be made at merging with adjacent extents. * * The caller is responsible for passing down meta_ac if we'll need it. */ static int ocfs2_mark_extent_refcounted(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } ret = ocfs2_change_extent_flag(handle, et, cpos, len, phys, meta_ac, dealloc, OCFS2_EXT_REFCOUNTED, 0); if (ret) mlog_errno(ret); out: return ret; } /* * Given some contiguous physical clusters, calculate what we need * for modifying their refcount. */ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, struct ocfs2_caching_info *ci, struct buffer_head *ref_root_bh, u64 start_cpos, u32 clusters, int *meta_add, int *credits) { int ret = 0, index, ref_blocks = 0, recs_add = 0; u64 cpos = start_cpos; struct ocfs2_refcount_block *rb; struct ocfs2_refcount_rec rec; struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; u32 len; while (clusters) { ret = ocfs2_get_refcount_rec(ci, ref_root_bh, cpos, clusters, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } if (ref_leaf_bh != prev_bh) { /* * Now we encounter a new leaf block, so calculate * whether we need to extend the old leaf. */ if (prev_bh) { rb = (struct ocfs2_refcount_block *) prev_bh->b_data; if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; } recs_add = 0; *credits += 1; brelse(prev_bh); prev_bh = ref_leaf_bh; get_bh(prev_bh); } trace_ocfs2_calc_refcount_meta_credits_iterate( recs_add, (unsigned long long)cpos, clusters, (unsigned long long)le64_to_cpu(rec.r_cpos), le32_to_cpu(rec.r_clusters), le32_to_cpu(rec.r_refcount), index); len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; /* * We record all the records which will be inserted to the * same refcount block, so that we can tell exactly whether * we need a new refcount block or not. * * If we will insert a new one, this is easy and only happens * during adding refcounted flag to the extent, so we don't * have a chance of spliting. We just need one record. * * If the refcount rec already exists, that would be a little * complicated. we may have to: * 1) split at the beginning if the start pos isn't aligned. * we need 1 more record in this case. * 2) split int the end if the end pos isn't aligned. * we need 1 more record in this case. * 3) split in the middle because of file system fragmentation. * we need 2 more records in this case(we can't detect this * beforehand, so always think of the worst case). */ if (rec.r_refcount) { recs_add += 2; /* Check whether we need a split at the beginning. */ if (cpos == start_cpos && cpos != le64_to_cpu(rec.r_cpos)) recs_add++; /* Check whether we need a split in the end. */ if (cpos + clusters < le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) recs_add++; } else recs_add++; brelse(ref_leaf_bh); ref_leaf_bh = NULL; clusters -= len; cpos += len; } if (prev_bh) { rb = (struct ocfs2_refcount_block *)prev_bh->b_data; if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; *credits += 1; } if (!ref_blocks) goto out; *meta_add += ref_blocks; *credits += ref_blocks; /* * So we may need ref_blocks to insert into the tree. * That also means we need to change the b-tree and add that number * of records since we never merge them. * We need one more block for expansion since the new created leaf * block is also full and needs split. */ rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { struct ocfs2_extent_tree et; ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); *meta_add += ocfs2_extend_meta_needed(et.et_root_el); *credits += ocfs2_calc_extend_credits(sb, et.et_root_el); } else { *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; *meta_add += 1; } out: trace_ocfs2_calc_refcount_meta_credits( (unsigned long long)start_cpos, clusters, *meta_add, *credits); brelse(ref_leaf_bh); brelse(prev_bh); return ret; } /* * For refcount tree, we will decrease some contiguous clusters * refcount count, so just go through it to see how many blocks * we gonna touch and whether we need to create new blocks. * * Normally the refcount blocks store these refcount should be * contiguous also, so that we can get the number easily. * We will at most add split 2 refcount records and 2 more * refcount blocks, so just check it in a rough way. * * Caller must hold refcount tree lock. */ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 refcount_loc, u64 phys_blkno, u32 clusters, int *credits, int *ref_blocks) { int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), refcount_loc, &tree); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, &tree->rf_ci, ref_root_bh, start_cpos, clusters, ref_blocks, credits); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits); out: brelse(ref_root_bh); return ret; } #define MAX_CONTIG_BYTES 1048576 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) { return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); } static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) { return ~(ocfs2_cow_contig_clusters(sb) - 1); } /* * Given an extent that starts at 'start' and an I/O that starts at 'cpos', * find an offset (start + (n * contig_clusters)) that is closest to cpos * while still being less than or equal to it. * * The goal is to break the extent at a multiple of contig_clusters. */ static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, unsigned int start, unsigned int cpos) { BUG_ON(start > cpos); return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); } /* * Given a cluster count of len, pad it out so that it is a multiple * of contig_clusters. */ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, unsigned int len) { unsigned int padded = (len + (ocfs2_cow_contig_clusters(sb) - 1)) & ocfs2_cow_contig_mask(sb); /* Did we wrap? */ if (padded < len) padded = UINT_MAX; return padded; } /* * Calculate out the start and number of virtual clusters we need to to CoW. * * cpos is vitual start cluster position we want to do CoW in a * file and write_len is the cluster length. * max_cpos is the place where we want to stop CoW intentionally. * * Normal we will start CoW from the beginning of extent record cotaining cpos. * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we * get good I/O from the resulting extent tree. */ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, struct ocfs2_extent_list *el, u32 cpos, u32 write_len, u32 max_cpos, u32 *cow_start, u32 *cow_len) { int ret = 0; int tree_height = le16_to_cpu(el->l_tree_depth), i; struct buffer_head *eb_bh = NULL; struct ocfs2_extent_block *eb = NULL; struct ocfs2_extent_rec *rec; unsigned int want_clusters, rec_end = 0; int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); int leaf_clusters; BUG_ON(cpos + write_len > max_cpos); if (tree_height > 0) { ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); if (ret) { mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, "Inode %lu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; } } *cow_len = 0; for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; if (ocfs2_is_empty_extent(rec)) { mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " "index %d\n", inode->i_ino, i); continue; } if (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters) <= cpos) continue; if (*cow_len == 0) { /* * We should find a refcounted record in the * first pass. */ BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); *cow_start = le32_to_cpu(rec->e_cpos); } /* * If we encounter a hole, a non-refcounted record or * pass the max_cpos, stop the search. */ if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || (max_cpos <= le32_to_cpu(rec->e_cpos))) break; leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; if (rec_end > max_cpos) { rec_end = max_cpos; leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); } /* * How many clusters do we actually need from * this extent? First we see how many we actually * need to complete the write. If that's smaller * than contig_clusters, we try for contig_clusters. */ if (!*cow_len) want_clusters = write_len; else want_clusters = (cpos + write_len) - (*cow_start + *cow_len); if (want_clusters < contig_clusters) want_clusters = contig_clusters; /* * If the write does not cover the whole extent, we * need to calculate how we're going to split the extent. * We try to do it on contig_clusters boundaries. * * Any extent smaller than contig_clusters will be * CoWed in its entirety. */ if (leaf_clusters <= contig_clusters) *cow_len += leaf_clusters; else if (*cow_len || (*cow_start == cpos)) { /* * This extent needs to be CoW'd from its * beginning, so all we have to do is compute * how many clusters to grab. We align * want_clusters to the edge of contig_clusters * to get better I/O. */ want_clusters = ocfs2_cow_align_length(inode->i_sb, want_clusters); if (leaf_clusters < want_clusters) *cow_len += leaf_clusters; else *cow_len += want_clusters; } else if ((*cow_start + contig_clusters) >= (cpos + write_len)) { /* * Breaking off contig_clusters at the front * of the extent will cover our write. That's * easy. */ *cow_len = contig_clusters; } else if ((rec_end - cpos) <= contig_clusters) { /* * Breaking off contig_clusters at the tail of * this extent will cover cpos. */ *cow_start = rec_end - contig_clusters; *cow_len = contig_clusters; } else if ((rec_end - cpos) <= want_clusters) { /* * While we can't fit the entire write in this * extent, we know that the write goes from cpos * to the end of the extent. Break that off. * We try to break it at some multiple of * contig_clusters from the front of the extent. * Failing that (ie, cpos is within * contig_clusters of the front), we'll CoW the * entire extent. */ *cow_start = ocfs2_cow_align_start(inode->i_sb, *cow_start, cpos); *cow_len = rec_end - *cow_start; } else { /* * Ok, the entire write lives in the middle of * this extent. Let's try to slice the extent up * nicely. Optimally, our CoW region starts at * m*contig_clusters from the beginning of the * extent and goes for n*contig_clusters, * covering the entire write. */ *cow_start = ocfs2_cow_align_start(inode->i_sb, *cow_start, cpos); want_clusters = (cpos + write_len) - *cow_start; want_clusters = ocfs2_cow_align_length(inode->i_sb, want_clusters); if (*cow_start + want_clusters <= rec_end) *cow_len = want_clusters; else *cow_len = rec_end - *cow_start; } /* Have we covered our entire write yet? */ if ((*cow_start + *cow_len) >= (cpos + write_len)) break; /* * If we reach the end of the extent block and don't get enough * clusters, continue with the next extent block if possible. */ if (i + 1 == le16_to_cpu(el->l_next_free_rec) && eb && eb->h_next_leaf_blk) { brelse(eb_bh); eb_bh = NULL; ret = ocfs2_read_extent_block(INODE_CACHE(inode), le64_to_cpu(eb->h_next_leaf_blk), &eb_bh); if (ret) { mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; i = -1; } } out: brelse(eb_bh); return ret; } /* * Prepare meta_ac, data_ac and calculate credits when we want to add some * num_clusters in data_tree "et" and change the refcount for the old * clusters(starting form p_cluster) in the refcount tree. * * Note: * 1. since we may split the old tree, so we at most will need num_clusters + 2 * more new leaf records. * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so * just give data_ac = NULL. */ static int ocfs2_lock_refcount_allocators(struct super_block *sb, u32 p_cluster, u32 num_clusters, struct ocfs2_extent_tree *et, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, struct ocfs2_alloc_context **meta_ac, struct ocfs2_alloc_context **data_ac, int *credits) { int ret = 0, meta_add = 0; int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); goto out; } if (num_free_extents < num_clusters + 2) meta_add = ocfs2_extend_meta_needed(et->et_root_el); *credits += ocfs2_calc_extend_credits(sb, et->et_root_el); ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, p_cluster, num_clusters, &meta_add, credits); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_lock_refcount_allocators(meta_add, *credits); ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, meta_ac); if (ret) { mlog_errno(ret); goto out; } if (data_ac) { ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, data_ac); if (ret) mlog_errno(ret); } out: if (ret) { if (*meta_ac) { ocfs2_free_alloc_context(*meta_ac); *meta_ac = NULL; } } return ret; } static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) { BUG_ON(buffer_dirty(bh)); clear_buffer_mapped(bh); return 0; } int ocfs2_duplicate_clusters_by_page(handle_t *handle, struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0, partial; struct super_block *sb = inode->i_sb; u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; unsigned int from, to; loff_t offset, end, map_end; struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* * We only duplicate pages until we reach the page contains i_size - 1. * So trim 'end' to i_size. */ if (end > i_size_read(inode)) end = i_size_read(inode); while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; if (map_end > end) map_end = end; /* from, to is the offset within the page. */ from = offset & (PAGE_CACHE_SIZE - 1); to = PAGE_CACHE_SIZE; if (map_end & (PAGE_CACHE_SIZE - 1)) to = map_end & (PAGE_CACHE_SIZE - 1); page = find_or_create_page(mapping, page_index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); break; } /* * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page * can't be dirtied before we CoW it out. */ if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { mlog_errno(ret); goto unlock; } lock_page(page); } if (page_has_buffers(page)) { ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, ocfs2_clear_cow_buffer); if (ret) { mlog_errno(ret); goto unlock; } } ocfs2_map_and_dirty_page(inode, handle, from, to, page, 0, &new_block); mark_page_accessed(page); unlock: unlock_page(page); page_cache_release(page); page = NULL; offset = map_end; if (ret) break; } return ret; } int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, struct inode *inode, u32 cpos, u32 old_cluster, u32 new_cluster, u32 new_len) { int ret = 0; struct super_block *sb = inode->i_sb; struct ocfs2_caching_info *ci = INODE_CACHE(inode); int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *old_bh = NULL; struct buffer_head *new_bh = NULL; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); for (i = 0; i < blocks; i++, old_block++, new_block++) { new_bh = sb_getblk(osb->sb, new_block); if (new_bh == NULL) { ret = -ENOMEM; mlog_errno(ret); break; } ocfs2_set_new_buffer_uptodate(ci, new_bh); ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); if (ret) { mlog_errno(ret); break; } ret = ocfs2_journal_access(handle, ci, new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); break; } memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); ocfs2_journal_dirty(handle, new_bh); brelse(new_bh); brelse(old_bh); new_bh = NULL; old_bh = NULL; } brelse(new_bh); brelse(old_bh); return ret; } static int ocfs2_clear_ext_refcount(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u32 p_cluster, u32 len, unsigned int ext_flags, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, index; struct ocfs2_extent_rec replace_rec; struct ocfs2_path *path = NULL; struct ocfs2_extent_list *el; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); u64 ino = ocfs2_metadata_cache_owner(et->et_ci); trace_ocfs2_clear_ext_refcount((unsigned long long)ino, cpos, len, p_cluster, ext_flags); memset(&replace_rec, 0, sizeof(replace_rec)); replace_rec.e_cpos = cpu_to_le32(cpos); replace_rec.e_leaf_clusters = cpu_to_le16(len); replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, p_cluster)); replace_rec.e_flags = ext_flags; replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; } el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1) { ret = ocfs2_error(sb, "Inode %llu has an extent at cpos %u which can no longer be found\n", (unsigned long long)ino, cpos); goto out; } ret = ocfs2_split_extent(handle, et, path, index, &replace_rec, meta_ac, dealloc); if (ret) mlog_errno(ret); out: ocfs2_free_path(path); return ret; } static int ocfs2_replace_clusters(handle_t *handle, struct ocfs2_cow_context *context, u32 cpos, u32 old, u32 new, u32 len, unsigned int ext_flags) { int ret; struct ocfs2_caching_info *ci = context->data_et.et_ci; u64 ino = ocfs2_metadata_cache_owner(ci); trace_ocfs2_replace_clusters((unsigned long long)ino, cpos, old, new, len, ext_flags); /*If the old clusters is unwritten, no need to duplicate. */ if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { ret = context->cow_duplicate_clusters(handle, context->inode, cpos, old, new, len); if (ret) { mlog_errno(ret); goto out; } } ret = ocfs2_clear_ext_refcount(handle, &context->data_et, cpos, new, len, ext_flags, context->meta_ac, &context->dealloc); if (ret) mlog_errno(ret); out: return ret; } int ocfs2_cow_sync_writeback(struct super_block *sb, struct inode *inode, u32 cpos, u32 num_clusters) { int ret = 0; loff_t offset, end, map_end; pgoff_t page_index; struct page *page; if (ocfs2_should_order_data(inode)) return 0; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); ret = filemap_fdatawrite_range(inode->i_mapping, offset, end - 1); if (ret < 0) { mlog_errno(ret); return ret; } while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT; if (map_end > end) map_end = end; page = find_or_create_page(inode->i_mapping, page_index, GFP_NOFS); BUG_ON(!page); wait_on_page_writeback(page); if (PageError(page)) { ret = -EIO; mlog_errno(ret); } else mark_page_accessed(page); unlock_page(page); page_cache_release(page); page = NULL; offset = map_end; if (ret) break; } return ret; } static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags) { return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, num_clusters, extent_flags); } static int ocfs2_make_clusters_writable(struct super_block *sb, struct ocfs2_cow_context *context, u32 cpos, u32 p_cluster, u32 num_clusters, unsigned int e_flags) { int ret, delete, index, credits = 0; u32 new_bit, new_len, orig_num_clusters; unsigned int set_len; struct ocfs2_super *osb = OCFS2_SB(sb); handle_t *handle; struct buffer_head *ref_leaf_bh = NULL; struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; struct ocfs2_refcount_rec rec; trace_ocfs2_make_clusters_writable(cpos, p_cluster, num_clusters, e_flags); ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, &context->data_et, ref_ci, context->ref_root_bh, &context->meta_ac, &context->data_ac, &credits); if (ret) { mlog_errno(ret); return ret; } if (context->post_refcount) credits += context->post_refcount->credits; credits += context->extra_credits; handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } orig_num_clusters = num_clusters; while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, p_cluster, num_clusters, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out_commit; } BUG_ON(!rec.r_refcount); set_len = min((u64)p_cluster + num_clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - p_cluster; /* * There are many different situation here. * 1. If refcount == 1, remove the flag and don't COW. * 2. If refcount > 1, allocate clusters. * Here we may not allocate r_len once at a time, so continue * until we reach num_clusters. */ if (le32_to_cpu(rec.r_refcount) == 1) { delete = 0; ret = ocfs2_clear_ext_refcount(handle, &context->data_et, cpos, p_cluster, set_len, e_flags, context->meta_ac, &context->dealloc); if (ret) { mlog_errno(ret); goto out_commit; } } else { delete = 1; ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, set_len, &new_bit, &new_len); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_replace_clusters(handle, context, cpos, p_cluster, new_bit, new_len, e_flags); if (ret) { mlog_errno(ret); goto out_commit; } set_len = new_len; } ret = __ocfs2_decrease_refcount(handle, ref_ci, context->ref_root_bh, p_cluster, set_len, context->meta_ac, &context->dealloc, delete); if (ret) { mlog_errno(ret); goto out_commit; } cpos += set_len; p_cluster += set_len; num_clusters -= set_len; brelse(ref_leaf_bh); ref_leaf_bh = NULL; } /* handle any post_cow action. */ if (context->post_refcount && context->post_refcount->func) { ret = context->post_refcount->func(context->inode, handle, context->post_refcount->para); if (ret) { mlog_errno(ret); goto out_commit; } } /* * Here we should write the new page out first if we are * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos, orig_num_clusters); if (ret) mlog_errno(ret); } out_commit: ocfs2_commit_trans(osb, handle); out: if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); context->data_ac = NULL; } if (context->meta_ac) { ocfs2_free_alloc_context(context->meta_ac); context->meta_ac = NULL; } brelse(ref_leaf_bh); return ret; } static int ocfs2_replace_cow(struct ocfs2_cow_context *context) { int ret = 0; struct inode *inode = context->inode; u32 cow_start = context->cow_start, cow_len = context->cow_len; u32 p_cluster, num_clusters; unsigned int ext_flags; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); } ocfs2_init_dealloc_ctxt(&context->dealloc); while (cow_len) { ret = context->get_clusters(context, cow_start, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); break; } BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); if (cow_len < num_clusters) num_clusters = cow_len; ret = ocfs2_make_clusters_writable(inode->i_sb, context, cow_start, p_cluster, num_clusters, ext_flags); if (ret) { mlog_errno(ret); break; } cow_len -= num_clusters; cow_start += num_clusters; } if (ocfs2_dealloc_has_cluster(&context->dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &context->dealloc); } return ret; } /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { int ret; u32 cow_start = 0, cow_len = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_cow_context *context = NULL; BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, cpos, write_len, max_cpos, &cow_start, &cow_len); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno, cpos, write_len, max_cpos, cow_start, cow_len); BUG_ON(cow_len == 0); context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; mlog_errno(ret); goto out; } ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } context->inode = inode; context->cow_start = cow_start; context->cow_len = cow_len; context->ref_tree = ref_tree; context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); ret = ocfs2_replace_cow(context); if (ret) mlog_errno(ret); /* * truncate the extent map here since no matter whether we meet with * any error during the action, we shouldn't trust cached extent map * any more. */ ocfs2_extent_map_trunc(inode, cow_start); ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); out: kfree(context); return ret; } /* * CoW any and all clusters between cpos and cpos+write_len. * Don't CoW past max_cpos. If this returns successfully, all * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { int ret = 0; u32 p_cluster, num_clusters; unsigned int ext_flags; while (write_len) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); break; } if (write_len < num_clusters) num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); break; } } write_len -= num_clusters; cpos += num_clusters; } return ret; } static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags) { struct inode *inode = context->inode; struct ocfs2_xattr_value_root *xv = context->cow_object; return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, num_clusters, &xv->xr_list, extent_flags); } /* * Given a xattr value root, calculate the most meta/credits we need for * refcount tree change if we truncate it to 0. */ int ocfs2_refcounted_xattr_delete_need(struct inode *inode, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, struct ocfs2_xattr_value_root *xv, int *meta_add, int *credits) { int ret = 0, index, ref_blocks = 0; u32 p_cluster, num_clusters; u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); struct ocfs2_refcount_block *rb; struct ocfs2_refcount_rec rec; struct buffer_head *ref_leaf_bh = NULL; while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, &xv->xr_list, NULL); if (ret) { mlog_errno(ret); goto out; } cpos += num_clusters; while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, p_cluster, num_clusters, &rec, &index, &ref_leaf_bh); if (ret) { mlog_errno(ret); goto out; } BUG_ON(!rec.r_refcount); rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; /* * We really don't know whether the other clusters is in * this refcount block or not, so just take the worst * case that all the clusters are in this block and each * one will split a refcount rec, so totally we need * clusters * 2 new refcount rec. */ if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; *credits += 1; brelse(ref_leaf_bh); ref_leaf_bh = NULL; if (num_clusters <= le32_to_cpu(rec.r_clusters)) break; else num_clusters -= le32_to_cpu(rec.r_clusters); p_cluster += num_clusters; } } *meta_add += ref_blocks; if (!ref_blocks) goto out; rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; else { struct ocfs2_extent_tree et; ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); *credits += ocfs2_calc_extend_credits(inode->i_sb, et.et_root_el); } out: brelse(ref_leaf_bh); return ret; } /* * Do CoW for xattr. */ int ocfs2_refcount_cow_xattr(struct inode *inode, struct ocfs2_dinode *di, struct ocfs2_xattr_value_buf *vb, struct ocfs2_refcount_tree *ref_tree, struct buffer_head *ref_root_bh, u32 cpos, u32 write_len, struct ocfs2_post_refcount *post) { int ret; struct ocfs2_xattr_value_root *xv = vb->vb_xv; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_cow_context *context = NULL; u32 cow_start, cow_len; BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, cpos, write_len, UINT_MAX, &cow_start, &cow_len); if (ret) { mlog_errno(ret); goto out; } BUG_ON(cow_len == 0); context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; mlog_errno(ret); goto out; } context->inode = inode; context->cow_start = cow_start; context->cow_len = cow_len; context->ref_tree = ref_tree; context->ref_root_bh = ref_root_bh; context->cow_object = xv; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; /* We need the extra credits for duplicate_clusters by jbd. */ context->extra_credits = ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; context->get_clusters = ocfs2_xattr_value_get_clusters; context->post_refcount = post; ocfs2_init_xattr_value_extent_tree(&context->data_et, INODE_CACHE(inode), vb); ret = ocfs2_replace_cow(context); if (ret) mlog_errno(ret); out: kfree(context); return ret; } /* * Insert a new extent into refcount tree and mark a extent rec * as refcounted in the dinode tree. */ int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_extent_tree *data_et, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, u32 cpos, u32 p_cluster, u32 num_clusters, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_post_refcount *post) { int ret; handle_t *handle; int credits = 1, ref_blocks = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, ref_ci, ref_root_bh, p_cluster, num_clusters, &ref_blocks, &credits); if (ret) { mlog_errno(ret); goto out; } trace_ocfs2_add_refcount_flag(ref_blocks, credits); if (ref_blocks) { ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), ref_blocks, &meta_ac); if (ret) { mlog_errno(ret); goto out; } } if (post) credits += post->credits; handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, cpos, num_clusters, p_cluster, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; } ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, 0, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; } if (post && post->func) { ret = post->func(inode, handle, post->para); if (ret) mlog_errno(ret); } out_commit: ocfs2_commit_trans(osb, handle); out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); return ret; } static int ocfs2_change_ctime(struct inode *inode, struct buffer_head *di_bh) { int ret; handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } inode->i_ctime = CURRENT_TIME; di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ocfs2_journal_dirty(handle, di_bh); out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } static int ocfs2_attach_refcount_tree(struct inode *inode, struct buffer_head *di_bh) { int ret, data_changed = 0; struct buffer_head *ref_root_bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_refcount_tree *ref_tree; unsigned int ext_flags; loff_t size; u32 cpos, num_clusters, clusters, p_cluster; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree di_et; ocfs2_init_dealloc_ctxt(&dealloc); if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { ret = ocfs2_create_refcount_tree(inode, di_bh); if (ret) { mlog_errno(ret); goto out; } } BUG_ON(!di->i_refcount_loc); ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) goto attach_xattr; ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); size = i_size_read(inode); clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); cpos = 0; while (cpos < clusters) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); goto unlock; } if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { ret = ocfs2_add_refcount_flag(inode, &di_et, &ref_tree->rf_ci, ref_root_bh, cpos, p_cluster, num_clusters, &dealloc, NULL); if (ret) { mlog_errno(ret); goto unlock; } data_changed = 1; } cpos += num_clusters; } attach_xattr: if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, &ref_tree->rf_ci, ref_root_bh, &dealloc); if (ret) { mlog_errno(ret); goto unlock; } } if (data_changed) { ret = ocfs2_change_ctime(inode, di_bh); if (ret) mlog_errno(ret); } unlock: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); } out: /* * Empty the extent map so that we may get the right extent * record from the disk. */ ocfs2_extent_map_trunc(inode, 0); return ret; } static int ocfs2_add_refcounted_extent(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, u32 cpos, u32 p_cluster, u32 num_clusters, unsigned int ext_flags, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret; handle_t *handle; int credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; ret = ocfs2_lock_refcount_allocators(inode->i_sb, p_cluster, num_clusters, et, ref_ci, ref_root_bh, &meta_ac, NULL, &credits); if (ret) { mlog_errno(ret); goto out; } handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_insert_extent(handle, et, cpos, ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), num_clusters, ext_flags, meta_ac); if (ret) { mlog_errno(ret); goto out_commit; } ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, meta_ac, dealloc); if (ret) mlog_errno(ret); out_commit: ocfs2_commit_trans(osb, handle); out: if (meta_ac) ocfs2_free_alloc_context(meta_ac); return ret; } static int ocfs2_duplicate_inline_data(struct inode *s_inode, struct buffer_head *s_bh, struct inode *t_inode, struct buffer_head *t_bh) { int ret; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, le16_to_cpu(s_di->id2.i_data.id_count)); spin_lock(&OCFS2_I(t_inode)->ip_lock); OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); spin_unlock(&OCFS2_I(t_inode)->ip_lock); ocfs2_journal_dirty(handle, t_bh); out_commit: ocfs2_commit_trans(osb, handle); out: return ret; } static int ocfs2_duplicate_extent_list(struct inode *s_inode, struct inode *t_inode, struct buffer_head *t_bh, struct ocfs2_caching_info *ref_ci, struct buffer_head *ref_root_bh, struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0; u32 p_cluster, num_clusters, clusters, cpos; loff_t size; unsigned int ext_flags; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); size = i_size_read(s_inode); clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); cpos = 0; while (cpos < clusters) { ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, &num_clusters, &ext_flags); if (ret) { mlog_errno(ret); goto out; } if (p_cluster) { ret = ocfs2_add_refcounted_extent(t_inode, &et, ref_ci, ref_root_bh, cpos, p_cluster, num_clusters, ext_flags, dealloc); if (ret) { mlog_errno(ret); goto out; } } cpos += num_clusters; } out: return ret; } /* * change the new file's attributes to the src. * * reflink creates a snapshot of a file, that means the attributes * must be identical except for three exceptions - nlink, ino, and ctime. */ static int ocfs2_complete_reflink(struct inode *s_inode, struct buffer_head *s_bh, struct inode *t_inode, struct buffer_head *t_bh, bool preserve) { int ret; handle_t *handle; struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; loff_t size = i_size_read(s_inode); handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); return ret; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } spin_lock(&OCFS2_I(t_inode)->ip_lock); OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; spin_unlock(&OCFS2_I(t_inode)->ip_lock); i_size_write(t_inode, size); t_inode->i_blocks = s_inode->i_blocks; di->i_xattr_inline_size = s_di->i_xattr_inline_size; di->i_clusters = s_di->i_clusters; di->i_size = s_di->i_size; di->i_dyn_features = s_di->i_dyn_features; di->i_attr = s_di->i_attr; if (preserve) { t_inode->i_uid = s_inode->i_uid; t_inode->i_gid = s_inode->i_gid; t_inode->i_mode = s_inode->i_mode; di->i_uid = s_di->i_uid; di->i_gid = s_di->i_gid; di->i_mode = s_di->i_mode; /* * update time. * we want mtime to appear identical to the source and * update ctime. */ t_inode->i_ctime = CURRENT_TIME; di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); t_inode->i_mtime = s_inode->i_mtime; di->i_mtime = s_di->i_mtime; di->i_mtime_nsec = s_di->i_mtime_nsec; } ocfs2_journal_dirty(handle, t_bh); out_commit: ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); return ret; } static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *s_bh, struct inode *t_inode, struct buffer_head *t_bh, bool preserve) { int ret; struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; ocfs2_init_dealloc_ctxt(&dealloc); ret = ocfs2_set_refcount_tree(t_inode, t_bh, le64_to_cpu(di->i_refcount_loc)); if (ret) { mlog_errno(ret); goto out; } if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); if (ret) mlog_errno(ret); goto out; } ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); if (ret) { mlog_errno(ret); goto out; } rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, &dealloc); if (ret) { mlog_errno(ret); goto out_unlock_refcount; } out_unlock_refcount: ocfs2_unlock_refcount_tree(osb, ref_tree, 1); brelse(ref_root_bh); out: if (ocfs2_dealloc_has_cluster(&dealloc)) { ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); } return ret; } static int __ocfs2_reflink(struct dentry *old_dentry, struct buffer_head *old_bh, struct inode *new_inode, bool preserve) { int ret; struct inode *inode = d_inode(old_dentry); struct buffer_head *new_bh = NULL; if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { ret = -EINVAL; mlog_errno(ret); goto out; } ret = filemap_fdatawrite(inode->i_mapping); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_attach_refcount_tree(inode, old_bh); if (ret) { mlog_errno(ret); goto out; } mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { mlog_errno(ret); goto out_unlock; } ret = ocfs2_create_reflink_node(inode, old_bh, new_inode, new_bh, preserve); if (ret) { mlog_errno(ret); goto inode_unlock; } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { ret = ocfs2_reflink_xattrs(inode, old_bh, new_inode, new_bh, preserve); if (ret) { mlog_errno(ret); goto inode_unlock; } } ret = ocfs2_complete_reflink(inode, old_bh, new_inode, new_bh, preserve); if (ret) mlog_errno(ret); inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: mutex_unlock(&new_inode->i_mutex); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); if (ret) mlog_errno(ret); } return ret; } static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { int error; struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; struct posix_acl *default_acl, *acl; umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; mode = inode->i_mode; error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) { mlog_errno(error); return error; } error = ocfs2_create_inode_in_orphan(dir, mode, &new_orphan_inode); if (error) { mlog_errno(error); goto out; } error = ocfs2_rw_lock(inode, 1); if (error) { mlog_errno(error); goto out; } error = ocfs2_inode_lock(inode, &old_bh, 1); if (error) { mlog_errno(error); ocfs2_rw_unlock(inode, 1); goto out; } down_write(&OCFS2_I(inode)->ip_xattr_sem); down_write(&OCFS2_I(inode)->ip_alloc_sem); error = __ocfs2_reflink(old_dentry, old_bh, new_orphan_inode, preserve); up_write(&OCFS2_I(inode)->ip_alloc_sem); up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); ocfs2_rw_unlock(inode, 1); brelse(old_bh); if (error) { mlog_errno(error); goto out; } /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, &new_dentry->d_name, default_acl, acl); if (error) mlog_errno(error); } out: if (default_acl) posix_acl_release(default_acl); if (acl) posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); if (error) mlog_errno(error); } if (new_orphan_inode) { /* * We need to open_unlock the inode no matter whether we * succeed or not, so that other nodes can delete it later. */ ocfs2_open_unlock(new_orphan_inode); if (error) iput(new_orphan_inode); } return error; } /* * Below here are the bits used by OCFS2_IOC_REFLINK() to fake * sys_reflink(). This will go away when vfs_reflink() exists in * fs/namei.c. */ /* copied from may_create in VFS. */ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; return inode_permission(dir, MAY_WRITE | MAY_EXEC); } /** * ocfs2_vfs_reflink - Create a reference-counted link * * @old_dentry: source dentry + inode * @dir: directory to create the target * @new_dentry: target dentry * @preserve: if true, preserve all file attributes */ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { struct inode *inode = d_inode(old_dentry); int error; if (!inode) return -ENOENT; error = ocfs2_may_create(dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A reflink to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* Only regular files can be reflinked. */ if (!S_ISREG(inode->i_mode)) return -EPERM; /* * If the caller wants to preserve ownership, they require the * rights to do so. */ if (preserve) { if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN)) return -EPERM; if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) return -EPERM; } /* * If the caller is modifying any aspect of the attributes, they * are not creating a snapshot. They need read permission on the * file. */ if (!preserve) { error = inode_permission(inode, MAY_READ); if (error) return error; } mutex_lock(&inode->i_mutex); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_create(dir, new_dentry); return error; } /* * Most codes are copied from sys_linkat. */ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve) { struct dentry *new_dentry; struct path old_path, new_path; int error; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; error = user_path_at(AT_FDCWD, oldname, 0, &old_path); if (error) { mlog_errno(error); return error; } new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); goto out; } error = -EXDEV; if (old_path.mnt != new_path.mnt) { mlog_errno(error); goto out_dput; } error = ocfs2_vfs_reflink(old_path.dentry, d_inode(new_path.dentry), new_dentry, preserve); out_dput: done_path_create(&new_path, new_dentry); out: path_put(&old_path); return error; }