summaryrefslogtreecommitdiffstats
path: root/kernel/drivers/media/platform/am437x/am437x-vpfe.c
blob: a30cc2f7e4f1268445c8cb2763bcacda57665beb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
/*
 * TI VPFE capture Driver
 *
 * Copyright (C) 2013 - 2014 Texas Instruments, Inc.
 *
 * Benoit Parrot <bparrot@ti.com>
 * Lad, Prabhakar <prabhakar.csengg@gmail.com>
 *
 * This program is free software; you may redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <linux/delay.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/module.h>
#include <linux/pinctrl/consumer.h>
#include <linux/platform_device.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/videodev2.h>

#include <media/v4l2-common.h>
#include <media/v4l2-ctrls.h>
#include <media/v4l2-event.h>
#include <media/v4l2-of.h>

#include "am437x-vpfe.h"

#define VPFE_MODULE_NAME	"vpfe"
#define VPFE_VERSION		"0.1.0"

static int debug;
module_param(debug, int, 0644);
MODULE_PARM_DESC(debug, "Debug level 0-8");

#define vpfe_dbg(level, dev, fmt, arg...)	\
		v4l2_dbg(level, debug, &dev->v4l2_dev, fmt, ##arg)
#define vpfe_info(dev, fmt, arg...)	\
		v4l2_info(&dev->v4l2_dev, fmt, ##arg)
#define vpfe_err(dev, fmt, arg...)	\
		v4l2_err(&dev->v4l2_dev, fmt, ##arg)

/* standard information */
struct vpfe_standard {
	v4l2_std_id std_id;
	unsigned int width;
	unsigned int height;
	struct v4l2_fract pixelaspect;
	int frame_format;
};

static const struct vpfe_standard vpfe_standards[] = {
	{V4L2_STD_525_60, 720, 480, {11, 10}, 1},
	{V4L2_STD_625_50, 720, 576, {54, 59}, 1},
};

struct bus_format {
	unsigned int width;
	unsigned int bpp;
};

/*
 * struct vpfe_fmt - VPFE media bus format information
 * @name: V4L2 format description
 * @code: V4L2 media bus format code
 * @shifted: V4L2 media bus format code for the same pixel layout but
 *	shifted to be 8 bits per pixel. =0 if format is not shiftable.
 * @pixelformat: V4L2 pixel format FCC identifier
 * @width: Bits per pixel (when transferred over a bus)
 * @bpp: Bytes per pixel (when stored in memory)
 * @supported: Indicates format supported by subdev
 */
struct vpfe_fmt {
	const char *name;
	u32 fourcc;
	u32 code;
	struct bus_format l;
	struct bus_format s;
	bool supported;
	u32 index;
};

static struct vpfe_fmt formats[] = {
	{
		.name		= "YUV 4:2:2 packed, YCbYCr",
		.fourcc		= V4L2_PIX_FMT_YUYV,
		.code		= MEDIA_BUS_FMT_YUYV8_2X8,
		.l.width	= 10,
		.l.bpp		= 4,
		.s.width	= 8,
		.s.bpp		= 2,
		.supported	= false,
	}, {
		.name		= "YUV 4:2:2 packed, CbYCrY",
		.fourcc		= V4L2_PIX_FMT_UYVY,
		.code		= MEDIA_BUS_FMT_UYVY8_2X8,
		.l.width	= 10,
		.l.bpp		= 4,
		.s.width	= 8,
		.s.bpp		= 2,
		.supported	= false,
	}, {
		.name		= "YUV 4:2:2 packed, YCrYCb",
		.fourcc		= V4L2_PIX_FMT_YVYU,
		.code		= MEDIA_BUS_FMT_YVYU8_2X8,
		.l.width	= 10,
		.l.bpp		= 4,
		.s.width	= 8,
		.s.bpp		= 2,
		.supported	= false,
	}, {
		.name		= "YUV 4:2:2 packed, CrYCbY",
		.fourcc		= V4L2_PIX_FMT_VYUY,
		.code		= MEDIA_BUS_FMT_VYUY8_2X8,
		.l.width	= 10,
		.l.bpp		= 4,
		.s.width	= 8,
		.s.bpp		= 2,
		.supported	= false,
	}, {
		.name		= "RAW8 BGGR",
		.fourcc		= V4L2_PIX_FMT_SBGGR8,
		.code		= MEDIA_BUS_FMT_SBGGR8_1X8,
		.l.width	= 10,
		.l.bpp		= 2,
		.s.width	= 8,
		.s.bpp		= 1,
		.supported	= false,
	}, {
		.name		= "RAW8 GBRG",
		.fourcc		= V4L2_PIX_FMT_SGBRG8,
		.code		= MEDIA_BUS_FMT_SGBRG8_1X8,
		.l.width	= 10,
		.l.bpp		= 2,
		.s.width	= 8,
		.s.bpp		= 1,
		.supported	= false,
	}, {
		.name		= "RAW8 GRBG",
		.fourcc		= V4L2_PIX_FMT_SGRBG8,
		.code		= MEDIA_BUS_FMT_SGRBG8_1X8,
		.l.width	= 10,
		.l.bpp		= 2,
		.s.width	= 8,
		.s.bpp		= 1,
		.supported	= false,
	}, {
		.name		= "RAW8 RGGB",
		.fourcc		= V4L2_PIX_FMT_SRGGB8,
		.code		= MEDIA_BUS_FMT_SRGGB8_1X8,
		.l.width	= 10,
		.l.bpp		= 2,
		.s.width	= 8,
		.s.bpp		= 1,
		.supported	= false,
	}, {
		.name		= "RGB565 (LE)",
		.fourcc		= V4L2_PIX_FMT_RGB565,
		.code		= MEDIA_BUS_FMT_RGB565_2X8_LE,
		.l.width	= 10,
		.l.bpp		= 4,
		.s.width	= 8,
		.s.bpp		= 2,
		.supported	= false,
	}, {
		.name		= "RGB565 (BE)",
		.fourcc		= V4L2_PIX_FMT_RGB565X,
		.code		= MEDIA_BUS_FMT_RGB565_2X8_BE,
		.l.width	= 10,
		.l.bpp		= 4,
		.s.width	= 8,
		.s.bpp		= 2,
		.supported	= false,
	},
};

static int
__vpfe_get_format(struct vpfe_device *vpfe,
		  struct v4l2_format *format, unsigned int *bpp);

static struct vpfe_fmt *find_format_by_code(unsigned int code)
{
	struct vpfe_fmt *fmt;
	unsigned int k;

	for (k = 0; k < ARRAY_SIZE(formats); k++) {
		fmt = &formats[k];
		if (fmt->code == code)
			return fmt;
	}

	return NULL;
}

static struct vpfe_fmt *find_format_by_pix(unsigned int pixelformat)
{
	struct vpfe_fmt *fmt;
	unsigned int k;

	for (k = 0; k < ARRAY_SIZE(formats); k++) {
		fmt = &formats[k];
		if (fmt->fourcc == pixelformat)
			return fmt;
	}

	return NULL;
}

static void
mbus_to_pix(struct vpfe_device *vpfe,
	    const struct v4l2_mbus_framefmt *mbus,
	    struct v4l2_pix_format *pix, unsigned int *bpp)
{
	struct vpfe_subdev_info *sdinfo = vpfe->current_subdev;
	unsigned int bus_width = sdinfo->vpfe_param.bus_width;
	struct vpfe_fmt *fmt;

	fmt = find_format_by_code(mbus->code);
	if (WARN_ON(fmt == NULL)) {
		pr_err("Invalid mbus code set\n");
		*bpp = 1;
		return;
	}

	memset(pix, 0, sizeof(*pix));
	v4l2_fill_pix_format(pix, mbus);
	pix->pixelformat = fmt->fourcc;
	*bpp = (bus_width == 10) ?  fmt->l.bpp : fmt->s.bpp;

	/* pitch should be 32 bytes aligned */
	pix->bytesperline = ALIGN(pix->width * *bpp, 32);
	pix->sizeimage = pix->bytesperline * pix->height;
}

static void pix_to_mbus(struct vpfe_device *vpfe,
			struct v4l2_pix_format *pix_fmt,
			struct v4l2_mbus_framefmt *mbus_fmt)
{
	struct vpfe_fmt *fmt;

	fmt = find_format_by_pix(pix_fmt->pixelformat);
	if (!fmt) {
		/* default to first entry */
		vpfe_dbg(3, vpfe, "Invalid pixel code: %x, default used instead\n",
			pix_fmt->pixelformat);
		fmt = &formats[0];
	}

	memset(mbus_fmt, 0, sizeof(*mbus_fmt));
	v4l2_fill_mbus_format(mbus_fmt, pix_fmt, fmt->code);
}

/*  Print Four-character-code (FOURCC) */
static char *print_fourcc(u32 fmt)
{
	static char code[5];

	code[0] = (unsigned char)(fmt & 0xff);
	code[1] = (unsigned char)((fmt >> 8) & 0xff);
	code[2] = (unsigned char)((fmt >> 16) & 0xff);
	code[3] = (unsigned char)((fmt >> 24) & 0xff);
	code[4] = '\0';

	return code;
}

static int
cmp_v4l2_format(const struct v4l2_format *lhs, const struct v4l2_format *rhs)
{
	return lhs->type == rhs->type &&
		lhs->fmt.pix.width == rhs->fmt.pix.width &&
		lhs->fmt.pix.height == rhs->fmt.pix.height &&
		lhs->fmt.pix.pixelformat == rhs->fmt.pix.pixelformat &&
		lhs->fmt.pix.field == rhs->fmt.pix.field &&
		lhs->fmt.pix.colorspace == rhs->fmt.pix.colorspace &&
		lhs->fmt.pix.ycbcr_enc == rhs->fmt.pix.ycbcr_enc &&
		lhs->fmt.pix.quantization == rhs->fmt.pix.quantization;
}

static inline u32 vpfe_reg_read(struct vpfe_ccdc *ccdc, u32 offset)
{
	return ioread32(ccdc->ccdc_cfg.base_addr + offset);
}

static inline void vpfe_reg_write(struct vpfe_ccdc *ccdc, u32 val, u32 offset)
{
	iowrite32(val, ccdc->ccdc_cfg.base_addr + offset);
}

static inline struct vpfe_device *to_vpfe(struct vpfe_ccdc *ccdc)
{
	return container_of(ccdc, struct vpfe_device, ccdc);
}

static inline struct vpfe_cap_buffer *to_vpfe_buffer(struct vb2_buffer *vb)
{
	return container_of(vb, struct vpfe_cap_buffer, vb);
}

static inline void vpfe_pcr_enable(struct vpfe_ccdc *ccdc, int flag)
{
	vpfe_reg_write(ccdc, !!flag, VPFE_PCR);
}

static void vpfe_config_enable(struct vpfe_ccdc *ccdc, int flag)
{
	unsigned int cfg;

	if (!flag) {
		cfg = vpfe_reg_read(ccdc, VPFE_CONFIG);
		cfg &= ~(VPFE_CONFIG_EN_ENABLE << VPFE_CONFIG_EN_SHIFT);
	} else {
		cfg = VPFE_CONFIG_EN_ENABLE << VPFE_CONFIG_EN_SHIFT;
	}

	vpfe_reg_write(ccdc, cfg, VPFE_CONFIG);
}

static void vpfe_ccdc_setwin(struct vpfe_ccdc *ccdc,
			     struct v4l2_rect *image_win,
			     enum ccdc_frmfmt frm_fmt,
			     int bpp)
{
	int horz_start, horz_nr_pixels;
	int vert_start, vert_nr_lines;
	int val, mid_img;

	/*
	 * ppc - per pixel count. indicates how many pixels per cell
	 * output to SDRAM. example, for ycbcr, it is one y and one c, so 2.
	 * raw capture this is 1
	 */
	horz_start = image_win->left * bpp;
	horz_nr_pixels = (image_win->width * bpp) - 1;
	vpfe_reg_write(ccdc, (horz_start << VPFE_HORZ_INFO_SPH_SHIFT) |
				horz_nr_pixels, VPFE_HORZ_INFO);

	vert_start = image_win->top;

	if (frm_fmt == CCDC_FRMFMT_INTERLACED) {
		vert_nr_lines = (image_win->height >> 1) - 1;
		vert_start >>= 1;
		/* Since first line doesn't have any data */
		vert_start += 1;
		/* configure VDINT0 */
		val = (vert_start << VPFE_VDINT_VDINT0_SHIFT);
	} else {
		/* Since first line doesn't have any data */
		vert_start += 1;
		vert_nr_lines = image_win->height - 1;
		/*
		 * configure VDINT0 and VDINT1. VDINT1 will be at half
		 * of image height
		 */
		mid_img = vert_start + (image_win->height / 2);
		val = (vert_start << VPFE_VDINT_VDINT0_SHIFT) |
				(mid_img & VPFE_VDINT_VDINT1_MASK);
	}

	vpfe_reg_write(ccdc, val, VPFE_VDINT);

	vpfe_reg_write(ccdc, (vert_start << VPFE_VERT_START_SLV0_SHIFT) |
				vert_start, VPFE_VERT_START);
	vpfe_reg_write(ccdc, vert_nr_lines, VPFE_VERT_LINES);
}

static void vpfe_reg_dump(struct vpfe_ccdc *ccdc)
{
	struct vpfe_device *vpfe = to_vpfe(ccdc);

	vpfe_dbg(3, vpfe, "ALAW: 0x%x\n", vpfe_reg_read(ccdc, VPFE_ALAW));
	vpfe_dbg(3, vpfe, "CLAMP: 0x%x\n", vpfe_reg_read(ccdc, VPFE_CLAMP));
	vpfe_dbg(3, vpfe, "DCSUB: 0x%x\n", vpfe_reg_read(ccdc, VPFE_DCSUB));
	vpfe_dbg(3, vpfe, "BLKCMP: 0x%x\n", vpfe_reg_read(ccdc, VPFE_BLKCMP));
	vpfe_dbg(3, vpfe, "COLPTN: 0x%x\n", vpfe_reg_read(ccdc, VPFE_COLPTN));
	vpfe_dbg(3, vpfe, "SDOFST: 0x%x\n", vpfe_reg_read(ccdc, VPFE_SDOFST));
	vpfe_dbg(3, vpfe, "SYN_MODE: 0x%x\n",
		 vpfe_reg_read(ccdc, VPFE_SYNMODE));
	vpfe_dbg(3, vpfe, "HSIZE_OFF: 0x%x\n",
		 vpfe_reg_read(ccdc, VPFE_HSIZE_OFF));
	vpfe_dbg(3, vpfe, "HORZ_INFO: 0x%x\n",
		 vpfe_reg_read(ccdc, VPFE_HORZ_INFO));
	vpfe_dbg(3, vpfe, "VERT_START: 0x%x\n",
		 vpfe_reg_read(ccdc, VPFE_VERT_START));
	vpfe_dbg(3, vpfe, "VERT_LINES: 0x%x\n",
		 vpfe_reg_read(ccdc, VPFE_VERT_LINES));
}

static int
vpfe_ccdc_validate_param(struct vpfe_ccdc *ccdc,
			 struct vpfe_ccdc_config_params_raw *ccdcparam)
{
	struct vpfe_device *vpfe = to_vpfe(ccdc);
	u8 max_gamma, max_data;

	if (!ccdcparam->alaw.enable)
		return 0;

	max_gamma = ccdc_gamma_width_max_bit(ccdcparam->alaw.gamma_wd);
	max_data = ccdc_data_size_max_bit(ccdcparam->data_sz);

	if (ccdcparam->alaw.gamma_wd > VPFE_CCDC_GAMMA_BITS_09_0 ||
	    ccdcparam->alaw.gamma_wd < VPFE_CCDC_GAMMA_BITS_15_6 ||
	    max_gamma > max_data) {
		vpfe_dbg(1, vpfe, "Invalid data line select\n");
		return -EINVAL;
	}

	return 0;
}

static void
vpfe_ccdc_update_raw_params(struct vpfe_ccdc *ccdc,
			    struct vpfe_ccdc_config_params_raw *raw_params)
{
	struct vpfe_ccdc_config_params_raw *config_params =
				&ccdc->ccdc_cfg.bayer.config_params;

	config_params = raw_params;
}

/*
 * vpfe_ccdc_restore_defaults()
 * This function will write defaults to all CCDC registers
 */
static void vpfe_ccdc_restore_defaults(struct vpfe_ccdc *ccdc)
{
	int i;

	/* Disable CCDC */
	vpfe_pcr_enable(ccdc, 0);

	/* set all registers to default value */
	for (i = 4; i <= 0x94; i += 4)
		vpfe_reg_write(ccdc, 0,  i);

	vpfe_reg_write(ccdc, VPFE_NO_CULLING, VPFE_CULLING);
	vpfe_reg_write(ccdc, VPFE_CCDC_GAMMA_BITS_11_2, VPFE_ALAW);
}

static int vpfe_ccdc_close(struct vpfe_ccdc *ccdc, struct device *dev)
{
	int dma_cntl, i, pcr;

	/* If the CCDC module is still busy wait for it to be done */
	for (i = 0; i < 10; i++) {
		usleep_range(5000, 6000);
		pcr = vpfe_reg_read(ccdc, VPFE_PCR);
		if (!pcr)
			break;

		/* make sure it it is disabled */
		vpfe_pcr_enable(ccdc, 0);
	}

	/* Disable CCDC by resetting all register to default POR values */
	vpfe_ccdc_restore_defaults(ccdc);

	/* if DMA_CNTL overflow bit is set. Clear it
	 *  It appears to take a while for this to become quiescent ~20ms
	 */
	for (i = 0; i < 10; i++) {
		dma_cntl = vpfe_reg_read(ccdc, VPFE_DMA_CNTL);
		if (!(dma_cntl & VPFE_DMA_CNTL_OVERFLOW))
			break;

		/* Clear the overflow bit */
		vpfe_reg_write(ccdc, dma_cntl, VPFE_DMA_CNTL);
		usleep_range(5000, 6000);
	}

	/* Disabled the module at the CONFIG level */
	vpfe_config_enable(ccdc, 0);

	pm_runtime_put_sync(dev);

	return 0;
}

static int vpfe_ccdc_set_params(struct vpfe_ccdc *ccdc, void __user *params)
{
	struct vpfe_device *vpfe = container_of(ccdc, struct vpfe_device, ccdc);
	struct vpfe_ccdc_config_params_raw raw_params;
	int x;

	if (ccdc->ccdc_cfg.if_type != VPFE_RAW_BAYER)
		return -EINVAL;

	x = copy_from_user(&raw_params, params, sizeof(raw_params));
	if (x) {
		vpfe_dbg(1, vpfe,
			"vpfe_ccdc_set_params: error in copying ccdc params, %d\n",
			x);
		return -EFAULT;
	}

	if (!vpfe_ccdc_validate_param(ccdc, &raw_params)) {
		vpfe_ccdc_update_raw_params(ccdc, &raw_params);
			return 0;
	}

	return -EINVAL;
}

/*
 * vpfe_ccdc_config_ycbcr()
 * This function will configure CCDC for YCbCr video capture
 */
static void vpfe_ccdc_config_ycbcr(struct vpfe_ccdc *ccdc)
{
	struct vpfe_device *vpfe = container_of(ccdc, struct vpfe_device, ccdc);
	struct ccdc_params_ycbcr *params = &ccdc->ccdc_cfg.ycbcr;
	u32 syn_mode;

	vpfe_dbg(3, vpfe, "vpfe_ccdc_config_ycbcr:\n");
	/*
	 * first restore the CCDC registers to default values
	 * This is important since we assume default values to be set in
	 * a lot of registers that we didn't touch
	 */
	vpfe_ccdc_restore_defaults(ccdc);

	/*
	 * configure pixel format, frame format, configure video frame
	 * format, enable output to SDRAM, enable internal timing generator
	 * and 8bit pack mode
	 */
	syn_mode = (((params->pix_fmt & VPFE_SYN_MODE_INPMOD_MASK) <<
		    VPFE_SYN_MODE_INPMOD_SHIFT) |
		    ((params->frm_fmt & VPFE_SYN_FLDMODE_MASK) <<
		    VPFE_SYN_FLDMODE_SHIFT) | VPFE_VDHDEN_ENABLE |
		    VPFE_WEN_ENABLE | VPFE_DATA_PACK_ENABLE);

	/* setup BT.656 sync mode */
	if (params->bt656_enable) {
		vpfe_reg_write(ccdc, VPFE_REC656IF_BT656_EN, VPFE_REC656IF);

		/*
		 * configure the FID, VD, HD pin polarity,
		 * fld,hd pol positive, vd negative, 8-bit data
		 */
		syn_mode |= VPFE_SYN_MODE_VD_POL_NEGATIVE;
		if (ccdc->ccdc_cfg.if_type == VPFE_BT656_10BIT)
			syn_mode |= VPFE_SYN_MODE_10BITS;
		else
			syn_mode |= VPFE_SYN_MODE_8BITS;
	} else {
		/* y/c external sync mode */
		syn_mode |= (((params->fid_pol & VPFE_FID_POL_MASK) <<
			     VPFE_FID_POL_SHIFT) |
			     ((params->hd_pol & VPFE_HD_POL_MASK) <<
			     VPFE_HD_POL_SHIFT) |
			     ((params->vd_pol & VPFE_VD_POL_MASK) <<
			     VPFE_VD_POL_SHIFT));
	}
	vpfe_reg_write(ccdc, syn_mode, VPFE_SYNMODE);

	/* configure video window */
	vpfe_ccdc_setwin(ccdc, &params->win,
			 params->frm_fmt, params->bytesperpixel);

	/*
	 * configure the order of y cb cr in SDRAM, and disable latch
	 * internal register on vsync
	 */
	if (ccdc->ccdc_cfg.if_type == VPFE_BT656_10BIT)
		vpfe_reg_write(ccdc,
			       (params->pix_order << VPFE_CCDCFG_Y8POS_SHIFT) |
			       VPFE_LATCH_ON_VSYNC_DISABLE |
			       VPFE_CCDCFG_BW656_10BIT, VPFE_CCDCFG);
	else
		vpfe_reg_write(ccdc,
			       (params->pix_order << VPFE_CCDCFG_Y8POS_SHIFT) |
			       VPFE_LATCH_ON_VSYNC_DISABLE, VPFE_CCDCFG);

	/*
	 * configure the horizontal line offset. This should be a
	 * on 32 byte boundary. So clear LSB 5 bits
	 */
	vpfe_reg_write(ccdc, params->bytesperline, VPFE_HSIZE_OFF);

	/* configure the memory line offset */
	if (params->buf_type == CCDC_BUFTYPE_FLD_INTERLEAVED)
		/* two fields are interleaved in memory */
		vpfe_reg_write(ccdc, VPFE_SDOFST_FIELD_INTERLEAVED,
			       VPFE_SDOFST);
}

static void
vpfe_ccdc_config_black_clamp(struct vpfe_ccdc *ccdc,
			     struct vpfe_ccdc_black_clamp *bclamp)
{
	u32 val;

	if (!bclamp->enable) {
		/* configure DCSub */
		val = (bclamp->dc_sub) & VPFE_BLK_DC_SUB_MASK;
		vpfe_reg_write(ccdc, val, VPFE_DCSUB);
		vpfe_reg_write(ccdc, VPFE_CLAMP_DEFAULT_VAL, VPFE_CLAMP);
		return;
	}
	/*
	 * Configure gain,  Start pixel, No of line to be avg,
	 * No of pixel/line to be avg, & Enable the Black clamping
	 */
	val = ((bclamp->sgain & VPFE_BLK_SGAIN_MASK) |
	       ((bclamp->start_pixel & VPFE_BLK_ST_PXL_MASK) <<
		VPFE_BLK_ST_PXL_SHIFT) |
	       ((bclamp->sample_ln & VPFE_BLK_SAMPLE_LINE_MASK) <<
		VPFE_BLK_SAMPLE_LINE_SHIFT) |
	       ((bclamp->sample_pixel & VPFE_BLK_SAMPLE_LN_MASK) <<
		VPFE_BLK_SAMPLE_LN_SHIFT) | VPFE_BLK_CLAMP_ENABLE);
	vpfe_reg_write(ccdc, val, VPFE_CLAMP);
	/* If Black clamping is enable then make dcsub 0 */
	vpfe_reg_write(ccdc, VPFE_DCSUB_DEFAULT_VAL, VPFE_DCSUB);
}

static void
vpfe_ccdc_config_black_compense(struct vpfe_ccdc *ccdc,
				struct vpfe_ccdc_black_compensation *bcomp)
{
	u32 val;

	val = ((bcomp->b & VPFE_BLK_COMP_MASK) |
	      ((bcomp->gb & VPFE_BLK_COMP_MASK) <<
	       VPFE_BLK_COMP_GB_COMP_SHIFT) |
	      ((bcomp->gr & VPFE_BLK_COMP_MASK) <<
	       VPFE_BLK_COMP_GR_COMP_SHIFT) |
	      ((bcomp->r & VPFE_BLK_COMP_MASK) <<
	       VPFE_BLK_COMP_R_COMP_SHIFT));
	vpfe_reg_write(ccdc, val, VPFE_BLKCMP);
}

/*
 * vpfe_ccdc_config_raw()
 * This function will configure CCDC for Raw capture mode
 */
static void vpfe_ccdc_config_raw(struct vpfe_ccdc *ccdc)
{
	struct vpfe_device *vpfe = container_of(ccdc, struct vpfe_device, ccdc);
	struct vpfe_ccdc_config_params_raw *config_params =
				&ccdc->ccdc_cfg.bayer.config_params;
	struct ccdc_params_raw *params = &ccdc->ccdc_cfg.bayer;
	unsigned int syn_mode;
	unsigned int val;

	vpfe_dbg(3, vpfe, "vpfe_ccdc_config_raw:\n");

	/* Reset CCDC */
	vpfe_ccdc_restore_defaults(ccdc);

	/* Disable latching function registers on VSYNC  */
	vpfe_reg_write(ccdc, VPFE_LATCH_ON_VSYNC_DISABLE, VPFE_CCDCFG);

	/*
	 * Configure the vertical sync polarity(SYN_MODE.VDPOL),
	 * horizontal sync polarity (SYN_MODE.HDPOL), frame id polarity
	 * (SYN_MODE.FLDPOL), frame format(progressive or interlace),
	 * data size(SYNMODE.DATSIZ), &pixel format (Input mode), output
	 * SDRAM, enable internal timing generator
	 */
	syn_mode = (((params->vd_pol & VPFE_VD_POL_MASK) << VPFE_VD_POL_SHIFT) |
		   ((params->hd_pol & VPFE_HD_POL_MASK) << VPFE_HD_POL_SHIFT) |
		   ((params->fid_pol & VPFE_FID_POL_MASK) <<
		   VPFE_FID_POL_SHIFT) | ((params->frm_fmt &
		   VPFE_FRM_FMT_MASK) << VPFE_FRM_FMT_SHIFT) |
		   ((config_params->data_sz & VPFE_DATA_SZ_MASK) <<
		   VPFE_DATA_SZ_SHIFT) | ((params->pix_fmt &
		   VPFE_PIX_FMT_MASK) << VPFE_PIX_FMT_SHIFT) |
		   VPFE_WEN_ENABLE | VPFE_VDHDEN_ENABLE);

	/* Enable and configure aLaw register if needed */
	if (config_params->alaw.enable) {
		val = ((config_params->alaw.gamma_wd &
		      VPFE_ALAW_GAMMA_WD_MASK) | VPFE_ALAW_ENABLE);
		vpfe_reg_write(ccdc, val, VPFE_ALAW);
		vpfe_dbg(3, vpfe, "\nWriting 0x%x to ALAW...\n", val);
	}

	/* Configure video window */
	vpfe_ccdc_setwin(ccdc, &params->win, params->frm_fmt,
			 params->bytesperpixel);

	/* Configure Black Clamp */
	vpfe_ccdc_config_black_clamp(ccdc, &config_params->blk_clamp);

	/* Configure Black level compensation */
	vpfe_ccdc_config_black_compense(ccdc, &config_params->blk_comp);

	/* If data size is 8 bit then pack the data */
	if ((config_params->data_sz == VPFE_CCDC_DATA_8BITS) ||
	    config_params->alaw.enable)
		syn_mode |= VPFE_DATA_PACK_ENABLE;

	/*
	 * Configure Horizontal offset register. If pack 8 is enabled then
	 * 1 pixel will take 1 byte
	 */
	vpfe_reg_write(ccdc, params->bytesperline, VPFE_HSIZE_OFF);

	vpfe_dbg(3, vpfe, "Writing %d (%x) to HSIZE_OFF\n",
		params->bytesperline, params->bytesperline);

	/* Set value for SDOFST */
	if (params->frm_fmt == CCDC_FRMFMT_INTERLACED) {
		if (params->image_invert_enable) {
			/* For interlace inverse mode */
			vpfe_reg_write(ccdc, VPFE_INTERLACED_IMAGE_INVERT,
				   VPFE_SDOFST);
		} else {
			/* For interlace non inverse mode */
			vpfe_reg_write(ccdc, VPFE_INTERLACED_NO_IMAGE_INVERT,
				   VPFE_SDOFST);
		}
	} else if (params->frm_fmt == CCDC_FRMFMT_PROGRESSIVE) {
		vpfe_reg_write(ccdc, VPFE_PROGRESSIVE_NO_IMAGE_INVERT,
			   VPFE_SDOFST);
	}

	vpfe_reg_write(ccdc, syn_mode, VPFE_SYNMODE);

	vpfe_reg_dump(ccdc);
}

static inline int
vpfe_ccdc_set_buftype(struct vpfe_ccdc *ccdc,
		      enum ccdc_buftype buf_type)
{
	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER)
		ccdc->ccdc_cfg.bayer.buf_type = buf_type;
	else
		ccdc->ccdc_cfg.ycbcr.buf_type = buf_type;

	return 0;
}

static inline enum ccdc_buftype vpfe_ccdc_get_buftype(struct vpfe_ccdc *ccdc)
{
	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER)
		return ccdc->ccdc_cfg.bayer.buf_type;

	return ccdc->ccdc_cfg.ycbcr.buf_type;
}

static int vpfe_ccdc_set_pixel_format(struct vpfe_ccdc *ccdc, u32 pixfmt)
{
	struct vpfe_device *vpfe = container_of(ccdc, struct vpfe_device, ccdc);

	vpfe_dbg(1, vpfe, "vpfe_ccdc_set_pixel_format: if_type: %d, pixfmt:%s\n",
		 ccdc->ccdc_cfg.if_type, print_fourcc(pixfmt));

	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER) {
		ccdc->ccdc_cfg.bayer.pix_fmt = CCDC_PIXFMT_RAW;
		/*
		 * Need to clear it in case it was left on
		 * after the last capture.
		 */
		ccdc->ccdc_cfg.bayer.config_params.alaw.enable = 0;

		switch (pixfmt) {
		case V4L2_PIX_FMT_SBGGR8:
			ccdc->ccdc_cfg.bayer.config_params.alaw.enable = 1;
			break;

		case V4L2_PIX_FMT_YUYV:
		case V4L2_PIX_FMT_UYVY:
		case V4L2_PIX_FMT_YUV420:
		case V4L2_PIX_FMT_NV12:
		case V4L2_PIX_FMT_RGB565X:
			break;

		case V4L2_PIX_FMT_SBGGR16:
		default:
			return -EINVAL;
		}
	} else {
		switch (pixfmt) {
		case V4L2_PIX_FMT_YUYV:
			ccdc->ccdc_cfg.ycbcr.pix_order = CCDC_PIXORDER_YCBYCR;
			break;

		case V4L2_PIX_FMT_UYVY:
			ccdc->ccdc_cfg.ycbcr.pix_order = CCDC_PIXORDER_CBYCRY;
			break;

		default:
			return -EINVAL;
		}
	}

	return 0;
}

static u32 vpfe_ccdc_get_pixel_format(struct vpfe_ccdc *ccdc)
{
	u32 pixfmt;

	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER) {
		pixfmt = V4L2_PIX_FMT_YUYV;
	} else {
		if (ccdc->ccdc_cfg.ycbcr.pix_order == CCDC_PIXORDER_YCBYCR)
			pixfmt = V4L2_PIX_FMT_YUYV;
		else
			pixfmt = V4L2_PIX_FMT_UYVY;
	}

	return pixfmt;
}

static int
vpfe_ccdc_set_image_window(struct vpfe_ccdc *ccdc,
			   struct v4l2_rect *win, unsigned int bpp)
{
	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER) {
		ccdc->ccdc_cfg.bayer.win = *win;
		ccdc->ccdc_cfg.bayer.bytesperpixel = bpp;
		ccdc->ccdc_cfg.bayer.bytesperline = ALIGN(win->width * bpp, 32);
	} else {
		ccdc->ccdc_cfg.ycbcr.win = *win;
		ccdc->ccdc_cfg.ycbcr.bytesperpixel = bpp;
		ccdc->ccdc_cfg.ycbcr.bytesperline = ALIGN(win->width * bpp, 32);
	}

	return 0;
}

static inline void
vpfe_ccdc_get_image_window(struct vpfe_ccdc *ccdc,
			   struct v4l2_rect *win)
{
	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER)
		*win = ccdc->ccdc_cfg.bayer.win;
	else
		*win = ccdc->ccdc_cfg.ycbcr.win;
}

static inline unsigned int vpfe_ccdc_get_line_length(struct vpfe_ccdc *ccdc)
{
	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER)
		return ccdc->ccdc_cfg.bayer.bytesperline;

	return ccdc->ccdc_cfg.ycbcr.bytesperline;
}

static inline int
vpfe_ccdc_set_frame_format(struct vpfe_ccdc *ccdc,
			   enum ccdc_frmfmt frm_fmt)
{
	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER)
		ccdc->ccdc_cfg.bayer.frm_fmt = frm_fmt;
	else
		ccdc->ccdc_cfg.ycbcr.frm_fmt = frm_fmt;

	return 0;
}

static inline enum ccdc_frmfmt
vpfe_ccdc_get_frame_format(struct vpfe_ccdc *ccdc)
{
	if (ccdc->ccdc_cfg.if_type == VPFE_RAW_BAYER)
		return ccdc->ccdc_cfg.bayer.frm_fmt;

	return ccdc->ccdc_cfg.ycbcr.frm_fmt;
}

static inline int vpfe_ccdc_getfid(struct vpfe_ccdc *ccdc)
{
	return (vpfe_reg_read(ccdc, VPFE_SYNMODE) >> 15) & 1;
}

static inline void vpfe_set_sdr_addr(struct vpfe_ccdc *ccdc, unsigned long addr)
{
	vpfe_reg_write(ccdc, addr & 0xffffffe0, VPFE_SDR_ADDR);
}

static int vpfe_ccdc_set_hw_if_params(struct vpfe_ccdc *ccdc,
				      struct vpfe_hw_if_param *params)
{
	struct vpfe_device *vpfe = container_of(ccdc, struct vpfe_device, ccdc);

	ccdc->ccdc_cfg.if_type = params->if_type;

	switch (params->if_type) {
	case VPFE_BT656:
	case VPFE_YCBCR_SYNC_16:
	case VPFE_YCBCR_SYNC_8:
	case VPFE_BT656_10BIT:
		ccdc->ccdc_cfg.ycbcr.vd_pol = params->vdpol;
		ccdc->ccdc_cfg.ycbcr.hd_pol = params->hdpol;
		break;

	case VPFE_RAW_BAYER:
		ccdc->ccdc_cfg.bayer.vd_pol = params->vdpol;
		ccdc->ccdc_cfg.bayer.hd_pol = params->hdpol;
		if (params->bus_width == 10)
			ccdc->ccdc_cfg.bayer.config_params.data_sz =
				VPFE_CCDC_DATA_10BITS;
		else
			ccdc->ccdc_cfg.bayer.config_params.data_sz =
				VPFE_CCDC_DATA_8BITS;
		vpfe_dbg(1, vpfe, "params.bus_width: %d\n",
			params->bus_width);
		vpfe_dbg(1, vpfe, "config_params.data_sz: %d\n",
			ccdc->ccdc_cfg.bayer.config_params.data_sz);
		break;

	default:
		return -EINVAL;
	}

	return 0;
}

static void vpfe_clear_intr(struct vpfe_ccdc *ccdc, int vdint)
{
	unsigned int vpfe_int_status;

	vpfe_int_status = vpfe_reg_read(ccdc, VPFE_IRQ_STS);

	switch (vdint) {
	/* VD0 interrupt */
	case VPFE_VDINT0:
		vpfe_int_status &= ~VPFE_VDINT0;
		vpfe_int_status |= VPFE_VDINT0;
		break;

	/* VD1 interrupt */
	case VPFE_VDINT1:
		vpfe_int_status &= ~VPFE_VDINT1;
		vpfe_int_status |= VPFE_VDINT1;
		break;

	/* VD2 interrupt */
	case VPFE_VDINT2:
		vpfe_int_status &= ~VPFE_VDINT2;
		vpfe_int_status |= VPFE_VDINT2;
		break;

	/* Clear all interrupts */
	default:
		vpfe_int_status &= ~(VPFE_VDINT0 |
				VPFE_VDINT1 |
				VPFE_VDINT2);
		vpfe_int_status |= (VPFE_VDINT0 |
				VPFE_VDINT1 |
				VPFE_VDINT2);
		break;
	}
	/* Clear specific VDINT from the status register */
	vpfe_reg_write(ccdc, vpfe_int_status, VPFE_IRQ_STS);

	vpfe_int_status = vpfe_reg_read(ccdc, VPFE_IRQ_STS);

	/* Acknowledge that we are done with all interrupts */
	vpfe_reg_write(ccdc, 1, VPFE_IRQ_EOI);
}

static void vpfe_ccdc_config_defaults(struct vpfe_ccdc *ccdc)
{
	ccdc->ccdc_cfg.if_type = VPFE_RAW_BAYER;

	ccdc->ccdc_cfg.ycbcr.pix_fmt = CCDC_PIXFMT_YCBCR_8BIT;
	ccdc->ccdc_cfg.ycbcr.frm_fmt = CCDC_FRMFMT_INTERLACED;
	ccdc->ccdc_cfg.ycbcr.fid_pol = VPFE_PINPOL_POSITIVE;
	ccdc->ccdc_cfg.ycbcr.vd_pol = VPFE_PINPOL_POSITIVE;
	ccdc->ccdc_cfg.ycbcr.hd_pol = VPFE_PINPOL_POSITIVE;
	ccdc->ccdc_cfg.ycbcr.pix_order = CCDC_PIXORDER_CBYCRY;
	ccdc->ccdc_cfg.ycbcr.buf_type = CCDC_BUFTYPE_FLD_INTERLEAVED;

	ccdc->ccdc_cfg.ycbcr.win.left = 0;
	ccdc->ccdc_cfg.ycbcr.win.top = 0;
	ccdc->ccdc_cfg.ycbcr.win.width = 720;
	ccdc->ccdc_cfg.ycbcr.win.height = 576;
	ccdc->ccdc_cfg.ycbcr.bt656_enable = 1;

	ccdc->ccdc_cfg.bayer.pix_fmt = CCDC_PIXFMT_RAW;
	ccdc->ccdc_cfg.bayer.frm_fmt = CCDC_FRMFMT_PROGRESSIVE;
	ccdc->ccdc_cfg.bayer.fid_pol = VPFE_PINPOL_POSITIVE;
	ccdc->ccdc_cfg.bayer.vd_pol = VPFE_PINPOL_POSITIVE;
	ccdc->ccdc_cfg.bayer.hd_pol = VPFE_PINPOL_POSITIVE;

	ccdc->ccdc_cfg.bayer.win.left = 0;
	ccdc->ccdc_cfg.bayer.win.top = 0;
	ccdc->ccdc_cfg.bayer.win.width = 800;
	ccdc->ccdc_cfg.bayer.win.height = 600;
	ccdc->ccdc_cfg.bayer.config_params.data_sz = VPFE_CCDC_DATA_8BITS;
	ccdc->ccdc_cfg.bayer.config_params.alaw.gamma_wd =
						VPFE_CCDC_GAMMA_BITS_09_0;
}

/*
 * vpfe_get_ccdc_image_format - Get image parameters based on CCDC settings
 */
static int vpfe_get_ccdc_image_format(struct vpfe_device *vpfe,
				      struct v4l2_format *f)
{
	struct v4l2_rect image_win;
	enum ccdc_buftype buf_type;
	enum ccdc_frmfmt frm_fmt;

	memset(f, 0, sizeof(*f));
	f->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
	vpfe_ccdc_get_image_window(&vpfe->ccdc, &image_win);
	f->fmt.pix.width = image_win.width;
	f->fmt.pix.height = image_win.height;
	f->fmt.pix.bytesperline = vpfe_ccdc_get_line_length(&vpfe->ccdc);
	f->fmt.pix.sizeimage = f->fmt.pix.bytesperline *
				f->fmt.pix.height;
	buf_type = vpfe_ccdc_get_buftype(&vpfe->ccdc);
	f->fmt.pix.pixelformat = vpfe_ccdc_get_pixel_format(&vpfe->ccdc);
	frm_fmt = vpfe_ccdc_get_frame_format(&vpfe->ccdc);

	if (frm_fmt == CCDC_FRMFMT_PROGRESSIVE) {
		f->fmt.pix.field = V4L2_FIELD_NONE;
	} else if (frm_fmt == CCDC_FRMFMT_INTERLACED) {
		if (buf_type == CCDC_BUFTYPE_FLD_INTERLEAVED) {
			f->fmt.pix.field = V4L2_FIELD_INTERLACED;
		 } else if (buf_type == CCDC_BUFTYPE_FLD_SEPARATED) {
			f->fmt.pix.field = V4L2_FIELD_SEQ_TB;
		} else {
			vpfe_err(vpfe, "Invalid buf_type\n");
			return -EINVAL;
		}
	} else {
		vpfe_err(vpfe, "Invalid frm_fmt\n");
		return -EINVAL;
	}
	return 0;
}

static int vpfe_config_ccdc_image_format(struct vpfe_device *vpfe)
{
	enum ccdc_frmfmt frm_fmt = CCDC_FRMFMT_INTERLACED;
	int ret;

	vpfe_dbg(2, vpfe, "vpfe_config_ccdc_image_format\n");

	vpfe_dbg(1, vpfe, "pixelformat: %s\n",
		print_fourcc(vpfe->fmt.fmt.pix.pixelformat));

	if (vpfe_ccdc_set_pixel_format(&vpfe->ccdc,
			vpfe->fmt.fmt.pix.pixelformat) < 0) {
		vpfe_err(vpfe, "couldn't set pix format in ccdc\n");
		return -EINVAL;
	}

	/* configure the image window */
	vpfe_ccdc_set_image_window(&vpfe->ccdc, &vpfe->crop, vpfe->bpp);

	switch (vpfe->fmt.fmt.pix.field) {
	case V4L2_FIELD_INTERLACED:
		/* do nothing, since it is default */
		ret = vpfe_ccdc_set_buftype(
				&vpfe->ccdc,
				CCDC_BUFTYPE_FLD_INTERLEAVED);
		break;

	case V4L2_FIELD_NONE:
		frm_fmt = CCDC_FRMFMT_PROGRESSIVE;
		/* buffer type only applicable for interlaced scan */
		break;

	case V4L2_FIELD_SEQ_TB:
		ret = vpfe_ccdc_set_buftype(
				&vpfe->ccdc,
				CCDC_BUFTYPE_FLD_SEPARATED);
		break;

	default:
		return -EINVAL;
	}

	if (ret)
		return ret;

	return vpfe_ccdc_set_frame_format(&vpfe->ccdc, frm_fmt);
}

/*
 * vpfe_config_image_format()
 * For a given standard, this functions sets up the default
 * pix format & crop values in the vpfe device and ccdc.  It first
 * starts with defaults based values from the standard table.
 * It then checks if sub device support g_mbus_fmt and then override the
 * values based on that.Sets crop values to match with scan resolution
 * starting at 0,0. It calls vpfe_config_ccdc_image_format() set the
 * values in ccdc
 */
static int vpfe_config_image_format(struct vpfe_device *vpfe,
				    v4l2_std_id std_id)
{
	struct v4l2_pix_format *pix = &vpfe->fmt.fmt.pix;
	int i, ret;

	for (i = 0; i < ARRAY_SIZE(vpfe_standards); i++) {
		if (vpfe_standards[i].std_id & std_id) {
			vpfe->std_info.active_pixels =
					vpfe_standards[i].width;
			vpfe->std_info.active_lines =
					vpfe_standards[i].height;
			vpfe->std_info.frame_format =
					vpfe_standards[i].frame_format;
			vpfe->std_index = i;

			break;
		}
	}

	if (i ==  ARRAY_SIZE(vpfe_standards)) {
		vpfe_err(vpfe, "standard not supported\n");
		return -EINVAL;
	}

	vpfe->crop.top = vpfe->crop.left = 0;
	vpfe->crop.width = vpfe->std_info.active_pixels;
	vpfe->crop.height = vpfe->std_info.active_lines;
	pix->width = vpfe->crop.width;
	pix->height = vpfe->crop.height;
	pix->pixelformat = V4L2_PIX_FMT_YUYV;

	/* first field and frame format based on standard frame format */
	if (vpfe->std_info.frame_format)
		pix->field = V4L2_FIELD_INTERLACED;
	else
		pix->field = V4L2_FIELD_NONE;

	ret = __vpfe_get_format(vpfe, &vpfe->fmt, &vpfe->bpp);
	if (ret)
		return ret;

	/* Update the crop window based on found values */
	vpfe->crop.width = pix->width;
	vpfe->crop.height = pix->height;

	return vpfe_config_ccdc_image_format(vpfe);
}

static int vpfe_initialize_device(struct vpfe_device *vpfe)
{
	struct vpfe_subdev_info *sdinfo;
	int ret;

	sdinfo = &vpfe->cfg->sub_devs[0];
	sdinfo->sd = vpfe->sd[0];
	vpfe->current_input = 0;
	vpfe->std_index = 0;
	/* Configure the default format information */
	ret = vpfe_config_image_format(vpfe,
				       vpfe_standards[vpfe->std_index].std_id);
	if (ret)
		return ret;

	pm_runtime_get_sync(vpfe->pdev);

	vpfe_config_enable(&vpfe->ccdc, 1);

	vpfe_ccdc_restore_defaults(&vpfe->ccdc);

	/* Clear all VPFE interrupts */
	vpfe_clear_intr(&vpfe->ccdc, -1);

	return ret;
}

/*
 * vpfe_release : This function is based on the vb2_fop_release
 * helper function.
 * It has been augmented to handle module power management,
 * by disabling/enabling h/w module fcntl clock when necessary.
 */
static int vpfe_release(struct file *file)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	int ret;

	mutex_lock(&vpfe->lock);

	if (v4l2_fh_is_singular_file(file))
		vpfe_ccdc_close(&vpfe->ccdc, vpfe->pdev);
	ret = _vb2_fop_release(file, NULL);

	mutex_unlock(&vpfe->lock);

	return ret;
}

/*
 * vpfe_open : This function is based on the v4l2_fh_open helper function.
 * It has been augmented to handle module power management,
 * by disabling/enabling h/w module fcntl clock when necessary.
 */
static int vpfe_open(struct file *file)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	int ret;

	mutex_lock(&vpfe->lock);

	ret = v4l2_fh_open(file);
	if (ret) {
		vpfe_err(vpfe, "v4l2_fh_open failed\n");
		goto unlock;
	}

	if (!v4l2_fh_is_singular_file(file))
		goto unlock;

	if (vpfe_initialize_device(vpfe)) {
		v4l2_fh_release(file);
		ret = -ENODEV;
	}

unlock:
	mutex_unlock(&vpfe->lock);
	return ret;
}

/**
 * vpfe_schedule_next_buffer: set next buffer address for capture
 * @vpfe : ptr to vpfe device
 *
 * This function will get next buffer from the dma queue and
 * set the buffer address in the vpfe register for capture.
 * the buffer is marked active
 *
 * Assumes caller is holding vpfe->dma_queue_lock already
 */
static inline void vpfe_schedule_next_buffer(struct vpfe_device *vpfe)
{
	vpfe->next_frm = list_entry(vpfe->dma_queue.next,
				    struct vpfe_cap_buffer, list);
	list_del(&vpfe->next_frm->list);

	vpfe_set_sdr_addr(&vpfe->ccdc,
		       vb2_dma_contig_plane_dma_addr(&vpfe->next_frm->vb, 0));
}

static inline void vpfe_schedule_bottom_field(struct vpfe_device *vpfe)
{
	unsigned long addr;

	addr = vb2_dma_contig_plane_dma_addr(&vpfe->next_frm->vb, 0) +
					vpfe->field_off;

	vpfe_set_sdr_addr(&vpfe->ccdc, addr);
}

/*
 * vpfe_process_buffer_complete: process a completed buffer
 * @vpfe : ptr to vpfe device
 *
 * This function time stamp the buffer and mark it as DONE. It also
 * wake up any process waiting on the QUEUE and set the next buffer
 * as current
 */
static inline void vpfe_process_buffer_complete(struct vpfe_device *vpfe)
{
	v4l2_get_timestamp(&vpfe->cur_frm->vb.v4l2_buf.timestamp);
	vpfe->cur_frm->vb.v4l2_buf.field = vpfe->fmt.fmt.pix.field;
	vpfe->cur_frm->vb.v4l2_buf.sequence = vpfe->sequence++;
	vb2_buffer_done(&vpfe->cur_frm->vb, VB2_BUF_STATE_DONE);
	vpfe->cur_frm = vpfe->next_frm;
}

/*
 * vpfe_isr : ISR handler for vpfe capture (VINT0)
 * @irq: irq number
 * @dev_id: dev_id ptr
 *
 * It changes status of the captured buffer, takes next buffer from the queue
 * and sets its address in VPFE registers
 */
static irqreturn_t vpfe_isr(int irq, void *dev)
{
	struct vpfe_device *vpfe = (struct vpfe_device *)dev;
	enum v4l2_field field;
	int intr_status;
	int fid;

	intr_status = vpfe_reg_read(&vpfe->ccdc, VPFE_IRQ_STS);

	if (intr_status & VPFE_VDINT0) {
		field = vpfe->fmt.fmt.pix.field;

		if (field == V4L2_FIELD_NONE) {
			/* handle progressive frame capture */
			if (vpfe->cur_frm != vpfe->next_frm)
				vpfe_process_buffer_complete(vpfe);
			goto next_intr;
		}

		/* interlaced or TB capture check which field
		   we are in hardware */
		fid = vpfe_ccdc_getfid(&vpfe->ccdc);

		/* switch the software maintained field id */
		vpfe->field ^= 1;
		if (fid == vpfe->field) {
			/* we are in-sync here,continue */
			if (fid == 0) {
				/*
				 * One frame is just being captured. If the
				 * next frame is available, release the
				 * current frame and move on
				 */
				if (vpfe->cur_frm != vpfe->next_frm)
					vpfe_process_buffer_complete(vpfe);
				/*
				 * based on whether the two fields are stored
				 * interleave or separately in memory,
				 * reconfigure the CCDC memory address
				 */
				if (field == V4L2_FIELD_SEQ_TB)
					vpfe_schedule_bottom_field(vpfe);

				goto next_intr;
			}
			/*
			 * if one field is just being captured configure
			 * the next frame get the next frame from the empty
			 * queue if no frame is available hold on to the
			 * current buffer
			 */
			spin_lock(&vpfe->dma_queue_lock);
			if (!list_empty(&vpfe->dma_queue) &&
			    vpfe->cur_frm == vpfe->next_frm)
				vpfe_schedule_next_buffer(vpfe);
			spin_unlock(&vpfe->dma_queue_lock);
		} else if (fid == 0) {
			/*
			 * out of sync. Recover from any hardware out-of-sync.
			 * May loose one frame
			 */
			vpfe->field = fid;
		}
	}

next_intr:
	if (intr_status & VPFE_VDINT1) {
		spin_lock(&vpfe->dma_queue_lock);
		if (vpfe->fmt.fmt.pix.field == V4L2_FIELD_NONE &&
		    !list_empty(&vpfe->dma_queue) &&
		    vpfe->cur_frm == vpfe->next_frm)
			vpfe_schedule_next_buffer(vpfe);
		spin_unlock(&vpfe->dma_queue_lock);
	}

	vpfe_clear_intr(&vpfe->ccdc, intr_status);

	return IRQ_HANDLED;
}

static inline void vpfe_detach_irq(struct vpfe_device *vpfe)
{
	unsigned int intr = VPFE_VDINT0;
	enum ccdc_frmfmt frame_format;

	frame_format = vpfe_ccdc_get_frame_format(&vpfe->ccdc);
	if (frame_format == CCDC_FRMFMT_PROGRESSIVE)
		intr |= VPFE_VDINT1;

	vpfe_reg_write(&vpfe->ccdc, intr, VPFE_IRQ_EN_CLR);
}

static inline void vpfe_attach_irq(struct vpfe_device *vpfe)
{
	unsigned int intr = VPFE_VDINT0;
	enum ccdc_frmfmt frame_format;

	frame_format = vpfe_ccdc_get_frame_format(&vpfe->ccdc);
	if (frame_format == CCDC_FRMFMT_PROGRESSIVE)
		intr |= VPFE_VDINT1;

	vpfe_reg_write(&vpfe->ccdc, intr, VPFE_IRQ_EN_SET);
}

static int vpfe_querycap(struct file *file, void  *priv,
			 struct v4l2_capability *cap)
{
	struct vpfe_device *vpfe = video_drvdata(file);

	vpfe_dbg(2, vpfe, "vpfe_querycap\n");

	strlcpy(cap->driver, VPFE_MODULE_NAME, sizeof(cap->driver));
	strlcpy(cap->card, "TI AM437x VPFE", sizeof(cap->card));
	snprintf(cap->bus_info, sizeof(cap->bus_info),
			"platform:%s", vpfe->v4l2_dev.name);
	cap->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING |
			    V4L2_CAP_READWRITE;
	cap->capabilities = cap->device_caps | V4L2_CAP_DEVICE_CAPS;

	return 0;
}

/* get the format set at output pad of the adjacent subdev */
static int __vpfe_get_format(struct vpfe_device *vpfe,
			     struct v4l2_format *format, unsigned int *bpp)
{
	struct v4l2_mbus_framefmt mbus_fmt;
	struct vpfe_subdev_info *sdinfo;
	struct v4l2_subdev_format fmt;
	int ret;

	sdinfo = vpfe->current_subdev;
	if (!sdinfo->sd)
		return -EINVAL;

	fmt.which = V4L2_SUBDEV_FORMAT_ACTIVE;
	fmt.pad = 0;

	ret = v4l2_subdev_call(sdinfo->sd, pad, get_fmt, NULL, &fmt);
	if (ret && ret != -ENOIOCTLCMD && ret != -ENODEV)
		return ret;

	if (!ret) {
		v4l2_fill_pix_format(&format->fmt.pix, &fmt.format);
		mbus_to_pix(vpfe, &fmt.format, &format->fmt.pix, bpp);
	} else {
		ret = v4l2_device_call_until_err(&vpfe->v4l2_dev,
						 sdinfo->grp_id,
						 video, g_mbus_fmt,
						 &mbus_fmt);
		if (ret && ret != -ENOIOCTLCMD && ret != -ENODEV)
			return ret;
		v4l2_fill_pix_format(&format->fmt.pix, &mbus_fmt);
		mbus_to_pix(vpfe, &mbus_fmt, &format->fmt.pix, bpp);
	}

	format->type = vpfe->fmt.type;

	vpfe_dbg(1, vpfe,
		 "%s size %dx%d (%s) bytesperline = %d, size = %d, bpp = %d\n",
		 __func__, format->fmt.pix.width, format->fmt.pix.height,
		 print_fourcc(format->fmt.pix.pixelformat),
		 format->fmt.pix.bytesperline, format->fmt.pix.sizeimage, *bpp);

	return 0;
}

/* set the format at output pad of the adjacent subdev */
static int __vpfe_set_format(struct vpfe_device *vpfe,
			     struct v4l2_format *format, unsigned int *bpp)
{
	struct v4l2_mbus_framefmt mbus_fmt;
	struct vpfe_subdev_info *sdinfo;
	struct v4l2_subdev_format fmt;
	int ret;

	vpfe_dbg(2, vpfe, "__vpfe_set_format\n");

	sdinfo = vpfe->current_subdev;
	if (!sdinfo->sd)
		return -EINVAL;

	fmt.which = V4L2_SUBDEV_FORMAT_ACTIVE;
	fmt.pad = 0;

	pix_to_mbus(vpfe, &format->fmt.pix, &fmt.format);

	ret = v4l2_subdev_call(sdinfo->sd, pad, set_fmt, NULL, &fmt);
	if (ret && ret != -ENOIOCTLCMD && ret != -ENODEV)
		return ret;

	if (!ret) {
		v4l2_fill_pix_format(&format->fmt.pix, &fmt.format);
		mbus_to_pix(vpfe, &fmt.format, &format->fmt.pix, bpp);
	} else {
		ret = v4l2_device_call_until_err(&vpfe->v4l2_dev,
						 sdinfo->grp_id,
						 video, s_mbus_fmt,
						 &mbus_fmt);
		if (ret && ret != -ENOIOCTLCMD && ret != -ENODEV)
			return ret;

		v4l2_fill_pix_format(&format->fmt.pix, &mbus_fmt);
		mbus_to_pix(vpfe, &mbus_fmt, &format->fmt.pix, bpp);
	}

	format->type = vpfe->fmt.type;

	vpfe_dbg(1, vpfe,
		 "%s size %dx%d (%s) bytesperline = %d, size = %d, bpp = %d\n",
		 __func__,  format->fmt.pix.width, format->fmt.pix.height,
		 print_fourcc(format->fmt.pix.pixelformat),
		 format->fmt.pix.bytesperline, format->fmt.pix.sizeimage, *bpp);

	return 0;
}

static int vpfe_g_fmt(struct file *file, void *priv,
		      struct v4l2_format *fmt)
{
	struct vpfe_device *vpfe = video_drvdata(file);

	vpfe_dbg(2, vpfe, "vpfe_g_fmt\n");

	*fmt = vpfe->fmt;

	return 0;
}

static int vpfe_enum_fmt(struct file *file, void  *priv,
			 struct v4l2_fmtdesc *f)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct vpfe_subdev_info *sdinfo;
	struct vpfe_fmt *fmt = NULL;
	unsigned int k;

	vpfe_dbg(2, vpfe, "vpfe_enum_format index:%d\n",
		f->index);

	sdinfo = vpfe->current_subdev;
	if (!sdinfo->sd)
		return -EINVAL;

	if (f->index > ARRAY_SIZE(formats))
		return -EINVAL;

	for (k = 0; k < ARRAY_SIZE(formats); k++) {
		if (formats[k].index == f->index) {
			fmt = &formats[k];
			break;
		}
	}
	if (!fmt)
		return -EINVAL;

	strncpy(f->description, fmt->name, sizeof(f->description) - 1);
	f->pixelformat = fmt->fourcc;
	f->type = vpfe->fmt.type;

	vpfe_dbg(1, vpfe, "vpfe_enum_format: mbus index: %d code: %x pixelformat: %s [%s]\n",
		f->index, fmt->code, print_fourcc(fmt->fourcc), fmt->name);

	return 0;
}

static int vpfe_try_fmt(struct file *file, void *priv,
			struct v4l2_format *fmt)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	unsigned int bpp;

	vpfe_dbg(2, vpfe, "vpfe_try_fmt\n");

	return __vpfe_get_format(vpfe, fmt, &bpp);
}

static int vpfe_s_fmt(struct file *file, void *priv,
		      struct v4l2_format *fmt)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct v4l2_format format;
	unsigned int bpp;
	int ret;

	vpfe_dbg(2, vpfe, "vpfe_s_fmt\n");

	/* If streaming is started, return error */
	if (vb2_is_busy(&vpfe->buffer_queue)) {
		vpfe_err(vpfe, "%s device busy\n", __func__);
		return -EBUSY;
	}

	ret = vpfe_try_fmt(file, priv, fmt);
	if (ret)
		return ret;


	if (!cmp_v4l2_format(fmt, &format)) {
		/* Sensor format is different from the requested format
		 * so we need to change it
		 */
		ret = __vpfe_set_format(vpfe, fmt, &bpp);
		if (ret)
			return ret;
	} else /* Just make sure all of the fields are consistent */
		*fmt = format;

	/* First detach any IRQ if currently attached */
	vpfe_detach_irq(vpfe);
	vpfe->fmt = *fmt;
	vpfe->bpp = bpp;

	/* Update the crop window based on found values */
	vpfe->crop.width = fmt->fmt.pix.width;
	vpfe->crop.height = fmt->fmt.pix.height;

	/* set image capture parameters in the ccdc */
	return vpfe_config_ccdc_image_format(vpfe);
}

static int vpfe_enum_size(struct file *file, void  *priv,
			  struct v4l2_frmsizeenum *fsize)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct v4l2_subdev_frame_size_enum fse;
	struct vpfe_subdev_info *sdinfo;
	struct v4l2_mbus_framefmt mbus;
	struct v4l2_pix_format pix;
	struct vpfe_fmt *fmt;
	int ret;

	vpfe_dbg(2, vpfe, "vpfe_enum_size\n");

	/* check for valid format */
	fmt = find_format_by_pix(fsize->pixel_format);
	if (!fmt) {
		vpfe_dbg(3, vpfe, "Invalid pixel code: %x, default used instead\n",
			fsize->pixel_format);
		return -EINVAL;
	}

	memset(fsize->reserved, 0x0, sizeof(fsize->reserved));

	sdinfo = vpfe->current_subdev;
	if (!sdinfo->sd)
		return -EINVAL;

	memset(&pix, 0x0, sizeof(pix));
	/* Construct pix from parameter and use default for the rest */
	pix.pixelformat = fsize->pixel_format;
	pix.width = 640;
	pix.height = 480;
	pix.colorspace = V4L2_COLORSPACE_SRGB;
	pix.field = V4L2_FIELD_NONE;
	pix_to_mbus(vpfe, &pix, &mbus);

	memset(&fse, 0x0, sizeof(fse));
	fse.index = fsize->index;
	fse.pad = 0;
	fse.code = mbus.code;
	fse.which = V4L2_SUBDEV_FORMAT_ACTIVE;
	ret = v4l2_subdev_call(sdinfo->sd, pad, enum_frame_size, NULL, &fse);
	if (ret)
		return -EINVAL;

	vpfe_dbg(1, vpfe, "vpfe_enum_size: index: %d code: %x W:[%d,%d] H:[%d,%d]\n",
		fse.index, fse.code, fse.min_width, fse.max_width,
		fse.min_height, fse.max_height);

	fsize->type = V4L2_FRMSIZE_TYPE_DISCRETE;
	fsize->discrete.width = fse.max_width;
	fsize->discrete.height = fse.max_height;

	vpfe_dbg(1, vpfe, "vpfe_enum_size: index: %d pixformat: %s size: %dx%d\n",
		fsize->index, print_fourcc(fsize->pixel_format),
		fsize->discrete.width, fsize->discrete.height);

	return 0;
}

/*
 * vpfe_get_subdev_input_index - Get subdev index and subdev input index for a
 * given app input index
 */
static int
vpfe_get_subdev_input_index(struct vpfe_device *vpfe,
			    int *subdev_index,
			    int *subdev_input_index,
			    int app_input_index)
{
	struct vpfe_config *cfg = vpfe->cfg;
	struct vpfe_subdev_info *sdinfo;
	int i, j = 0;

	for (i = 0; i < ARRAY_SIZE(vpfe->cfg->asd); i++) {
		sdinfo = &cfg->sub_devs[i];
		if (app_input_index < (j + 1)) {
			*subdev_index = i;
			*subdev_input_index = app_input_index - j;
			return 0;
		}
		j++;
	}
	return -EINVAL;
}

/*
 * vpfe_get_app_input - Get app input index for a given subdev input index
 * driver stores the input index of the current sub device and translate it
 * when application request the current input
 */
static int vpfe_get_app_input_index(struct vpfe_device *vpfe,
				    int *app_input_index)
{
	struct vpfe_config *cfg = vpfe->cfg;
	struct vpfe_subdev_info *sdinfo;
	struct i2c_client *client;
	struct i2c_client *curr_client;
	int i, j = 0;

	curr_client = v4l2_get_subdevdata(vpfe->current_subdev->sd);
	for (i = 0; i < ARRAY_SIZE(vpfe->cfg->asd); i++) {
		sdinfo = &cfg->sub_devs[i];
		client = v4l2_get_subdevdata(sdinfo->sd);
		if (client->addr == curr_client->addr &&
		    client->adapter->nr == client->adapter->nr) {
			if (vpfe->current_input >= 1)
				return -1;
			*app_input_index = j + vpfe->current_input;
			return 0;
		}
		j++;
	}
	return -EINVAL;
}

static int vpfe_enum_input(struct file *file, void *priv,
			   struct v4l2_input *inp)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct vpfe_subdev_info *sdinfo;
	int subdev, index;

	vpfe_dbg(2, vpfe, "vpfe_enum_input\n");

	if (vpfe_get_subdev_input_index(vpfe, &subdev, &index,
					inp->index) < 0) {
		vpfe_dbg(1, vpfe,
			"input information not found for the subdev\n");
		return -EINVAL;
	}
	sdinfo = &vpfe->cfg->sub_devs[subdev];
	*inp = sdinfo->inputs[index];

	return 0;
}

static int vpfe_g_input(struct file *file, void *priv, unsigned int *index)
{
	struct vpfe_device *vpfe = video_drvdata(file);

	vpfe_dbg(2, vpfe, "vpfe_g_input\n");

	return vpfe_get_app_input_index(vpfe, index);
}

/* Assumes caller is holding vpfe_dev->lock */
static int vpfe_set_input(struct vpfe_device *vpfe, unsigned int index)
{
	int subdev_index = 0, inp_index = 0;
	struct vpfe_subdev_info *sdinfo;
	struct vpfe_route *route;
	u32 input, output;
	int ret;

	vpfe_dbg(2, vpfe, "vpfe_set_input: index: %d\n", index);

	/* If streaming is started, return error */
	if (vb2_is_busy(&vpfe->buffer_queue)) {
		vpfe_err(vpfe, "%s device busy\n", __func__);
		return -EBUSY;
	}
	ret = vpfe_get_subdev_input_index(vpfe,
					  &subdev_index,
					  &inp_index,
					  index);
	if (ret < 0) {
		vpfe_err(vpfe, "invalid input index: %d\n", index);
		goto get_out;
	}

	sdinfo = &vpfe->cfg->sub_devs[subdev_index];
	sdinfo->sd = vpfe->sd[subdev_index];
	route = &sdinfo->routes[inp_index];
	if (route && sdinfo->can_route) {
		input = route->input;
		output = route->output;
		if (sdinfo->sd) {
			ret = v4l2_subdev_call(sdinfo->sd, video,
					s_routing, input, output, 0);
			if (ret) {
				vpfe_err(vpfe, "s_routing failed\n");
				ret = -EINVAL;
				goto get_out;
			}
		}

	}

	vpfe->current_subdev = sdinfo;
	if (sdinfo->sd)
		vpfe->v4l2_dev.ctrl_handler = sdinfo->sd->ctrl_handler;
	vpfe->current_input = index;
	vpfe->std_index = 0;

	/* set the bus/interface parameter for the sub device in ccdc */
	ret = vpfe_ccdc_set_hw_if_params(&vpfe->ccdc, &sdinfo->vpfe_param);
	if (ret)
		return ret;

	/* set the default image parameters in the device */
	return vpfe_config_image_format(vpfe,
					vpfe_standards[vpfe->std_index].std_id);

get_out:
	return ret;
}

static int vpfe_s_input(struct file *file, void *priv, unsigned int index)
{
	struct vpfe_device *vpfe = video_drvdata(file);

	vpfe_dbg(2, vpfe,
		"vpfe_s_input: index: %d\n", index);

	return vpfe_set_input(vpfe, index);
}

static int vpfe_querystd(struct file *file, void *priv, v4l2_std_id *std_id)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct vpfe_subdev_info *sdinfo;

	vpfe_dbg(2, vpfe, "vpfe_querystd\n");

	sdinfo = vpfe->current_subdev;
	if (!(sdinfo->inputs[0].capabilities & V4L2_IN_CAP_STD))
		return -ENODATA;

	/* Call querystd function of decoder device */
	return v4l2_device_call_until_err(&vpfe->v4l2_dev, sdinfo->grp_id,
					 video, querystd, std_id);
}

static int vpfe_s_std(struct file *file, void *priv, v4l2_std_id std_id)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct vpfe_subdev_info *sdinfo;
	int ret;

	vpfe_dbg(2, vpfe, "vpfe_s_std\n");

	sdinfo = vpfe->current_subdev;
	if (!(sdinfo->inputs[0].capabilities & V4L2_IN_CAP_STD))
		return -ENODATA;

	/* If streaming is started, return error */
	if (vb2_is_busy(&vpfe->buffer_queue)) {
		vpfe_err(vpfe, "%s device busy\n", __func__);
		ret = -EBUSY;
		return ret;
	}

	ret = v4l2_device_call_until_err(&vpfe->v4l2_dev, sdinfo->grp_id,
					 video, s_std, std_id);
	if (ret < 0) {
		vpfe_err(vpfe, "Failed to set standard\n");
		return ret;
	}
	ret = vpfe_config_image_format(vpfe, std_id);

	return ret;
}

static int vpfe_g_std(struct file *file, void *priv, v4l2_std_id *std_id)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct vpfe_subdev_info *sdinfo;

	vpfe_dbg(2, vpfe, "vpfe_g_std\n");

	sdinfo = vpfe->current_subdev;
	if (sdinfo->inputs[0].capabilities != V4L2_IN_CAP_STD)
		return -ENODATA;

	*std_id = vpfe_standards[vpfe->std_index].std_id;

	return 0;
}

/*
 * vpfe_calculate_offsets : This function calculates buffers offset
 * for top and bottom field
 */
static void vpfe_calculate_offsets(struct vpfe_device *vpfe)
{
	struct v4l2_rect image_win;

	vpfe_dbg(2, vpfe, "vpfe_calculate_offsets\n");

	vpfe_ccdc_get_image_window(&vpfe->ccdc, &image_win);
	vpfe->field_off = image_win.height * image_win.width;
}

/*
 * vpfe_queue_setup - Callback function for buffer setup.
 * @vq: vb2_queue ptr
 * @fmt: v4l2 format
 * @nbuffers: ptr to number of buffers requested by application
 * @nplanes:: contains number of distinct video planes needed to hold a frame
 * @sizes[]: contains the size (in bytes) of each plane.
 * @alloc_ctxs: ptr to allocation context
 *
 * This callback function is called when reqbuf() is called to adjust
 * the buffer count and buffer size
 */
static int vpfe_queue_setup(struct vb2_queue *vq,
			    const struct v4l2_format *fmt,
			    unsigned int *nbuffers, unsigned int *nplanes,
			    unsigned int sizes[], void *alloc_ctxs[])
{
	struct vpfe_device *vpfe = vb2_get_drv_priv(vq);

	if (fmt && fmt->fmt.pix.sizeimage < vpfe->fmt.fmt.pix.sizeimage)
		return -EINVAL;

	if (vq->num_buffers + *nbuffers < 3)
		*nbuffers = 3 - vq->num_buffers;

	*nplanes = 1;
	sizes[0] = fmt ? fmt->fmt.pix.sizeimage : vpfe->fmt.fmt.pix.sizeimage;
	alloc_ctxs[0] = vpfe->alloc_ctx;

	vpfe_dbg(1, vpfe,
		"nbuffers=%d, size=%u\n", *nbuffers, sizes[0]);

	/* Calculate field offset */
	vpfe_calculate_offsets(vpfe);

	return 0;
}

/*
 * vpfe_buffer_prepare :  callback function for buffer prepare
 * @vb: ptr to vb2_buffer
 *
 * This is the callback function for buffer prepare when vb2_qbuf()
 * function is called. The buffer is prepared and user space virtual address
 * or user address is converted into  physical address
 */
static int vpfe_buffer_prepare(struct vb2_buffer *vb)
{
	struct vpfe_device *vpfe = vb2_get_drv_priv(vb->vb2_queue);

	vb2_set_plane_payload(vb, 0, vpfe->fmt.fmt.pix.sizeimage);

	if (vb2_get_plane_payload(vb, 0) > vb2_plane_size(vb, 0))
		return -EINVAL;

	vb->v4l2_buf.field = vpfe->fmt.fmt.pix.field;

	return 0;
}

/*
 * vpfe_buffer_queue : Callback function to add buffer to DMA queue
 * @vb: ptr to vb2_buffer
 */
static void vpfe_buffer_queue(struct vb2_buffer *vb)
{
	struct vpfe_device *vpfe = vb2_get_drv_priv(vb->vb2_queue);
	struct vpfe_cap_buffer *buf = to_vpfe_buffer(vb);
	unsigned long flags = 0;

	/* add the buffer to the DMA queue */
	spin_lock_irqsave(&vpfe->dma_queue_lock, flags);
	list_add_tail(&buf->list, &vpfe->dma_queue);
	spin_unlock_irqrestore(&vpfe->dma_queue_lock, flags);
}

/*
 * vpfe_start_streaming : Starts the DMA engine for streaming
 * @vb: ptr to vb2_buffer
 * @count: number of buffers
 */
static int vpfe_start_streaming(struct vb2_queue *vq, unsigned int count)
{
	struct vpfe_device *vpfe = vb2_get_drv_priv(vq);
	struct vpfe_cap_buffer *buf, *tmp;
	struct vpfe_subdev_info *sdinfo;
	unsigned long flags;
	unsigned long addr;
	int ret;

	spin_lock_irqsave(&vpfe->dma_queue_lock, flags);

	vpfe->field = 0;
	vpfe->sequence = 0;

	sdinfo = vpfe->current_subdev;

	vpfe_attach_irq(vpfe);

	if (vpfe->ccdc.ccdc_cfg.if_type == VPFE_RAW_BAYER)
		vpfe_ccdc_config_raw(&vpfe->ccdc);
	else
		vpfe_ccdc_config_ycbcr(&vpfe->ccdc);

	/* Get the next frame from the buffer queue */
	vpfe->next_frm = list_entry(vpfe->dma_queue.next,
				    struct vpfe_cap_buffer, list);
	vpfe->cur_frm = vpfe->next_frm;
	/* Remove buffer from the buffer queue */
	list_del(&vpfe->cur_frm->list);
	spin_unlock_irqrestore(&vpfe->dma_queue_lock, flags);

	addr = vb2_dma_contig_plane_dma_addr(&vpfe->cur_frm->vb, 0);

	vpfe_set_sdr_addr(&vpfe->ccdc, (unsigned long)(addr));

	vpfe_pcr_enable(&vpfe->ccdc, 1);

	ret = v4l2_subdev_call(sdinfo->sd, video, s_stream, 1);
	if (ret < 0) {
		vpfe_err(vpfe, "Error in attaching interrupt handle\n");
		goto err;
	}

	return 0;

err:
	list_for_each_entry_safe(buf, tmp, &vpfe->dma_queue, list) {
		list_del(&buf->list);
		vb2_buffer_done(&buf->vb, VB2_BUF_STATE_QUEUED);
	}

	return ret;
}

/*
 * vpfe_stop_streaming : Stop the DMA engine
 * @vq: ptr to vb2_queue
 *
 * This callback stops the DMA engine and any remaining buffers
 * in the DMA queue are released.
 */
static void vpfe_stop_streaming(struct vb2_queue *vq)
{
	struct vpfe_device *vpfe = vb2_get_drv_priv(vq);
	struct vpfe_subdev_info *sdinfo;
	unsigned long flags;
	int ret;

	vpfe_pcr_enable(&vpfe->ccdc, 0);

	vpfe_detach_irq(vpfe);

	sdinfo = vpfe->current_subdev;
	ret = v4l2_subdev_call(sdinfo->sd, video, s_stream, 0);
	if (ret && ret != -ENOIOCTLCMD && ret != -ENODEV)
		vpfe_dbg(1, vpfe, "stream off failed in subdev\n");

	/* release all active buffers */
	spin_lock_irqsave(&vpfe->dma_queue_lock, flags);
	if (vpfe->cur_frm == vpfe->next_frm) {
		vb2_buffer_done(&vpfe->cur_frm->vb, VB2_BUF_STATE_ERROR);
	} else {
		if (vpfe->cur_frm != NULL)
			vb2_buffer_done(&vpfe->cur_frm->vb,
					VB2_BUF_STATE_ERROR);
		if (vpfe->next_frm != NULL)
			vb2_buffer_done(&vpfe->next_frm->vb,
					VB2_BUF_STATE_ERROR);
	}

	while (!list_empty(&vpfe->dma_queue)) {
		vpfe->next_frm = list_entry(vpfe->dma_queue.next,
						struct vpfe_cap_buffer, list);
		list_del(&vpfe->next_frm->list);
		vb2_buffer_done(&vpfe->next_frm->vb, VB2_BUF_STATE_ERROR);
	}
	spin_unlock_irqrestore(&vpfe->dma_queue_lock, flags);
}

static int vpfe_cropcap(struct file *file, void *priv,
			struct v4l2_cropcap *crop)
{
	struct vpfe_device *vpfe = video_drvdata(file);

	vpfe_dbg(2, vpfe, "vpfe_cropcap\n");

	if (vpfe->std_index >= ARRAY_SIZE(vpfe_standards))
		return -EINVAL;

	memset(crop, 0, sizeof(struct v4l2_cropcap));

	crop->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
	crop->defrect.width = vpfe_standards[vpfe->std_index].width;
	crop->bounds.width = crop->defrect.width;
	crop->defrect.height = vpfe_standards[vpfe->std_index].height;
	crop->bounds.height = crop->defrect.height;
	crop->pixelaspect = vpfe_standards[vpfe->std_index].pixelaspect;

	return 0;
}

static int
vpfe_g_selection(struct file *file, void *fh, struct v4l2_selection *s)
{
	struct vpfe_device *vpfe = video_drvdata(file);

	switch (s->target) {
	case V4L2_SEL_TGT_CROP_BOUNDS:
	case V4L2_SEL_TGT_CROP_DEFAULT:
		s->r.left = s->r.top = 0;
		s->r.width = vpfe->crop.width;
		s->r.height = vpfe->crop.height;
		break;

	case V4L2_SEL_TGT_CROP:
		s->r = vpfe->crop;
		break;

	default:
		return -EINVAL;
	}

	return 0;
}

static int enclosed_rectangle(struct v4l2_rect *a, struct v4l2_rect *b)
{
	if (a->left < b->left || a->top < b->top)
		return 0;

	if (a->left + a->width > b->left + b->width)
		return 0;

	if (a->top + a->height > b->top + b->height)
		return 0;

	return 1;
}

static int
vpfe_s_selection(struct file *file, void *fh, struct v4l2_selection *s)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	struct v4l2_rect cr = vpfe->crop;
	struct v4l2_rect r = s->r;

	/* If streaming is started, return error */
	if (vb2_is_busy(&vpfe->buffer_queue)) {
		vpfe_err(vpfe, "%s device busy\n", __func__);
		return -EBUSY;
	}

	if (s->type != V4L2_BUF_TYPE_VIDEO_CAPTURE ||
			s->target != V4L2_SEL_TGT_CROP)
		return -EINVAL;

	v4l_bound_align_image(&r.width, 0, cr.width, 0,
			      &r.height, 0, cr.height, 0, 0);

	r.left = clamp_t(unsigned int, r.left, 0, cr.width - r.width);
	r.top  = clamp_t(unsigned int, r.top, 0, cr.height - r.height);

	if (s->flags & V4L2_SEL_FLAG_LE && !enclosed_rectangle(&r, &s->r))
		return -ERANGE;

	if (s->flags & V4L2_SEL_FLAG_GE && !enclosed_rectangle(&s->r, &r))
		return -ERANGE;

	s->r = vpfe->crop = r;

	vpfe_ccdc_set_image_window(&vpfe->ccdc, &r, vpfe->bpp);
	vpfe->fmt.fmt.pix.width = r.width;
	vpfe->fmt.fmt.pix.height = r.height;
	vpfe->fmt.fmt.pix.bytesperline = vpfe_ccdc_get_line_length(&vpfe->ccdc);
	vpfe->fmt.fmt.pix.sizeimage = vpfe->fmt.fmt.pix.bytesperline *
						vpfe->fmt.fmt.pix.height;

	vpfe_dbg(1, vpfe, "cropped (%d,%d)/%dx%d of %dx%d\n",
		 r.left, r.top, r.width, r.height, cr.width, cr.height);

	return 0;
}

static long vpfe_ioctl_default(struct file *file, void *priv,
			       bool valid_prio, unsigned int cmd, void *param)
{
	struct vpfe_device *vpfe = video_drvdata(file);
	int ret;

	vpfe_dbg(2, vpfe, "vpfe_ioctl_default\n");

	if (!valid_prio) {
		vpfe_err(vpfe, "%s device busy\n", __func__);
		return -EBUSY;
	}

	/* If streaming is started, return error */
	if (vb2_is_busy(&vpfe->buffer_queue)) {
		vpfe_err(vpfe, "%s device busy\n", __func__);
		return -EBUSY;
	}

	switch (cmd) {
	case VIDIOC_AM437X_CCDC_CFG:
		ret = vpfe_ccdc_set_params(&vpfe->ccdc, (void __user *)param);
		if (ret) {
			vpfe_dbg(2, vpfe,
				"Error setting parameters in CCDC\n");
			return ret;
		}
		ret = vpfe_get_ccdc_image_format(vpfe,
						 &vpfe->fmt);
		if (ret < 0) {
			vpfe_dbg(2, vpfe,
				"Invalid image format at CCDC\n");
			return ret;
		}
		break;

	default:
		ret = -ENOTTY;
		break;
	}

	return ret;
}

static const struct vb2_ops vpfe_video_qops = {
	.wait_prepare		= vb2_ops_wait_prepare,
	.wait_finish		= vb2_ops_wait_finish,
	.queue_setup		= vpfe_queue_setup,
	.buf_prepare		= vpfe_buffer_prepare,
	.buf_queue		= vpfe_buffer_queue,
	.start_streaming	= vpfe_start_streaming,
	.stop_streaming		= vpfe_stop_streaming,
};

/* vpfe capture driver file operations */
static const struct v4l2_file_operations vpfe_fops = {
	.owner		= THIS_MODULE,
	.open		= vpfe_open,
	.release	= vpfe_release,
	.read		= vb2_fop_read,
	.poll		= vb2_fop_poll,
	.unlocked_ioctl	= video_ioctl2,
	.mmap		= vb2_fop_mmap,
};

/* vpfe capture ioctl operations */
static const struct v4l2_ioctl_ops vpfe_ioctl_ops = {
	.vidioc_querycap		= vpfe_querycap,
	.vidioc_enum_fmt_vid_cap	= vpfe_enum_fmt,
	.vidioc_g_fmt_vid_cap		= vpfe_g_fmt,
	.vidioc_s_fmt_vid_cap		= vpfe_s_fmt,
	.vidioc_try_fmt_vid_cap		= vpfe_try_fmt,

	.vidioc_enum_framesizes		= vpfe_enum_size,

	.vidioc_enum_input		= vpfe_enum_input,
	.vidioc_g_input			= vpfe_g_input,
	.vidioc_s_input			= vpfe_s_input,

	.vidioc_querystd		= vpfe_querystd,
	.vidioc_s_std			= vpfe_s_std,
	.vidioc_g_std			= vpfe_g_std,

	.vidioc_reqbufs			= vb2_ioctl_reqbufs,
	.vidioc_create_bufs		= vb2_ioctl_create_bufs,
	.vidioc_prepare_buf		= vb2_ioctl_prepare_buf,
	.vidioc_querybuf		= vb2_ioctl_querybuf,
	.vidioc_qbuf			= vb2_ioctl_qbuf,
	.vidioc_dqbuf			= vb2_ioctl_dqbuf,
	.vidioc_expbuf			= vb2_ioctl_expbuf,
	.vidioc_streamon		= vb2_ioctl_streamon,
	.vidioc_streamoff		= vb2_ioctl_streamoff,

	.vidioc_log_status		= v4l2_ctrl_log_status,
	.vidioc_subscribe_event		= v4l2_ctrl_subscribe_event,
	.vidioc_unsubscribe_event	= v4l2_event_unsubscribe,

	.vidioc_cropcap			= vpfe_cropcap,
	.vidioc_g_selection		= vpfe_g_selection,
	.vidioc_s_selection		= vpfe_s_selection,

	.vidioc_default			= vpfe_ioctl_default,
};

static int
vpfe_async_bound(struct v4l2_async_notifier *notifier,
		 struct v4l2_subdev *subdev,
		 struct v4l2_async_subdev *asd)
{
	struct vpfe_device *vpfe = container_of(notifier->v4l2_dev,
					       struct vpfe_device, v4l2_dev);
	struct v4l2_subdev_mbus_code_enum mbus_code;
	struct vpfe_subdev_info *sdinfo;
	bool found = false;
	int i, j;

	vpfe_dbg(1, vpfe, "vpfe_async_bound\n");

	for (i = 0; i < ARRAY_SIZE(vpfe->cfg->asd); i++) {
		if (vpfe->cfg->asd[i]->match.of.node == asd[i].match.of.node) {
			sdinfo = &vpfe->cfg->sub_devs[i];
			vpfe->sd[i] = subdev;
			vpfe->sd[i]->grp_id = sdinfo->grp_id;
			found = true;
			break;
		}
	}

	if (!found) {
		vpfe_info(vpfe, "sub device (%s) not matched\n", subdev->name);
		return -EINVAL;
	}

	vpfe->video_dev.tvnorms |= sdinfo->inputs[0].std;

	/* setup the supported formats & indexes */
	for (j = 0, i = 0; ; ++j) {
		struct vpfe_fmt *fmt;
		int ret;

		memset(&mbus_code, 0, sizeof(mbus_code));
		mbus_code.index = j;
		mbus_code.which = V4L2_SUBDEV_FORMAT_ACTIVE;
		ret = v4l2_subdev_call(subdev, pad, enum_mbus_code,
			       NULL, &mbus_code);
		if (ret)
			break;

		fmt = find_format_by_code(mbus_code.code);
		if (!fmt)
			continue;

		fmt->supported = true;
		fmt->index = i++;
	}

	return 0;
}

static int vpfe_probe_complete(struct vpfe_device *vpfe)
{
	struct video_device *vdev;
	struct vb2_queue *q;
	int err;

	spin_lock_init(&vpfe->dma_queue_lock);
	mutex_init(&vpfe->lock);

	vpfe->fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;

	/* set first sub device as current one */
	vpfe->current_subdev = &vpfe->cfg->sub_devs[0];
	vpfe->v4l2_dev.ctrl_handler = vpfe->sd[0]->ctrl_handler;

	err = vpfe_set_input(vpfe, 0);
	if (err)
		goto probe_out;

	/* Initialize videobuf2 queue as per the buffer type */
	vpfe->alloc_ctx = vb2_dma_contig_init_ctx(vpfe->pdev);
	if (IS_ERR(vpfe->alloc_ctx)) {
		vpfe_err(vpfe, "Failed to get the context\n");
		err = PTR_ERR(vpfe->alloc_ctx);
		goto probe_out;
	}

	q = &vpfe->buffer_queue;
	q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
	q->io_modes = VB2_MMAP | VB2_DMABUF | VB2_READ;
	q->drv_priv = vpfe;
	q->ops = &vpfe_video_qops;
	q->mem_ops = &vb2_dma_contig_memops;
	q->buf_struct_size = sizeof(struct vpfe_cap_buffer);
	q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
	q->lock = &vpfe->lock;
	q->min_buffers_needed = 1;

	err = vb2_queue_init(q);
	if (err) {
		vpfe_err(vpfe, "vb2_queue_init() failed\n");
		vb2_dma_contig_cleanup_ctx(vpfe->alloc_ctx);
		goto probe_out;
	}

	INIT_LIST_HEAD(&vpfe->dma_queue);

	vdev = &vpfe->video_dev;
	strlcpy(vdev->name, VPFE_MODULE_NAME, sizeof(vdev->name));
	vdev->release = video_device_release_empty;
	vdev->fops = &vpfe_fops;
	vdev->ioctl_ops = &vpfe_ioctl_ops;
	vdev->v4l2_dev = &vpfe->v4l2_dev;
	vdev->vfl_dir = VFL_DIR_RX;
	vdev->queue = q;
	vdev->lock = &vpfe->lock;
	video_set_drvdata(vdev, vpfe);
	err = video_register_device(&vpfe->video_dev, VFL_TYPE_GRABBER, -1);
	if (err) {
		vpfe_err(vpfe,
			"Unable to register video device.\n");
		goto probe_out;
	}

	return 0;

probe_out:
	v4l2_device_unregister(&vpfe->v4l2_dev);
	return err;
}

static int vpfe_async_complete(struct v4l2_async_notifier *notifier)
{
	struct vpfe_device *vpfe = container_of(notifier->v4l2_dev,
					struct vpfe_device, v4l2_dev);

	return vpfe_probe_complete(vpfe);
}

static struct vpfe_config *
vpfe_get_pdata(struct platform_device *pdev)
{
	struct device_node *endpoint = NULL;
	struct v4l2_of_endpoint bus_cfg;
	struct vpfe_subdev_info *sdinfo;
	struct vpfe_config *pdata;
	unsigned int flags;
	unsigned int i;
	int err;

	dev_dbg(&pdev->dev, "vpfe_get_pdata\n");

	if (!IS_ENABLED(CONFIG_OF) || !pdev->dev.of_node)
		return pdev->dev.platform_data;

	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
	if (!pdata)
		return NULL;

	for (i = 0; ; i++) {
		struct device_node *rem;

		endpoint = of_graph_get_next_endpoint(pdev->dev.of_node,
						      endpoint);
		if (!endpoint)
			break;

		sdinfo = &pdata->sub_devs[i];
		sdinfo->grp_id = 0;

		/* we only support camera */
		sdinfo->inputs[0].index = i;
		strcpy(sdinfo->inputs[0].name, "Camera");
		sdinfo->inputs[0].type = V4L2_INPUT_TYPE_CAMERA;
		sdinfo->inputs[0].std = V4L2_STD_ALL;
		sdinfo->inputs[0].capabilities = V4L2_IN_CAP_STD;

		sdinfo->can_route = 0;
		sdinfo->routes = NULL;

		of_property_read_u32(endpoint, "ti,am437x-vpfe-interface",
				     &sdinfo->vpfe_param.if_type);
		if (sdinfo->vpfe_param.if_type < 0 ||
			sdinfo->vpfe_param.if_type > 4) {
			sdinfo->vpfe_param.if_type = VPFE_RAW_BAYER;
		}

		err = v4l2_of_parse_endpoint(endpoint, &bus_cfg);
		if (err) {
			dev_err(&pdev->dev, "Could not parse the endpoint\n");
			goto done;
		}

		sdinfo->vpfe_param.bus_width = bus_cfg.bus.parallel.bus_width;

		if (sdinfo->vpfe_param.bus_width < 8 ||
			sdinfo->vpfe_param.bus_width > 16) {
			dev_err(&pdev->dev, "Invalid bus width.\n");
			goto done;
		}

		flags = bus_cfg.bus.parallel.flags;

		if (flags & V4L2_MBUS_HSYNC_ACTIVE_HIGH)
			sdinfo->vpfe_param.hdpol = 1;

		if (flags & V4L2_MBUS_VSYNC_ACTIVE_HIGH)
			sdinfo->vpfe_param.vdpol = 1;

		rem = of_graph_get_remote_port_parent(endpoint);
		if (!rem) {
			dev_err(&pdev->dev, "Remote device at %s not found\n",
				endpoint->full_name);
			goto done;
		}

		pdata->asd[i] = devm_kzalloc(&pdev->dev,
					     sizeof(struct v4l2_async_subdev),
					     GFP_KERNEL);
		if (!pdata->asd[i]) {
			of_node_put(rem);
			pdata = NULL;
			goto done;
		}

		pdata->asd[i]->match_type = V4L2_ASYNC_MATCH_OF;
		pdata->asd[i]->match.of.node = rem;
		of_node_put(rem);
	}

	of_node_put(endpoint);
	return pdata;

done:
	of_node_put(endpoint);
	return NULL;
}

/*
 * vpfe_probe : This function creates device entries by register
 * itself to the V4L2 driver and initializes fields of each
 * device objects
 */
static int vpfe_probe(struct platform_device *pdev)
{
	struct vpfe_config *vpfe_cfg = vpfe_get_pdata(pdev);
	struct vpfe_device *vpfe;
	struct vpfe_ccdc *ccdc;
	struct resource	*res;
	int ret;

	if (!vpfe_cfg) {
		dev_err(&pdev->dev, "No platform data\n");
		return -EINVAL;
	}

	vpfe = devm_kzalloc(&pdev->dev, sizeof(*vpfe), GFP_KERNEL);
	if (!vpfe)
		return -ENOMEM;

	vpfe->pdev = &pdev->dev;
	vpfe->cfg = vpfe_cfg;
	ccdc = &vpfe->ccdc;

	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
	ccdc->ccdc_cfg.base_addr = devm_ioremap_resource(&pdev->dev, res);
	if (IS_ERR(ccdc->ccdc_cfg.base_addr))
		return PTR_ERR(ccdc->ccdc_cfg.base_addr);

	vpfe->irq = platform_get_irq(pdev, 0);
	if (vpfe->irq <= 0) {
		dev_err(&pdev->dev, "No IRQ resource\n");
		return -ENODEV;
	}

	ret = devm_request_irq(vpfe->pdev, vpfe->irq, vpfe_isr, 0,
			       "vpfe_capture0", vpfe);
	if (ret) {
		dev_err(&pdev->dev, "Unable to request interrupt\n");
		return -EINVAL;
	}

	ret = v4l2_device_register(&pdev->dev, &vpfe->v4l2_dev);
	if (ret) {
		vpfe_err(vpfe,
			"Unable to register v4l2 device.\n");
		return ret;
	}

	/* set the driver data in platform device */
	platform_set_drvdata(pdev, vpfe);
	/* Enabling module functional clock */
	pm_runtime_enable(&pdev->dev);

	/* for now just enable it here instead of waiting for the open */
	pm_runtime_get_sync(&pdev->dev);

	vpfe_ccdc_config_defaults(ccdc);

	pm_runtime_put_sync(&pdev->dev);

	vpfe->sd = devm_kzalloc(&pdev->dev, sizeof(struct v4l2_subdev *) *
				ARRAY_SIZE(vpfe->cfg->asd), GFP_KERNEL);
	if (!vpfe->sd) {
		ret = -ENOMEM;
		goto probe_out_v4l2_unregister;
	}

	vpfe->notifier.subdevs = vpfe->cfg->asd;
	vpfe->notifier.num_subdevs = ARRAY_SIZE(vpfe->cfg->asd);
	vpfe->notifier.bound = vpfe_async_bound;
	vpfe->notifier.complete = vpfe_async_complete;
	ret = v4l2_async_notifier_register(&vpfe->v4l2_dev,
						&vpfe->notifier);
	if (ret) {
		vpfe_err(vpfe, "Error registering async notifier\n");
		ret = -EINVAL;
		goto probe_out_v4l2_unregister;
	}

	return 0;

probe_out_v4l2_unregister:
	v4l2_device_unregister(&vpfe->v4l2_dev);
	return ret;
}

/*
 * vpfe_remove : It un-register device from V4L2 driver
 */
static int vpfe_remove(struct platform_device *pdev)
{
	struct vpfe_device *vpfe = platform_get_drvdata(pdev);

	vpfe_dbg(2, vpfe, "vpfe_remove\n");

	pm_runtime_disable(&pdev->dev);

	v4l2_async_notifier_unregister(&vpfe->notifier);
	v4l2_device_unregister(&vpfe->v4l2_dev);
	video_unregister_device(&vpfe->video_dev);

	return 0;
}

#ifdef CONFIG_PM_SLEEP

static void vpfe_save_context(struct vpfe_ccdc *ccdc)
{
	ccdc->ccdc_ctx[VPFE_PCR >> 2] = vpfe_reg_read(ccdc, VPFE_PCR);
	ccdc->ccdc_ctx[VPFE_SYNMODE >> 2] = vpfe_reg_read(ccdc, VPFE_SYNMODE);
	ccdc->ccdc_ctx[VPFE_SDOFST >> 2] = vpfe_reg_read(ccdc, VPFE_SDOFST);
	ccdc->ccdc_ctx[VPFE_SDR_ADDR >> 2] = vpfe_reg_read(ccdc, VPFE_SDR_ADDR);
	ccdc->ccdc_ctx[VPFE_CLAMP >> 2] = vpfe_reg_read(ccdc, VPFE_CLAMP);
	ccdc->ccdc_ctx[VPFE_DCSUB >> 2] = vpfe_reg_read(ccdc, VPFE_DCSUB);
	ccdc->ccdc_ctx[VPFE_COLPTN >> 2] = vpfe_reg_read(ccdc, VPFE_COLPTN);
	ccdc->ccdc_ctx[VPFE_BLKCMP >> 2] = vpfe_reg_read(ccdc, VPFE_BLKCMP);
	ccdc->ccdc_ctx[VPFE_VDINT >> 2] = vpfe_reg_read(ccdc, VPFE_VDINT);
	ccdc->ccdc_ctx[VPFE_ALAW >> 2] = vpfe_reg_read(ccdc, VPFE_ALAW);
	ccdc->ccdc_ctx[VPFE_REC656IF >> 2] = vpfe_reg_read(ccdc, VPFE_REC656IF);
	ccdc->ccdc_ctx[VPFE_CCDCFG >> 2] = vpfe_reg_read(ccdc, VPFE_CCDCFG);
	ccdc->ccdc_ctx[VPFE_CULLING >> 2] = vpfe_reg_read(ccdc, VPFE_CULLING);
	ccdc->ccdc_ctx[VPFE_HD_VD_WID >> 2] = vpfe_reg_read(ccdc,
							    VPFE_HD_VD_WID);
	ccdc->ccdc_ctx[VPFE_PIX_LINES >> 2] = vpfe_reg_read(ccdc,
							    VPFE_PIX_LINES);
	ccdc->ccdc_ctx[VPFE_HORZ_INFO >> 2] = vpfe_reg_read(ccdc,
							    VPFE_HORZ_INFO);
	ccdc->ccdc_ctx[VPFE_VERT_START >> 2] = vpfe_reg_read(ccdc,
							     VPFE_VERT_START);
	ccdc->ccdc_ctx[VPFE_VERT_LINES >> 2] = vpfe_reg_read(ccdc,
							     VPFE_VERT_LINES);
	ccdc->ccdc_ctx[VPFE_HSIZE_OFF >> 2] = vpfe_reg_read(ccdc,
							    VPFE_HSIZE_OFF);
}

static int vpfe_suspend(struct device *dev)
{
	struct platform_device *pdev = to_platform_device(dev);
	struct vpfe_device *vpfe = platform_get_drvdata(pdev);
	struct vpfe_ccdc *ccdc = &vpfe->ccdc;

	/* if streaming has not started we don't care */
	if (!vb2_start_streaming_called(&vpfe->buffer_queue))
		return 0;

	pm_runtime_get_sync(dev);
	vpfe_config_enable(ccdc, 1);

	/* Save VPFE context */
	vpfe_save_context(ccdc);

	/* Disable CCDC */
	vpfe_pcr_enable(ccdc, 0);
	vpfe_config_enable(ccdc, 0);

	/* Disable both master and slave clock */
	pm_runtime_put_sync(dev);

	/* Select sleep pin state */
	pinctrl_pm_select_sleep_state(dev);

	return 0;
}

static void vpfe_restore_context(struct vpfe_ccdc *ccdc)
{
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_SYNMODE >> 2], VPFE_SYNMODE);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_CULLING >> 2], VPFE_CULLING);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_SDOFST >> 2], VPFE_SDOFST);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_SDR_ADDR >> 2], VPFE_SDR_ADDR);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_CLAMP >> 2], VPFE_CLAMP);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_DCSUB >> 2], VPFE_DCSUB);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_COLPTN >> 2], VPFE_COLPTN);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_BLKCMP >> 2], VPFE_BLKCMP);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_VDINT >> 2], VPFE_VDINT);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_ALAW >> 2], VPFE_ALAW);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_REC656IF >> 2], VPFE_REC656IF);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_CCDCFG >> 2], VPFE_CCDCFG);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_PCR >> 2], VPFE_PCR);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_HD_VD_WID >> 2],
						VPFE_HD_VD_WID);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_PIX_LINES >> 2],
						VPFE_PIX_LINES);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_HORZ_INFO >> 2],
						VPFE_HORZ_INFO);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_VERT_START >> 2],
						VPFE_VERT_START);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_VERT_LINES >> 2],
						VPFE_VERT_LINES);
	vpfe_reg_write(ccdc, ccdc->ccdc_ctx[VPFE_HSIZE_OFF >> 2],
						VPFE_HSIZE_OFF);
}

static int vpfe_resume(struct device *dev)
{
	struct platform_device *pdev = to_platform_device(dev);
	struct vpfe_device *vpfe = platform_get_drvdata(pdev);
	struct vpfe_ccdc *ccdc = &vpfe->ccdc;

	/* if streaming has not started we don't care */
	if (!vb2_start_streaming_called(&vpfe->buffer_queue))
		return 0;

	/* Enable both master and slave clock */
	pm_runtime_get_sync(dev);
	vpfe_config_enable(ccdc, 1);

	/* Restore VPFE context */
	vpfe_restore_context(ccdc);

	vpfe_config_enable(ccdc, 0);
	pm_runtime_put_sync(dev);

	/* Select default pin state */
	pinctrl_pm_select_default_state(dev);

	return 0;
}

#endif

static SIMPLE_DEV_PM_OPS(vpfe_pm_ops, vpfe_suspend, vpfe_resume);

static const struct of_device_id vpfe_of_match[] = {
	{ .compatible = "ti,am437x-vpfe", },
	{ /* sentinel */ },
};
MODULE_DEVICE_TABLE(of, vpfe_of_match);

static struct platform_driver vpfe_driver = {
	.probe		= vpfe_probe,
	.remove		= vpfe_remove,
	.driver = {
		.name	= VPFE_MODULE_NAME,
		.pm	= &vpfe_pm_ops,
		.of_match_table = of_match_ptr(vpfe_of_match),
	},
};

module_platform_driver(vpfe_driver);

MODULE_AUTHOR("Texas Instruments");
MODULE_DESCRIPTION("TI AM437x VPFE driver");
MODULE_LICENSE("GPL");
MODULE_VERSION(VPFE_VERSION);
"n">zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) { return NULL; } static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, nodemask_t *allowednodes) { return 1; } static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { } static void zlc_clear_zones_full(struct zonelist *zonelist) { } static bool zone_local(struct zone *local_zone, struct zone *zone) { return true; } static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } #endif /* CONFIG_NUMA */ static void reset_alloc_batches(struct zone *preferred_zone) { struct zone *zone = preferred_zone->zone_pgdat->node_zones; do { mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); } while (zone++ != preferred_zone); } /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { struct zonelist *zonelist = ac->zonelist; struct zoneref *z; struct page *page = NULL; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); int nr_fair_skipped = 0; bool zonelist_rescan; zonelist_scan: zonelist_rescan = false; /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { unsigned long mark; if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(ac->preferred_zone, zone)) break; if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { nr_fair_skipped++; continue; } } /* * When allocating a page cache page for writing, we * want to get it from a zone that is within its dirty * limit, such that no single zone holds more than its * proportional share of globally allowed dirty pages. * The dirty limits take into account the zone's * lowmem reserves and high watermark so that kswapd * should be able to balance it without having to * write pages from its LRU list. * * This may look like it could increase pressure on * lower zones by failing allocations in higher zones * before they are full. But the pages that do spill * over are limited as the lower zones are protected * by this very same mechanism. It should not become * a practical burden to them. * * XXX: For now, allow allocations to potentially * exceed the per-zone dirty limit in the slowpath * (ALLOC_WMARK_LOW unset) before going into reclaim, * which is important when on a NUMA setup the allowed * zones are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of zones in the * dirty-throttling and the flusher threads. */ if (consider_zone_dirty && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, ac->classzone_idx, alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup if there are multiple nodes * and before considering the first zone allowed * by the cpuset. */ allowednodes = zlc_setup(zonelist, alloc_flags); zlc_active = 1; did_zlc_setup = 1; } if (zone_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zone, zone)) goto this_zone_full; /* * As we may have just activated ZLC, check if the first * eligible zone has failed zone_reclaim recently. */ if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; ret = zone_reclaim(zone, gfp_mask, order); switch (ret) { case ZONE_RECLAIM_NOSCAN: /* did not scan */ continue; case ZONE_RECLAIM_FULL: /* scanned but unreclaimable */ continue; default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, ac->classzone_idx, alloc_flags)) goto try_this_zone; /* * Failed to reclaim enough to meet watermark. * Only mark the zone full if checking the min * watermark or if we failed to reclaim just * 1<<order pages or else the page allocator * fastpath will prematurely mark zones full * when the watermark is between the low and * min watermarks. */ if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || ret == ZONE_RECLAIM_SOME) goto this_zone_full; continue; } } try_this_zone: page = buffered_rmqueue(ac->preferred_zone, zone, order, gfp_mask, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; return page; } this_zone_full: if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } /* * The first pass makes sure allocations are spread fairly within the * local node. However, the local node might have free pages left * after the fairness batches are exhausted, and remote zones haven't * even been considered yet. Try once more without fairness, and * include remote zones now, before entering the slowpath and waking * kswapd: prefer spilling to a remote zone over swapping locally. */ if (alloc_flags & ALLOC_FAIR) { alloc_flags &= ~ALLOC_FAIR; if (nr_fair_skipped) { zonelist_rescan = true; reset_alloc_batches(ac->preferred_zone); } if (nr_online_nodes > 1) zonelist_rescan = true; } if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; zonelist_rescan = true; } if (zonelist_rescan) goto zonelist_scan; return NULL; } /* * Large machines with many possible nodes should not always dump per-node * meminfo in irq context. */ static inline bool should_suppress_show_mem(void) { bool ret = false; #if NODES_SHIFT > 8 ret = in_interrupt(); #endif return ret; } static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || debug_guardpage_minorder() > 0) return; /* * This documents exceptions given to allocations in certain * contexts that are allowed to allocate outside current's set * of allowed nodes. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn("%pV", &vaf); va_end(args); } pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); } static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress, unsigned long pages_reclaimed) { /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) return 0; /* Always retry if specifically requested */ if (gfp_mask & __GFP_NOFAIL) return 1; /* * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim * making forward progress without invoking OOM. Suspend also disables * storage devices so kswapd will not help. Bail if we are suspending. */ if (!did_some_progress && pm_suspended_storage()) return 0; /* * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER * means __GFP_NOFAIL, but that may not be true in other * implementations. */ if (order <= PAGE_ALLOC_COSTLY_ORDER) return 1; /* * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is * specified, then we retry until we no longer reclaim any pages * (above), or we've reclaimed an order of pages at least as * large as the allocation's order. In both cases, if the * allocation still fails, we stop retrying. */ if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) return 1; return 0; } static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page; *did_some_progress = 0; /* * Acquire the per-zone oom lock for each zone. If that * fails, somebody else is making progress for us. */ if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { *did_some_progress = 1; schedule_timeout_uninterruptible(1); return NULL; } /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. */ page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; if (!(gfp_mask & __GFP_NOFAIL)) { /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) goto out; /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; /* The OOM killer does not compensate for light reclaim */ if (!(gfp_mask & __GFP_FS)) { /* * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but * keep looping as per should_alloc_retry(). */ *did_some_progress = 1; goto out; } /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; } /* Exhausted what can be done so it's blamo time */ if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); return page; } #ifdef CONFIG_COMPACTION /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { unsigned long compact_result; struct page *page; if (!order) return NULL; current->flags |= PF_MEMALLOC; compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; switch (compact_result) { case COMPACT_DEFERRED: *deferred_compaction = true; /* fall-through */ case COMPACT_SKIPPED: return NULL; default: break; } /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall */ count_vm_event(COMPACTSTALL); page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) { struct zone *zone = page_zone(page); zone->compact_blockskip_flush = false; compaction_defer_reset(zone, order, true); count_vm_event(COMPACTSUCCESS); return page; } /* * It's bad if compaction run occurs and fails. The most likely reason * is that pages exist, but not enough to satisfy watermarks. */ count_vm_event(COMPACTFAIL); cond_resched(); return NULL; } #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { return NULL; } #endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { struct reclaim_state reclaim_state; int progress; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); current->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; progress = try_to_free_pages(ac->zonelist, order, gfp_mask, ac->nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); current->flags &= ~PF_MEMALLOC; cond_resched(); return progress; } /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) return NULL; /* After successful reclaim, reconsider all zones for allocation */ if (IS_ENABLED(CONFIG_NUMA)) zlc_clear_zones_full(ac->zonelist); retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because * pages are pinned on the per-cpu lists. Drain them and try again */ if (!page && !drained) { drain_all_pages(NULL); drained = true; goto retry; } return page; } /* * This is called in the allocator slow-path if the allocation request is of * sufficient urgency to ignore watermarks and take other desperate measures */ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { struct page *page; do { page = get_page_from_freelist(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; } static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); } static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); /* * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (atomic) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the * comment for __cpuset_node_allowed(). */ alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (gfp_mask & __GFP_MEMALLOC) alloc_flags |= ALLOC_NO_WATERMARKS; else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) alloc_flags |= ALLOC_NO_WATERMARKS; else if (!in_interrupt() && ((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; } bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; int contended_compaction = COMPACT_CONTENDED_NONE; /* * In the slowpath, we sanity check order to avoid ever trying to * reclaim >= MAX_ORDER areas which will never succeed. Callers may * be using allocators in order of preference for an area that is * too large. */ if (order >= MAX_ORDER) { WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; } /* * If this allocation cannot block and it is for a specific node, then * fail early. There's no need to wakeup kswapd or retry for a * speculative node-specific allocation. */ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) goto nopage; retry: if (!(gfp_mask & __GFP_NO_KSWAPD)) wake_all_kswapds(order, ac); /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according * to how we want to proceed. */ alloc_flags = gfp_to_alloc_flags(gfp_mask); /* * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { struct zoneref *preferred_zoneref; preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, NULL, &ac->preferred_zone); ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); } /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) goto got_pg; /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { /* * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds * the allocation is high priority and these type of * allocations are system rather than user orientated */ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); page = __alloc_pages_high_priority(gfp_mask, order, ac); if (page) { goto got_pg; } } /* Atomic allocations - we can't balance anything */ if (!wait) { /* * All existing users of the deprecated __GFP_NOFAIL are * blockable, so warn of any new users that actually allow this * type of allocation to fail. */ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; } /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; /* * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, migration_mode, &contended_compaction, &deferred_compaction); if (page) goto got_pg; /* Checks for THP-specific high-order allocations */ if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case * and the caller requested a THP allocation, we do not want * to heavily disrupt the system, so we fail the allocation * instead of entering direct reclaim. */ if (deferred_compaction) goto nopage; /* * In all zones where compaction was attempted (and not * deferred or skipped), lock contention has been detected. * For THP allocation we do not want to disrupt the others * so we fallback to base pages instead. */ if (contended_compaction == COMPACT_CONTENDED_LOCK) goto nopage; /* * If compaction was aborted due to need_resched(), we do not * want to further increase allocation latency, unless it is * khugepaged trying to collapse. */ if (contended_compaction == COMPACT_CONTENDED_SCHED && !(current->flags & PF_KTHREAD)) goto nopage; } /* * It can become very expensive to allocate transparent hugepages at * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); if (page) goto got_pg; /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, did_some_progress, pages_reclaimed)) { /* * If we fail to make progress by freeing individual * pages, but the allocation wants us to keep going, * start OOM killing tasks. */ if (!did_some_progress) { page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) goto got_pg; if (!did_some_progress) goto nopage; } /* Wait for some write requests to complete then retry */ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); goto retry; } else { /* * High-order allocations do not necessarily loop after * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, migration_mode, &contended_compaction, &deferred_compaction); if (page) goto got_pg; } nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: return page; } /* * This is the 'heart' of the zoned buddy allocator. */ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask), }; gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_WAIT); if (should_fail_alloc_page(gfp_mask, order)) return NULL; /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result * of __GFP_THISNODE and a memoryless node */ if (unlikely(!zonelist->_zonerefs->zone)) return NULL; if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); /* We set it here, as __alloc_pages_slowpath might have changed it */ ac.zonelist = zonelist; /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask ? : &cpuset_current_mems_allowed, &ac.preferred_zone); if (!ac.preferred_zone) goto out; ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ alloc_mask = gfp_mask|__GFP_HARDWALL; page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. */ alloc_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(alloc_mask, order, &ac); } if (kmemcheck_enabled && page) kmemcheck_pagealloc_alloc(page, order, gfp_mask); trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); out: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); /* * Common helper functions. */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; /* * __get_free_pages() returns a 32-bit address, which cannot represent * a highmem page */ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } EXPORT_SYMBOL(__get_free_pages); unsigned long get_zeroed_page(gfp_t gfp_mask) { return __get_free_pages(gfp_mask | __GFP_ZERO, 0); } EXPORT_SYMBOL(get_zeroed_page); void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) free_hot_cold_page(page, false); else __free_pages_ok(page, order); } } EXPORT_SYMBOL(__free_pages); void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); __free_pages(virt_to_page((void *)addr), order); } } EXPORT_SYMBOL(free_pages); /* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter * of the current memory cgroup. * * It should be used when the caller would like to use kmalloc, but since the * allocation is large, it has to fall back to the page allocator. */ struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; struct mem_cgroup *memcg = NULL; if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) return NULL; page = alloc_pages(gfp_mask, order); memcg_kmem_commit_charge(page, memcg, order); return page; } struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { struct page *page; struct mem_cgroup *memcg = NULL; if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) return NULL; page = alloc_pages_node(nid, gfp_mask, order); memcg_kmem_commit_charge(page, memcg, order); return page; } /* * __free_kmem_pages and free_kmem_pages will free pages allocated with * alloc_kmem_pages. */ void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); __free_kmem_pages(virt_to_page((void *)addr), order); } } static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; } } return (void *)addr; } /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate * @gfp_mask: GFP flags for the allocation * * This function is similar to alloc_pages(), except that it allocates the * minimum number of pages to satisfy the request. alloc_pages() can only * allocate memory in power-of-two pages. * * This function is also limited by MAX_ORDER. * * Memory allocated by this function must be released by free_pages_exact(). */ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); unsigned long addr; addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); } EXPORT_SYMBOL(alloc_pages_exact); /** * alloc_pages_exact_nid - allocate an exact number of physically-contiguous * pages on a node. * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate * @gfp_mask: GFP flags for the allocation * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } /** * free_pages_exact - release memory allocated via alloc_pages_exact() * @virt: the value returned by alloc_pages_exact. * @size: size of allocation, same value as passed to alloc_pages_exact(). * * Release the memory allocated by a previous call to alloc_pages_exact. */ void free_pages_exact(void *virt, size_t size) { unsigned long addr = (unsigned long)virt; unsigned long end = addr + PAGE_ALIGN(size); while (addr < end) { free_page(addr); addr += PAGE_SIZE; } } EXPORT_SYMBOL(free_pages_exact); /** * nr_free_zone_pages - count number of pages beyond high watermark * @offset: The zone index of the highest zone * * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: * managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { struct zoneref *z; struct zone *zone; /* Just pick one node, since fallback list is circular */ unsigned long sum = 0; struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); for_each_zone_zonelist(zone, z, zonelist, offset) { unsigned long size = zone->managed_pages; unsigned long high = high_wmark_pages(zone); if (size > high) sum += size - high; } return sum; } /** * nr_free_buffer_pages - count number of pages beyond high watermark * * nr_free_buffer_pages() counts the number of pages which are beyond the high * watermark within ZONE_DMA and ZONE_NORMAL. */ unsigned long nr_free_buffer_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_USER)); } EXPORT_SYMBOL_GPL(nr_free_buffer_pages); /** * nr_free_pagecache_pages - count number of pages beyond high watermark * * nr_free_pagecache_pages() counts the number of pages which are beyond the * high watermark within all zones. */ unsigned long nr_free_pagecache_pages(void) { return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); } static inline void show_node(struct zone *zone) { if (IS_ENABLED(CONFIG_NUMA)) printk("Node %d ", zone_to_nid(zone)); } void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; } EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], NR_FREE_PAGES); #else val->totalhigh = 0; val->freehigh = 0; #endif val->mem_unit = PAGE_SIZE; } #endif /* * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; unsigned int cpuset_mems_cookie; if (!(flags & SHOW_MEM_FILTER_NODES)) goto out; do { cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } #define K(x) ((x) << (PAGE_SHIFT-10)) static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U', [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', [MIGRATE_RESERVE] = 'R', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif #ifdef CONFIG_MEMORY_ISOLATION [MIGRATE_ISOLATE] = 'I', #endif }; char tmp[MIGRATE_TYPES + 1]; char *p = tmp; int i; for (i = 0; i < MIGRATE_TYPES; i++) { if (type & (1 << i)) *p++ = types[i]; } *p = '\0'; printk("(%s) ", tmp); } /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. * * Bits in @filter: * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ void show_free_areas(unsigned int filter) { unsigned long free_pcp = 0; int cpu; struct zone *zone; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_FILE), global_page_state(NR_ISOLATED_FILE), global_page_state(NR_UNEVICTABLE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), global_page_state(NR_FREE_PAGES), free_pcp, global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { int i; if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; free_pcp = 0; for_each_online_cpu(cpu) free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; show_node(zone); printk("%s" " free:%lukB" " min:%lukB" " low:%lukB" " high:%lukB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" " isolated(anon):%lukB" " isolated(file):%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" " dirty:%lukB" " writeback:%lukB" " mapped:%lukB" " shmem:%lukB" " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", zone->name, K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), K(zone_page_state(zone, NR_ACTIVE_ANON)), K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_UNEVICTABLE)), K(zone_page_state(zone, NR_ISOLATED_ANON)), K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_FILE_DIRTY)), K(zone_page_state(zone, NR_WRITEBACK)), K(zone_page_state(zone, NR_FILE_MAPPED)), K(zone_page_state(zone, NR_SHMEM)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK) * THREAD_SIZE / 1024, K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; show_node(zone); printk("%s: ", zone->name); spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; int type; nr[order] = area->nr_free; total += nr[order] << order; types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { if (!list_empty(&area->free_list[type])) types[order] |= 1 << type; } } spin_unlock_irqrestore(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { printk("%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) show_migration_types(types[order]); } printk("= %lukB\n", K(total)); } hugetlb_show_meminfo(); printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); show_swap_cache_info(); } static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; zoneref->zone_idx = zone_idx(zone); } /* * Builds allocation fallback zone lists. * * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int nr_zones) { struct zone *zone; enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) { zoneref_set_zone(zone, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); return nr_zones; } /* * zonelist_order: * 0 = automatic detection of better ordering. * 1 = order by ([node] distance, -zonetype) * 2 = order by (-zonetype, [node] distance) * * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create * the same zonelist. So only NUMA can configure this param. */ #define ZONELIST_ORDER_DEFAULT 0 #define ZONELIST_ORDER_NODE 1 #define ZONELIST_ORDER_ZONE 2 /* zonelist order in the kernel. * set_zonelist_order() will set this to NODE or ZONE. */ static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; #ifdef CONFIG_NUMA /* The value user specified ....changed by config */ static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; /* string for sysctl */ #define NUMA_ZONELIST_ORDER_LEN 16 char numa_zonelist_order[16] = "default"; /* * interface for configure zonelist ordering. * command line option "numa_zonelist_order" * = "[dD]efault - default, automatic configuration. * = "[nN]ode - order by node locality, then by zone within node * = "[zZ]one - order by zone, then by locality within zone */ static int __parse_numa_zonelist_order(char *s) { if (*s == 'd' || *s == 'D') { user_zonelist_order = ZONELIST_ORDER_DEFAULT; } else if (*s == 'n' || *s == 'N') { user_zonelist_order = ZONELIST_ORDER_NODE; } else if (*s == 'z' || *s == 'Z') { user_zonelist_order = ZONELIST_ORDER_ZONE; } else { printk(KERN_WARNING "Ignoring invalid numa_zonelist_order value: " "%s\n", s); return -EINVAL; } return 0; } static __init int setup_numa_zonelist_order(char *s) { int ret; if (!s) return 0; ret = __parse_numa_zonelist_order(s); if (ret == 0) strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); return ret; } early_param("numa_zonelist_order", setup_numa_zonelist_order); /* * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; int ret; static DEFINE_MUTEX(zl_order_mutex); mutex_lock(&zl_order_mutex); if (write) { if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { ret = -EINVAL; goto out; } strcpy(saved_string, (char *)table->data); } ret = proc_dostring(table, write, buffer, length, ppos); if (ret) goto out; if (write) { int oldval = user_zonelist_order; ret = __parse_numa_zonelist_order((char *)table->data); if (ret) { /* * bogus value. restore saved string */ strncpy((char *)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { mutex_lock(&zonelists_mutex); build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } } out: mutex_unlock(&zl_order_mutex); return ret; } #define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending * @used_node_mask: nodemask_t of already used nodes * * We use a number of factors to determine which is the next node that should * appear on a given node's fallback list. The node should not have appeared * already in @node's fallback list, and it should be the next closest node * according to the distance array (which contains arbitrary distance values * from each node to each node in the system), and should also prefer nodes * with no CPUs, since presumably they'll have very little allocation pressure * on them otherwise. * It returns -1 if no node is found. */ static int find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; int best_node = NUMA_NO_NODE; const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ if (!node_isset(node, *used_node_mask)) { node_set(node, *used_node_mask); return node; } for_each_node_state(n, N_MEMORY) { /* Don't want a node to appear more than once */ if (node_isset(n, *used_node_mask)) continue; /* Use the distance array to find the distance */ val = node_distance(node, n); /* Penalize nodes under us ("prefer the next node") */ val += (n < node); /* Give preference to headless and unused nodes */ tmp = cpumask_of_node(n); if (!cpumask_empty(tmp)) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ val *= (MAX_NODE_LOAD*MAX_NUMNODES); val += node_load[n]; if (val < min_val) { min_val = val; best_node = n; } } if (best_node >= 0) node_set(best_node, *used_node_mask); return best_node; } /* * Build zonelists ordered by node and zones within node. * This results in maximum locality--normal zone overflows into local * DMA zone, if any--but risks exhausting DMA zone. */ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) { int j; struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } /* * Build gfp_thisnode zonelists */ static void build_thisnode_zonelists(pg_data_t *pgdat) { int j; struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } /* * Build zonelists ordered by zone and nodes within zones. * This results in conserving DMA zone[s] until all Normal memory is * exhausted, but results in overflowing to remote node while memory * may still exist in local DMA zone. */ static int node_order[MAX_NUMNODES]; static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) { int pos, j, node; int zone_type; /* needs to be signed */ struct zone *z; struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[0]; pos = 0; for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { for (j = 0; j < nr_nodes; j++) { node = node_order[j]; z = &NODE_DATA(node)->node_zones[zone_type]; if (populated_zone(z)) { zoneref_set_zone(z, &zonelist->_zonerefs[pos++]); check_highest_zone(zone_type); } } } zonelist->_zonerefs[pos].zone = NULL; zonelist->_zonerefs[pos].zone_idx = 0; } #if defined(CONFIG_64BIT) /* * Devices that require DMA32/DMA are relatively rare and do not justify a * penalty to every machine in case the specialised case applies. Default * to Node-ordering on 64-bit NUMA machines */ static int default_zonelist_order(void) { return ZONELIST_ORDER_NODE; } #else /* * On 32-bit, the Normal zone needs to be preserved for allocations accessible * by the kernel. If processes running on node 0 deplete the low memory zone * then reclaim will occur more frequency increasing stalls and potentially * be easier to OOM if a large percentage of the zone is under writeback or * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. * Hence, default to zone ordering on 32-bit. */ static int default_zonelist_order(void) { return ZONELIST_ORDER_ZONE; } #endif /* CONFIG_64BIT */ static void set_zonelist_order(void) { if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) current_zonelist_order = default_zonelist_order(); else current_zonelist_order = user_zonelist_order; } static void build_zonelists(pg_data_t *pgdat) { int j, node, load; enum zone_type i; nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { zonelist = pgdat->node_zonelists + i; zonelist->_zonerefs[0].zone = NULL; zonelist->_zonerefs[0].zone_idx = 0; } /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; load = nr_online_nodes; prev_node = local_node; nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); j = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ if (node_distance(local_node, node) != node_distance(local_node, prev_node)) node_load[node] = load; prev_node = node; load--; if (order == ZONELIST_ORDER_NODE) build_zonelists_in_node_order(pgdat, node); else node_order[j++] = node; /* remember order */ } if (order == ZONELIST_ORDER_ZONE) { /* calculate node order -- i.e., DMA last! */ build_zonelists_in_zone_order(pgdat, j); } build_thisnode_zonelists(pgdat); } /* Construct the zonelist performance cache - see further mmzone.h */ static void build_zonelist_cache(pg_data_t *pgdat) { struct zonelist *zonelist; struct zonelist_cache *zlc; struct zoneref *z; zonelist = &pgdat->node_zonelists[0]; zonelist->zlcache_ptr = zlc = &zonelist->zlcache; bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); for (z = zonelist->_zonerefs; z->zone; z++) zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); } #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * Return node id of node used for "local" allocations. * I.e., first node id of first zone in arg node's generic zonelist. * Used for initializing percpu 'numa_mem', which is used primarily * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. */ int local_memory_node(int node) { struct zone *zone; (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL, &zone); return zone->node; } #endif #else /* CONFIG_NUMA */ static void set_zonelist_order(void) { current_zonelist_order = ZONELIST_ORDER_ZONE; } static void build_zonelists(pg_data_t *pgdat) { int node, local_node; enum zone_type j; struct zonelist *zonelist; local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones * of all the other nodes. * We don't want to pressure a particular node, so when * building the zones for node N, we make sure that the * zones coming right after the local ones are those from * node N+1 (modulo N) */ for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ static void build_zonelist_cache(pg_data_t *pgdat) { pgdat->node_zonelists[0].zlcache_ptr = NULL; } #endif /* CONFIG_NUMA */ /* * Boot pageset table. One per cpu which is going to be used for all * zones and all nodes. The parameters will be set in such a way * that an item put on a list will immediately be handed over to * the buddy list. This is safe since pageset manipulation is done * with interrupts disabled. * * The boot_pagesets must be kept even after bootup is complete for * unused processors and/or zones. They do play a role for bootstrapping * hotplugged processors. * * zoneinfo_show() and maybe other functions do * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static void setup_zone_pageset(struct zone *zone); /* * Global mutex to protect against size modification of zonelists * as well as to serialize pageset setup for the new populated zone. */ DEFINE_MUTEX(zonelists_mutex); /* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *data) { int nid; int cpu; pg_data_t *self = data; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif if (self && !node_online(self->node_id)) { build_zonelists(self); build_zonelist_cache(self); } for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for * each zone will be allocated later when the per cpu * allocator is available. * * boot_pagesets are used also for bootstrapping offline * cpus if the system is already booted because the pagesets * are needed to initialize allocators on a specific cpu too. * F.e. the percpu allocator needs the page allocator which * needs the percpu allocator in order to allocate its pagesets * (a chicken-egg dilemma). */ for_each_possible_cpu(cpu) { setup_pageset(&per_cpu(boot_pageset, cpu), 0); #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * We now know the "local memory node" for each node-- * i.e., the node of the first zone in the generic zonelist. * Set up numa_mem percpu variable for on-line cpus. During * boot, only the boot cpu should be on-line; we'll init the * secondary cpus' numa_mem as they come on-line. During * node/memory hotplug, we'll fixup all on-line cpus. */ if (cpu_online(cpu)) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); #endif } return 0; } static noinline void __init build_all_zonelists_init(void) { __build_all_zonelists(NULL); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } /* * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. * * __ref due to (1) call of __meminit annotated setup_zone_pageset * [we're only called with non-NULL zone through __meminit paths] and * (2) call of __init annotated helper build_all_zonelists_init * [protected by SYSTEM_BOOTING]. */ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif /* we have to stop all cpus to guarantee there is no user of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); /* * Disable grouping by mobility if the number of pages in the * system is too low to allow the mechanism to work. It would be * more accurate, but expensive to check per-zone. This check is * made on memory-hotadd so a system can start with mobility * disabled and enable it later */ if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) page_group_by_mobility_disabled = 1; else page_group_by_mobility_disabled = 0; pr_info("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif } /* * Helper functions to size the waitqueue hash table. * Essentially these want to choose hash table sizes sufficiently * large so that collisions trying to wait on pages are rare. * But in fact, the number of active page waitqueues on typical * systems is ridiculously low, less than 200. So this is even * conservative, even though it seems large. * * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to * waitqueues, i.e. the size of the waitq table given the number of pages. */ #define PAGES_PER_WAITQUEUE 256 #ifndef CONFIG_MEMORY_HOTPLUG static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) { unsigned long size = 1; pages /= PAGES_PER_WAITQUEUE; while (size < pages) size <<= 1; /* * Once we have dozens or even hundreds of threads sleeping * on IO we've got bigger problems than wait queue collision. * Limit the size of the wait table to a reasonable size. */ size = min(size, 4096UL); return max(size, 4UL); } #else /* * A zone's size might be changed by hot-add, so it is not possible to determine * a suitable size for its wait_table. So we use the maximum size now. * * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: * * i386 (preemption config) : 4096 x 16 = 64Kbyte. * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. * * The maximum entries are prepared when a zone's memory is (512K + 256) pages * or more by the traditional way. (See above). It equals: * * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. * ia64(16K page size) : = ( 8G + 4M)byte. * powerpc (64K page size) : = (32G +16M)byte. */ static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) { return 4096UL; } #endif /* * This is an integer logarithm so that shifts can be used later * to extract the more random high bits from the multiplicative * hash function before the remainder is taken. */ static inline unsigned long wait_table_bits(unsigned long size) { return ffz(~size); } /* * Check if a pageblock contains reserved pages */ static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) return 1; } return 0; } /* * Mark a number of pageblocks as MIGRATE_RESERVE. The number * of blocks reserved is based on min_wmark_pages(zone). The memory within * the reserve will tend to store contiguous free pages. Setting min_free_kbytes * higher will lead to a bigger reserve which will get freed as contiguous * blocks as reclaim kicks in */ static void setup_zone_migrate_reserve(struct zone *zone) { unsigned long start_pfn, pfn, end_pfn, block_end_pfn; struct page *page; unsigned long block_migratetype; int reserve; int old_reserve; /* * Get the start pfn, end pfn and the number of blocks to reserve * We have to be careful to be aligned to pageblock_nr_pages to * make sure that we always check pfn_valid for the first page in * the block. */ start_pfn = zone->zone_start_pfn; end_pfn = zone_end_pfn(zone); start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; /* * Reserve blocks are generally in place to help high-order atomic * allocations that are short-lived. A min_free_kbytes value that * would result in more than 2 reserve blocks for atomic allocations * is assumed to be in place to help anti-fragmentation for the * future allocation of hugepages at runtime. */ reserve = min(2, reserve); old_reserve = zone->nr_migrate_reserve_block; /* When memory hot-add, we almost always need to do nothing */ if (reserve == old_reserve) return; zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); /* Watch out for overlapping nodes */ if (page_to_nid(page) != zone_to_nid(zone)) continue; block_migratetype = get_pageblock_migratetype(page); /* Only test what is necessary when the reserves are not met */ if (reserve > 0) { /* * Blocks with reserved pages will never free, skip * them. */ block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); if (pageblock_is_reserved(pfn, block_end_pfn)) continue; /* If this block is reserved, account for it */ if (block_migratetype == MIGRATE_RESERVE) { reserve--; continue; } /* Suitable for reserving if this block is movable */ if (block_migratetype == MIGRATE_MOVABLE) { set_pageblock_migratetype(page, MIGRATE_RESERVE); move_freepages_block(zone, page, MIGRATE_RESERVE); reserve--; continue; } } else if (!old_reserve) { /* * At boot time we don't need to scan the whole zone * for turning off MIGRATE_RESERVE. */ break; } /* * If the reserve is met and this is a previous reserved block, * take it back */ if (block_migratetype == MIGRATE_RESERVE) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); move_freepages_block(zone, page, MIGRATE_MOVABLE); } } } /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; z = &NODE_DATA(nid)->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s * handed to this function. They do not * exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { if (!early_pfn_valid(pfn)) continue; if (!early_pfn_in_nid(pfn, nid)) continue; } page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived * kernel allocations are made. Later some blocks near * the start are marked MIGRATE_RESERVE by * setup_zone_migrate_reserve() * * bitmap is created for zone's valid pfn range. but memmap * can be created for invalid pages (for alignment) * check here not to call set_pageblock_migratetype() against * pfn out of zone. */ if ((z->zone_start_pfn <= pfn) && (pfn < zone_end_pfn(z)) && !(pfn & (pageblock_nr_pages - 1))) set_pageblock_migratetype(page, MIGRATE_MOVABLE); INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (!is_highmem_idx(zone)) set_page_address(page, __va(pfn << PAGE_SHIFT)); #endif } } static void __meminit zone_init_free_lists(struct zone *zone) { unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; } } #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; /* * The per-cpu-pages pools are set to around 1000th of the * size of the zone. But no more than 1/2 of a meg. * * OK, so we don't know how big the cache is. So guess. */ batch = zone->managed_pages / 1024; if (batch * PAGE_SIZE > 512 * 1024) batch = (512 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; /* * Clamp the batch to a 2^n - 1 value. Having a power * of 2 value was found to be more likely to have * suboptimal cache aliasing properties in some cases. * * For example if 2 tasks are alternately allocating * batches of pages, one task can end up with a lot * of pages of one half of the possible page colors * and the other with pages of the other colors. */ batch = rounddown_pow_of_two(batch + batch/2) - 1; return batch; #else /* The deferral and batching of frees should be suppressed under NOMMU * conditions. * * The problem is that NOMMU needs to be able to allocate large chunks * of contiguous memory as there's no hardware page translation to * assemble apparent contiguous memory from discontiguous pages. * * Queueing large contiguous runs of pages for batching, however, * causes the pages to actually be freed in smaller chunks. As there * can be a significant delay between the individual batches being * recycled, this leads to the once large chunks of space being * fragmented and becoming unavailable for high-order allocations. */ return 0; #endif } /* * pcp->high and pcp->batch values are related and dependent on one another: * ->batch must never be higher then ->high. * The following function updates them in a safe manner without read side * locking. * * Any new users of pcp->batch and pcp->high should ensure they can cope with * those fields changing asynchronously (acording the the above rule). * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters * exist). */ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, unsigned long batch) { /* start with a fail safe value for batch */ pcp->batch = 1; smp_wmb(); /* Update high, then batch, in order */ pcp->high = high; smp_wmb(); pcp->batch = batch; } /* a companion to pageset_set_high() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; memset(p, 0, sizeof(*p)); pcp = &p->pcp; pcp->count = 0; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { pageset_init(p); pageset_set_batch(p, batch); } /* * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { unsigned long batch = max(1UL, high / 4); if ((high / 4) > (PAGE_SHIFT * 8)) batch = PAGE_SHIFT * 8; pageset_update(&p->pcp, high, batch); } static void pageset_set_high_and_batch(struct zone *zone, struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, (zone->managed_pages / percpu_pagelist_fraction)); else pageset_set_batch(pcp, zone_batchsize(zone)); } static void __meminit zone_pageset_init(struct zone *zone, int cpu) { struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); pageset_init(pcp); pageset_set_high_and_batch(zone, pcp); } static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; zone->pageset = alloc_percpu(struct per_cpu_pageset); for_each_possible_cpu(cpu) zone_pageset_init(zone, cpu); } /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. */ void __init setup_per_cpu_pageset(void) { struct zone *zone; for_each_populated_zone(zone) setup_zone_pageset(zone); } static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; size_t alloc_size; /* * The per-page waitqueue mechanism uses hashed waitqueues * per zone. */ zone->wait_table_hash_nr_entries = wait_table_hash_nr_entries(zone_size_pages); zone->wait_table_bits = wait_table_bits(zone->wait_table_hash_nr_entries); alloc_size = zone->wait_table_hash_nr_entries * sizeof(wait_queue_head_t); if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) memblock_virt_alloc_node_nopanic( alloc_size, zone->zone_pgdat->node_id); } else { /* * This case means that a zone whose size was 0 gets new memory * via memory hot-add. * But it may be the case that a new node was hot-added. In * this case vmalloc() will not be able to use this new node's * memory - this wait_table must be initialized to use this new * node itself as well. * To use this new node's memory, further consideration will be * necessary. */ zone->wait_table = vmalloc(alloc_size); } if (!zone->wait_table) return -ENOMEM; for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); return 0; } static __meminit void zone_pcp_init(struct zone *zone) { /* * per cpu subsystem is not up at this point. The following code * relies on the ability of the linker to provide the * offset of a (static) per cpu variable into the per cpu area. */ zone->pageset = &boot_pageset; if (populated_zone(zone)) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", zone->name, zone->present_pages, zone_batchsize(zone)); } int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size, enum memmap_context context) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; ret = zone_wait_table_init(zone, size); if (ret) return ret; pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; mminit_dprintk(MMINIT_TRACE, "memmap_init", "Initialising map node %d zone %lu pfns %lu -> %lu\n", pgdat->node_id, (unsigned long)zone_idx(zone), zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); return 0; } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; int nid; /* * NOTE: The following SMP-unsafe globals are only used early in boot * when the kernel is running single-threaded. */ static unsigned long __meminitdata last_start_pfn, last_end_pfn; static int __meminitdata last_nid; if (last_start_pfn <= pfn && pfn < last_end_pfn) return last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { last_start_pfn = start_pfn; last_end_pfn = end_pfn; last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { int nid; nid = __early_pfn_to_nid(pfn); if (nid >= 0) return nid; /* just returns 0 */ return 0; } #ifdef CONFIG_NODES_SPAN_OTHER_NODES bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { int nid; nid = __early_pfn_to_nid(pfn); if (nid >= 0 && nid != node) return false; return true; } #endif /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * * If an architecture guarantees that all ranges registered contain no holes * and may be freed, this this function may be used instead of calling * memblock_free_early_nid() manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { unsigned long start_pfn, end_pfn; int i, this_nid; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { start_pfn = min(start_pfn, max_low_pfn); end_pfn = min(end_pfn, max_low_pfn); if (start_pfn < end_pfn) memblock_free_early_nid(PFN_PHYS(start_pfn), (end_pfn - start_pfn) << PAGE_SHIFT, this_nid); } } /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * * If an architecture guarantees that all ranges registered contain no holes and may * be freed, this function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { unsigned long start_pfn, end_pfn; int i, this_nid; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) memory_present(this_nid, start_pfn, end_pfn); } /** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. * @start_pfn: Passed by reference. On return, it will have the node start_pfn. * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information * provided by memblock_set_node(). If called for a node * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ void __meminit get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { unsigned long this_start_pfn, this_end_pfn; int i; *start_pfn = -1UL; *end_pfn = 0; for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { *start_pfn = min(*start_pfn, this_start_pfn); *end_pfn = max(*end_pfn, this_end_pfn); } if (*start_pfn == -1UL) *start_pfn = 0; } /* * This finds a zone that can be used for ZONE_MOVABLE pages. The * assumption is made that zones within a node are ordered in monotonic * increasing memory addresses so that the "highest" populated zone is used */ static void __init find_usable_zone_for_movable(void) { int zone_index; for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { if (zone_index == ZONE_MOVABLE) continue; if (arch_zone_highest_possible_pfn[zone_index] > arch_zone_lowest_possible_pfn[zone_index]) break; } VM_BUG_ON(zone_index == -1); movable_zone = zone_index; } /* * The zone ranges provided by the architecture do not include ZONE_MOVABLE * because it is sized independent of architecture. Unlike the other zones, * the starting point for ZONE_MOVABLE is not fixed. It may be different * in each node depending on the size of each node and how evenly kernelcore * is distributed. This helper function adjusts the zone ranges * provided by the architecture for a given node by using the end of the * highest usable zone for ZONE_MOVABLE. This preserves the assumption that * zones within a node are in order of monotonic increases memory addresses */ static void __meminit adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, unsigned long *zone_end_pfn) { /* Only adjust if ZONE_MOVABLE is on this node */ if (zone_movable_pfn[nid]) { /* Size ZONE_MOVABLE */ if (zone_type == ZONE_MOVABLE) { *zone_start_pfn = zone_movable_pfn[nid]; *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); /* Adjust for ZONE_MOVABLE starting within this range */ } else if (*zone_start_pfn < zone_movable_pfn[nid] && *zone_end_pfn > zone_movable_pfn[nid]) { *zone_end_pfn = zone_movable_pfn[nid]; /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; } } /* * Return the number of pages a zone spans in a node, including holes * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_start_pfn, zone_end_pfn; /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); /* Check that this node has pages within the zone's required range */ if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) return 0; /* Move the zone boundaries inside the node if necessary */ zone_end_pfn = min(zone_end_pfn, node_end_pfn); zone_start_pfn = max(zone_start_pfn, node_start_pfn); /* Return the spanned pages */ return zone_end_pfn - zone_start_pfn; } /* * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { unsigned long nr_absent = range_end_pfn - range_start_pfn; unsigned long start_pfn, end_pfn; int i; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); nr_absent -= end_pfn - start_pfn; } return nr_absent; } /** * absent_pages_in_range - Return number of page frames in holes within a range * @start_pfn: The start PFN to start searching for holes * @end_pfn: The end PFN to stop searching for holes * * It returns the number of pages frames in memory holes within a range. */ unsigned long __init absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn) { return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); } /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; } static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) return 0; return zholes_size[zone_type]; } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } #ifndef CONFIG_SPARSEMEM /* * Calculate the size of the zone->blockflags rounded to an unsigned long * Start by making sure zonesize is a multiple of pageblock_order by rounding * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally * round what is now in bits to nearest long in bits, then return it in * bytes. */ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize; zonesize += zone_start_pfn & (pageblock_nr_pages-1); usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); return usemapsize / 8; } static void __init setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = memblock_virt_alloc_node_nopanic(usemapsize, pgdat->node_id); } #else static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zone_start_pfn, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __paginginit set_pageblock_order(void) { unsigned int order; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; if (HPAGE_SHIFT > PAGE_SHIFT) order = HUGETLB_PAGE_ORDER; else order = MAX_ORDER - 1; /* * Assume the largest contiguous order of interest is a huge page. * This value may be variable depending on boot parameters on IA64 and * powerpc. */ pageblock_order = order; } #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ /* * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() * is unused as pageblock_order is set at compile-time. See * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ void __paginginit set_pageblock_order(void) { } #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, unsigned long present_pages) { unsigned long pages = spanned_pages; /* * Provide a more accurate estimation if there are holes within * the zone and SPARSEMEM is in use. If there are holes within the * zone, each populated memory region may cost us one or two extra * memmap pages due to alignment because memmap pages for each * populated regions may not naturally algined on page boundary. * So the (present_pages >> 4) heuristic is a tradeoff for that. */ if (spanned_pages > present_pages + (present_pages >> 4) && IS_ENABLED(CONFIG_SPARSEMEM)) pages = present_pages; return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; } /* * Set up the zone data structures: * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps * * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING spin_lock_init(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; size = zone_spanned_pages_in_node(nid, j, node_start_pfn, node_end_pfn, zones_size); realsize = freesize = size - zone_absent_pages_in_node(nid, j, node_start_pfn, node_end_pfn, zholes_size); /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ memmap_pages = calc_memmap_size(size, realsize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; if (memmap_pages) printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING " %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { freesize -= dma_reserve; printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) nr_kernel_pages += freesize; /* Charge for highmem memmap if there are enough kernel pages */ else if (nr_kernel_pages > memmap_pages * 2) nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; zone->spanned_pages = size; zone->present_pages = realsize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) / 100; zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone_pcp_init(zone); /* For bootup, initialized properly in watermark setup */ mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); lruvec_init(&zone->lruvec); if (!size) continue; set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; } } static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { /* Skip empty nodes */ if (!pgdat->node_spanned_pages) return; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { unsigned long size, start, end; struct page *map; /* * The zone's endpoints aren't required to be MAX_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES /* * With no DISCONTIG, the global mem_map is just set as node 0's */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", nid, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); #endif free_area_init_core(pgdat, start_pfn, end_pfn, zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 /* * Figure out the number of possible node ids. */ void __init setup_nr_node_ids(void) { unsigned int node; unsigned int highest = 0; for_each_node_mask(node, node_possible_map) highest = node; nr_node_ids = highest + 1; } #endif /** * node_map_pfn_alignment - determine the maximum internode alignment * * This function should be called after node map is populated and sorted. * It calculates the maximum power of two alignment which can distinguish * all the nodes. * * For example, if all nodes are 1GiB and aligned to 1GiB, the return value * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is * shifted, 1GiB is enough and this function will indicate so. * * This is used to test whether pfn -> nid mapping of the chosen memory * model has fine enough granularity to avoid incorrect mapping for the * populated node map. * * Returns the determined alignment in pfn's. 0 if there is no alignment * requirement (single node). */ unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; unsigned long start, end, mask; int last_nid = -1; int i, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { if (!start || last_nid < 0 || last_nid == nid) { last_nid = nid; last_end = end; continue; } /* * Start with a mask granular enough to pin-point to the * start pfn and tick off bits one-by-one until it becomes * too coarse to separate the current node from the last. */ mask = ~((1 << __ffs(start)) - 1); while (mask && last_end <= (start & (mask << 1))) mask <<= 1; /* accumulate all internode masks */ accl_mask |= mask; } /* convert mask to number of pages */ return ~accl_mask + 1; } /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { unsigned long min_pfn = ULONG_MAX; unsigned long start_pfn; int i; for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { printk(KERN_WARNING "Could not find start_pfn for node %d\n", nid); return 0; } return min_pfn; } /** * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { return find_min_pfn_for_node(MAX_NUMNODES); } /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. * Populate N_MEMORY for calculating usable_nodes. */ static unsigned long __init early_calculate_totalpages(void) { unsigned long totalpages = 0; unsigned long start_pfn, end_pfn; int i, nid; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { unsigned long pages = end_pfn - start_pfn; totalpages += pages; if (pages) node_set_state(nid, N_MEMORY); } return totalpages; } /* * Find the PFN the Movable zone begins in each node. Kernel memory * is spread evenly between nodes as long as the nodes have enough * memory. When they don't, some nodes will have more kernelcore than * others */ static void __init find_zone_movable_pfns_for_nodes(void) { int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; /* save the state before borrow the nodemask */ nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); struct memblock_region *r; /* Need to find movable_zone earlier when movable_node is specified. */ find_usable_zone_for_movable(); /* * If movable_node is specified, ignore kernelcore and movablecore * options. */ if (movable_node_is_enabled()) { for_each_memblock(memory, r) { if (!memblock_is_hotpluggable(r)) continue; nid = r->nid; usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; } goto out2; } /* * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore * will be used for required_kernelcore if it's greater than * what movablecore would have allowed. */ if (required_movablecore) { unsigned long corepages; /* * Round-up so that ZONE_MOVABLE is at least as large as what * was requested by the user */ required_movablecore = roundup(required_movablecore, MAX_ORDER_NR_PAGES); corepages = totalpages - required_movablecore; required_kernelcore = max(required_kernelcore, corepages); } /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; /* * Recalculate kernelcore_node if the division per node * now exceeds what is necessary to satisfy the requested * amount of memory for the kernel */ if (required_kernelcore < kernelcore_node) kernelcore_node = required_kernelcore / usable_nodes; /* * As the map is walked, we track how much memory is usable * by the kernel using kernelcore_remaining. When it is * 0, the rest of the node is usable by ZONE_MOVABLE */ kernelcore_remaining = kernelcore_node; /* Go through each range of PFNs within this node */ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsigned long size_pages; start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue; /* Account for what is only usable for kernelcore */ if (start_pfn < usable_startpfn) { unsigned long kernel_pages; kernel_pages = min(end_pfn, usable_startpfn) - start_pfn; kernelcore_remaining -= min(kernel_pages, kernelcore_remaining); required_kernelcore -= min(kernel_pages, required_kernelcore); /* Continue if range is now fully accounted */ if (end_pfn <= usable_startpfn) { /* * Push zone_movable_pfn to the end so * that if we have to rebalance * kernelcore across nodes, we will * not double account here */ zone_movable_pfn[nid] = end_pfn; continue; } start_pfn = usable_startpfn; } /* * The usable PFN range for ZONE_MOVABLE is from * start_pfn->end_pfn. Calculate size_pages as the * number of pages used as kernelcore */ size_pages = end_pfn - start_pfn; if (size_pages > kernelcore_remaining) size_pages = kernelcore_remaining; zone_movable_pfn[nid] = start_pfn + size_pages; /* * Some kernelcore has been met, update counts and * break if the kernelcore for this node has been * satisfied */ required_kernelcore -= min(required_kernelcore, size_pages); kernelcore_remaining -= size_pages; if (!kernelcore_remaining) break; } } /* * If there is still required_kernelcore, we do another pass with one * less node in the count. This will push zone_movable_pfn[nid] further * along on the nodes that still have memory until kernelcore is * satisfied */ usable_nodes--; if (usable_nodes && required_kernelcore > usable_nodes) goto restart; out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); out: /* restore the node_state */ node_states[N_MEMORY] = saved_node_state; } /* Any regular or high memory on that node ? */ static void check_for_memory(pg_data_t *pgdat, int nid) { enum zone_type zone_type; if (N_MEMORY == N_NORMAL_MEMORY) return; for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (populated_zone(zone)) { node_set_state(nid, N_HIGH_MEMORY); if (N_NORMAL_MEMORY != N_HIGH_MEMORY && zone_type <= ZONE_NORMAL) node_set_state(nid, N_NORMAL_MEMORY); break; } } } /** * free_area_init_nodes - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed * that arch_max_dma32_pfn has no pages. It is also assumed that a zone * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ void __init free_area_init_nodes(unsigned long *max_zone_pfn) { unsigned long start_pfn, end_pfn; int i, nid; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; for (i = 1; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1]; arch_zone_highest_possible_pfn[i] = max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); find_zone_movable_pfns_for_nodes(); /* Print out the zone ranges */ pr_info("Zone ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; pr_info(" %-8s ", zone_names[i]); if (arch_zone_lowest_possible_pfn[i] == arch_zone_highest_possible_pfn[i]) pr_cont("empty\n"); else pr_cont("[mem %#018Lx-%#018Lx]\n", (u64)arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, ((u64)arch_zone_highest_possible_pfn[i] << PAGE_SHIFT) - 1); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ pr_info("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) pr_info(" Node %d: %#018Lx\n", i, (u64)zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early node map */ pr_info("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ if (pgdat->node_present_pages) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } } static int __init cmdline_parse_core(char *p, unsigned long *core) { unsigned long long coremem; if (!p) return -EINVAL; coremem = memparse(p, &p); *core = coremem >> PAGE_SHIFT; /* Paranoid check that UL is enough for the coremem value */ WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); return 0; } /* * kernelcore=size sets the amount of memory for use for allocations that * cannot be reclaimed or migrated. */ static int __init cmdline_parse_kernelcore(char *p) { return cmdline_parse_core(p, &required_kernelcore); } /* * movablecore=size sets the amount of memory for use for allocations that * can be reclaimed or migrated. */ static int __init cmdline_parse_movablecore(char *p) { return cmdline_parse_core(p, &required_movablecore); } early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ void adjust_managed_page_count(struct page *page, long count) { spin_lock(&managed_page_count_lock); page_zone(page)->managed_pages += count; totalram_pages += count; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) totalhigh_pages += count; #endif spin_unlock(&managed_page_count_lock); } EXPORT_SYMBOL(adjust_managed_page_count); unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { void *pos; unsigned long pages = 0; start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { if ((unsigned int)poison <= 0xFF) memset(pos, poison, PAGE_SIZE); free_reserved_page(virt_to_page(pos)); } if (pages && s) pr_info("Freeing %s memory: %ldK (%p - %p)\n", s, pages << (PAGE_SHIFT - 10), start, end); return pages; } EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) { __free_reserved_page(page); totalram_pages++; page_zone(page)->managed_pages++; totalhigh_pages++; } #endif void __init mem_init_print_info(const char *str) { unsigned long physpages, codesize, datasize, rosize, bss_size; unsigned long init_code_size, init_data_size; physpages = get_num_physpages(); codesize = _etext - _stext; datasize = _edata - _sdata; rosize = __end_rodata - __start_rodata; bss_size = __bss_stop - __bss_start; init_data_size = __init_end - __init_begin; init_code_size = _einittext - _sinittext; /* * Detect special cases and adjust section sizes accordingly: * 1) .init.* may be embedded into .data sections * 2) .init.text.* may be out of [__init_begin, __init_end], * please refer to arch/tile/kernel/vmlinux.lds.S. * 3) .rodata.* may be embedded into .text or .data sections. */ #define adj_init_size(start, end, size, pos, adj) \ do { \ if (start <= pos && pos < end && size > adj) \ size -= adj; \ } while (0) adj_init_size(__init_begin, __init_end, init_data_size, _sinittext, init_code_size); adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); #undef adj_init_size pr_info("Memory: %luK/%luK available " "(%luK kernel code, %luK rwdata, %luK rodata, " "%luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM ", %luK highmem" #endif "%s%s)\n", nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), totalcma_pages << (PAGE_SHIFT-10), #ifdef CONFIG_HIGHMEM totalhigh_pages << (PAGE_SHIFT-10), #endif str ? ", " : "", str ? str : ""); } /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved * * The per-cpu batchsize and zone watermarks are determined by present_pages. * In the DMA zone, a significant percentage may be consumed by kernel image * and other unfreeable allocations which can skew the watermarks badly. This * function may optionally be used to account for unfreeable pages in the * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and * smaller per-cpu batchsize. */ void __init set_dma_reserve(unsigned long new_dma_reserve) { dma_reserve = new_dma_reserve; } void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { lru_add_drain_cpu(cpu); drain_pages(cpu); /* * Spill the event counters of the dead processor * into the current processors event counters. * This artificially elevates the count of the current * processor. */ vm_events_fold_cpu(cpu); /* * Zero the differential counters of the dead processor * so that the vm statistics are consistent. * * This is only okay since the processor is dead and cannot * race with what we are doing. */ cpu_vm_stats_fold(cpu); } return NOTIFY_OK; } void __init page_alloc_init(void) { hotcpu_notifier(page_alloc_cpu_notify, 0); local_irq_lock_init(pa_lock); } /* * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio * or min_free_kbytes changes. */ static void calculate_totalreserve_pages(void) { struct pglist_data *pgdat; unsigned long reserve_pages = 0; enum zone_type i, j; for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { if (zone->lowmem_reserve[j] > max) max = zone->lowmem_reserve[j]; } /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); if (max > zone->managed_pages) max = zone->managed_pages; reserve_pages += max; /* * Lowmem reserves are not available to * GFP_HIGHUSER page cache allocations and * kswapd tries to balance zones to their high * watermark. As a result, neither should be * regarded as dirtyable memory, to prevent a * situation where reclaim has to clean pages * in order to balance the zones. */ zone->dirty_balance_reserve = max; } } dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } /* * setup_per_zone_lowmem_reserve - called whenever * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of * pages are left in the zone after a successful __alloc_pages(). */ static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; enum zone_type j, idx; for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long managed_pages = zone->managed_pages; zone->lowmem_reserve[j] = 0; idx = j; while (idx) { struct zone *lower_zone; idx--; if (sysctl_lowmem_reserve_ratio[idx] < 1) sysctl_lowmem_reserve_ratio[idx] = 1; lower_zone = pgdat->node_zones + idx; lower_zone->lowmem_reserve[j] = managed_pages / sysctl_lowmem_reserve_ratio[idx]; managed_pages += lower_zone->managed_pages; } } } /* update totalreserve_pages */ calculate_totalreserve_pages(); } static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; /* Calculate total number of !ZONE_HIGHMEM pages */ for_each_zone(zone) { if (!is_highmem(zone)) lowmem_pages += zone->managed_pages; } for_each_zone(zone) { u64 tmp; spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone->managed_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't * need highmem pages, so cap pages_min to a small * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas control asynch page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; min_pages = zone->managed_pages / 1024; min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); zone->watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ zone->watermark[WMARK_MIN] = tmp; } zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } /* update totalreserve_pages */ calculate_totalreserve_pages(); } /** * setup_per_zone_wmarks - called when min_free_kbytes changes * or when memory is hot-{added|removed} * * Ensures that the watermark[min,low,high] values for each zone are set * correctly with respect to min_free_kbytes. */ void setup_per_zone_wmarks(void) { mutex_lock(&zonelists_mutex); __setup_per_zone_wmarks(); mutex_unlock(&zonelists_mutex); } /* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. * * The inactive_anon ratio is the target ratio of ACTIVE_ANON to * INACTIVE_ANON pages on this zone's LRU, maintained by the * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of * the anonymous pages are kept on the inactive list. * * total target max * memory ratio inactive anon * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB */ static void __meminit calculate_zone_inactive_ratio(struct zone *zone) { unsigned int gb, ratio; /* Zone size in gigabytes */ gb = zone->managed_pages >> (30 - PAGE_SHIFT); if (gb) ratio = int_sqrt(10 * gb); else ratio = 1; zone->inactive_ratio = ratio; } static void __meminit setup_per_zone_inactive_ratio(void) { struct zone *zone; for_each_zone(zone) calculate_zone_inactive_ratio(zone); } /* * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines * we want it large (64MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields * * 16MB: 512k * 32MB: 724k * 64MB: 1024k * 128MB: 1448k * 256MB: 2048k * 512MB: 2896k * 1024MB: 4096k * 2048MB: 5792k * 4096MB: 8192k * 8192MB: 11584k * 16384MB: 16384k */ int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); if (new_min_free_kbytes > user_min_free_kbytes) { min_free_kbytes = new_min_free_kbytes; if (min_free_kbytes < 128) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; } else { pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", new_min_free_kbytes, user_min_free_kbytes); } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; if (write) { user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); } return 0; } #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; for_each_zone(zone) zone->min_unmapped_pages = (zone->managed_pages * sysctl_min_unmapped_ratio) / 100; return 0; } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; for_each_zone(zone) zone->min_slab_pages = (zone->managed_pages * sysctl_min_slab_ratio) / 100; return 0; } #endif /* * lowmem_reserve_ratio_sysctl_handler - just a wrapper around * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() * whenever sysctl_lowmem_reserve_ratio changes. * * The reserve ratio obviously has absolutely no relation with the * minimum watermarks. The lowmem reserve ratio can only make sense * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each * cpu. It is the fraction of total pages in each zone that a hot per cpu * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; int ret; mutex_lock(&pcp_batch_high_lock); old_percpu_pagelist_fraction = percpu_pagelist_fraction; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || ret < 0) goto out; /* Sanity checking to avoid pcp imbalance */ if (percpu_pagelist_fraction && percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { percpu_pagelist_fraction = old_percpu_pagelist_fraction; ret = -EINVAL; goto out; } /* No change? */ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) goto out; for_each_populated_zone(zone) { unsigned int cpu; for_each_possible_cpu(cpu) pageset_set_high_and_batch(zone, per_cpu_ptr(zone->pageset, cpu)); } out: mutex_unlock(&pcp_batch_high_lock); return ret; } int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA static int __init set_hashdist(char *str) { if (!str) return 0; hashdist = simple_strtoul(str, &str, 0); return 1; } __setup("hashdist=", set_hashdist); #endif /* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 * quantity of entries * - limit is the number of hash buckets, not the total allocation size */ void *__init alloc_large_system_hash(const char *tablename, unsigned long bucketsize, unsigned long numentries, int scale, int flags, unsigned int *_hash_shift, unsigned int *_hash_mask, unsigned long low_limit, unsigned long high_limit) { unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; /* allow the kernel cmdline to have a say */ if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SHIFT < 20) numentries = round_up(numentries, (1<<20)/PAGE_SIZE); /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); else numentries <<= (PAGE_SHIFT - scale); /* Make sure we've got at least a 0-order allocation.. */ if (unlikely(flags & HASH_SMALL)) { /* Makes no sense without HASH_EARLY */ WARN_ON(!(flags & HASH_EARLY)); if (!(numentries >> *_hash_shift)) { numentries = 1UL << *_hash_shift; BUG_ON(!numentries); } } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); /* limit allocation size to 1/16 total memory by default */ if (max == 0) { max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } max = min(max, 0x80000000ULL); if (numentries < low_limit) numentries = low_limit; if (numentries > max) numentries = max; log2qty = ilog2(numentries); do { size = bucketsize << log2qty; if (flags & HASH_EARLY) table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); kmemleak_alloc(table, size, 1, GFP_ATOMIC); } } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", tablename, (1UL << log2qty), ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) *_hash_shift = log2qty; if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; return table; } /* Return a pointer to the bitmap storing bits affecting a block of pages */ static inline unsigned long *get_pageblock_bitmap(struct zone *zone, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM return __pfn_to_section(pfn)->pageblock_flags; #else return zone->pageblock_flags; #endif /* CONFIG_SPARSEMEM */ } static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } /** * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages * @page: The page within the block of interest * @pfn: The target page frame number * @end_bitidx: The last bit of interest to retrieve * @mask: mask of bits that the caller is interested in * * Return: pageblock_bits flags */ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; unsigned long bitidx, word_bitidx; unsigned long word; zone = page_zone(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); word = bitmap[word_bitidx]; bitidx += end_bitidx; return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @flags: The flags to set * @pfn: The target page frame number * @end_bitidx: The last bit of interest * @mask: mask of bits that the caller is interested in */ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; unsigned long bitidx, word_bitidx; unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); bitidx += end_bitidx; mask <<= (BITS_PER_LONG - bitidx - 1); flags <<= (BITS_PER_LONG - bitidx - 1); word = READ_ONCE(bitmap[word_bitidx]); for (;;) { old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); if (word == old_word) break; word = old_word; } } /* * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. It means you can't * expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; int mt; /* * For avoiding noise data, lru_add_drain_all() should be called * If ZONE_MOVABLE, the zone never contains unmovable pages */ if (zone_idx(zone) == ZONE_MOVABLE) return false; mt = get_pageblock_migratetype(page); if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) return false; pfn = page_to_pfn(page); for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; if (!pfn_valid_within(check)) continue; page = pfn_to_page(check); /* * Hugepages are not in LRU lists, but they're movable. * We need not scan over tail pages bacause we don't * handle each tail page individually in migration. */ if (PageHuge(page)) { iter = round_up(iter + 1, 1<<compound_order(page)) - 1; continue; } /* * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP * because their page->_count is zero at all time. */ if (!atomic_read(&page->_count)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; } /* * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ if (skip_hwpoisoned_pages && PageHWPoison(page)) continue; if (!PageLRU(page)) found++; /* * If there are RECLAIMABLE pages, we need to check * it. But now, memory offline itself doesn't call * shrink_node_slabs() and it still to be fixed. */ /* * If the page is not RAM, page_count()should be 0. * we don't need more check. This is an _used_ not-movable page. * * The problematic thing here is PG_reserved pages. PG_reserved * is set to both of a memory hole page and a _used_ kernel * page at boot. */ if (found > count) return true; } return false; } bool is_pageblock_removable_nolock(struct page *page) { struct zone *zone; unsigned long pfn; /* * We have to be careful here because we are iterating over memory * sections which are not zone aware so we might end up outside of * the zone but still within the section. * We have to take care about the node as well. If the node is offline * its NODE_DATA will be NULL - see page_zone. */ if (!node_online(page_to_nid(page))) return false; zone = page_zone(page); pfn = page_to_pfn(page); if (!zone_spans_pfn(zone, pfn)) return false; return !has_unmovable_pages(zone, page, 0, true); } #ifdef CONFIG_CMA static unsigned long pfn_max_align_down(unsigned long pfn) { return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, pageblock_nr_pages) - 1); } static unsigned long pfn_max_align_up(unsigned long pfn) { return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, pageblock_nr_pages)); } /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) { /* This function is based on compact_zone() from compaction.c. */ unsigned long nr_reclaimed; unsigned long pfn = start; unsigned int tries = 0; int ret = 0; migrate_prep(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { ret = -EINTR; break; } if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; pfn = isolate_migratepages_range(cc, pfn, end); if (!pfn) { ret = -EINTR; break; } tries = 0; } else if (++tries == 5) { ret = ret < 0 ? ret : -EBUSY; break; } nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); return ret; } return 0; } /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate * @migratetype: migratetype of the underlaying pageblocks (either * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that * we are the only thread that changes migrate type of pageblocks the * pages fall in. * * The PFN range must belong to a single zone. * * Returns zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { unsigned long outer_start, outer_end; int ret = 0, order; struct compact_control cc = { .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); /* * What we do here is we mark all pageblocks in range as * MIGRATE_ISOLATE. Because pageblock and max order pages may * have different sizes, and due to the way page allocator * work, we align the range to biggest of the two pages so * that page allocator won't try to merge buddies from * different pageblocks and change MIGRATE_ISOLATE to some * other migration type. * * Once the pageblocks are marked as MIGRATE_ISOLATE, we * migrate the pages from an unaligned range (ie. pages that * we are interested in). This will put all the pages in * range back to page allocator as MIGRATE_ISOLATE. * * When this is done, we take the pages in range from page * allocator removing them from the buddy system. This way * page allocator will never consider using them. * * This lets us mark the pageblocks back as * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the * aligned range but not in the unaligned, original range are * put back to page allocator so that buddy can use them. */ ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype, false); if (ret) return ret; ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) goto done; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES * aligned blocks that are marked as MIGRATE_ISOLATE. What's * more, all pages in [start, end) are free in page allocator. * What we are going to do is to allocate all pages from * [start, end) (that is remove them from page allocator). * * The only problem is that pages at the beginning and at the * end of interesting range may be not aligned with pages that * page allocator holds, ie. they can be part of higher order * pages. Because of this, we reserve the bigger range and * once this is done free the pages we are not interested in. * * We don't have to hold zone->lock here because the pages are * isolated thus they won't get removed from buddy. */ lru_add_drain_all(); drain_all_pages(cc.zone); order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { if (++order >= MAX_ORDER) { ret = -EBUSY; goto done; } outer_start &= ~0UL << order; } /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { pr_info("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; goto done; } /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { ret = -EBUSY; goto done; } /* Free head and tail (if any) */ if (start != outer_start) free_contig_range(outer_start, start - outer_start); if (end != outer_end) free_contig_range(end, outer_end - end); done: undo_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); return ret; } void free_contig_range(unsigned long pfn, unsigned nr_pages) { unsigned int count = 0; for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); count += page_count(page) != 1; __free_page(page); } WARN(count != 0, "%d pages are still in use!\n", count); } #endif #ifdef CONFIG_MEMORY_HOTPLUG /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. */ void __meminit zone_pcp_update(struct zone *zone) { unsigned cpu; mutex_lock(&pcp_batch_high_lock); for_each_possible_cpu(cpu) pageset_set_high_and_batch(zone, per_cpu_ptr(zone->pageset, cpu)); mutex_unlock(&pcp_batch_high_lock); } #endif void zone_pcp_reset(struct zone *zone) { unsigned long flags; int cpu; struct per_cpu_pageset *pset; /* avoid races with drain_pages() */ local_lock_irqsave(pa_lock, flags); if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); drain_zonestat(zone, pset); } free_percpu(zone->pageset); zone->pageset = &boot_pageset; } local_unlock_irqrestore(pa_lock, flags); } #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. */ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ for (pfn = start_pfn; pfn < end_pfn; pfn++) if (pfn_valid(pfn)) break; if (pfn == end_pfn) return; zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); pfn = start_pfn; while (pfn < end_pfn) { if (!pfn_valid(pfn)) { pfn++; continue; } page = pfn_to_page(pfn); /* * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; SetPageReserved(page); continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); #ifdef CONFIG_DEBUG_VM printk(KERN_INFO "remove from free list %lx %d %lx\n", pfn, 1 << order, end_pfn); #endif list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); } #endif #ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); if (PageBuddy(page_head) && page_order(page_head) >= order) break; } spin_unlock_irqrestore(&zone->lock, flags); return order < MAX_ORDER; } #endif