summaryrefslogtreecommitdiffstats
path: root/kernel/net/sched/sch_choke.c
blob: 0a08c860eee4f35905907ce1eab38d6152c4487c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
/*
 * net/sched/sch_choke.c	CHOKE scheduler
 *
 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 2 as published by the Free Software Foundation.
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
#include <net/flow_dissector.h>

/*
   CHOKe stateless AQM for fair bandwidth allocation
   =================================================

   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
   maintains no flow state. The difference from RED is an additional step
   during the enqueuing process. If average queue size is over the
   low threshold (qmin), a packet is chosen at random from the queue.
   If both the new and chosen packet are from the same flow, both
   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
   needs to access packets in queue randomly. It has a minimal class
   interface to allow overriding the builtin flow classifier with
   filters.

   Source:
   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
   IEEE INFOCOM, 2000.

   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
   Characteristics", IEEE/ACM Transactions on Networking, 2004

 */

/* Upper bound on size of sk_buff table (packets) */
#define CHOKE_MAX_QUEUE	(128*1024 - 1)

struct choke_sched_data {
/* Parameters */
	u32		 limit;
	unsigned char	 flags;

	struct red_parms parms;

/* Variables */
	struct red_vars  vars;
	struct tcf_proto __rcu *filter_list;
	struct {
		u32	prob_drop;	/* Early probability drops */
		u32	prob_mark;	/* Early probability marks */
		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
		u32	pdrop;          /* Drops due to queue limits */
		u32	other;          /* Drops due to drop() calls */
		u32	matched;	/* Drops to flow match */
	} stats;

	unsigned int	 head;
	unsigned int	 tail;

	unsigned int	 tab_mask; /* size - 1 */

	struct sk_buff **tab;
};

/* number of elements in queue including holes */
static unsigned int choke_len(const struct choke_sched_data *q)
{
	return (q->tail - q->head) & q->tab_mask;
}

/* Is ECN parameter configured */
static int use_ecn(const struct choke_sched_data *q)
{
	return q->flags & TC_RED_ECN;
}

/* Should packets over max just be dropped (versus marked) */
static int use_harddrop(const struct choke_sched_data *q)
{
	return q->flags & TC_RED_HARDDROP;
}

/* Move head pointer forward to skip over holes */
static void choke_zap_head_holes(struct choke_sched_data *q)
{
	do {
		q->head = (q->head + 1) & q->tab_mask;
		if (q->head == q->tail)
			break;
	} while (q->tab[q->head] == NULL);
}

/* Move tail pointer backwards to reuse holes */
static void choke_zap_tail_holes(struct choke_sched_data *q)
{
	do {
		q->tail = (q->tail - 1) & q->tab_mask;
		if (q->head == q->tail)
			break;
	} while (q->tab[q->tail] == NULL);
}

/* Drop packet from queue array by creating a "hole" */
static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb = q->tab[idx];

	q->tab[idx] = NULL;

	if (idx == q->head)
		choke_zap_head_holes(q);
	if (idx == q->tail)
		choke_zap_tail_holes(q);

	qdisc_qstats_backlog_dec(sch, skb);
	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
	qdisc_drop(skb, sch);
	--sch->q.qlen;
}

struct choke_skb_cb {
	u16			classid;
	u8			keys_valid;
	struct			flow_keys_digest keys;
};

static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
{
	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
}

static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
{
	choke_skb_cb(skb)->classid = classid;
}

static u16 choke_get_classid(const struct sk_buff *skb)
{
	return choke_skb_cb(skb)->classid;
}

/*
 * Compare flow of two packets
 *  Returns true only if source and destination address and port match.
 *          false for special cases
 */
static bool choke_match_flow(struct sk_buff *skb1,
			     struct sk_buff *skb2)
{
	struct flow_keys temp;

	if (skb1->protocol != skb2->protocol)
		return false;

	if (!choke_skb_cb(skb1)->keys_valid) {
		choke_skb_cb(skb1)->keys_valid = 1;
		skb_flow_dissect_flow_keys(skb1, &temp, 0);
		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
	}

	if (!choke_skb_cb(skb2)->keys_valid) {
		choke_skb_cb(skb2)->keys_valid = 1;
		skb_flow_dissect_flow_keys(skb2, &temp, 0);
		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
	}

	return !memcmp(&choke_skb_cb(skb1)->keys,
		       &choke_skb_cb(skb2)->keys,
		       sizeof(choke_skb_cb(skb1)->keys));
}

/*
 * Classify flow using either:
 *  1. pre-existing classification result in skb
 *  2. fast internal classification
 *  3. use TC filter based classification
 */
static bool choke_classify(struct sk_buff *skb,
			   struct Qdisc *sch, int *qerr)

{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct tcf_result res;
	struct tcf_proto *fl;
	int result;

	fl = rcu_dereference_bh(q->filter_list);
	result = tc_classify(skb, fl, &res, false);
	if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
		switch (result) {
		case TC_ACT_STOLEN:
		case TC_ACT_QUEUED:
			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
		case TC_ACT_SHOT:
			return false;
		}
#endif
		choke_set_classid(skb, TC_H_MIN(res.classid));
		return true;
	}

	return false;
}

/*
 * Select a packet at random from queue
 * HACK: since queue can have holes from previous deletion; retry several
 *   times to find a random skb but then just give up and return the head
 * Will return NULL if queue is empty (q->head == q->tail)
 */
static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
					 unsigned int *pidx)
{
	struct sk_buff *skb;
	int retrys = 3;

	do {
		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
		skb = q->tab[*pidx];
		if (skb)
			return skb;
	} while (--retrys > 0);

	return q->tab[*pidx = q->head];
}

/*
 * Compare new packet with random packet in queue
 * returns true if matched and sets *pidx
 */
static bool choke_match_random(const struct choke_sched_data *q,
			       struct sk_buff *nskb,
			       unsigned int *pidx)
{
	struct sk_buff *oskb;

	if (q->head == q->tail)
		return false;

	oskb = choke_peek_random(q, pidx);
	if (rcu_access_pointer(q->filter_list))
		return choke_get_classid(nskb) == choke_get_classid(oskb);

	return choke_match_flow(oskb, nskb);
}

static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
	struct choke_sched_data *q = qdisc_priv(sch);
	const struct red_parms *p = &q->parms;

	if (rcu_access_pointer(q->filter_list)) {
		/* If using external classifiers, get result and record it. */
		if (!choke_classify(skb, sch, &ret))
			goto other_drop;	/* Packet was eaten by filter */
	}

	choke_skb_cb(skb)->keys_valid = 0;
	/* Compute average queue usage (see RED) */
	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
	if (red_is_idling(&q->vars))
		red_end_of_idle_period(&q->vars);

	/* Is queue small? */
	if (q->vars.qavg <= p->qth_min)
		q->vars.qcount = -1;
	else {
		unsigned int idx;

		/* Draw a packet at random from queue and compare flow */
		if (choke_match_random(q, skb, &idx)) {
			q->stats.matched++;
			choke_drop_by_idx(sch, idx);
			goto congestion_drop;
		}

		/* Queue is large, always mark/drop */
		if (q->vars.qavg > p->qth_max) {
			q->vars.qcount = -1;

			qdisc_qstats_overlimit(sch);
			if (use_harddrop(q) || !use_ecn(q) ||
			    !INET_ECN_set_ce(skb)) {
				q->stats.forced_drop++;
				goto congestion_drop;
			}

			q->stats.forced_mark++;
		} else if (++q->vars.qcount) {
			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
				q->vars.qcount = 0;
				q->vars.qR = red_random(p);

				qdisc_qstats_overlimit(sch);
				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
					q->stats.prob_drop++;
					goto congestion_drop;
				}

				q->stats.prob_mark++;
			}
		} else
			q->vars.qR = red_random(p);
	}

	/* Admit new packet */
	if (sch->q.qlen < q->limit) {
		q->tab[q->tail] = skb;
		q->tail = (q->tail + 1) & q->tab_mask;
		++sch->q.qlen;
		qdisc_qstats_backlog_inc(sch, skb);
		return NET_XMIT_SUCCESS;
	}

	q->stats.pdrop++;
	return qdisc_drop(skb, sch);

congestion_drop:
	qdisc_drop(skb, sch);
	return NET_XMIT_CN;

other_drop:
	if (ret & __NET_XMIT_BYPASS)
		qdisc_qstats_drop(sch);
	kfree_skb(skb);
	return ret;
}

static struct sk_buff *choke_dequeue(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;

	if (q->head == q->tail) {
		if (!red_is_idling(&q->vars))
			red_start_of_idle_period(&q->vars);
		return NULL;
	}

	skb = q->tab[q->head];
	q->tab[q->head] = NULL;
	choke_zap_head_holes(q);
	--sch->q.qlen;
	qdisc_qstats_backlog_dec(sch, skb);
	qdisc_bstats_update(sch, skb);

	return skb;
}

static unsigned int choke_drop(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	unsigned int len;

	len = qdisc_queue_drop(sch);
	if (len > 0)
		q->stats.other++;
	else {
		if (!red_is_idling(&q->vars))
			red_start_of_idle_period(&q->vars);
	}

	return len;
}

static void choke_reset(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);

	while (q->head != q->tail) {
		struct sk_buff *skb = q->tab[q->head];

		q->head = (q->head + 1) & q->tab_mask;
		if (!skb)
			continue;
		qdisc_qstats_backlog_dec(sch, skb);
		--sch->q.qlen;
		qdisc_drop(skb, sch);
	}

	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
	q->head = q->tail = 0;
	red_restart(&q->vars);
}

static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
};


static void choke_free(void *addr)
{
	kvfree(addr);
}

static int choke_change(struct Qdisc *sch, struct nlattr *opt)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_CHOKE_MAX + 1];
	const struct tc_red_qopt *ctl;
	int err;
	struct sk_buff **old = NULL;
	unsigned int mask;
	u32 max_P;

	if (opt == NULL)
		return -EINVAL;

	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
	if (err < 0)
		return err;

	if (tb[TCA_CHOKE_PARMS] == NULL ||
	    tb[TCA_CHOKE_STAB] == NULL)
		return -EINVAL;

	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;

	ctl = nla_data(tb[TCA_CHOKE_PARMS]);

	if (ctl->limit > CHOKE_MAX_QUEUE)
		return -EINVAL;

	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
	if (mask != q->tab_mask) {
		struct sk_buff **ntab;

		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
			       GFP_KERNEL | __GFP_NOWARN);
		if (!ntab)
			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
		if (!ntab)
			return -ENOMEM;

		sch_tree_lock(sch);
		old = q->tab;
		if (old) {
			unsigned int oqlen = sch->q.qlen, tail = 0;
			unsigned dropped = 0;

			while (q->head != q->tail) {
				struct sk_buff *skb = q->tab[q->head];

				q->head = (q->head + 1) & q->tab_mask;
				if (!skb)
					continue;
				if (tail < mask) {
					ntab[tail++] = skb;
					continue;
				}
				dropped += qdisc_pkt_len(skb);
				qdisc_qstats_backlog_dec(sch, skb);
				--sch->q.qlen;
				qdisc_drop(skb, sch);
			}
			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
			q->head = 0;
			q->tail = tail;
		}

		q->tab_mask = mask;
		q->tab = ntab;
	} else
		sch_tree_lock(sch);

	q->flags = ctl->flags;
	q->limit = ctl->limit;

	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
		      ctl->Plog, ctl->Scell_log,
		      nla_data(tb[TCA_CHOKE_STAB]),
		      max_P);
	red_set_vars(&q->vars);

	if (q->head == q->tail)
		red_end_of_idle_period(&q->vars);

	sch_tree_unlock(sch);
	choke_free(old);
	return 0;
}

static int choke_init(struct Qdisc *sch, struct nlattr *opt)
{
	return choke_change(sch, opt);
}

static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct nlattr *opts = NULL;
	struct tc_red_qopt opt = {
		.limit		= q->limit,
		.flags		= q->flags,
		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
		.Wlog		= q->parms.Wlog,
		.Plog		= q->parms.Plog,
		.Scell_log	= q->parms.Scell_log,
	};

	opts = nla_nest_start(skb, TCA_OPTIONS);
	if (opts == NULL)
		goto nla_put_failure;

	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
		goto nla_put_failure;
	return nla_nest_end(skb, opts);

nla_put_failure:
	nla_nest_cancel(skb, opts);
	return -EMSGSIZE;
}

static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct tc_choke_xstats st = {
		.early	= q->stats.prob_drop + q->stats.forced_drop,
		.marked	= q->stats.prob_mark + q->stats.forced_mark,
		.pdrop	= q->stats.pdrop,
		.other	= q->stats.other,
		.matched = q->stats.matched,
	};

	return gnet_stats_copy_app(d, &st, sizeof(st));
}

static void choke_destroy(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);

	tcf_destroy_chain(&q->filter_list);
	choke_free(q->tab);
}

static struct sk_buff *choke_peek_head(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);

	return (q->head != q->tail) ? q->tab[q->head] : NULL;
}

static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
	.id		=	"choke",
	.priv_size	=	sizeof(struct choke_sched_data),

	.enqueue	=	choke_enqueue,
	.dequeue	=	choke_dequeue,
	.peek		=	choke_peek_head,
	.drop		=	choke_drop,
	.init		=	choke_init,
	.destroy	=	choke_destroy,
	.reset		=	choke_reset,
	.change		=	choke_change,
	.dump		=	choke_dump,
	.dump_stats	=	choke_dump_stats,
	.owner		=	THIS_MODULE,
};

static int __init choke_module_init(void)
{
	return register_qdisc(&choke_qdisc_ops);
}

static void __exit choke_module_exit(void)
{
	unregister_qdisc(&choke_qdisc_ops);
}

module_init(choke_module_init)
module_exit(choke_module_exit)

MODULE_LICENSE("GPL");
ass="n">flags; if ((bar & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { flags = bar & ~PCI_BASE_ADDRESS_IO_MASK; flags |= IORESOURCE_IO; return flags; } flags = bar & ~PCI_BASE_ADDRESS_MEM_MASK; flags |= IORESOURCE_MEM; if (flags & PCI_BASE_ADDRESS_MEM_PREFETCH) flags |= IORESOURCE_PREFETCH; mem_type = bar & PCI_BASE_ADDRESS_MEM_TYPE_MASK; switch (mem_type) { case PCI_BASE_ADDRESS_MEM_TYPE_32: break; case PCI_BASE_ADDRESS_MEM_TYPE_1M: /* 1M mem BAR treated as 32-bit BAR */ break; case PCI_BASE_ADDRESS_MEM_TYPE_64: flags |= IORESOURCE_MEM_64; break; default: /* mem unknown type treated as 32-bit BAR */ break; } return flags; } #define PCI_COMMAND_DECODE_ENABLE (PCI_COMMAND_MEMORY | PCI_COMMAND_IO) /** * pci_read_base - read a PCI BAR * @dev: the PCI device * @type: type of the BAR * @res: resource buffer to be filled in * @pos: BAR position in the config space * * Returns 1 if the BAR is 64-bit, or 0 if 32-bit. */ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, struct resource *res, unsigned int pos) { u32 l, sz, mask; u64 l64, sz64, mask64; u16 orig_cmd; struct pci_bus_region region, inverted_region; mask = type ? PCI_ROM_ADDRESS_MASK : ~0; /* No printks while decoding is disabled! */ if (!dev->mmio_always_on) { pci_read_config_word(dev, PCI_COMMAND, &orig_cmd); if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) { pci_write_config_word(dev, PCI_COMMAND, orig_cmd & ~PCI_COMMAND_DECODE_ENABLE); } } res->name = pci_name(dev); pci_read_config_dword(dev, pos, &l); pci_write_config_dword(dev, pos, l | mask); pci_read_config_dword(dev, pos, &sz); pci_write_config_dword(dev, pos, l); /* * All bits set in sz means the device isn't working properly. * If the BAR isn't implemented, all bits must be 0. If it's a * memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit * 1 must be clear. */ if (sz == 0xffffffff) sz = 0; /* * I don't know how l can have all bits set. Copied from old code. * Maybe it fixes a bug on some ancient platform. */ if (l == 0xffffffff) l = 0; if (type == pci_bar_unknown) { res->flags = decode_bar(dev, l); res->flags |= IORESOURCE_SIZEALIGN; if (res->flags & IORESOURCE_IO) { l64 = l & PCI_BASE_ADDRESS_IO_MASK; sz64 = sz & PCI_BASE_ADDRESS_IO_MASK; mask64 = PCI_BASE_ADDRESS_IO_MASK & (u32)IO_SPACE_LIMIT; } else { l64 = l & PCI_BASE_ADDRESS_MEM_MASK; sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK; mask64 = (u32)PCI_BASE_ADDRESS_MEM_MASK; } } else { res->flags |= (l & IORESOURCE_ROM_ENABLE); l64 = l & PCI_ROM_ADDRESS_MASK; sz64 = sz & PCI_ROM_ADDRESS_MASK; mask64 = (u32)PCI_ROM_ADDRESS_MASK; } if (res->flags & IORESOURCE_MEM_64) { pci_read_config_dword(dev, pos + 4, &l); pci_write_config_dword(dev, pos + 4, ~0); pci_read_config_dword(dev, pos + 4, &sz); pci_write_config_dword(dev, pos + 4, l); l64 |= ((u64)l << 32); sz64 |= ((u64)sz << 32); mask64 |= ((u64)~0 << 32); } if (!dev->mmio_always_on && (orig_cmd & PCI_COMMAND_DECODE_ENABLE)) pci_write_config_word(dev, PCI_COMMAND, orig_cmd); if (!sz64) goto fail; sz64 = pci_size(l64, sz64, mask64); if (!sz64) { dev_info(&dev->dev, FW_BUG "reg 0x%x: invalid BAR (can't size)\n", pos); goto fail; } if (res->flags & IORESOURCE_MEM_64) { if ((sizeof(pci_bus_addr_t) < 8 || sizeof(resource_size_t) < 8) && sz64 > 0x100000000ULL) { res->flags |= IORESOURCE_UNSET | IORESOURCE_DISABLED; res->start = 0; res->end = 0; dev_err(&dev->dev, "reg 0x%x: can't handle BAR larger than 4GB (size %#010llx)\n", pos, (unsigned long long)sz64); goto out; } if ((sizeof(pci_bus_addr_t) < 8) && l) { /* Above 32-bit boundary; try to reallocate */ res->flags |= IORESOURCE_UNSET; res->start = 0; res->end = sz64; dev_info(&dev->dev, "reg 0x%x: can't handle BAR above 4GB (bus address %#010llx)\n", pos, (unsigned long long)l64); goto out; } } region.start = l64; region.end = l64 + sz64; pcibios_bus_to_resource(dev->bus, res, &region); pcibios_resource_to_bus(dev->bus, &inverted_region, res); /* * If "A" is a BAR value (a bus address), "bus_to_resource(A)" is * the corresponding resource address (the physical address used by * the CPU. Converting that resource address back to a bus address * should yield the original BAR value: * * resource_to_bus(bus_to_resource(A)) == A * * If it doesn't, CPU accesses to "bus_to_resource(A)" will not * be claimed by the device. */ if (inverted_region.start != region.start) { res->flags |= IORESOURCE_UNSET; res->start = 0; res->end = region.end - region.start; dev_info(&dev->dev, "reg 0x%x: initial BAR value %#010llx invalid\n", pos, (unsigned long long)region.start); } goto out; fail: res->flags = 0; out: if (res->flags) dev_printk(KERN_DEBUG, &dev->dev, "reg 0x%x: %pR\n", pos, res); return (res->flags & IORESOURCE_MEM_64) ? 1 : 0; } static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom) { unsigned int pos, reg; for (pos = 0; pos < howmany; pos++) { struct resource *res = &dev->resource[pos]; reg = PCI_BASE_ADDRESS_0 + (pos << 2); pos += __pci_read_base(dev, pci_bar_unknown, res, reg); } if (rom) { struct resource *res = &dev->resource[PCI_ROM_RESOURCE]; dev->rom_base_reg = rom; res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH | IORESOURCE_READONLY | IORESOURCE_SIZEALIGN; __pci_read_base(dev, pci_bar_mem32, res, rom); } } static void pci_read_bridge_io(struct pci_bus *child) { struct pci_dev *dev = child->self; u8 io_base_lo, io_limit_lo; unsigned long io_mask, io_granularity, base, limit; struct pci_bus_region region; struct resource *res; io_mask = PCI_IO_RANGE_MASK; io_granularity = 0x1000; if (dev->io_window_1k) { /* Support 1K I/O space granularity */ io_mask = PCI_IO_1K_RANGE_MASK; io_granularity = 0x400; } res = child->resource[0]; pci_read_config_byte(dev, PCI_IO_BASE, &io_base_lo); pci_read_config_byte(dev, PCI_IO_LIMIT, &io_limit_lo); base = (io_base_lo & io_mask) << 8; limit = (io_limit_lo & io_mask) << 8; if ((io_base_lo & PCI_IO_RANGE_TYPE_MASK) == PCI_IO_RANGE_TYPE_32) { u16 io_base_hi, io_limit_hi; pci_read_config_word(dev, PCI_IO_BASE_UPPER16, &io_base_hi); pci_read_config_word(dev, PCI_IO_LIMIT_UPPER16, &io_limit_hi); base |= ((unsigned long) io_base_hi << 16); limit |= ((unsigned long) io_limit_hi << 16); } if (base <= limit) { res->flags = (io_base_lo & PCI_IO_RANGE_TYPE_MASK) | IORESOURCE_IO; region.start = base; region.end = limit + io_granularity - 1; pcibios_bus_to_resource(dev->bus, res, &region); dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res); } } static void pci_read_bridge_mmio(struct pci_bus *child) { struct pci_dev *dev = child->self; u16 mem_base_lo, mem_limit_lo; unsigned long base, limit; struct pci_bus_region region; struct resource *res; res = child->resource[1]; pci_read_config_word(dev, PCI_MEMORY_BASE, &mem_base_lo); pci_read_config_word(dev, PCI_MEMORY_LIMIT, &mem_limit_lo); base = ((unsigned long) mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16; limit = ((unsigned long) mem_limit_lo & PCI_MEMORY_RANGE_MASK) << 16; if (base <= limit) { res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM; region.start = base; region.end = limit + 0xfffff; pcibios_bus_to_resource(dev->bus, res, &region); dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res); } } static void pci_read_bridge_mmio_pref(struct pci_bus *child) { struct pci_dev *dev = child->self; u16 mem_base_lo, mem_limit_lo; u64 base64, limit64; pci_bus_addr_t base, limit; struct pci_bus_region region; struct resource *res; res = child->resource[2]; pci_read_config_word(dev, PCI_PREF_MEMORY_BASE, &mem_base_lo); pci_read_config_word(dev, PCI_PREF_MEMORY_LIMIT, &mem_limit_lo); base64 = (mem_base_lo & PCI_PREF_RANGE_MASK) << 16; limit64 = (mem_limit_lo & PCI_PREF_RANGE_MASK) << 16; if ((mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64) { u32 mem_base_hi, mem_limit_hi; pci_read_config_dword(dev, PCI_PREF_BASE_UPPER32, &mem_base_hi); pci_read_config_dword(dev, PCI_PREF_LIMIT_UPPER32, &mem_limit_hi); /* * Some bridges set the base > limit by default, and some * (broken) BIOSes do not initialize them. If we find * this, just assume they are not being used. */ if (mem_base_hi <= mem_limit_hi) { base64 |= (u64) mem_base_hi << 32; limit64 |= (u64) mem_limit_hi << 32; } } base = (pci_bus_addr_t) base64; limit = (pci_bus_addr_t) limit64; if (base != base64) { dev_err(&dev->dev, "can't handle bridge window above 4GB (bus address %#010llx)\n", (unsigned long long) base64); return; } if (base <= limit) { res->flags = (mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH; if (res->flags & PCI_PREF_RANGE_TYPE_64) res->flags |= IORESOURCE_MEM_64; region.start = base; region.end = limit + 0xfffff; pcibios_bus_to_resource(dev->bus, res, &region); dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR\n", res); } } void pci_read_bridge_bases(struct pci_bus *child) { struct pci_dev *dev = child->self; struct resource *res; int i; if (pci_is_root_bus(child)) /* It's a host bus, nothing to read */ return; dev_info(&dev->dev, "PCI bridge to %pR%s\n", &child->busn_res, dev->transparent ? " (subtractive decode)" : ""); pci_bus_remove_resources(child); for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) child->resource[i] = &dev->resource[PCI_BRIDGE_RESOURCES+i]; pci_read_bridge_io(child); pci_read_bridge_mmio(child); pci_read_bridge_mmio_pref(child); if (dev->transparent) { pci_bus_for_each_resource(child->parent, res, i) { if (res && res->flags) { pci_bus_add_resource(child, res, PCI_SUBTRACTIVE_DECODE); dev_printk(KERN_DEBUG, &dev->dev, " bridge window %pR (subtractive decode)\n", res); } } } } static struct pci_bus *pci_alloc_bus(struct pci_bus *parent) { struct pci_bus *b; b = kzalloc(sizeof(*b), GFP_KERNEL); if (!b) return NULL; INIT_LIST_HEAD(&b->node); INIT_LIST_HEAD(&b->children); INIT_LIST_HEAD(&b->devices); INIT_LIST_HEAD(&b->slots); INIT_LIST_HEAD(&b->resources); b->max_bus_speed = PCI_SPEED_UNKNOWN; b->cur_bus_speed = PCI_SPEED_UNKNOWN; #ifdef CONFIG_PCI_DOMAINS_GENERIC if (parent) b->domain_nr = parent->domain_nr; #endif return b; } static void pci_release_host_bridge_dev(struct device *dev) { struct pci_host_bridge *bridge = to_pci_host_bridge(dev); if (bridge->release_fn) bridge->release_fn(bridge); pci_free_resource_list(&bridge->windows); kfree(bridge); } static struct pci_host_bridge *pci_alloc_host_bridge(struct pci_bus *b) { struct pci_host_bridge *bridge; bridge = kzalloc(sizeof(*bridge), GFP_KERNEL); if (!bridge) return NULL; INIT_LIST_HEAD(&bridge->windows); bridge->bus = b; return bridge; } static const unsigned char pcix_bus_speed[] = { PCI_SPEED_UNKNOWN, /* 0 */ PCI_SPEED_66MHz_PCIX, /* 1 */ PCI_SPEED_100MHz_PCIX, /* 2 */ PCI_SPEED_133MHz_PCIX, /* 3 */ PCI_SPEED_UNKNOWN, /* 4 */ PCI_SPEED_66MHz_PCIX_ECC, /* 5 */ PCI_SPEED_100MHz_PCIX_ECC, /* 6 */ PCI_SPEED_133MHz_PCIX_ECC, /* 7 */ PCI_SPEED_UNKNOWN, /* 8 */ PCI_SPEED_66MHz_PCIX_266, /* 9 */ PCI_SPEED_100MHz_PCIX_266, /* A */ PCI_SPEED_133MHz_PCIX_266, /* B */ PCI_SPEED_UNKNOWN, /* C */ PCI_SPEED_66MHz_PCIX_533, /* D */ PCI_SPEED_100MHz_PCIX_533, /* E */ PCI_SPEED_133MHz_PCIX_533 /* F */ }; const unsigned char pcie_link_speed[] = { PCI_SPEED_UNKNOWN, /* 0 */ PCIE_SPEED_2_5GT, /* 1 */ PCIE_SPEED_5_0GT, /* 2 */ PCIE_SPEED_8_0GT, /* 3 */ PCI_SPEED_UNKNOWN, /* 4 */ PCI_SPEED_UNKNOWN, /* 5 */ PCI_SPEED_UNKNOWN, /* 6 */ PCI_SPEED_UNKNOWN, /* 7 */ PCI_SPEED_UNKNOWN, /* 8 */ PCI_SPEED_UNKNOWN, /* 9 */ PCI_SPEED_UNKNOWN, /* A */ PCI_SPEED_UNKNOWN, /* B */ PCI_SPEED_UNKNOWN, /* C */ PCI_SPEED_UNKNOWN, /* D */ PCI_SPEED_UNKNOWN, /* E */ PCI_SPEED_UNKNOWN /* F */ }; void pcie_update_link_speed(struct pci_bus *bus, u16 linksta) { bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS]; } EXPORT_SYMBOL_GPL(pcie_update_link_speed); static unsigned char agp_speeds[] = { AGP_UNKNOWN, AGP_1X, AGP_2X, AGP_4X, AGP_8X }; static enum pci_bus_speed agp_speed(int agp3, int agpstat) { int index = 0; if (agpstat & 4) index = 3; else if (agpstat & 2) index = 2; else if (agpstat & 1) index = 1; else goto out; if (agp3) { index += 2; if (index == 5) index = 0; } out: return agp_speeds[index]; } static void pci_set_bus_speed(struct pci_bus *bus) { struct pci_dev *bridge = bus->self; int pos; pos = pci_find_capability(bridge, PCI_CAP_ID_AGP); if (!pos) pos = pci_find_capability(bridge, PCI_CAP_ID_AGP3); if (pos) { u32 agpstat, agpcmd; pci_read_config_dword(bridge, pos + PCI_AGP_STATUS, &agpstat); bus->max_bus_speed = agp_speed(agpstat & 8, agpstat & 7); pci_read_config_dword(bridge, pos + PCI_AGP_COMMAND, &agpcmd); bus->cur_bus_speed = agp_speed(agpstat & 8, agpcmd & 7); } pos = pci_find_capability(bridge, PCI_CAP_ID_PCIX); if (pos) { u16 status; enum pci_bus_speed max; pci_read_config_word(bridge, pos + PCI_X_BRIDGE_SSTATUS, &status); if (status & PCI_X_SSTATUS_533MHZ) { max = PCI_SPEED_133MHz_PCIX_533; } else if (status & PCI_X_SSTATUS_266MHZ) { max = PCI_SPEED_133MHz_PCIX_266; } else if (status & PCI_X_SSTATUS_133MHZ) { if ((status & PCI_X_SSTATUS_VERS) == PCI_X_SSTATUS_V2) max = PCI_SPEED_133MHz_PCIX_ECC; else max = PCI_SPEED_133MHz_PCIX; } else { max = PCI_SPEED_66MHz_PCIX; } bus->max_bus_speed = max; bus->cur_bus_speed = pcix_bus_speed[ (status & PCI_X_SSTATUS_FREQ) >> 6]; return; } if (pci_is_pcie(bridge)) { u32 linkcap; u16 linksta; pcie_capability_read_dword(bridge, PCI_EXP_LNKCAP, &linkcap); bus->max_bus_speed = pcie_link_speed[linkcap & PCI_EXP_LNKCAP_SLS]; pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta); pcie_update_link_speed(bus, linksta); } } static struct irq_domain *pci_host_bridge_msi_domain(struct pci_bus *bus) { struct irq_domain *d; /* * Any firmware interface that can resolve the msi_domain * should be called from here. */ d = pci_host_bridge_of_msi_domain(bus); return d; } static void pci_set_bus_msi_domain(struct pci_bus *bus) { struct irq_domain *d; struct pci_bus *b; /* * The bus can be a root bus, a subordinate bus, or a virtual bus * created by an SR-IOV device. Walk up to the first bridge device * found or derive the domain from the host bridge. */ for (b = bus, d = NULL; !d && !pci_is_root_bus(b); b = b->parent) { if (b->self) d = dev_get_msi_domain(&b->self->dev); } if (!d) d = pci_host_bridge_msi_domain(b); dev_set_msi_domain(&bus->dev, d); } static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent, struct pci_dev *bridge, int busnr) { struct pci_bus *child; int i; int ret; /* * Allocate a new bus, and inherit stuff from the parent.. */ child = pci_alloc_bus(parent); if (!child) return NULL; child->parent = parent; child->ops = parent->ops; child->msi = parent->msi; child->sysdata = parent->sysdata; child->bus_flags = parent->bus_flags; /* initialize some portions of the bus device, but don't register it * now as the parent is not properly set up yet. */ child->dev.class = &pcibus_class; dev_set_name(&child->dev, "%04x:%02x", pci_domain_nr(child), busnr); /* * Set up the primary, secondary and subordinate * bus numbers. */ child->number = child->busn_res.start = busnr; child->primary = parent->busn_res.start; child->busn_res.end = 0xff; if (!bridge) { child->dev.parent = parent->bridge; goto add_dev; } child->self = bridge; child->bridge = get_device(&bridge->dev); child->dev.parent = child->bridge; pci_set_bus_of_node(child); pci_set_bus_speed(child); /* Set up default resource pointers and names.. */ for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i]; child->resource[i]->name = child->name; } bridge->subordinate = child; add_dev: pci_set_bus_msi_domain(child); ret = device_register(&child->dev); WARN_ON(ret < 0); pcibios_add_bus(child); /* Create legacy_io and legacy_mem files for this bus */ pci_create_legacy_files(child); return child; } struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, int busnr) { struct pci_bus *child; child = pci_alloc_child_bus(parent, dev, busnr); if (child) { down_write(&pci_bus_sem); list_add_tail(&child->node, &parent->children); up_write(&pci_bus_sem); } return child; } EXPORT_SYMBOL(pci_add_new_bus); static void pci_enable_crs(struct pci_dev *pdev) { u16 root_cap = 0; /* Enable CRS Software Visibility if supported */ pcie_capability_read_word(pdev, PCI_EXP_RTCAP, &root_cap); if (root_cap & PCI_EXP_RTCAP_CRSVIS) pcie_capability_set_word(pdev, PCI_EXP_RTCTL, PCI_EXP_RTCTL_CRSSVE); } /* * If it's a bridge, configure it and scan the bus behind it. * For CardBus bridges, we don't scan behind as the devices will * be handled by the bridge driver itself. * * We need to process bridges in two passes -- first we scan those * already configured by the BIOS and after we are done with all of * them, we proceed to assigning numbers to the remaining buses in * order to avoid overlaps between old and new bus numbers. */ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) { struct pci_bus *child; int is_cardbus = (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS); u32 buses, i, j = 0; u16 bctl; u8 primary, secondary, subordinate; int broken = 0; pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses); primary = buses & 0xFF; secondary = (buses >> 8) & 0xFF; subordinate = (buses >> 16) & 0xFF; dev_dbg(&dev->dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n", secondary, subordinate, pass); if (!primary && (primary != bus->number) && secondary && subordinate) { dev_warn(&dev->dev, "Primary bus is hard wired to 0\n"); primary = bus->number; } /* Check if setup is sensible at all */ if (!pass && (primary != bus->number || secondary <= bus->number || secondary > subordinate)) { dev_info(&dev->dev, "bridge configuration invalid ([bus %02x-%02x]), reconfiguring\n", secondary, subordinate); broken = 1; } /* Disable MasterAbortMode during probing to avoid reporting of bus errors (in some architectures) */ pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl); pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT); pci_enable_crs(dev); if ((secondary || subordinate) && !pcibios_assign_all_busses() && !is_cardbus && !broken) { unsigned int cmax; /* * Bus already configured by firmware, process it in the first * pass and just note the configuration. */ if (pass) goto out; /* * The bus might already exist for two reasons: Either we are * rescanning the bus or the bus is reachable through more than * one bridge. The second case can happen with the i450NX * chipset. */ child = pci_find_bus(pci_domain_nr(bus), secondary); if (!child) { child = pci_add_new_bus(bus, dev, secondary); if (!child) goto out; child->primary = primary; pci_bus_insert_busn_res(child, secondary, subordinate); child->bridge_ctl = bctl; } cmax = pci_scan_child_bus(child); if (cmax > subordinate) dev_warn(&dev->dev, "bridge has subordinate %02x but max busn %02x\n", subordinate, cmax); /* subordinate should equal child->busn_res.end */ if (subordinate > max) max = subordinate; } else { /* * We need to assign a number to this bus which we always * do in the second pass. */ if (!pass) { if (pcibios_assign_all_busses() || broken || is_cardbus) /* Temporarily disable forwarding of the configuration cycles on all bridges in this bus segment to avoid possible conflicts in the second pass between two bridges programmed with overlapping bus ranges. */ pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses & ~0xffffff); goto out; } /* Clear errors */ pci_write_config_word(dev, PCI_STATUS, 0xffff); /* Prevent assigning a bus number that already exists. * This can happen when a bridge is hot-plugged, so in * this case we only re-scan this bus. */ child = pci_find_bus(pci_domain_nr(bus), max+1); if (!child) { child = pci_add_new_bus(bus, dev, max+1); if (!child) goto out; pci_bus_insert_busn_res(child, max+1, 0xff); } max++; buses = (buses & 0xff000000) | ((unsigned int)(child->primary) << 0) | ((unsigned int)(child->busn_res.start) << 8) | ((unsigned int)(child->busn_res.end) << 16); /* * yenta.c forces a secondary latency timer of 176. * Copy that behaviour here. */ if (is_cardbus) { buses &= ~0xff000000; buses |= CARDBUS_LATENCY_TIMER << 24; } /* * We need to blast all three values with a single write. */ pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses); if (!is_cardbus) { child->bridge_ctl = bctl; max = pci_scan_child_bus(child); } else { /* * For CardBus bridges, we leave 4 bus numbers * as cards with a PCI-to-PCI bridge can be * inserted later. */ for (i = 0; i < CARDBUS_RESERVE_BUSNR; i++) { struct pci_bus *parent = bus; if (pci_find_bus(pci_domain_nr(bus), max+i+1)) break; while (parent->parent) { if ((!pcibios_assign_all_busses()) && (parent->busn_res.end > max) && (parent->busn_res.end <= max+i)) { j = 1; } parent = parent->parent; } if (j) { /* * Often, there are two cardbus bridges * -- try to leave one valid bus number * for each one. */ i /= 2; break; } } max += i; } /* * Set the subordinate bus number to its real value. */ pci_bus_update_busn_res_end(child, max); pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max); } sprintf(child->name, (is_cardbus ? "PCI CardBus %04x:%02x" : "PCI Bus %04x:%02x"), pci_domain_nr(bus), child->number); /* Has only triggered on CardBus, fixup is in yenta_socket */ while (bus->parent) { if ((child->busn_res.end > bus->busn_res.end) || (child->number > bus->busn_res.end) || (child->number < bus->number) || (child->busn_res.end < bus->number)) { dev_info(&child->dev, "%pR %s hidden behind%s bridge %s %pR\n", &child->busn_res, (bus->number > child->busn_res.end && bus->busn_res.end < child->number) ? "wholly" : "partially", bus->self->transparent ? " transparent" : "", dev_name(&bus->dev), &bus->busn_res); } bus = bus->parent; } out: pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl); return max; } EXPORT_SYMBOL(pci_scan_bridge); /* * Read interrupt line and base address registers. * The architecture-dependent code can tweak these, of course. */ static void pci_read_irq(struct pci_dev *dev) { unsigned char irq; pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &irq); dev->pin = irq; if (irq) pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); dev->irq = irq; } void set_pcie_port_type(struct pci_dev *pdev) { int pos; u16 reg16; int type; struct pci_dev *parent; pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); if (!pos) return; pdev->pcie_cap = pos; pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16); pdev->pcie_flags_reg = reg16; pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16); pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD; /* * A Root Port is always the upstream end of a Link. No PCIe * component has two Links. Two Links are connected by a Switch * that has a Port on each Link and internal logic to connect the * two Ports. */ type = pci_pcie_type(pdev); if (type == PCI_EXP_TYPE_ROOT_PORT) pdev->has_secondary_link = 1; else if (type == PCI_EXP_TYPE_UPSTREAM || type == PCI_EXP_TYPE_DOWNSTREAM) { parent = pci_upstream_bridge(pdev); /* * Usually there's an upstream device (Root Port or Switch * Downstream Port), but we can't assume one exists. */ if (parent && !parent->has_secondary_link) pdev->has_secondary_link = 1; } } void set_pcie_hotplug_bridge(struct pci_dev *pdev) { u32 reg32; pcie_capability_read_dword(pdev, PCI_EXP_SLTCAP, &reg32); if (reg32 & PCI_EXP_SLTCAP_HPC) pdev->is_hotplug_bridge = 1; } /** * pci_ext_cfg_is_aliased - is ext config space just an alias of std config? * @dev: PCI device * * PCI Express to PCI/PCI-X Bridge Specification, rev 1.0, 4.1.4 says that * when forwarding a type1 configuration request the bridge must check that * the extended register address field is zero. The bridge is not permitted * to forward the transactions and must handle it as an Unsupported Request. * Some bridges do not follow this rule and simply drop the extended register * bits, resulting in the standard config space being aliased, every 256 * bytes across the entire configuration space. Test for this condition by * comparing the first dword of each potential alias to the vendor/device ID. * Known offenders: * ASM1083/1085 PCIe-to-PCI Reversible Bridge (1b21:1080, rev 01 & 03) * AMD/ATI SBx00 PCI to PCI Bridge (1002:4384, rev 40) */ static bool pci_ext_cfg_is_aliased(struct pci_dev *dev) { #ifdef CONFIG_PCI_QUIRKS int pos; u32 header, tmp; pci_read_config_dword(dev, PCI_VENDOR_ID, &header); for (pos = PCI_CFG_SPACE_SIZE; pos < PCI_CFG_SPACE_EXP_SIZE; pos += PCI_CFG_SPACE_SIZE) { if (pci_read_config_dword(dev, pos, &tmp) != PCIBIOS_SUCCESSFUL || header != tmp) return false; } return true; #else return false; #endif } /** * pci_cfg_space_size - get the configuration space size of the PCI device. * @dev: PCI device * * Regular PCI devices have 256 bytes, but PCI-X 2 and PCI Express devices * have 4096 bytes. Even if the device is capable, that doesn't mean we can * access it. Maybe we don't have a way to generate extended config space * accesses, or the device is behind a reverse Express bridge. So we try * reading the dword at 0x100 which must either be 0 or a valid extended * capability header. */ static int pci_cfg_space_size_ext(struct pci_dev *dev) { u32 status; int pos = PCI_CFG_SPACE_SIZE; if (pci_read_config_dword(dev, pos, &status) != PCIBIOS_SUCCESSFUL) goto fail; if (status == 0xffffffff || pci_ext_cfg_is_aliased(dev)) goto fail; return PCI_CFG_SPACE_EXP_SIZE; fail: return PCI_CFG_SPACE_SIZE; } int pci_cfg_space_size(struct pci_dev *dev) { int pos; u32 status; u16 class; class = dev->class >> 8; if (class == PCI_CLASS_BRIDGE_HOST) return pci_cfg_space_size_ext(dev); if (!pci_is_pcie(dev)) { pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); if (!pos) goto fail; pci_read_config_dword(dev, pos + PCI_X_STATUS, &status); if (!(status & (PCI_X_STATUS_266MHZ | PCI_X_STATUS_533MHZ))) goto fail; } return pci_cfg_space_size_ext(dev); fail: return PCI_CFG_SPACE_SIZE; } #define LEGACY_IO_RESOURCE (IORESOURCE_IO | IORESOURCE_PCI_FIXED) void pci_msi_setup_pci_dev(struct pci_dev *dev) { /* * Disable the MSI hardware to avoid screaming interrupts * during boot. This is the power on reset default so * usually this should be a noop. */ dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI); if (dev->msi_cap) pci_msi_set_enable(dev, 0); dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX); if (dev->msix_cap) pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); } /** * pci_setup_device - fill in class and map information of a device * @dev: the device structure to fill * * Initialize the device structure with information about the device's * vendor,class,memory and IO-space addresses,IRQ lines etc. * Called at initialisation of the PCI subsystem and by CardBus services. * Returns 0 on success and negative if unknown type of device (not normal, * bridge or CardBus). */ int pci_setup_device(struct pci_dev *dev) { u32 class; u8 hdr_type; int pos = 0; struct pci_bus_region region; struct resource *res; if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type)) return -EIO; dev->sysdata = dev->bus->sysdata; dev->dev.parent = dev->bus->bridge; dev->dev.bus = &pci_bus_type; dev->hdr_type = hdr_type & 0x7f; dev->multifunction = !!(hdr_type & 0x80); dev->error_state = pci_channel_io_normal; set_pcie_port_type(dev); pci_dev_assign_slot(dev); /* Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer) set this higher, assuming the system even supports it. */ dev->dma_mask = 0xffffffff; dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus), dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn)); pci_read_config_dword(dev, PCI_CLASS_REVISION, &class); dev->revision = class & 0xff; dev->class = class >> 8; /* upper 3 bytes */ dev_printk(KERN_DEBUG, &dev->dev, "[%04x:%04x] type %02x class %#08x\n", dev->vendor, dev->device, dev->hdr_type, dev->class); /* need to have dev->class ready */ dev->cfg_size = pci_cfg_space_size(dev); /* "Unknown power state" */ dev->current_state = PCI_UNKNOWN; pci_msi_setup_pci_dev(dev); /* Early fixups, before probing the BARs */ pci_fixup_device(pci_fixup_early, dev); /* device class may be changed after fixup */ class = dev->class >> 8; switch (dev->hdr_type) { /* header type */ case PCI_HEADER_TYPE_NORMAL: /* standard header */ if (class == PCI_CLASS_BRIDGE_PCI) goto bad; pci_read_irq(dev); pci_read_bases(dev, 6, PCI_ROM_ADDRESS); pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor); pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device); /* * Do the ugly legacy mode stuff here rather than broken chip * quirk code. Legacy mode ATA controllers have fixed * addresses. These are not always echoed in BAR0-3, and * BAR0-3 in a few cases contain junk! */ if (class == PCI_CLASS_STORAGE_IDE) { u8 progif; pci_read_config_byte(dev, PCI_CLASS_PROG, &progif); if ((progif & 1) == 0) { region.start = 0x1F0; region.end = 0x1F7; res = &dev->resource[0]; res->flags = LEGACY_IO_RESOURCE; pcibios_bus_to_resource(dev->bus, res, &region); dev_info(&dev->dev, "legacy IDE quirk: reg 0x10: %pR\n", res); region.start = 0x3F6; region.end = 0x3F6; res = &dev->resource[1]; res->flags = LEGACY_IO_RESOURCE; pcibios_bus_to_resource(dev->bus, res, &region); dev_info(&dev->dev, "legacy IDE quirk: reg 0x14: %pR\n", res); } if ((progif & 4) == 0) { region.start = 0x170; region.end = 0x177; res = &dev->resource[2]; res->flags = LEGACY_IO_RESOURCE; pcibios_bus_to_resource(dev->bus, res, &region); dev_info(&dev->dev, "legacy IDE quirk: reg 0x18: %pR\n", res); region.start = 0x376; region.end = 0x376; res = &dev->resource[3]; res->flags = LEGACY_IO_RESOURCE; pcibios_bus_to_resource(dev->bus, res, &region); dev_info(&dev->dev, "legacy IDE quirk: reg 0x1c: %pR\n", res); } } break; case PCI_HEADER_TYPE_BRIDGE: /* bridge header */ if (class != PCI_CLASS_BRIDGE_PCI) goto bad; /* The PCI-to-PCI bridge spec requires that subtractive decoding (i.e. transparent) bridge must have programming interface code of 0x01. */ pci_read_irq(dev); dev->transparent = ((dev->class & 0xff) == 1); pci_read_bases(dev, 2, PCI_ROM_ADDRESS1); set_pcie_hotplug_bridge(dev); pos = pci_find_capability(dev, PCI_CAP_ID_SSVID); if (pos) { pci_read_config_word(dev, pos + PCI_SSVID_VENDOR_ID, &dev->subsystem_vendor); pci_read_config_word(dev, pos + PCI_SSVID_DEVICE_ID, &dev->subsystem_device); } break; case PCI_HEADER_TYPE_CARDBUS: /* CardBus bridge header */ if (class != PCI_CLASS_BRIDGE_CARDBUS) goto bad; pci_read_irq(dev); pci_read_bases(dev, 1, 0); pci_read_config_word(dev, PCI_CB_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor); pci_read_config_word(dev, PCI_CB_SUBSYSTEM_ID, &dev->subsystem_device); break; default: /* unknown header */ dev_err(&dev->dev, "unknown header type %02x, ignoring device\n", dev->hdr_type); return -EIO; bad: dev_err(&dev->dev, "ignoring class %#08x (doesn't match header type %02x)\n", dev->class, dev->hdr_type); dev->class = PCI_CLASS_NOT_DEFINED << 8; } /* We found a fine healthy device, go go go... */ return 0; } static void pci_configure_mps(struct pci_dev *dev) { struct pci_dev *bridge = pci_upstream_bridge(dev); int mps, p_mps, rc; if (!pci_is_pcie(dev) || !bridge || !pci_is_pcie(bridge)) return; mps = pcie_get_mps(dev); p_mps = pcie_get_mps(bridge); if (mps == p_mps) return; if (pcie_bus_config == PCIE_BUS_TUNE_OFF) { dev_warn(&dev->dev, "Max Payload Size %d, but upstream %s set to %d; if necessary, use \"pci=pcie_bus_safe\" and report a bug\n", mps, pci_name(bridge), p_mps); return; } /* * Fancier MPS configuration is done later by * pcie_bus_configure_settings() */ if (pcie_bus_config != PCIE_BUS_DEFAULT) return; rc = pcie_set_mps(dev, p_mps); if (rc) { dev_warn(&dev->dev, "can't set Max Payload Size to %d; if necessary, use \"pci=pcie_bus_safe\" and report a bug\n", p_mps); return; } dev_info(&dev->dev, "Max Payload Size set to %d (was %d, max %d)\n", p_mps, mps, 128 << dev->pcie_mpss); } static struct hpp_type0 pci_default_type0 = { .revision = 1, .cache_line_size = 8, .latency_timer = 0x40, .enable_serr = 0, .enable_perr = 0, }; static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp) { u16 pci_cmd, pci_bctl; if (!hpp) hpp = &pci_default_type0; if (hpp->revision > 1) { dev_warn(&dev->dev, "PCI settings rev %d not supported; using defaults\n", hpp->revision); hpp = &pci_default_type0; } pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpp->cache_line_size); pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpp->latency_timer); pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); if (hpp->enable_serr) pci_cmd |= PCI_COMMAND_SERR; if (hpp->enable_perr) pci_cmd |= PCI_COMMAND_PARITY; pci_write_config_word(dev, PCI_COMMAND, pci_cmd); /* Program bridge control value */ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { pci_write_config_byte(dev, PCI_SEC_LATENCY_TIMER, hpp->latency_timer); pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &pci_bctl); if (hpp->enable_serr) pci_bctl |= PCI_BRIDGE_CTL_SERR; if (hpp->enable_perr) pci_bctl |= PCI_BRIDGE_CTL_PARITY; pci_write_config_word(dev, PCI_BRIDGE_CONTROL, pci_bctl); } } static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp) { if (hpp) dev_warn(&dev->dev, "PCI-X settings not supported\n"); } static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) { int pos; u32 reg32; if (!hpp) return; if (hpp->revision > 1) { dev_warn(&dev->dev, "PCIe settings rev %d not supported\n", hpp->revision); return; } /* * Don't allow _HPX to change MPS or MRRS settings. We manage * those to make sure they're consistent with the rest of the * platform. */ hpp->pci_exp_devctl_and |= PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ; hpp->pci_exp_devctl_or &= ~(PCI_EXP_DEVCTL_PAYLOAD | PCI_EXP_DEVCTL_READRQ); /* Initialize Device Control Register */ pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL, ~hpp->pci_exp_devctl_and, hpp->pci_exp_devctl_or); /* Initialize Link Control Register */ if (pcie_cap_has_lnkctl(dev)) pcie_capability_clear_and_set_word(dev, PCI_EXP_LNKCTL, ~hpp->pci_exp_lnkctl_and, hpp->pci_exp_lnkctl_or); /* Find Advanced Error Reporting Enhanced Capability */ pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); if (!pos) return; /* Initialize Uncorrectable Error Mask Register */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, &reg32); reg32 = (reg32 & hpp->unc_err_mask_and) | hpp->unc_err_mask_or; pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_MASK, reg32); /* Initialize Uncorrectable Error Severity Register */ pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, &reg32); reg32 = (reg32 & hpp->unc_err_sever_and) | hpp->unc_err_sever_or; pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, reg32); /* Initialize Correctable Error Mask Register */ pci_read_config_dword(dev, pos + PCI_ERR_COR_MASK, &reg32); reg32 = (reg32 & hpp->cor_err_mask_and) | hpp->cor_err_mask_or; pci_write_config_dword(dev, pos + PCI_ERR_COR_MASK, reg32); /* Initialize Advanced Error Capabilities and Control Register */ pci_read_config_dword(dev, pos + PCI_ERR_CAP, &reg32); reg32 = (reg32 & hpp->adv_err_cap_and) | hpp->adv_err_cap_or; pci_write_config_dword(dev, pos + PCI_ERR_CAP, reg32); /* * FIXME: The following two registers are not supported yet. * * o Secondary Uncorrectable Error Severity Register * o Secondary Uncorrectable Error Mask Register */ } static void pci_configure_device(struct pci_dev *dev) { struct hotplug_params hpp; int ret; pci_configure_mps(dev); memset(&hpp, 0, sizeof(hpp)); ret = pci_get_hp_params(dev, &hpp); if (ret) return; program_hpp_type2(dev, hpp.t2); program_hpp_type1(dev, hpp.t1); program_hpp_type0(dev, hpp.t0); } static void pci_release_capabilities(struct pci_dev *dev) { pci_vpd_release(dev); pci_iov_release(dev); pci_free_cap_save_buffers(dev); } /** * pci_release_dev - free a pci device structure when all users of it are finished. * @dev: device that's been disconnected * * Will be called only by the device core when all users of this pci device are * done. */ static void pci_release_dev(struct device *dev) { struct pci_dev *pci_dev; pci_dev = to_pci_dev(dev); pci_release_capabilities(pci_dev); pci_release_of_node(pci_dev); pcibios_release_device(pci_dev); pci_bus_put(pci_dev->bus); kfree(pci_dev->driver_override); kfree(pci_dev); } struct pci_dev *pci_alloc_dev(struct pci_bus *bus) { struct pci_dev *dev; dev = kzalloc(sizeof(struct pci_dev), GFP_KERNEL); if (!dev) return NULL; INIT_LIST_HEAD(&dev->bus_list); dev->dev.type = &pci_dev_type; dev->bus = pci_bus_get(bus); return dev; } EXPORT_SYMBOL(pci_alloc_dev); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l, int crs_timeout) { int delay = 1; if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l)) return false; /* some broken boards return 0 or ~0 if a slot is empty: */ if (*l == 0xffffffff || *l == 0x00000000 || *l == 0x0000ffff || *l == 0xffff0000) return false; /* * Configuration Request Retry Status. Some root ports return the * actual device ID instead of the synthetic ID (0xFFFF) required * by the PCIe spec. Ignore the device ID and only check for * (vendor id == 1). */ while ((*l & 0xffff) == 0x0001) { if (!crs_timeout) return false; msleep(delay); delay *= 2; if (pci_bus_read_config_dword(bus, devfn, PCI_VENDOR_ID, l)) return false; /* Card hasn't responded in 60 seconds? Must be stuck. */ if (delay > crs_timeout) { printk(KERN_WARNING "pci %04x:%02x:%02x.%d: not responding\n", pci_domain_nr(bus), bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); return false; } } return true; } EXPORT_SYMBOL(pci_bus_read_dev_vendor_id); /* * Read the config data for a PCI device, sanity-check it * and fill in the dev structure... */ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) { struct pci_dev *dev; u32 l; if (!pci_bus_read_dev_vendor_id(bus, devfn, &l, 60*1000)) return NULL; dev = pci_alloc_dev(bus); if (!dev) return NULL; dev->devfn = devfn; dev->vendor = l & 0xffff; dev->device = (l >> 16) & 0xffff; pci_set_of_node(dev); if (pci_setup_device(dev)) { pci_bus_put(dev->bus); kfree(dev); return NULL; } return dev; } static void pci_init_capabilities(struct pci_dev *dev) { /* Enhanced Allocation */ pci_ea_init(dev); /* MSI/MSI-X list */ pci_msi_init_pci_dev(dev); /* Buffers for saving PCIe and PCI-X capabilities */ pci_allocate_cap_save_buffers(dev); /* Power Management */ pci_pm_init(dev); /* Vital Product Data */ pci_vpd_pci22_init(dev); /* Alternative Routing-ID Forwarding */ pci_configure_ari(dev); /* Single Root I/O Virtualization */ pci_iov_init(dev); /* Address Translation Services */ pci_ats_init(dev); /* Enable ACS P2P upstream forwarding */ pci_enable_acs(dev); pci_cleanup_aer_error_status_regs(dev); } /* * This is the equivalent of pci_host_bridge_msi_domain that acts on * devices. Firmware interfaces that can select the MSI domain on a * per-device basis should be called from here. */ static struct irq_domain *pci_dev_msi_domain(struct pci_dev *dev) { struct irq_domain *d; /* * If a domain has been set through the pcibios_add_device * callback, then this is the one (platform code knows best). */ d = dev_get_msi_domain(&dev->dev); if (d) return d; /* * Let's see if we have a firmware interface able to provide * the domain. */ d = pci_msi_get_device_domain(dev); if (d) return d; return NULL; } static void pci_set_msi_domain(struct pci_dev *dev) { struct irq_domain *d; /* * If the platform or firmware interfaces cannot supply a * device-specific MSI domain, then inherit the default domain * from the host bridge itself. */ d = pci_dev_msi_domain(dev); if (!d) d = dev_get_msi_domain(&dev->bus->dev); dev_set_msi_domain(&dev->dev, d); } /** * pci_dma_configure - Setup DMA configuration * @dev: ptr to pci_dev struct of the PCI device * * Function to update PCI devices's DMA configuration using the same * info from the OF node or ACPI node of host bridge's parent (if any). */ static void pci_dma_configure(struct pci_dev *dev) { struct device *bridge = pci_get_host_bridge_device(dev); if (IS_ENABLED(CONFIG_OF) && bridge->parent && bridge->parent->of_node) { of_dma_configure(&dev->dev, bridge->parent->of_node); } else if (has_acpi_companion(bridge)) { struct acpi_device *adev = to_acpi_device_node(bridge->fwnode); enum dev_dma_attr attr = acpi_get_dma_attr(adev); if (attr == DEV_DMA_NOT_SUPPORTED) dev_warn(&dev->dev, "DMA not supported.\n"); else arch_setup_dma_ops(&dev->dev, 0, 0, NULL, attr == DEV_DMA_COHERENT); } pci_put_host_bridge_device(bridge); } void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) { int ret; pci_configure_device(dev); device_initialize(&dev->dev); dev->dev.release = pci_release_dev; set_dev_node(&dev->dev, pcibus_to_node(bus)); dev->dev.dma_mask = &dev->dma_mask; dev->dev.dma_parms = &dev->dma_parms; dev->dev.coherent_dma_mask = 0xffffffffull; pci_dma_configure(dev); pci_set_dma_max_seg_size(dev, 65536); pci_set_dma_seg_boundary(dev, 0xffffffff); /* Fix up broken headers */ pci_fixup_device(pci_fixup_header, dev); /* moved out from quirk header fixup code */ pci_reassigndev_resource_alignment(dev); /* Clear the state_saved flag. */ dev->state_saved = false; /* Initialize various capabilities */ pci_init_capabilities(dev); /* * Add the device to our list of discovered devices * and the bus list for fixup functions, etc. */ down_write(&pci_bus_sem); list_add_tail(&dev->bus_list, &bus->devices); up_write(&pci_bus_sem); ret = pcibios_add_device(dev); WARN_ON(ret < 0); /* Setup MSI irq domain */ pci_set_msi_domain(dev); /* Notifier could use PCI capabilities */ dev->match_driver = false; ret = device_add(&dev->dev); WARN_ON(ret < 0); } struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn) { struct pci_dev *dev; dev = pci_get_slot(bus, devfn); if (dev) { pci_dev_put(dev); return dev; } dev = pci_scan_device(bus, devfn); if (!dev) return NULL; pci_device_add(dev, bus); return dev; } EXPORT_SYMBOL(pci_scan_single_device); static unsigned next_fn(struct pci_bus *bus, struct pci_dev *dev, unsigned fn) { int pos; u16 cap = 0; unsigned next_fn; if (pci_ari_enabled(bus)) { if (!dev) return 0; pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI); if (!pos) return 0; pci_read_config_word(dev, pos + PCI_ARI_CAP, &cap); next_fn = PCI_ARI_CAP_NFN(cap); if (next_fn <= fn) return 0; /* protect against malformed list */ return next_fn; } /* dev may be NULL for non-contiguous multifunction devices */ if (!dev || dev->multifunction) return (fn + 1) % 8; return 0; } static int only_one_child(struct pci_bus *bus) { struct pci_dev *parent = bus->self; if (!parent || !pci_is_pcie(parent)) return 0; if (pci_pcie_type(parent) == PCI_EXP_TYPE_ROOT_PORT) return 1; if (parent->has_secondary_link && !pci_has_flag(PCI_SCAN_ALL_PCIE_DEVS)) return 1; return 0; } /** * pci_scan_slot - scan a PCI slot on a bus for devices. * @bus: PCI bus to scan * @devfn: slot number to scan (must have zero function.) * * Scan a PCI slot on the specified PCI bus for devices, adding * discovered devices to the @bus->devices list. New devices * will not have is_added set. * * Returns the number of new devices found. */ int pci_scan_slot(struct pci_bus *bus, int devfn) { unsigned fn, nr = 0; struct pci_dev *dev; if (only_one_child(bus) && (devfn > 0)) return 0; /* Already scanned the entire slot */ dev = pci_scan_single_device(bus, devfn); if (!dev) return 0; if (!dev->is_added) nr++; for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) { dev = pci_scan_single_device(bus, devfn + fn); if (dev) { if (!dev->is_added) nr++; dev->multifunction = 1; } } /* only one slot has pcie device */ if (bus->self && nr) pcie_aspm_init_link_state(bus->self); return nr; } EXPORT_SYMBOL(pci_scan_slot); static int pcie_find_smpss(struct pci_dev *dev, void *data) { u8 *smpss = data; if (!pci_is_pcie(dev)) return 0; /* * We don't have a way to change MPS settings on devices that have * drivers attached. A hot-added device might support only the minimum * MPS setting (MPS=128). Therefore, if the fabric contains a bridge * where devices may be hot-added, we limit the fabric MPS to 128 so * hot-added devices will work correctly. * * However, if we hot-add a device to a slot directly below a Root * Port, it's impossible for there to be other existing devices below * the port. We don't limit the MPS in this case because we can * reconfigure MPS on both the Root Port and the hot-added device, * and there are no other devices involved. * * Note that this PCIE_BUS_SAFE path assumes no peer-to-peer DMA. */ if (dev->is_hotplug_bridge && pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT) *smpss = 0; if (*smpss > dev->pcie_mpss) *smpss = dev->pcie_mpss; return 0; } static void pcie_write_mps(struct pci_dev *dev, int mps) { int rc; if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { mps = 128 << dev->pcie_mpss; if (pci_pcie_type(dev) != PCI_EXP_TYPE_ROOT_PORT && dev->bus->self) /* For "Performance", the assumption is made that * downstream communication will never be larger than * the MRRS. So, the MPS only needs to be configured * for the upstream communication. This being the case, * walk from the top down and set the MPS of the child * to that of the parent bus. * * Configure the device MPS with the smaller of the * device MPSS or the bridge MPS (which is assumed to be * properly configured at this point to the largest * allowable MPS based on its parent bus). */ mps = min(mps, pcie_get_mps(dev->bus->self)); } rc = pcie_set_mps(dev, mps); if (rc) dev_err(&dev->dev, "Failed attempting to set the MPS\n"); } static void pcie_write_mrrs(struct pci_dev *dev) { int rc, mrrs; /* In the "safe" case, do not configure the MRRS. There appear to be * issues with setting MRRS to 0 on a number of devices. */ if (pcie_bus_config != PCIE_BUS_PERFORMANCE) return; /* For Max performance, the MRRS must be set to the largest supported * value. However, it cannot be configured larger than the MPS the * device or the bus can support. This should already be properly * configured by a prior call to pcie_write_mps. */ mrrs = pcie_get_mps(dev); /* MRRS is a R/W register. Invalid values can be written, but a * subsequent read will verify if the value is acceptable or not. * If the MRRS value provided is not acceptable (e.g., too large), * shrink the value until it is acceptable to the HW. */ while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) { rc = pcie_set_readrq(dev, mrrs); if (!rc) break; dev_warn(&dev->dev, "Failed attempting to set the MRRS\n"); mrrs /= 2; } if (mrrs < 128) dev_err(&dev->dev, "MRRS was unable to be configured with a safe value. If problems are experienced, try running with pci=pcie_bus_safe\n"); } static int pcie_bus_configure_set(struct pci_dev *dev, void *data) { int mps, orig_mps; if (!pci_is_pcie(dev)) return 0; if (pcie_bus_config == PCIE_BUS_TUNE_OFF || pcie_bus_config == PCIE_BUS_DEFAULT) return 0; mps = 128 << *(u8 *)data; orig_mps = pcie_get_mps(dev); pcie_write_mps(dev, mps); pcie_write_mrrs(dev); dev_info(&dev->dev, "Max Payload Size set to %4d/%4d (was %4d), Max Read Rq %4d\n", pcie_get_mps(dev), 128 << dev->pcie_mpss, orig_mps, pcie_get_readrq(dev)); return 0; } /* pcie_bus_configure_settings requires that pci_walk_bus work in a top-down, * parents then children fashion. If this changes, then this code will not * work as designed. */ void pcie_bus_configure_settings(struct pci_bus *bus) { u8 smpss = 0; if (!bus->self) return; if (!pci_is_pcie(bus->self)) return; /* FIXME - Peer to peer DMA is possible, though the endpoint would need * to be aware of the MPS of the destination. To work around this, * simply force the MPS of the entire system to the smallest possible. */ if (pcie_bus_config == PCIE_BUS_PEER2PEER) smpss = 0; if (pcie_bus_config == PCIE_BUS_SAFE) { smpss = bus->self->pcie_mpss; pcie_find_smpss(bus->self, &smpss); pci_walk_bus(bus, pcie_find_smpss, &smpss); } pcie_bus_configure_set(bus->self, &smpss); pci_walk_bus(bus, pcie_bus_configure_set, &smpss); } EXPORT_SYMBOL_GPL(pcie_bus_configure_settings); unsigned int pci_scan_child_bus(struct pci_bus *bus) { unsigned int devfn, pass, max = bus->busn_res.start; struct pci_dev *dev; dev_dbg(&bus->dev, "scanning bus\n"); /* Go find them, Rover! */ for (devfn = 0; devfn < 0x100; devfn += 8) pci_scan_slot(bus, devfn); /* Reserve buses for SR-IOV capability. */ max += pci_iov_bus_range(bus); /* * After performing arch-dependent fixup of the bus, look behind * all PCI-to-PCI bridges on this bus. */ if (!bus->is_added) { dev_dbg(&bus->dev, "fixups for bus\n"); pcibios_fixup_bus(bus); bus->is_added = 1; } for (pass = 0; pass < 2; pass++) list_for_each_entry(dev, &bus->devices, bus_list) { if (pci_is_bridge(dev)) max = pci_scan_bridge(bus, dev, max, pass); } /* * We've scanned the bus and so we know all about what's on * the other side of any bridges that may be on this bus plus * any devices. * * Return how far we've got finding sub-buses. */ dev_dbg(&bus->dev, "bus scan returning with max=%02x\n", max); return max; } EXPORT_SYMBOL_GPL(pci_scan_child_bus); /** * pcibios_root_bridge_prepare - Platform-specific host bridge setup. * @bridge: Host bridge to set up. * * Default empty implementation. Replace with an architecture-specific setup * routine, if necessary. */ int __weak pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { return 0; } void __weak pcibios_add_bus(struct pci_bus *bus) { } void __weak pcibios_remove_bus(struct pci_bus *bus) { } struct pci_bus *pci_create_root_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata, struct list_head *resources) { int error; struct pci_host_bridge *bridge; struct pci_bus *b, *b2; struct resource_entry *window, *n; struct resource *res; resource_size_t offset; char bus_addr[64]; char *fmt; b = pci_alloc_bus(NULL); if (!b) return NULL; b->sysdata = sysdata; b->ops = ops; b->number = b->busn_res.start = bus; pci_bus_assign_domain_nr(b, parent); b2 = pci_find_bus(pci_domain_nr(b), bus); if (b2) { /* If we already got to this bus through a different bridge, ignore it */ dev_dbg(&b2->dev, "bus already known\n"); goto err_out; } bridge = pci_alloc_host_bridge(b); if (!bridge) goto err_out; bridge->dev.parent = parent; bridge->dev.release = pci_release_host_bridge_dev; dev_set_name(&bridge->dev, "pci%04x:%02x", pci_domain_nr(b), bus); error = pcibios_root_bridge_prepare(bridge); if (error) { kfree(bridge); goto err_out; } error = device_register(&bridge->dev); if (error) { put_device(&bridge->dev); goto err_out; } b->bridge = get_device(&bridge->dev); device_enable_async_suspend(b->bridge); pci_set_bus_of_node(b); pci_set_bus_msi_domain(b); if (!parent) set_dev_node(b->bridge, pcibus_to_node(b)); b->dev.class = &pcibus_class; b->dev.parent = b->bridge; dev_set_name(&b->dev, "%04x:%02x", pci_domain_nr(b), bus); error = device_register(&b->dev); if (error) goto class_dev_reg_err; pcibios_add_bus(b); /* Create legacy_io and legacy_mem files for this bus */ pci_create_legacy_files(b); if (parent) dev_info(parent, "PCI host bridge to bus %s\n", dev_name(&b->dev)); else printk(KERN_INFO "PCI host bridge to bus %s\n", dev_name(&b->dev)); /* Add initial resources to the bus */ resource_list_for_each_entry_safe(window, n, resources) { list_move_tail(&window->node, &bridge->windows); res = window->res; offset = window->offset; if (res->flags & IORESOURCE_BUS) pci_bus_insert_busn_res(b, bus, res->end); else pci_bus_add_resource(b, res, 0); if (offset) { if (resource_type(res) == IORESOURCE_IO) fmt = " (bus address [%#06llx-%#06llx])"; else fmt = " (bus address [%#010llx-%#010llx])"; snprintf(bus_addr, sizeof(bus_addr), fmt, (unsigned long long) (res->start - offset), (unsigned long long) (res->end - offset)); } else bus_addr[0] = '\0'; dev_info(&b->dev, "root bus resource %pR%s\n", res, bus_addr); } down_write(&pci_bus_sem); list_add_tail(&b->node, &pci_root_buses); up_write(&pci_bus_sem); return b; class_dev_reg_err: put_device(&bridge->dev); device_unregister(&bridge->dev); err_out: kfree(b); return NULL; } EXPORT_SYMBOL_GPL(pci_create_root_bus); int pci_bus_insert_busn_res(struct pci_bus *b, int bus, int bus_max) { struct resource *res = &b->busn_res; struct resource *parent_res, *conflict; res->start = bus; res->end = bus_max; res->flags = IORESOURCE_BUS; if (!pci_is_root_bus(b)) parent_res = &b->parent->busn_res; else { parent_res = get_pci_domain_busn_res(pci_domain_nr(b)); res->flags |= IORESOURCE_PCI_FIXED; } conflict = request_resource_conflict(parent_res, res); if (conflict) dev_printk(KERN_DEBUG, &b->dev, "busn_res: can not insert %pR under %s%pR (conflicts with %s %pR)\n", res, pci_is_root_bus(b) ? "domain " : "", parent_res, conflict->name, conflict); return conflict == NULL; } int pci_bus_update_busn_res_end(struct pci_bus *b, int bus_max) { struct resource *res = &b->busn_res; struct resource old_res = *res; resource_size_t size; int ret; if (res->start > bus_max) return -EINVAL; size = bus_max - res->start + 1; ret = adjust_resource(res, res->start, size); dev_printk(KERN_DEBUG, &b->dev, "busn_res: %pR end %s updated to %02x\n", &old_res, ret ? "can not be" : "is", bus_max); if (!ret && !res->parent) pci_bus_insert_busn_res(b, res->start, res->end); return ret; } void pci_bus_release_busn_res(struct pci_bus *b) { struct resource *res = &b->busn_res; int ret; if (!res->flags || !res->parent) return; ret = release_resource(res); dev_printk(KERN_DEBUG, &b->dev, "busn_res: %pR %s released\n", res, ret ? "can not be" : "is"); } struct pci_bus *pci_scan_root_bus_msi(struct device *parent, int bus, struct pci_ops *ops, void *sysdata, struct list_head *resources, struct msi_controller *msi) { struct resource_entry *window; bool found = false; struct pci_bus *b; int max; resource_list_for_each_entry(window, resources) if (window->res->flags & IORESOURCE_BUS) { found = true; break; } b = pci_create_root_bus(parent, bus, ops, sysdata, resources); if (!b) return NULL; b->msi = msi; if (!found) { dev_info(&b->dev, "No busn resource found for root bus, will use [bus %02x-ff]\n", bus); pci_bus_insert_busn_res(b, bus, 255); } max = pci_scan_child_bus(b); if (!found) pci_bus_update_busn_res_end(b, max); return b; } struct pci_bus *pci_scan_root_bus(struct device *parent, int bus, struct pci_ops *ops, void *sysdata, struct list_head *resources) { return pci_scan_root_bus_msi(parent, bus, ops, sysdata, resources, NULL); } EXPORT_SYMBOL(pci_scan_root_bus); struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata) { LIST_HEAD(resources); struct pci_bus *b; pci_add_resource(&resources, &ioport_resource); pci_add_resource(&resources, &iomem_resource); pci_add_resource(&resources, &busn_resource); b = pci_create_root_bus(NULL, bus, ops, sysdata, &resources); if (b) { pci_scan_child_bus(b); } else { pci_free_resource_list(&resources); } return b; } EXPORT_SYMBOL(pci_scan_bus); /** * pci_rescan_bus_bridge_resize - scan a PCI bus for devices. * @bridge: PCI bridge for the bus to scan * * Scan a PCI bus and child buses for new devices, add them, * and enable them, resizing bridge mmio/io resource if necessary * and possible. The caller must ensure the child devices are already * removed for resizing to occur. * * Returns the max number of subordinate bus discovered. */ unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge) { unsigned int max; struct pci_bus *bus = bridge->subordinate; max = pci_scan_child_bus(bus); pci_assign_unassigned_bridge_resources(bridge); pci_bus_add_devices(bus); return max; } /** * pci_rescan_bus - scan a PCI bus for devices. * @bus: PCI bus to scan * * Scan a PCI bus and child buses for new devices, adds them, * and enables them. * * Returns the max number of subordinate bus discovered. */ unsigned int pci_rescan_bus(struct pci_bus *bus) { unsigned int max; max = pci_scan_child_bus(bus); pci_assign_unassigned_bus_resources(bus); pci_bus_add_devices(bus); return max; } EXPORT_SYMBOL_GPL(pci_rescan_bus); /* * pci_rescan_bus(), pci_rescan_bus_bridge_resize() and PCI device removal * routines should always be executed under this mutex. */ static DEFINE_MUTEX(pci_rescan_remove_lock); void pci_lock_rescan_remove(void) { mutex_lock(&pci_rescan_remove_lock); } EXPORT_SYMBOL_GPL(pci_lock_rescan_remove); void pci_unlock_rescan_remove(void) { mutex_unlock(&pci_rescan_remove_lock); } EXPORT_SYMBOL_GPL(pci_unlock_rescan_remove); static int __init pci_sort_bf_cmp(const struct device *d_a, const struct device *d_b) { const struct pci_dev *a = to_pci_dev(d_a); const struct pci_dev *b = to_pci_dev(d_b); if (pci_domain_nr(a->bus) < pci_domain_nr(b->bus)) return -1; else if (pci_domain_nr(a->bus) > pci_domain_nr(b->bus)) return 1; if (a->bus->number < b->bus->number) return -1; else if (a->bus->number > b->bus->number) return 1; if (a->devfn < b->devfn) return -1; else if (a->devfn > b->devfn) return 1; return 0; } void __init pci_sort_breadthfirst(void) { bus_sort_breadthfirst(&pci_bus_type, &pci_sort_bf_cmp); }